7326b3af2a21d5258c94b000975fea6ad5ddf31d
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static GLboolean
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return GL_TRUE;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return GL_FALSE;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /* The message length for all SEND messages is restricted to [1,15]. This
136 * includes 1 for the header, so anything in slots 14 and above needs to be
137 * placed in a general-purpose register and emitted using a second URB write.
138 */
139 #define MAX_SLOTS_IN_FIRST_URB_WRITE 14
140
141 /**
142 * Determine whether the given vertex output can be written directly to a MRF
143 * or whether it has to be stored in a general-purpose register.
144 */
145 static inline bool can_use_direct_mrf(int vert_result,
146 int first_reladdr_output, int slot)
147 {
148 if (vert_result == VERT_RESULT_HPOS || vert_result == VERT_RESULT_PSIZ) {
149 /* These never go straight into MRF's. They are placed in the MRF by
150 * epilog code.
151 */
152 return false;
153 }
154 if (first_reladdr_output <= vert_result && vert_result < VERT_RESULT_MAX) {
155 /* Relative addressing might be used to access this vert_result, so it
156 * needs to go into a general-purpose register.
157 */
158 return false;
159 }
160 if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE) {
161 /* This output won't go out until the second URB write so it must be
162 * stored in a general-purpose register until then.
163 */
164 return false;
165 }
166 return true;
167 }
168
169 /**
170 * Preallocate GRF register before code emit.
171 * Do things as simply as possible. Allocate and populate all regs
172 * ahead of time.
173 */
174 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
175 {
176 struct intel_context *intel = &c->func.brw->intel;
177 GLuint i, reg = 0, slot;
178 int attributes_in_vue;
179 int first_reladdr_output;
180 int max_constant;
181 int constant = 0;
182 struct brw_vertex_program *vp = c->vp;
183 const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
184
185 /* Determine whether to use a real constant buffer or use a block
186 * of GRF registers for constants. The later is faster but only
187 * works if everything fits in the GRF.
188 * XXX this heuristic/check may need some fine tuning...
189 */
190 if (c->vp->program.Base.Parameters->NumParameters +
191 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
192 c->vp->use_const_buffer = GL_TRUE;
193 else
194 c->vp->use_const_buffer = GL_FALSE;
195
196 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
197
198 /* r0 -- reserved as usual
199 */
200 c->r0 = brw_vec8_grf(reg, 0);
201 reg++;
202
203 /* User clip planes from curbe:
204 */
205 if (c->key.userclip_active) {
206 if (intel->gen >= 6) {
207 for (i = 0; i <= c->key.nr_userclip_plane_consts; i++) {
208 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
209 (i % 2) * 4), 0, 4, 1);
210 }
211 reg += ALIGN(c->key.nr_userclip_plane_consts, 2) / 2;
212 } else {
213 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
214 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
215 (i % 2) * 4), 0, 4, 1);
216 }
217 reg += (ALIGN(6 + c->key.nr_userclip_plane_consts, 4) / 4) * 2;
218 }
219
220 }
221
222 /* Assign some (probably all) of the vertex program constants to
223 * the push constant buffer/CURBE.
224 *
225 * There's an obvious limit to the numer of push constants equal to
226 * the number of register available, and that number is smaller
227 * than the minimum maximum number of vertex program parameters, so
228 * support for pull constants is required if we overflow.
229 * Additionally, on gen6 the number of push constants is even
230 * lower.
231 *
232 * When there's relative addressing, we don't know what range of
233 * Mesa IR registers can be accessed. And generally, when relative
234 * addressing is used we also have too many constants to load them
235 * all as push constants. So, we'll just support relative
236 * addressing out of the pull constant buffers, and try to load as
237 * many statically-accessed constants into the push constant buffer
238 * as we can.
239 */
240 if (intel->gen >= 6) {
241 /* We can only load 32 regs of push constants. */
242 max_constant = 32 * 2 - c->key.nr_userclip_plane_consts;
243 } else {
244 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
245 }
246
247 /* constant_map maps from ParameterValues[] index to index in the
248 * push constant buffer, or -1 if it's only in the pull constant
249 * buffer.
250 */
251 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
252 for (i = 0;
253 i < c->vp->program.Base.NumInstructions && constant < max_constant;
254 i++) {
255 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
256 int arg;
257
258 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
259 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
260 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
261 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
262 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
263 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
264 continue;
265 }
266
267 if (inst->SrcReg[arg].RelAddr) {
268 c->vp->use_const_buffer = GL_TRUE;
269 continue;
270 }
271
272 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
273 c->constant_map[inst->SrcReg[arg].Index] = constant++;
274 }
275 }
276 }
277
278 /* If we ran out of push constant space, then we'll also upload all
279 * constants through the pull constant buffer so that they can be
280 * accessed no matter what. For relative addressing (the common
281 * case) we need them all in place anyway.
282 */
283 if (constant == max_constant)
284 c->vp->use_const_buffer = GL_TRUE;
285
286 /* Set up the references to the pull parameters if present. This backend
287 * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
288 * buffer, while the new VS backend allocates values to the pull buffer on
289 * demand.
290 */
291 if (c->vp->use_const_buffer) {
292 for (i = 0; i < params->NumParameters * 4; i++) {
293 c->prog_data.pull_param[i] = &params->ParameterValues[i / 4][i % 4].f;
294 }
295 c->prog_data.nr_pull_params = i;
296 }
297
298 for (i = 0; i < constant; i++) {
299 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
300 (i % 2) * 4),
301 0, 4, 1);
302 }
303 reg += (constant + 1) / 2;
304 c->prog_data.curb_read_length = reg - 1;
305 c->prog_data.nr_params = constant * 4;
306 /* XXX 0 causes a bug elsewhere... */
307 if (intel->gen < 6 && c->prog_data.nr_params == 0)
308 c->prog_data.nr_params = 4;
309
310 /* Allocate input regs:
311 */
312 c->nr_inputs = 0;
313 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
314 if (c->prog_data.inputs_read & (1 << i)) {
315 c->nr_inputs++;
316 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
317 reg++;
318 }
319 }
320 /* If there are no inputs, we'll still be reading one attribute's worth
321 * because it's required -- see urb_read_length setting.
322 */
323 if (c->nr_inputs == 0)
324 reg++;
325
326 /* Allocate outputs. The non-position outputs go straight into message regs.
327 */
328 brw_compute_vue_map(&c->vue_map, intel, c->key.userclip_active,
329 c->prog_data.outputs_written);
330 c->first_output = reg;
331
332 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
333
334 for (slot = 0; slot < c->vue_map.num_slots; slot++) {
335 int vert_result = c->vue_map.slot_to_vert_result[slot];
336 assert(vert_result < Elements(c->regs[PROGRAM_OUTPUT]));
337 if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) {
338 c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1);
339 } else {
340 c->regs[PROGRAM_OUTPUT][vert_result] = brw_vec8_grf(reg, 0);
341 reg++;
342 }
343 }
344
345 /* Allocate program temporaries:
346 */
347 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
348 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
349 reg++;
350 }
351
352 /* Address reg(s). Don't try to use the internal address reg until
353 * deref time.
354 */
355 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
356 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
357 reg,
358 0,
359 BRW_REGISTER_TYPE_D,
360 BRW_VERTICAL_STRIDE_8,
361 BRW_WIDTH_8,
362 BRW_HORIZONTAL_STRIDE_1,
363 BRW_SWIZZLE_XXXX,
364 WRITEMASK_X);
365 reg++;
366 }
367
368 if (c->vp->use_const_buffer) {
369 for (i = 0; i < 3; i++) {
370 c->current_const[i].reg = brw_vec8_grf(reg, 0);
371 reg++;
372 }
373 clear_current_const(c);
374 }
375
376 for (i = 0; i < 128; i++) {
377 if (c->output_regs[i].used_in_src) {
378 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
379 reg++;
380 }
381 }
382
383 if (c->needs_stack) {
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
385 reg += 2;
386 }
387
388 /* Some opcodes need an internal temporary:
389 */
390 c->first_tmp = reg;
391 c->last_tmp = reg; /* for allocation purposes */
392
393 /* Each input reg holds data from two vertices. The
394 * urb_read_length is the number of registers read from *each*
395 * vertex urb, so is half the amount:
396 */
397 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
398 /* Setting this field to 0 leads to undefined behavior according to the
399 * the VS_STATE docs. Our VUEs will always have at least one attribute
400 * sitting in them, even if it's padding.
401 */
402 if (c->prog_data.urb_read_length == 0)
403 c->prog_data.urb_read_length = 1;
404
405 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
406 * them to fit the biggest thing they need to.
407 */
408 attributes_in_vue = MAX2(c->vue_map.num_slots, c->nr_inputs);
409
410 if (intel->gen == 6) {
411 /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
412 * number of 128-byte (1024-bit) units.
413 */
414 c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 8) / 8;
415 } else {
416 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
417 * number of 64-byte (512-bit) units.
418 */
419 c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 4) / 4;
420 }
421
422 c->prog_data.total_grf = reg;
423
424 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
425 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
426 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
427 printf("%s reg = %d\n", __FUNCTION__, reg);
428 }
429 }
430
431
432 /**
433 * If an instruction uses a temp reg both as a src and the dest, we
434 * sometimes need to allocate an intermediate temporary.
435 */
436 static void unalias1( struct brw_vs_compile *c,
437 struct brw_reg dst,
438 struct brw_reg arg0,
439 void (*func)( struct brw_vs_compile *,
440 struct brw_reg,
441 struct brw_reg ))
442 {
443 if (dst.file == arg0.file && dst.nr == arg0.nr) {
444 struct brw_compile *p = &c->func;
445 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
446 func(c, tmp, arg0);
447 brw_MOV(p, dst, tmp);
448 release_tmp(c, tmp);
449 }
450 else {
451 func(c, dst, arg0);
452 }
453 }
454
455 /**
456 * \sa unalias2
457 * Checkes if 2-operand instruction needs an intermediate temporary.
458 */
459 static void unalias2( struct brw_vs_compile *c,
460 struct brw_reg dst,
461 struct brw_reg arg0,
462 struct brw_reg arg1,
463 void (*func)( struct brw_vs_compile *,
464 struct brw_reg,
465 struct brw_reg,
466 struct brw_reg ))
467 {
468 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
469 (dst.file == arg1.file && dst.nr == arg1.nr)) {
470 struct brw_compile *p = &c->func;
471 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
472 func(c, tmp, arg0, arg1);
473 brw_MOV(p, dst, tmp);
474 release_tmp(c, tmp);
475 }
476 else {
477 func(c, dst, arg0, arg1);
478 }
479 }
480
481 /**
482 * \sa unalias2
483 * Checkes if 3-operand instruction needs an intermediate temporary.
484 */
485 static void unalias3( struct brw_vs_compile *c,
486 struct brw_reg dst,
487 struct brw_reg arg0,
488 struct brw_reg arg1,
489 struct brw_reg arg2,
490 void (*func)( struct brw_vs_compile *,
491 struct brw_reg,
492 struct brw_reg,
493 struct brw_reg,
494 struct brw_reg ))
495 {
496 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
497 (dst.file == arg1.file && dst.nr == arg1.nr) ||
498 (dst.file == arg2.file && dst.nr == arg2.nr)) {
499 struct brw_compile *p = &c->func;
500 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
501 func(c, tmp, arg0, arg1, arg2);
502 brw_MOV(p, dst, tmp);
503 release_tmp(c, tmp);
504 }
505 else {
506 func(c, dst, arg0, arg1, arg2);
507 }
508 }
509
510 static void emit_sop( struct brw_vs_compile *c,
511 struct brw_reg dst,
512 struct brw_reg arg0,
513 struct brw_reg arg1,
514 GLuint cond)
515 {
516 struct brw_compile *p = &c->func;
517
518 brw_MOV(p, dst, brw_imm_f(0.0f));
519 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
520 brw_MOV(p, dst, brw_imm_f(1.0f));
521 brw_set_predicate_control_flag_value(p, 0xff);
522 }
523
524 static void emit_seq( struct brw_vs_compile *c,
525 struct brw_reg dst,
526 struct brw_reg arg0,
527 struct brw_reg arg1 )
528 {
529 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
530 }
531
532 static void emit_sne( struct brw_vs_compile *c,
533 struct brw_reg dst,
534 struct brw_reg arg0,
535 struct brw_reg arg1 )
536 {
537 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
538 }
539 static void emit_slt( struct brw_vs_compile *c,
540 struct brw_reg dst,
541 struct brw_reg arg0,
542 struct brw_reg arg1 )
543 {
544 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
545 }
546
547 static void emit_sle( struct brw_vs_compile *c,
548 struct brw_reg dst,
549 struct brw_reg arg0,
550 struct brw_reg arg1 )
551 {
552 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
553 }
554
555 static void emit_sgt( struct brw_vs_compile *c,
556 struct brw_reg dst,
557 struct brw_reg arg0,
558 struct brw_reg arg1 )
559 {
560 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
561 }
562
563 static void emit_sge( struct brw_vs_compile *c,
564 struct brw_reg dst,
565 struct brw_reg arg0,
566 struct brw_reg arg1 )
567 {
568 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
569 }
570
571 static void emit_cmp( struct brw_compile *p,
572 struct brw_reg dst,
573 struct brw_reg arg0,
574 struct brw_reg arg1,
575 struct brw_reg arg2 )
576 {
577 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
578 brw_SEL(p, dst, arg1, arg2);
579 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
580 }
581
582 static void emit_sign(struct brw_vs_compile *c,
583 struct brw_reg dst,
584 struct brw_reg arg0)
585 {
586 struct brw_compile *p = &c->func;
587
588 brw_MOV(p, dst, brw_imm_f(0));
589
590 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
591 brw_MOV(p, dst, brw_imm_f(-1.0));
592 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
593
594 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
595 brw_MOV(p, dst, brw_imm_f(1.0));
596 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
597 }
598
599 static void emit_max( struct brw_compile *p,
600 struct brw_reg dst,
601 struct brw_reg arg0,
602 struct brw_reg arg1 )
603 {
604 struct intel_context *intel = &p->brw->intel;
605
606 if (intel->gen >= 6) {
607 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
608 brw_SEL(p, dst, arg0, arg1);
609 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
610 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
611 } else {
612 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
613 brw_SEL(p, dst, arg0, arg1);
614 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
615 }
616 }
617
618 static void emit_min( struct brw_compile *p,
619 struct brw_reg dst,
620 struct brw_reg arg0,
621 struct brw_reg arg1 )
622 {
623 struct intel_context *intel = &p->brw->intel;
624
625 if (intel->gen >= 6) {
626 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
627 brw_SEL(p, dst, arg0, arg1);
628 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
629 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
630 } else {
631 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
632 brw_SEL(p, dst, arg0, arg1);
633 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
634 }
635 }
636
637 static void emit_arl(struct brw_compile *p,
638 struct brw_reg dst,
639 struct brw_reg src)
640 {
641 struct intel_context *intel = &p->brw->intel;
642
643 if (intel->gen >= 6) {
644 struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
645
646 brw_RNDD(p, dst_f, src);
647 brw_MOV(p, dst, dst_f);
648 } else {
649 brw_RNDD(p, dst, src);
650 }
651 }
652
653 static void emit_math1_gen4(struct brw_vs_compile *c,
654 GLuint function,
655 struct brw_reg dst,
656 struct brw_reg arg0,
657 GLuint precision)
658 {
659 /* There are various odd behaviours with SEND on the simulator. In
660 * addition there are documented issues with the fact that the GEN4
661 * processor doesn't do dependency control properly on SEND
662 * results. So, on balance, this kludge to get around failures
663 * with writemasked math results looks like it might be necessary
664 * whether that turns out to be a simulator bug or not:
665 */
666 struct brw_compile *p = &c->func;
667 struct brw_reg tmp = dst;
668 GLboolean need_tmp = GL_FALSE;
669
670 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
671 dst.dw1.bits.writemask != 0xf)
672 need_tmp = GL_TRUE;
673
674 if (need_tmp)
675 tmp = get_tmp(c);
676
677 brw_math(p,
678 tmp,
679 function,
680 BRW_MATH_SATURATE_NONE,
681 2,
682 arg0,
683 BRW_MATH_DATA_SCALAR,
684 precision);
685
686 if (need_tmp) {
687 brw_MOV(p, dst, tmp);
688 release_tmp(c, tmp);
689 }
690 }
691
692 static void
693 emit_math1_gen6(struct brw_vs_compile *c,
694 GLuint function,
695 struct brw_reg dst,
696 struct brw_reg arg0,
697 GLuint precision)
698 {
699 struct brw_compile *p = &c->func;
700 struct brw_reg tmp_src, tmp_dst;
701
702 /* Something is strange on gen6 math in 16-wide mode, though the
703 * docs say it's supposed to work. Punt to using align1 mode,
704 * which doesn't do writemasking and swizzles.
705 */
706 tmp_src = get_tmp(c);
707 tmp_dst = get_tmp(c);
708
709 brw_MOV(p, tmp_src, arg0);
710
711 brw_set_access_mode(p, BRW_ALIGN_1);
712 brw_math(p,
713 tmp_dst,
714 function,
715 BRW_MATH_SATURATE_NONE,
716 2,
717 tmp_src,
718 BRW_MATH_DATA_SCALAR,
719 precision);
720 brw_set_access_mode(p, BRW_ALIGN_16);
721
722 brw_MOV(p, dst, tmp_dst);
723
724 release_tmp(c, tmp_src);
725 release_tmp(c, tmp_dst);
726 }
727
728 static void
729 emit_math1(struct brw_vs_compile *c,
730 GLuint function,
731 struct brw_reg dst,
732 struct brw_reg arg0,
733 GLuint precision)
734 {
735 struct brw_compile *p = &c->func;
736 struct intel_context *intel = &p->brw->intel;
737
738 if (intel->gen >= 6)
739 emit_math1_gen6(c, function, dst, arg0, precision);
740 else
741 emit_math1_gen4(c, function, dst, arg0, precision);
742 }
743
744 static void emit_math2_gen4( struct brw_vs_compile *c,
745 GLuint function,
746 struct brw_reg dst,
747 struct brw_reg arg0,
748 struct brw_reg arg1,
749 GLuint precision)
750 {
751 struct brw_compile *p = &c->func;
752 struct brw_reg tmp = dst;
753 GLboolean need_tmp = GL_FALSE;
754
755 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
756 dst.dw1.bits.writemask != 0xf)
757 need_tmp = GL_TRUE;
758
759 if (need_tmp)
760 tmp = get_tmp(c);
761
762 brw_MOV(p, brw_message_reg(3), arg1);
763
764 brw_math(p,
765 tmp,
766 function,
767 BRW_MATH_SATURATE_NONE,
768 2,
769 arg0,
770 BRW_MATH_DATA_SCALAR,
771 precision);
772
773 if (need_tmp) {
774 brw_MOV(p, dst, tmp);
775 release_tmp(c, tmp);
776 }
777 }
778
779 static void emit_math2_gen6( struct brw_vs_compile *c,
780 GLuint function,
781 struct brw_reg dst,
782 struct brw_reg arg0,
783 struct brw_reg arg1,
784 GLuint precision)
785 {
786 struct brw_compile *p = &c->func;
787 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
788
789 tmp_src0 = get_tmp(c);
790 tmp_src1 = get_tmp(c);
791 tmp_dst = get_tmp(c);
792
793 brw_MOV(p, tmp_src0, arg0);
794 brw_MOV(p, tmp_src1, arg1);
795
796 brw_set_access_mode(p, BRW_ALIGN_1);
797 brw_math2(p,
798 tmp_dst,
799 function,
800 tmp_src0,
801 tmp_src1);
802 brw_set_access_mode(p, BRW_ALIGN_16);
803
804 brw_MOV(p, dst, tmp_dst);
805
806 release_tmp(c, tmp_src0);
807 release_tmp(c, tmp_src1);
808 release_tmp(c, tmp_dst);
809 }
810
811 static void emit_math2( struct brw_vs_compile *c,
812 GLuint function,
813 struct brw_reg dst,
814 struct brw_reg arg0,
815 struct brw_reg arg1,
816 GLuint precision)
817 {
818 struct brw_compile *p = &c->func;
819 struct intel_context *intel = &p->brw->intel;
820
821 if (intel->gen >= 6)
822 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
823 else
824 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
825 }
826
827 static void emit_exp_noalias( struct brw_vs_compile *c,
828 struct brw_reg dst,
829 struct brw_reg arg0 )
830 {
831 struct brw_compile *p = &c->func;
832
833
834 if (dst.dw1.bits.writemask & WRITEMASK_X) {
835 struct brw_reg tmp = get_tmp(c);
836 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
837
838 /* tmp_d = floor(arg0.x) */
839 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
840
841 /* result[0] = 2.0 ^ tmp */
842
843 /* Adjust exponent for floating point:
844 * exp += 127
845 */
846 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
847
848 /* Install exponent and sign.
849 * Excess drops off the edge:
850 */
851 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
852 tmp_d, brw_imm_d(23));
853
854 release_tmp(c, tmp);
855 }
856
857 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
858 /* result[1] = arg0.x - floor(arg0.x) */
859 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
860 }
861
862 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
863 /* As with the LOG instruction, we might be better off just
864 * doing a taylor expansion here, seeing as we have to do all
865 * the prep work.
866 *
867 * If mathbox partial precision is too low, consider also:
868 * result[3] = result[0] * EXP(result[1])
869 */
870 emit_math1(c,
871 BRW_MATH_FUNCTION_EXP,
872 brw_writemask(dst, WRITEMASK_Z),
873 brw_swizzle1(arg0, 0),
874 BRW_MATH_PRECISION_FULL);
875 }
876
877 if (dst.dw1.bits.writemask & WRITEMASK_W) {
878 /* result[3] = 1.0; */
879 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
880 }
881 }
882
883
884 static void emit_log_noalias( struct brw_vs_compile *c,
885 struct brw_reg dst,
886 struct brw_reg arg0 )
887 {
888 struct brw_compile *p = &c->func;
889 struct brw_reg tmp = dst;
890 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
891 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
892 GLboolean need_tmp = (dst.dw1.bits.writemask != 0xf ||
893 dst.file != BRW_GENERAL_REGISTER_FILE);
894
895 if (need_tmp) {
896 tmp = get_tmp(c);
897 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
898 }
899
900 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
901 * according to spec:
902 *
903 * These almost look likey they could be joined up, but not really
904 * practical:
905 *
906 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
907 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
908 */
909 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
910 brw_AND(p,
911 brw_writemask(tmp_ud, WRITEMASK_X),
912 brw_swizzle1(arg0_ud, 0),
913 brw_imm_ud((1U<<31)-1));
914
915 brw_SHR(p,
916 brw_writemask(tmp_ud, WRITEMASK_X),
917 tmp_ud,
918 brw_imm_ud(23));
919
920 brw_ADD(p,
921 brw_writemask(tmp, WRITEMASK_X),
922 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
923 brw_imm_d(-127));
924 }
925
926 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
927 brw_AND(p,
928 brw_writemask(tmp_ud, WRITEMASK_Y),
929 brw_swizzle1(arg0_ud, 0),
930 brw_imm_ud((1<<23)-1));
931
932 brw_OR(p,
933 brw_writemask(tmp_ud, WRITEMASK_Y),
934 tmp_ud,
935 brw_imm_ud(127<<23));
936 }
937
938 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
939 /* result[2] = result[0] + LOG2(result[1]); */
940
941 /* Why bother? The above is just a hint how to do this with a
942 * taylor series. Maybe we *should* use a taylor series as by
943 * the time all the above has been done it's almost certainly
944 * quicker than calling the mathbox, even with low precision.
945 *
946 * Options are:
947 * - result[0] + mathbox.LOG2(result[1])
948 * - mathbox.LOG2(arg0.x)
949 * - result[0] + inline_taylor_approx(result[1])
950 */
951 emit_math1(c,
952 BRW_MATH_FUNCTION_LOG,
953 brw_writemask(tmp, WRITEMASK_Z),
954 brw_swizzle1(tmp, 1),
955 BRW_MATH_PRECISION_FULL);
956
957 brw_ADD(p,
958 brw_writemask(tmp, WRITEMASK_Z),
959 brw_swizzle1(tmp, 2),
960 brw_swizzle1(tmp, 0));
961 }
962
963 if (dst.dw1.bits.writemask & WRITEMASK_W) {
964 /* result[3] = 1.0; */
965 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
966 }
967
968 if (need_tmp) {
969 brw_MOV(p, dst, tmp);
970 release_tmp(c, tmp);
971 }
972 }
973
974
975 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
976 */
977 static void emit_dst_noalias( struct brw_vs_compile *c,
978 struct brw_reg dst,
979 struct brw_reg arg0,
980 struct brw_reg arg1)
981 {
982 struct brw_compile *p = &c->func;
983
984 /* There must be a better way to do this:
985 */
986 if (dst.dw1.bits.writemask & WRITEMASK_X)
987 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
988 if (dst.dw1.bits.writemask & WRITEMASK_Y)
989 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
990 if (dst.dw1.bits.writemask & WRITEMASK_Z)
991 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
992 if (dst.dw1.bits.writemask & WRITEMASK_W)
993 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
994 }
995
996
997 static void emit_xpd( struct brw_compile *p,
998 struct brw_reg dst,
999 struct brw_reg t,
1000 struct brw_reg u)
1001 {
1002 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
1003 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1004 }
1005
1006
1007 static void emit_lit_noalias( struct brw_vs_compile *c,
1008 struct brw_reg dst,
1009 struct brw_reg arg0 )
1010 {
1011 struct brw_compile *p = &c->func;
1012 struct brw_reg tmp = dst;
1013 GLboolean need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1014
1015 if (need_tmp)
1016 tmp = get_tmp(c);
1017
1018 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1019 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1020
1021 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1022 * to get all channels active inside the IF. In the clipping code
1023 * we run with NoMask, so it's not an option and we can use
1024 * BRW_EXECUTE_1 for all comparisions.
1025 */
1026 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1027 brw_IF(p, BRW_EXECUTE_8);
1028 {
1029 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1030
1031 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1032 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1033 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1034
1035 emit_math2(c,
1036 BRW_MATH_FUNCTION_POW,
1037 brw_writemask(dst, WRITEMASK_Z),
1038 brw_swizzle1(tmp, 2),
1039 brw_swizzle1(arg0, 3),
1040 BRW_MATH_PRECISION_PARTIAL);
1041 }
1042 brw_ENDIF(p);
1043
1044 release_tmp(c, tmp);
1045 }
1046
1047 static void emit_lrp_noalias(struct brw_vs_compile *c,
1048 struct brw_reg dst,
1049 struct brw_reg arg0,
1050 struct brw_reg arg1,
1051 struct brw_reg arg2)
1052 {
1053 struct brw_compile *p = &c->func;
1054
1055 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1056 brw_MUL(p, brw_null_reg(), dst, arg2);
1057 brw_MAC(p, dst, arg0, arg1);
1058 }
1059
1060 static struct brw_reg
1061 get_constant(struct brw_vs_compile *c,
1062 const struct prog_instruction *inst,
1063 GLuint argIndex)
1064 {
1065 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1066 struct brw_compile *p = &c->func;
1067 struct brw_reg const_reg = c->current_const[argIndex].reg;
1068
1069 assert(argIndex < 3);
1070
1071 if (c->current_const[argIndex].index != src->Index) {
1072 /* Keep track of the last constant loaded in this slot, for reuse. */
1073 c->current_const[argIndex].index = src->Index;
1074
1075 #if 0
1076 printf(" fetch const[%d] for arg %d into reg %d\n",
1077 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1078 #endif
1079 /* need to fetch the constant now */
1080 brw_dp_READ_4_vs(p,
1081 const_reg, /* writeback dest */
1082 16 * src->Index, /* byte offset */
1083 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1084 );
1085 }
1086
1087 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1088 const_reg = stride(const_reg, 0, 4, 1);
1089 const_reg.subnr = 0;
1090
1091 return const_reg;
1092 }
1093
1094 static struct brw_reg
1095 get_reladdr_constant(struct brw_vs_compile *c,
1096 const struct prog_instruction *inst,
1097 GLuint argIndex)
1098 {
1099 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1100 struct brw_compile *p = &c->func;
1101 struct brw_context *brw = p->brw;
1102 struct intel_context *intel = &brw->intel;
1103 struct brw_reg const_reg = c->current_const[argIndex].reg;
1104 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1105 uint32_t offset;
1106
1107 assert(argIndex < 3);
1108
1109 /* Can't reuse a reladdr constant load. */
1110 c->current_const[argIndex].index = -1;
1111
1112 #if 0
1113 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1114 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1115 #endif
1116
1117 if (intel->gen >= 6) {
1118 offset = src->Index;
1119 } else {
1120 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1121 brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1122 addr_reg = byte_addr_reg;
1123 offset = 16 * src->Index;
1124 }
1125
1126 /* fetch the first vec4 */
1127 brw_dp_READ_4_vs_relative(p,
1128 const_reg,
1129 addr_reg,
1130 offset,
1131 SURF_INDEX_VERT_CONST_BUFFER);
1132
1133 return const_reg;
1134 }
1135
1136
1137
1138 /* TODO: relative addressing!
1139 */
1140 static struct brw_reg get_reg( struct brw_vs_compile *c,
1141 gl_register_file file,
1142 GLuint index )
1143 {
1144 switch (file) {
1145 case PROGRAM_TEMPORARY:
1146 case PROGRAM_INPUT:
1147 case PROGRAM_OUTPUT:
1148 assert(c->regs[file][index].nr != 0);
1149 return c->regs[file][index];
1150 case PROGRAM_STATE_VAR:
1151 case PROGRAM_CONSTANT:
1152 case PROGRAM_UNIFORM:
1153 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1154 return c->regs[PROGRAM_STATE_VAR][index];
1155 case PROGRAM_ADDRESS:
1156 assert(index == 0);
1157 return c->regs[file][index];
1158
1159 case PROGRAM_UNDEFINED: /* undef values */
1160 return brw_null_reg();
1161
1162 case PROGRAM_LOCAL_PARAM:
1163 case PROGRAM_ENV_PARAM:
1164 case PROGRAM_WRITE_ONLY:
1165 default:
1166 assert(0);
1167 return brw_null_reg();
1168 }
1169 }
1170
1171
1172 /**
1173 * Indirect addressing: get reg[[arg] + offset].
1174 */
1175 static struct brw_reg deref( struct brw_vs_compile *c,
1176 struct brw_reg arg,
1177 GLint offset,
1178 GLuint reg_size )
1179 {
1180 struct brw_compile *p = &c->func;
1181 struct brw_reg tmp = get_tmp(c);
1182 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1183 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1184 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1185 struct brw_reg indirect = brw_vec4_indirect(0,0);
1186 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1187
1188 /* Set the vertical stride on the register access so that the first
1189 * 4 components come from a0.0 and the second 4 from a0.1.
1190 */
1191 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1192
1193 {
1194 brw_push_insn_state(p);
1195 brw_set_access_mode(p, BRW_ALIGN_1);
1196
1197 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1198 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1199
1200 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1201 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1202
1203 brw_MOV(p, tmp, indirect);
1204
1205 brw_pop_insn_state(p);
1206 }
1207
1208 /* NOTE: tmp not released */
1209 return tmp;
1210 }
1211
1212 static void
1213 move_to_reladdr_dst(struct brw_vs_compile *c,
1214 const struct prog_instruction *inst,
1215 struct brw_reg val)
1216 {
1217 struct brw_compile *p = &c->func;
1218 int reg_size = 32;
1219 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1220 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1221 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1222 GLuint byte_offset = base.nr * 32 + base.subnr;
1223 struct brw_reg indirect = brw_vec4_indirect(0,0);
1224 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1225
1226 /* Because destination register indirect addressing can only use
1227 * one index, we'll write each vertex's vec4 value separately.
1228 */
1229 val.width = BRW_WIDTH_4;
1230 val.vstride = BRW_VERTICAL_STRIDE_4;
1231
1232 brw_push_insn_state(p);
1233 brw_set_access_mode(p, BRW_ALIGN_1);
1234
1235 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1236 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1237 brw_MOV(p, indirect, val);
1238
1239 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1240 brw_ADD(p, brw_address_reg(0), acc,
1241 brw_imm_uw(byte_offset + reg_size / 2));
1242 brw_MOV(p, indirect, suboffset(val, 4));
1243
1244 brw_pop_insn_state(p);
1245 }
1246
1247 /**
1248 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1249 * TODO: relative addressing!
1250 */
1251 static struct brw_reg
1252 get_src_reg( struct brw_vs_compile *c,
1253 const struct prog_instruction *inst,
1254 GLuint argIndex )
1255 {
1256 const GLuint file = inst->SrcReg[argIndex].File;
1257 const GLint index = inst->SrcReg[argIndex].Index;
1258 const GLboolean relAddr = inst->SrcReg[argIndex].RelAddr;
1259
1260 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1261 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1262
1263 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1264 SWIZZLE_ZERO,
1265 SWIZZLE_ZERO,
1266 SWIZZLE_ZERO)) {
1267 return brw_imm_f(0.0f);
1268 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1269 SWIZZLE_ONE,
1270 SWIZZLE_ONE,
1271 SWIZZLE_ONE)) {
1272 if (src->Negate)
1273 return brw_imm_f(-1.0F);
1274 else
1275 return brw_imm_f(1.0F);
1276 } else if (src->File == PROGRAM_CONSTANT) {
1277 const struct gl_program_parameter_list *params;
1278 float f;
1279 int component = -1;
1280
1281 switch (src->Swizzle) {
1282 case SWIZZLE_XXXX:
1283 component = 0;
1284 break;
1285 case SWIZZLE_YYYY:
1286 component = 1;
1287 break;
1288 case SWIZZLE_ZZZZ:
1289 component = 2;
1290 break;
1291 case SWIZZLE_WWWW:
1292 component = 3;
1293 break;
1294 }
1295
1296 if (component >= 0) {
1297 params = c->vp->program.Base.Parameters;
1298 f = params->ParameterValues[src->Index][component].f;
1299
1300 if (src->Abs)
1301 f = fabs(f);
1302 if (src->Negate)
1303 f = -f;
1304 return brw_imm_f(f);
1305 }
1306 }
1307 }
1308
1309 switch (file) {
1310 case PROGRAM_TEMPORARY:
1311 case PROGRAM_INPUT:
1312 case PROGRAM_OUTPUT:
1313 if (relAddr) {
1314 return deref(c, c->regs[file][0], index, 32);
1315 }
1316 else {
1317 assert(c->regs[file][index].nr != 0);
1318 return c->regs[file][index];
1319 }
1320
1321 case PROGRAM_STATE_VAR:
1322 case PROGRAM_CONSTANT:
1323 case PROGRAM_UNIFORM:
1324 case PROGRAM_ENV_PARAM:
1325 case PROGRAM_LOCAL_PARAM:
1326 if (!relAddr && c->constant_map[index] != -1) {
1327 /* Take from the push constant buffer if possible. */
1328 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1329 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1330 } else {
1331 /* Must be in the pull constant buffer then .*/
1332 assert(c->vp->use_const_buffer);
1333 if (relAddr)
1334 return get_reladdr_constant(c, inst, argIndex);
1335 else
1336 return get_constant(c, inst, argIndex);
1337 }
1338 case PROGRAM_ADDRESS:
1339 assert(index == 0);
1340 return c->regs[file][index];
1341
1342 case PROGRAM_UNDEFINED:
1343 /* this is a normal case since we loop over all three src args */
1344 return brw_null_reg();
1345
1346 case PROGRAM_WRITE_ONLY:
1347 default:
1348 assert(0);
1349 return brw_null_reg();
1350 }
1351 }
1352
1353 /**
1354 * Return the brw reg for the given instruction's src argument.
1355 * Will return mangled results for SWZ op. The emit_swz() function
1356 * ignores this result and recalculates taking extended swizzles into
1357 * account.
1358 */
1359 static struct brw_reg get_arg( struct brw_vs_compile *c,
1360 const struct prog_instruction *inst,
1361 GLuint argIndex )
1362 {
1363 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1364 struct brw_reg reg;
1365
1366 if (src->File == PROGRAM_UNDEFINED)
1367 return brw_null_reg();
1368
1369 reg = get_src_reg(c, inst, argIndex);
1370
1371 /* Convert 3-bit swizzle to 2-bit.
1372 */
1373 if (reg.file != BRW_IMMEDIATE_VALUE) {
1374 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1375 GET_SWZ(src->Swizzle, 1),
1376 GET_SWZ(src->Swizzle, 2),
1377 GET_SWZ(src->Swizzle, 3));
1378
1379 /* Note this is ok for non-swizzle ARB_vp instructions */
1380 reg.negate = src->Negate ? 1 : 0;
1381 }
1382
1383 return reg;
1384 }
1385
1386
1387 /**
1388 * Get brw register for the given program dest register.
1389 */
1390 static struct brw_reg get_dst( struct brw_vs_compile *c,
1391 struct prog_dst_register dst )
1392 {
1393 struct brw_reg reg;
1394
1395 switch (dst.File) {
1396 case PROGRAM_TEMPORARY:
1397 case PROGRAM_OUTPUT:
1398 /* register-indirect addressing is only 1x1, not VxH, for
1399 * destination regs. So, for RelAddr we'll return a temporary
1400 * for the dest and do a move of the result to the RelAddr
1401 * register after the instruction emit.
1402 */
1403 if (dst.RelAddr) {
1404 reg = get_tmp(c);
1405 } else {
1406 assert(c->regs[dst.File][dst.Index].nr != 0);
1407 reg = c->regs[dst.File][dst.Index];
1408 }
1409 break;
1410 case PROGRAM_ADDRESS:
1411 assert(dst.Index == 0);
1412 reg = c->regs[dst.File][dst.Index];
1413 break;
1414 case PROGRAM_UNDEFINED:
1415 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1416 reg = brw_null_reg();
1417 break;
1418 default:
1419 assert(0);
1420 reg = brw_null_reg();
1421 }
1422
1423 assert(reg.type != BRW_IMMEDIATE_VALUE);
1424 reg.dw1.bits.writemask = dst.WriteMask;
1425
1426 return reg;
1427 }
1428
1429
1430 static void emit_swz( struct brw_vs_compile *c,
1431 struct brw_reg dst,
1432 const struct prog_instruction *inst)
1433 {
1434 const GLuint argIndex = 0;
1435 const struct prog_src_register src = inst->SrcReg[argIndex];
1436 struct brw_compile *p = &c->func;
1437 GLuint zeros_mask = 0;
1438 GLuint ones_mask = 0;
1439 GLuint src_mask = 0;
1440 GLubyte src_swz[4];
1441 GLboolean need_tmp = (src.Negate &&
1442 dst.file != BRW_GENERAL_REGISTER_FILE);
1443 struct brw_reg tmp = dst;
1444 GLuint i;
1445
1446 if (need_tmp)
1447 tmp = get_tmp(c);
1448
1449 for (i = 0; i < 4; i++) {
1450 if (dst.dw1.bits.writemask & (1<<i)) {
1451 GLubyte s = GET_SWZ(src.Swizzle, i);
1452 switch (s) {
1453 case SWIZZLE_X:
1454 case SWIZZLE_Y:
1455 case SWIZZLE_Z:
1456 case SWIZZLE_W:
1457 src_mask |= 1<<i;
1458 src_swz[i] = s;
1459 break;
1460 case SWIZZLE_ZERO:
1461 zeros_mask |= 1<<i;
1462 break;
1463 case SWIZZLE_ONE:
1464 ones_mask |= 1<<i;
1465 break;
1466 }
1467 }
1468 }
1469
1470 /* Do src first, in case dst aliases src:
1471 */
1472 if (src_mask) {
1473 struct brw_reg arg0;
1474
1475 arg0 = get_src_reg(c, inst, argIndex);
1476
1477 arg0 = brw_swizzle(arg0,
1478 src_swz[0], src_swz[1],
1479 src_swz[2], src_swz[3]);
1480
1481 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1482 }
1483
1484 if (zeros_mask)
1485 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1486
1487 if (ones_mask)
1488 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1489
1490 if (src.Negate)
1491 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1492
1493 if (need_tmp) {
1494 brw_MOV(p, dst, tmp);
1495 release_tmp(c, tmp);
1496 }
1497 }
1498
1499 static int
1500 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1501 {
1502 struct intel_context *intel = &brw->intel;
1503
1504 if (intel->gen >= 6) {
1505 /* URB data written (does not include the message header reg) must
1506 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1507 * section 5.4.3.2.2: URB_INTERLEAVED.
1508 *
1509 * URB entries are allocated on a multiple of 1024 bits, so an
1510 * extra 128 bits written here to make the end align to 256 is
1511 * no problem.
1512 */
1513 if ((mlen % 2) != 1)
1514 mlen++;
1515 }
1516
1517 return mlen;
1518 }
1519
1520 /**
1521 * Post-vertex-program processing. Send the results to the URB.
1522 */
1523 static void emit_vertex_write( struct brw_vs_compile *c)
1524 {
1525 struct brw_compile *p = &c->func;
1526 struct brw_context *brw = p->brw;
1527 struct intel_context *intel = &brw->intel;
1528 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1529 struct brw_reg ndc;
1530 int eot;
1531 GLuint len_vertex_header = 2;
1532 int i;
1533 int msg_len;
1534 int slot;
1535
1536 if (c->key.copy_edgeflag) {
1537 brw_MOV(p,
1538 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1539 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1540 }
1541
1542 if (intel->gen < 6) {
1543 /* Build ndc coords */
1544 ndc = get_tmp(c);
1545 /* ndc = 1.0 / pos.w */
1546 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1547 /* ndc.xyz = pos * ndc */
1548 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1549 }
1550
1551 /* Update the header for point size, user clipping flags, and -ve rhw
1552 * workaround.
1553 */
1554 if (intel->gen >= 6) {
1555 struct brw_reg m1 = brw_message_reg(1);
1556
1557 /* On gen6, m1 has each value in a separate dword, so we never
1558 * need to mess with a temporary for computing the m1 value.
1559 */
1560 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1561 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1562 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1563 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1564 }
1565
1566 /* Set the user clip distances in dword 8-15. (m3-4)*/
1567 if (c->key.userclip_active) {
1568 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1569 struct brw_reg m;
1570 if (i < 4)
1571 m = brw_message_reg(3);
1572 else
1573 m = brw_message_reg(4);
1574
1575 brw_DP4(p, brw_writemask(m, (1 << (i & 3))),pos, c->userplane[i]);
1576 }
1577 }
1578 } else if ((c->prog_data.outputs_written &
1579 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1580 c->key.userclip_active || brw->has_negative_rhw_bug) {
1581 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1582 GLuint i;
1583
1584 brw_MOV(p, header1, brw_imm_ud(0));
1585
1586 brw_set_access_mode(p, BRW_ALIGN_16);
1587
1588 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1589 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1590 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1591 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1592 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1593 header1, brw_imm_ud(0x7ff<<8));
1594 }
1595
1596 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1597 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1598 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1599 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1600 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1601 }
1602
1603 /* i965 clipping workaround:
1604 * 1) Test for -ve rhw
1605 * 2) If set,
1606 * set ndc = (0,0,0,0)
1607 * set ucp[6] = 1
1608 *
1609 * Later, clipping will detect ucp[6] and ensure the primitive is
1610 * clipped against all fixed planes.
1611 */
1612 if (brw->has_negative_rhw_bug) {
1613 brw_CMP(p,
1614 vec8(brw_null_reg()),
1615 BRW_CONDITIONAL_L,
1616 brw_swizzle1(ndc, 3),
1617 brw_imm_f(0));
1618
1619 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1620 brw_MOV(p, ndc, brw_imm_f(0));
1621 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1622 }
1623
1624 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1625 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1626 brw_set_access_mode(p, BRW_ALIGN_16);
1627
1628 release_tmp(c, header1);
1629 }
1630 else {
1631 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1632 }
1633
1634 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1635 * of zeros followed by two sets of NDC coordinates:
1636 */
1637 brw_set_access_mode(p, BRW_ALIGN_1);
1638 brw_set_acc_write_control(p, 0);
1639
1640 /* The VUE layout is documented in Volume 2a. */
1641 if (intel->gen >= 6) {
1642 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1643 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1644 * dword 4-7 (m2) is the 4D space position
1645 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1646 * enabled.
1647 * m3 or 5 is the first vertex element data we fill, which is
1648 * the vertex position.
1649 */
1650 brw_MOV(p, brw_message_reg(2), pos);
1651 len_vertex_header = 1;
1652 if (c->key.userclip_active)
1653 len_vertex_header += 2;
1654 } else if (intel->gen == 5) {
1655 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1656 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1657 * dword 4-7 (m2) is the ndc position (set above)
1658 * dword 8-11 (m3) of the vertex header is the 4D space position
1659 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1660 * m6 is a pad so that the vertex element data is aligned
1661 * m7 is the first vertex data we fill, which is the vertex position.
1662 */
1663 brw_MOV(p, brw_message_reg(2), ndc);
1664 brw_MOV(p, brw_message_reg(3), pos);
1665 brw_MOV(p, brw_message_reg(7), pos);
1666 len_vertex_header = 6;
1667 } else {
1668 /* There are 8 dwords in VUE header pre-Ironlake:
1669 * dword 0-3 (m1) is indices, point width, clip flags.
1670 * dword 4-7 (m2) is ndc position (set above)
1671 *
1672 * dword 8-11 (m3) is the first vertex data, which we always have be the
1673 * vertex position.
1674 */
1675 brw_MOV(p, brw_message_reg(2), ndc);
1676 brw_MOV(p, brw_message_reg(3), pos);
1677 len_vertex_header = 2;
1678 }
1679
1680 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1681 for (slot = len_vertex_header; slot < c->vue_map.num_slots; ++slot) {
1682 if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE)
1683 break;
1684
1685 int mrf = slot + 1;
1686 int vert_result = c->vue_map.slot_to_vert_result[slot];
1687 if (c->regs[PROGRAM_OUTPUT][vert_result].file ==
1688 BRW_GENERAL_REGISTER_FILE) {
1689 brw_MOV(p, brw_message_reg(mrf),
1690 c->regs[PROGRAM_OUTPUT][vert_result]);
1691 }
1692 }
1693
1694 eot = (slot >= c->vue_map.num_slots);
1695
1696 /* Message header, plus the (first part of the) VUE. */
1697 msg_len = 1 + slot;
1698 msg_len = align_interleaved_urb_mlen(brw, msg_len);
1699 /* Any outputs beyond BRW_MAX_MRF should be in the second URB write */
1700 assert (msg_len <= BRW_MAX_MRF - 1);
1701
1702 brw_urb_WRITE(p,
1703 brw_null_reg(), /* dest */
1704 0, /* starting mrf reg nr */
1705 c->r0, /* src */
1706 0, /* allocate */
1707 1, /* used */
1708 msg_len,
1709 0, /* response len */
1710 eot, /* eot */
1711 eot, /* writes complete */
1712 0, /* urb destination offset */
1713 BRW_URB_SWIZZLE_INTERLEAVE);
1714
1715 if (slot < c->vue_map.num_slots) {
1716 /* Not all of the vertex outputs/results fit into the MRF.
1717 * Move the overflowed attributes from the GRF to the MRF and
1718 * issue another brw_urb_WRITE().
1719 */
1720 GLuint mrf = 1;
1721 for (; slot < c->vue_map.num_slots; ++slot) {
1722 int vert_result = c->vue_map.slot_to_vert_result[slot];
1723 /* move from GRF to MRF */
1724 brw_MOV(p, brw_message_reg(mrf),
1725 c->regs[PROGRAM_OUTPUT][vert_result]);
1726 mrf++;
1727 }
1728
1729 brw_urb_WRITE(p,
1730 brw_null_reg(), /* dest */
1731 0, /* starting mrf reg nr */
1732 c->r0, /* src */
1733 0, /* allocate */
1734 1, /* used */
1735 align_interleaved_urb_mlen(brw, mrf),
1736 0, /* response len */
1737 1, /* eot */
1738 1, /* writes complete */
1739 MAX_SLOTS_IN_FIRST_URB_WRITE / 2, /* urb destination offset */
1740 BRW_URB_SWIZZLE_INTERLEAVE);
1741 }
1742 }
1743
1744 static GLboolean
1745 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1746 {
1747 struct brw_compile *p = &c->func;
1748 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1749
1750 if (p->nr_insn == 0)
1751 return GL_FALSE;
1752
1753 if (val.address_mode != BRW_ADDRESS_DIRECT)
1754 return GL_FALSE;
1755
1756 if (val.negate || val.abs)
1757 return GL_FALSE;
1758
1759 switch (prev_insn->header.opcode) {
1760 case BRW_OPCODE_MOV:
1761 case BRW_OPCODE_MAC:
1762 case BRW_OPCODE_MUL:
1763 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1764 prev_insn->header.execution_size == val.width &&
1765 prev_insn->bits1.da1.dest_reg_file == val.file &&
1766 prev_insn->bits1.da1.dest_reg_type == val.type &&
1767 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1768 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1769 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1770 prev_insn->bits1.da16.dest_writemask == 0xf)
1771 return GL_TRUE;
1772 else
1773 return GL_FALSE;
1774 default:
1775 return GL_FALSE;
1776 }
1777 }
1778
1779 static uint32_t
1780 get_predicate(const struct prog_instruction *inst)
1781 {
1782 if (inst->DstReg.CondMask == COND_TR)
1783 return BRW_PREDICATE_NONE;
1784
1785 /* All of GLSL only produces predicates for COND_NE and one channel per
1786 * vector. Fail badly if someone starts doing something else, as it might
1787 * mean infinite looping or something.
1788 *
1789 * We'd like to support all the condition codes, but our hardware doesn't
1790 * quite match the Mesa IR, which is modeled after the NV extensions. For
1791 * those, the instruction may update the condition codes or not, then any
1792 * later instruction may use one of those condition codes. For gen4, the
1793 * instruction may update the flags register based on one of the condition
1794 * codes output by the instruction, and then further instructions may
1795 * predicate on that. We can probably support this, but it won't
1796 * necessarily be easy.
1797 */
1798 assert(inst->DstReg.CondMask == COND_NE);
1799
1800 switch (inst->DstReg.CondSwizzle) {
1801 case SWIZZLE_XXXX:
1802 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1803 case SWIZZLE_YYYY:
1804 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1805 case SWIZZLE_ZZZZ:
1806 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1807 case SWIZZLE_WWWW:
1808 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1809 default:
1810 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1811 inst->DstReg.CondMask);
1812 return BRW_PREDICATE_NORMAL;
1813 }
1814 }
1815
1816 static void
1817 brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
1818 {
1819 struct brw_compile *p = &c->func;
1820 int i;
1821
1822 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1823 if (!(c->prog_data.inputs_read & (1 << i)))
1824 continue;
1825
1826 if (c->key.gl_fixed_input_size[i] != 0) {
1827 struct brw_reg reg = c->regs[PROGRAM_INPUT][i];
1828
1829 brw_MUL(p,
1830 brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
1831 reg, brw_imm_f(1.0 / 65536.0));
1832 }
1833 }
1834 }
1835
1836 /* Emit the vertex program instructions here.
1837 */
1838 void brw_old_vs_emit(struct brw_vs_compile *c )
1839 {
1840 #define MAX_IF_DEPTH 32
1841 #define MAX_LOOP_DEPTH 32
1842 struct brw_compile *p = &c->func;
1843 struct brw_context *brw = p->brw;
1844 struct intel_context *intel = &brw->intel;
1845 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1846 GLuint insn, loop_depth = 0;
1847 struct brw_instruction *loop_inst[MAX_LOOP_DEPTH] = { 0 };
1848 int if_depth_in_loop[MAX_LOOP_DEPTH];
1849 const struct brw_indirect stack_index = brw_indirect(0, 0);
1850 GLuint index;
1851 GLuint file;
1852
1853 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1854 printf("vs-mesa:\n");
1855 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1856 GL_TRUE);
1857 printf("\n");
1858 }
1859
1860 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1861 brw_set_access_mode(p, BRW_ALIGN_16);
1862 if_depth_in_loop[loop_depth] = 0;
1863
1864 brw_set_acc_write_control(p, 1);
1865
1866 for (insn = 0; insn < nr_insns; insn++) {
1867 GLuint i;
1868 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1869
1870 /* Message registers can't be read, so copy the output into GRF
1871 * register if they are used in source registers
1872 */
1873 for (i = 0; i < 3; i++) {
1874 struct prog_src_register *src = &inst->SrcReg[i];
1875 GLuint index = src->Index;
1876 GLuint file = src->File;
1877 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1878 c->output_regs[index].used_in_src = GL_TRUE;
1879 }
1880
1881 switch (inst->Opcode) {
1882 case OPCODE_CAL:
1883 case OPCODE_RET:
1884 c->needs_stack = GL_TRUE;
1885 break;
1886 default:
1887 break;
1888 }
1889 }
1890
1891 /* Static register allocation
1892 */
1893 brw_vs_alloc_regs(c);
1894
1895 brw_vs_rescale_gl_fixed(c);
1896
1897 if (c->needs_stack)
1898 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1899
1900 for (insn = 0; insn < nr_insns; insn++) {
1901
1902 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1903 struct brw_reg args[3], dst;
1904 GLuint i;
1905
1906 #if 0
1907 printf("%d: ", insn);
1908 _mesa_print_instruction(inst);
1909 #endif
1910
1911 /* Get argument regs. SWZ is special and does this itself.
1912 */
1913 if (inst->Opcode != OPCODE_SWZ)
1914 for (i = 0; i < 3; i++) {
1915 const struct prog_src_register *src = &inst->SrcReg[i];
1916 index = src->Index;
1917 file = src->File;
1918 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
1919 /* Can't just make get_arg "do the right thing" here because
1920 * other callers of get_arg and get_src_reg don't expect any
1921 * special behavior for the c->output_regs[index].used_in_src
1922 * case.
1923 */
1924 args[i] = c->output_regs[index].reg;
1925 args[i].dw1.bits.swizzle =
1926 BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1927 GET_SWZ(src->Swizzle, 1),
1928 GET_SWZ(src->Swizzle, 2),
1929 GET_SWZ(src->Swizzle, 3));
1930
1931 /* Note this is ok for non-swizzle ARB_vp instructions */
1932 args[i].negate = src->Negate ? 1 : 0;
1933 } else
1934 args[i] = get_arg(c, inst, i);
1935 }
1936
1937 /* Get dest regs. Note that it is possible for a reg to be both
1938 * dst and arg, given the static allocation of registers. So
1939 * care needs to be taken emitting multi-operation instructions.
1940 */
1941 index = inst->DstReg.Index;
1942 file = inst->DstReg.File;
1943 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1944 /* Can't just make get_dst "do the right thing" here because other
1945 * callers of get_dst don't expect any special behavior for the
1946 * c->output_regs[index].used_in_src case.
1947 */
1948 dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
1949 else
1950 dst = get_dst(c, inst->DstReg);
1951
1952 if (inst->SaturateMode != SATURATE_OFF) {
1953 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1954 inst->SaturateMode);
1955 }
1956
1957 switch (inst->Opcode) {
1958 case OPCODE_ABS:
1959 args[0].negate = false;
1960 brw_MOV(p, dst, brw_abs(args[0]));
1961 break;
1962 case OPCODE_ADD:
1963 brw_ADD(p, dst, args[0], args[1]);
1964 break;
1965 case OPCODE_COS:
1966 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1967 break;
1968 case OPCODE_DP2:
1969 brw_DP2(p, dst, args[0], args[1]);
1970 break;
1971 case OPCODE_DP3:
1972 brw_DP3(p, dst, args[0], args[1]);
1973 break;
1974 case OPCODE_DP4:
1975 brw_DP4(p, dst, args[0], args[1]);
1976 break;
1977 case OPCODE_DPH:
1978 brw_DPH(p, dst, args[0], args[1]);
1979 break;
1980 case OPCODE_DST:
1981 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1982 break;
1983 case OPCODE_EXP:
1984 unalias1(c, dst, args[0], emit_exp_noalias);
1985 break;
1986 case OPCODE_EX2:
1987 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1988 break;
1989 case OPCODE_ARL:
1990 emit_arl(p, dst, args[0]);
1991 break;
1992 case OPCODE_FLR:
1993 brw_RNDD(p, dst, args[0]);
1994 break;
1995 case OPCODE_FRC:
1996 brw_FRC(p, dst, args[0]);
1997 break;
1998 case OPCODE_LOG:
1999 unalias1(c, dst, args[0], emit_log_noalias);
2000 break;
2001 case OPCODE_LG2:
2002 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2003 break;
2004 case OPCODE_LIT:
2005 unalias1(c, dst, args[0], emit_lit_noalias);
2006 break;
2007 case OPCODE_LRP:
2008 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2009 break;
2010 case OPCODE_MAD:
2011 if (!accumulator_contains(c, args[2]))
2012 brw_MOV(p, brw_acc_reg(), args[2]);
2013 brw_MAC(p, dst, args[0], args[1]);
2014 break;
2015 case OPCODE_CMP:
2016 emit_cmp(p, dst, args[0], args[1], args[2]);
2017 break;
2018 case OPCODE_MAX:
2019 emit_max(p, dst, args[0], args[1]);
2020 break;
2021 case OPCODE_MIN:
2022 emit_min(p, dst, args[0], args[1]);
2023 break;
2024 case OPCODE_MOV:
2025 brw_MOV(p, dst, args[0]);
2026 break;
2027 case OPCODE_MUL:
2028 brw_MUL(p, dst, args[0], args[1]);
2029 break;
2030 case OPCODE_POW:
2031 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2032 break;
2033 case OPCODE_RCP:
2034 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2035 break;
2036 case OPCODE_RSQ:
2037 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2038 break;
2039
2040 case OPCODE_SEQ:
2041 unalias2(c, dst, args[0], args[1], emit_seq);
2042 break;
2043 case OPCODE_SIN:
2044 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2045 break;
2046 case OPCODE_SNE:
2047 unalias2(c, dst, args[0], args[1], emit_sne);
2048 break;
2049 case OPCODE_SGE:
2050 unalias2(c, dst, args[0], args[1], emit_sge);
2051 break;
2052 case OPCODE_SGT:
2053 unalias2(c, dst, args[0], args[1], emit_sgt);
2054 break;
2055 case OPCODE_SLT:
2056 unalias2(c, dst, args[0], args[1], emit_slt);
2057 break;
2058 case OPCODE_SLE:
2059 unalias2(c, dst, args[0], args[1], emit_sle);
2060 break;
2061 case OPCODE_SSG:
2062 unalias1(c, dst, args[0], emit_sign);
2063 break;
2064 case OPCODE_SUB:
2065 brw_ADD(p, dst, args[0], negate(args[1]));
2066 break;
2067 case OPCODE_SWZ:
2068 /* The args[0] value can't be used here as it won't have
2069 * correctly encoded the full swizzle:
2070 */
2071 emit_swz(c, dst, inst);
2072 break;
2073 case OPCODE_TRUNC:
2074 /* round toward zero */
2075 brw_RNDZ(p, dst, args[0]);
2076 break;
2077 case OPCODE_XPD:
2078 emit_xpd(p, dst, args[0], args[1]);
2079 break;
2080 case OPCODE_IF: {
2081 struct brw_instruction *if_inst = brw_IF(p, BRW_EXECUTE_8);
2082 /* Note that brw_IF smashes the predicate_control field. */
2083 if_inst->header.predicate_control = get_predicate(inst);
2084 if_depth_in_loop[loop_depth]++;
2085 break;
2086 }
2087 case OPCODE_ELSE:
2088 clear_current_const(c);
2089 brw_ELSE(p);
2090 break;
2091 case OPCODE_ENDIF:
2092 clear_current_const(c);
2093 brw_ENDIF(p);
2094 if_depth_in_loop[loop_depth]--;
2095 break;
2096 case OPCODE_BGNLOOP:
2097 clear_current_const(c);
2098 loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
2099 if_depth_in_loop[loop_depth] = 0;
2100 break;
2101 case OPCODE_BRK:
2102 brw_set_predicate_control(p, get_predicate(inst));
2103 brw_BREAK(p, if_depth_in_loop[loop_depth]);
2104 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2105 break;
2106 case OPCODE_CONT:
2107 brw_set_predicate_control(p, get_predicate(inst));
2108 if (intel->gen >= 6) {
2109 gen6_CONT(p, loop_inst[loop_depth - 1]);
2110 } else {
2111 brw_CONT(p, if_depth_in_loop[loop_depth]);
2112 }
2113 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2114 break;
2115
2116 case OPCODE_ENDLOOP: {
2117 clear_current_const(c);
2118 struct brw_instruction *inst0, *inst1;
2119 GLuint br = 1;
2120
2121 loop_depth--;
2122
2123 if (intel->gen == 5)
2124 br = 2;
2125
2126 inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
2127
2128 if (intel->gen < 6) {
2129 /* patch all the BREAK/CONT instructions from last BEGINLOOP */
2130 while (inst0 > loop_inst[loop_depth]) {
2131 inst0--;
2132 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
2133 inst0->bits3.if_else.jump_count == 0) {
2134 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
2135 } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
2136 inst0->bits3.if_else.jump_count == 0) {
2137 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
2138 }
2139 }
2140 }
2141 }
2142 break;
2143
2144 case OPCODE_BRA:
2145 brw_set_predicate_control(p, get_predicate(inst));
2146 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2147 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2148 break;
2149 case OPCODE_CAL:
2150 brw_set_access_mode(p, BRW_ALIGN_1);
2151 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2152 brw_set_access_mode(p, BRW_ALIGN_16);
2153 brw_ADD(p, get_addr_reg(stack_index),
2154 get_addr_reg(stack_index), brw_imm_d(4));
2155 brw_save_call(p, inst->Comment, p->nr_insn);
2156 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2157 break;
2158 case OPCODE_RET:
2159 brw_ADD(p, get_addr_reg(stack_index),
2160 get_addr_reg(stack_index), brw_imm_d(-4));
2161 brw_set_access_mode(p, BRW_ALIGN_1);
2162 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2163 brw_set_access_mode(p, BRW_ALIGN_16);
2164 break;
2165 case OPCODE_END:
2166 emit_vertex_write(c);
2167 break;
2168 case OPCODE_PRINT:
2169 /* no-op */
2170 break;
2171 case OPCODE_BGNSUB:
2172 brw_save_label(p, inst->Comment, p->nr_insn);
2173 break;
2174 case OPCODE_ENDSUB:
2175 /* no-op */
2176 break;
2177 default:
2178 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2179 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2180 _mesa_opcode_string(inst->Opcode) :
2181 "unknown");
2182 }
2183
2184 /* Set the predication update on the last instruction of the native
2185 * instruction sequence.
2186 *
2187 * This would be problematic if it was set on a math instruction,
2188 * but that shouldn't be the case with the current GLSL compiler.
2189 */
2190 if (inst->CondUpdate) {
2191 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2192
2193 assert(hw_insn->header.destreg__conditionalmod == 0);
2194 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2195 }
2196
2197 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2198 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2199 && c->output_regs[inst->DstReg.Index].used_in_src) {
2200 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2201 }
2202
2203 /* Result color clamping.
2204 *
2205 * When destination register is an output register and
2206 * it's primary/secondary front/back color, we have to clamp
2207 * the result to [0,1]. This is done by enabling the
2208 * saturation bit for the last instruction.
2209 *
2210 * We don't use brw_set_saturate() as it modifies
2211 * p->current->header.saturate, which affects all the subsequent
2212 * instructions. Instead, we directly modify the header
2213 * of the last (already stored) instruction.
2214 */
2215 if (inst->DstReg.File == PROGRAM_OUTPUT &&
2216 c->key.clamp_vertex_color) {
2217 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2218 || (inst->DstReg.Index == VERT_RESULT_COL1)
2219 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2220 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2221 p->store[p->nr_insn-1].header.saturate = 1;
2222 }
2223 }
2224
2225 if (inst->DstReg.RelAddr) {
2226 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2227 inst->DstReg.File == PROGRAM_OUTPUT);
2228 move_to_reladdr_dst(c, inst, dst);
2229 }
2230
2231 release_tmps(c);
2232 }
2233
2234 brw_resolve_cals(p);
2235 brw_set_uip_jip(p);
2236
2237 brw_optimize(p);
2238
2239 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2240 int i;
2241
2242 printf("vs-native:\n");
2243 for (i = 0; i < p->nr_insn; i++)
2244 brw_disasm(stdout, &p->store[i], intel->gen);
2245 printf("\n");
2246 }
2247 }