i965: Make the userclip flag for the VUE map come from VS prog data.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vs_emit.c
1 /*
2 Copyright (C) Intel Corp. 2006. All Rights Reserved.
3 Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
4 develop this 3D driver.
5
6 Permission is hereby granted, free of charge, to any person obtaining
7 a copy of this software and associated documentation files (the
8 "Software"), to deal in the Software without restriction, including
9 without limitation the rights to use, copy, modify, merge, publish,
10 distribute, sublicense, and/or sell copies of the Software, and to
11 permit persons to whom the Software is furnished to do so, subject to
12 the following conditions:
13
14 The above copyright notice and this permission notice (including the
15 next paragraph) shall be included in all copies or substantial
16 portions of the Software.
17
18 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
19 EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
20 MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
21 IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
22 LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
23 OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
24 WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
25
26 **********************************************************************/
27 /*
28 * Authors:
29 * Keith Whitwell <keith@tungstengraphics.com>
30 */
31
32
33 #include "main/macros.h"
34 #include "program/program.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "brw_context.h"
38 #include "brw_vs.h"
39
40 /* Return the SrcReg index of the channels that can be immediate float operands
41 * instead of usage of PROGRAM_CONSTANT values through push/pull.
42 */
43 static bool
44 brw_vs_arg_can_be_immediate(enum prog_opcode opcode, int arg)
45 {
46 int opcode_array[] = {
47 [OPCODE_MOV] = 1,
48 [OPCODE_ADD] = 2,
49 [OPCODE_CMP] = 3,
50 [OPCODE_DP2] = 2,
51 [OPCODE_DP3] = 2,
52 [OPCODE_DP4] = 2,
53 [OPCODE_DPH] = 2,
54 [OPCODE_MAX] = 2,
55 [OPCODE_MIN] = 2,
56 [OPCODE_MUL] = 2,
57 [OPCODE_SEQ] = 2,
58 [OPCODE_SGE] = 2,
59 [OPCODE_SGT] = 2,
60 [OPCODE_SLE] = 2,
61 [OPCODE_SLT] = 2,
62 [OPCODE_SNE] = 2,
63 [OPCODE_XPD] = 2,
64 };
65
66 /* These opcodes get broken down in a way that allow two
67 * args to be immediates.
68 */
69 if (opcode == OPCODE_MAD || opcode == OPCODE_LRP) {
70 if (arg == 1 || arg == 2)
71 return true;
72 }
73
74 if (opcode > ARRAY_SIZE(opcode_array))
75 return false;
76
77 return arg == opcode_array[opcode] - 1;
78 }
79
80 static struct brw_reg get_tmp( struct brw_vs_compile *c )
81 {
82 struct brw_reg tmp = brw_vec8_grf(c->last_tmp, 0);
83
84 if (++c->last_tmp > c->prog_data.total_grf)
85 c->prog_data.total_grf = c->last_tmp;
86
87 return tmp;
88 }
89
90 static void release_tmp( struct brw_vs_compile *c, struct brw_reg tmp )
91 {
92 if (tmp.nr == c->last_tmp-1)
93 c->last_tmp--;
94 }
95
96 static void release_tmps( struct brw_vs_compile *c )
97 {
98 c->last_tmp = c->first_tmp;
99 }
100
101 static int
102 get_first_reladdr_output(struct gl_vertex_program *vp)
103 {
104 int i;
105 int first_reladdr_output = VERT_RESULT_MAX;
106
107 for (i = 0; i < vp->Base.NumInstructions; i++) {
108 struct prog_instruction *inst = vp->Base.Instructions + i;
109
110 if (inst->DstReg.File == PROGRAM_OUTPUT &&
111 inst->DstReg.RelAddr &&
112 inst->DstReg.Index < first_reladdr_output)
113 first_reladdr_output = inst->DstReg.Index;
114 }
115
116 return first_reladdr_output;
117 }
118
119 /* Clears the record of which vp_const_buffer elements have been
120 * loaded into our constant buffer registers, for the starts of new
121 * blocks after control flow.
122 */
123 static void
124 clear_current_const(struct brw_vs_compile *c)
125 {
126 unsigned int i;
127
128 if (c->vp->use_const_buffer) {
129 for (i = 0; i < 3; i++) {
130 c->current_const[i].index = -1;
131 }
132 }
133 }
134
135 /* The message length for all SEND messages is restricted to [1,15]. This
136 * includes 1 for the header, so anything in slots 14 and above needs to be
137 * placed in a general-purpose register and emitted using a second URB write.
138 */
139 #define MAX_SLOTS_IN_FIRST_URB_WRITE 14
140
141 /**
142 * Determine whether the given vertex output can be written directly to a MRF
143 * or whether it has to be stored in a general-purpose register.
144 */
145 static inline bool can_use_direct_mrf(int vert_result,
146 int first_reladdr_output, int slot)
147 {
148 if (vert_result == VERT_RESULT_HPOS || vert_result == VERT_RESULT_PSIZ) {
149 /* These never go straight into MRF's. They are placed in the MRF by
150 * epilog code.
151 */
152 return false;
153 }
154 if (first_reladdr_output <= vert_result && vert_result < VERT_RESULT_MAX) {
155 /* Relative addressing might be used to access this vert_result, so it
156 * needs to go into a general-purpose register.
157 */
158 return false;
159 }
160 if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE) {
161 /* This output won't go out until the second URB write so it must be
162 * stored in a general-purpose register until then.
163 */
164 return false;
165 }
166 return true;
167 }
168
169 /**
170 * Preallocate GRF register before code emit.
171 * Do things as simply as possible. Allocate and populate all regs
172 * ahead of time.
173 */
174 static void brw_vs_alloc_regs( struct brw_vs_compile *c )
175 {
176 struct brw_context *brw = c->func.brw;
177 struct intel_context *intel = &c->func.brw->intel;
178 GLuint i, reg = 0, slot;
179 int attributes_in_vue;
180 int first_reladdr_output;
181 int max_constant;
182 int constant = 0;
183 struct brw_vertex_program *vp = c->vp;
184 const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
185
186 /* Determine whether to use a real constant buffer or use a block
187 * of GRF registers for constants. The later is faster but only
188 * works if everything fits in the GRF.
189 * XXX this heuristic/check may need some fine tuning...
190 */
191 if (c->vp->program.Base.Parameters->NumParameters +
192 c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
193 c->vp->use_const_buffer = true;
194 else
195 c->vp->use_const_buffer = false;
196
197 /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
198
199 /* r0 -- reserved as usual
200 */
201 c->r0 = brw_vec8_grf(reg, 0);
202 reg++;
203
204 /* User clip planes from curbe:
205 */
206 if (c->key.userclip_active) {
207 if (intel->gen >= 6) {
208 for (i = 0; i <= c->key.nr_userclip_plane_consts; i++) {
209 c->userplane[i] = stride(brw_vec4_grf(reg + i / 2,
210 (i % 2) * 4), 0, 4, 1);
211 }
212 reg += ALIGN(c->key.nr_userclip_plane_consts, 2) / 2;
213 } else {
214 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
215 c->userplane[i] = stride(brw_vec4_grf(reg + (6 + i) / 2,
216 (i % 2) * 4), 0, 4, 1);
217 }
218 reg += (ALIGN(6 + c->key.nr_userclip_plane_consts, 4) / 4) * 2;
219 }
220
221 }
222
223 /* Assign some (probably all) of the vertex program constants to
224 * the push constant buffer/CURBE.
225 *
226 * There's an obvious limit to the numer of push constants equal to
227 * the number of register available, and that number is smaller
228 * than the minimum maximum number of vertex program parameters, so
229 * support for pull constants is required if we overflow.
230 * Additionally, on gen6 the number of push constants is even
231 * lower.
232 *
233 * When there's relative addressing, we don't know what range of
234 * Mesa IR registers can be accessed. And generally, when relative
235 * addressing is used we also have too many constants to load them
236 * all as push constants. So, we'll just support relative
237 * addressing out of the pull constant buffers, and try to load as
238 * many statically-accessed constants into the push constant buffer
239 * as we can.
240 */
241 if (intel->gen >= 6) {
242 /* We can only load 32 regs of push constants. */
243 max_constant = 32 * 2 - c->key.nr_userclip_plane_consts;
244 } else {
245 max_constant = BRW_MAX_GRF - 20 - c->vp->program.Base.NumTemporaries;
246 }
247
248 /* constant_map maps from ParameterValues[] index to index in the
249 * push constant buffer, or -1 if it's only in the pull constant
250 * buffer.
251 */
252 memset(c->constant_map, -1, c->vp->program.Base.Parameters->NumParameters);
253 for (i = 0;
254 i < c->vp->program.Base.NumInstructions && constant < max_constant;
255 i++) {
256 struct prog_instruction *inst = &c->vp->program.Base.Instructions[i];
257 int arg;
258
259 for (arg = 0; arg < 3 && constant < max_constant; arg++) {
260 if (inst->SrcReg[arg].File != PROGRAM_STATE_VAR &&
261 inst->SrcReg[arg].File != PROGRAM_CONSTANT &&
262 inst->SrcReg[arg].File != PROGRAM_UNIFORM &&
263 inst->SrcReg[arg].File != PROGRAM_ENV_PARAM &&
264 inst->SrcReg[arg].File != PROGRAM_LOCAL_PARAM) {
265 continue;
266 }
267
268 if (inst->SrcReg[arg].RelAddr) {
269 c->vp->use_const_buffer = true;
270 continue;
271 }
272
273 if (c->constant_map[inst->SrcReg[arg].Index] == -1) {
274 c->constant_map[inst->SrcReg[arg].Index] = constant++;
275 }
276 }
277 }
278
279 /* If we ran out of push constant space, then we'll also upload all
280 * constants through the pull constant buffer so that they can be
281 * accessed no matter what. For relative addressing (the common
282 * case) we need them all in place anyway.
283 */
284 if (constant == max_constant)
285 c->vp->use_const_buffer = true;
286
287 /* Set up the references to the pull parameters if present. This backend
288 * uses a 1:1 mapping from Mesa IR's index to location in the pull constant
289 * buffer, while the new VS backend allocates values to the pull buffer on
290 * demand.
291 */
292 if (c->vp->use_const_buffer) {
293 for (i = 0; i < params->NumParameters * 4; i++) {
294 c->prog_data.pull_param[i] = &params->ParameterValues[i / 4][i % 4].f;
295 }
296 c->prog_data.nr_pull_params = i;
297 }
298
299 for (i = 0; i < constant; i++) {
300 c->regs[PROGRAM_STATE_VAR][i] = stride(brw_vec4_grf(reg + i / 2,
301 (i % 2) * 4),
302 0, 4, 1);
303 }
304 reg += (constant + 1) / 2;
305 c->prog_data.curb_read_length = reg - 1;
306 c->prog_data.nr_params = constant * 4;
307 /* XXX 0 causes a bug elsewhere... */
308 if (intel->gen < 6 && c->prog_data.nr_params == 0)
309 c->prog_data.nr_params = 4;
310
311 /* Allocate input regs:
312 */
313 c->nr_inputs = 0;
314 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
315 if (c->prog_data.inputs_read & BITFIELD64_BIT(i)) {
316 c->nr_inputs++;
317 c->regs[PROGRAM_INPUT][i] = brw_vec8_grf(reg, 0);
318 reg++;
319 }
320 }
321 /* If there are no inputs, we'll still be reading one attribute's worth
322 * because it's required -- see urb_read_length setting.
323 */
324 if (c->nr_inputs == 0)
325 reg++;
326
327 /* Allocate outputs. The non-position outputs go straight into message regs.
328 */
329 brw_compute_vue_map(&c->vue_map, intel, &c->prog_data);
330 c->first_output = reg;
331
332 first_reladdr_output = get_first_reladdr_output(&c->vp->program);
333
334 for (slot = 0; slot < c->vue_map.num_slots; slot++) {
335 int vert_result = c->vue_map.slot_to_vert_result[slot];
336 assert(vert_result < Elements(c->regs[PROGRAM_OUTPUT]));
337 if (can_use_direct_mrf(vert_result, first_reladdr_output, slot)) {
338 c->regs[PROGRAM_OUTPUT][vert_result] = brw_message_reg(slot + 1);
339 } else {
340 c->regs[PROGRAM_OUTPUT][vert_result] = brw_vec8_grf(reg, 0);
341 reg++;
342 }
343 }
344
345 /* Allocate program temporaries:
346 */
347 for (i = 0; i < c->vp->program.Base.NumTemporaries; i++) {
348 c->regs[PROGRAM_TEMPORARY][i] = brw_vec8_grf(reg, 0);
349 reg++;
350 }
351
352 /* Address reg(s). Don't try to use the internal address reg until
353 * deref time.
354 */
355 for (i = 0; i < c->vp->program.Base.NumAddressRegs; i++) {
356 c->regs[PROGRAM_ADDRESS][i] = brw_reg(BRW_GENERAL_REGISTER_FILE,
357 reg,
358 0,
359 BRW_REGISTER_TYPE_D,
360 BRW_VERTICAL_STRIDE_8,
361 BRW_WIDTH_8,
362 BRW_HORIZONTAL_STRIDE_1,
363 BRW_SWIZZLE_XXXX,
364 WRITEMASK_X);
365 reg++;
366 }
367
368 if (c->vp->use_const_buffer) {
369 for (i = 0; i < 3; i++) {
370 c->current_const[i].reg = brw_vec8_grf(reg, 0);
371 reg++;
372 }
373 clear_current_const(c);
374 }
375
376 for (i = 0; i < 128; i++) {
377 if (c->output_regs[i].used_in_src) {
378 c->output_regs[i].reg = brw_vec8_grf(reg, 0);
379 reg++;
380 }
381 }
382
383 if (c->needs_stack) {
384 c->stack = brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg, 0);
385 reg += 2;
386 }
387
388 /* Some opcodes need an internal temporary:
389 */
390 c->first_tmp = reg;
391 c->last_tmp = reg; /* for allocation purposes */
392
393 /* Each input reg holds data from two vertices. The
394 * urb_read_length is the number of registers read from *each*
395 * vertex urb, so is half the amount:
396 */
397 c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
398 /* Setting this field to 0 leads to undefined behavior according to the
399 * the VS_STATE docs. Our VUEs will always have at least one attribute
400 * sitting in them, even if it's padding.
401 */
402 if (c->prog_data.urb_read_length == 0)
403 c->prog_data.urb_read_length = 1;
404
405 /* The VS VUEs are shared by VF (outputting our inputs) and VS, so size
406 * them to fit the biggest thing they need to.
407 */
408 attributes_in_vue = MAX2(c->vue_map.num_slots, c->nr_inputs);
409
410 if (intel->gen == 6) {
411 /* Each attribute is 32 bytes (2 vec4s), so dividing by 8 gives us the
412 * number of 128-byte (1024-bit) units.
413 */
414 c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 8) / 8;
415 } else {
416 /* Each attribute is 16 bytes (1 vec4), so dividing by 4 gives us the
417 * number of 64-byte (512-bit) units.
418 */
419 c->prog_data.urb_entry_size = ALIGN(attributes_in_vue, 4) / 4;
420 }
421
422 c->prog_data.total_grf = reg;
423
424 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
425 printf("%s NumAddrRegs %d\n", __FUNCTION__, c->vp->program.Base.NumAddressRegs);
426 printf("%s NumTemps %d\n", __FUNCTION__, c->vp->program.Base.NumTemporaries);
427 printf("%s reg = %d\n", __FUNCTION__, reg);
428 }
429 }
430
431
432 /**
433 * If an instruction uses a temp reg both as a src and the dest, we
434 * sometimes need to allocate an intermediate temporary.
435 */
436 static void unalias1( struct brw_vs_compile *c,
437 struct brw_reg dst,
438 struct brw_reg arg0,
439 void (*func)( struct brw_vs_compile *,
440 struct brw_reg,
441 struct brw_reg ))
442 {
443 if (dst.file == arg0.file && dst.nr == arg0.nr) {
444 struct brw_compile *p = &c->func;
445 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
446 func(c, tmp, arg0);
447 brw_MOV(p, dst, tmp);
448 release_tmp(c, tmp);
449 }
450 else {
451 func(c, dst, arg0);
452 }
453 }
454
455 /**
456 * \sa unalias2
457 * Checkes if 2-operand instruction needs an intermediate temporary.
458 */
459 static void unalias2( struct brw_vs_compile *c,
460 struct brw_reg dst,
461 struct brw_reg arg0,
462 struct brw_reg arg1,
463 void (*func)( struct brw_vs_compile *,
464 struct brw_reg,
465 struct brw_reg,
466 struct brw_reg ))
467 {
468 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
469 (dst.file == arg1.file && dst.nr == arg1.nr)) {
470 struct brw_compile *p = &c->func;
471 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
472 func(c, tmp, arg0, arg1);
473 brw_MOV(p, dst, tmp);
474 release_tmp(c, tmp);
475 }
476 else {
477 func(c, dst, arg0, arg1);
478 }
479 }
480
481 /**
482 * \sa unalias2
483 * Checkes if 3-operand instruction needs an intermediate temporary.
484 */
485 static void unalias3( struct brw_vs_compile *c,
486 struct brw_reg dst,
487 struct brw_reg arg0,
488 struct brw_reg arg1,
489 struct brw_reg arg2,
490 void (*func)( struct brw_vs_compile *,
491 struct brw_reg,
492 struct brw_reg,
493 struct brw_reg,
494 struct brw_reg ))
495 {
496 if ((dst.file == arg0.file && dst.nr == arg0.nr) ||
497 (dst.file == arg1.file && dst.nr == arg1.nr) ||
498 (dst.file == arg2.file && dst.nr == arg2.nr)) {
499 struct brw_compile *p = &c->func;
500 struct brw_reg tmp = brw_writemask(get_tmp(c), dst.dw1.bits.writemask);
501 func(c, tmp, arg0, arg1, arg2);
502 brw_MOV(p, dst, tmp);
503 release_tmp(c, tmp);
504 }
505 else {
506 func(c, dst, arg0, arg1, arg2);
507 }
508 }
509
510 static void emit_sop( struct brw_vs_compile *c,
511 struct brw_reg dst,
512 struct brw_reg arg0,
513 struct brw_reg arg1,
514 GLuint cond)
515 {
516 struct brw_compile *p = &c->func;
517
518 brw_MOV(p, dst, brw_imm_f(0.0f));
519 brw_CMP(p, brw_null_reg(), cond, arg0, arg1);
520 brw_MOV(p, dst, brw_imm_f(1.0f));
521 brw_set_predicate_control_flag_value(p, 0xff);
522 }
523
524 static void emit_seq( struct brw_vs_compile *c,
525 struct brw_reg dst,
526 struct brw_reg arg0,
527 struct brw_reg arg1 )
528 {
529 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_EQ);
530 }
531
532 static void emit_sne( struct brw_vs_compile *c,
533 struct brw_reg dst,
534 struct brw_reg arg0,
535 struct brw_reg arg1 )
536 {
537 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_NEQ);
538 }
539 static void emit_slt( struct brw_vs_compile *c,
540 struct brw_reg dst,
541 struct brw_reg arg0,
542 struct brw_reg arg1 )
543 {
544 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_L);
545 }
546
547 static void emit_sle( struct brw_vs_compile *c,
548 struct brw_reg dst,
549 struct brw_reg arg0,
550 struct brw_reg arg1 )
551 {
552 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_LE);
553 }
554
555 static void emit_sgt( struct brw_vs_compile *c,
556 struct brw_reg dst,
557 struct brw_reg arg0,
558 struct brw_reg arg1 )
559 {
560 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_G);
561 }
562
563 static void emit_sge( struct brw_vs_compile *c,
564 struct brw_reg dst,
565 struct brw_reg arg0,
566 struct brw_reg arg1 )
567 {
568 emit_sop(c, dst, arg0, arg1, BRW_CONDITIONAL_GE);
569 }
570
571 static void emit_cmp( struct brw_compile *p,
572 struct brw_reg dst,
573 struct brw_reg arg0,
574 struct brw_reg arg1,
575 struct brw_reg arg2 )
576 {
577 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
578 brw_SEL(p, dst, arg1, arg2);
579 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
580 }
581
582 static void emit_sign(struct brw_vs_compile *c,
583 struct brw_reg dst,
584 struct brw_reg arg0)
585 {
586 struct brw_compile *p = &c->func;
587
588 brw_MOV(p, dst, brw_imm_f(0));
589
590 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, brw_imm_f(0));
591 brw_MOV(p, dst, brw_imm_f(-1.0));
592 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
593
594 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, arg0, brw_imm_f(0));
595 brw_MOV(p, dst, brw_imm_f(1.0));
596 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
597 }
598
599 static void emit_max( struct brw_compile *p,
600 struct brw_reg dst,
601 struct brw_reg arg0,
602 struct brw_reg arg1 )
603 {
604 struct intel_context *intel = &p->brw->intel;
605
606 if (intel->gen >= 6) {
607 brw_set_conditionalmod(p, BRW_CONDITIONAL_GE);
608 brw_SEL(p, dst, arg0, arg1);
609 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
610 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
611 } else {
612 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_GE, arg0, arg1);
613 brw_SEL(p, dst, arg0, arg1);
614 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
615 }
616 }
617
618 static void emit_min( struct brw_compile *p,
619 struct brw_reg dst,
620 struct brw_reg arg0,
621 struct brw_reg arg1 )
622 {
623 struct intel_context *intel = &p->brw->intel;
624
625 if (intel->gen >= 6) {
626 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
627 brw_SEL(p, dst, arg0, arg1);
628 brw_set_conditionalmod(p, BRW_CONDITIONAL_NONE);
629 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
630 } else {
631 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_L, arg0, arg1);
632 brw_SEL(p, dst, arg0, arg1);
633 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
634 }
635 }
636
637 static void emit_arl(struct brw_compile *p,
638 struct brw_reg dst,
639 struct brw_reg src)
640 {
641 struct intel_context *intel = &p->brw->intel;
642
643 if (intel->gen >= 6) {
644 struct brw_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
645
646 brw_RNDD(p, dst_f, src);
647 brw_MOV(p, dst, dst_f);
648 } else {
649 brw_RNDD(p, dst, src);
650 }
651 }
652
653 static void emit_math1_gen4(struct brw_vs_compile *c,
654 GLuint function,
655 struct brw_reg dst,
656 struct brw_reg arg0,
657 GLuint precision)
658 {
659 /* There are various odd behaviours with SEND on the simulator. In
660 * addition there are documented issues with the fact that the GEN4
661 * processor doesn't do dependency control properly on SEND
662 * results. So, on balance, this kludge to get around failures
663 * with writemasked math results looks like it might be necessary
664 * whether that turns out to be a simulator bug or not:
665 */
666 struct brw_compile *p = &c->func;
667 struct brw_reg tmp = dst;
668 bool need_tmp = false;
669
670 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
671 dst.dw1.bits.writemask != 0xf)
672 need_tmp = true;
673
674 if (need_tmp)
675 tmp = get_tmp(c);
676
677 brw_math(p,
678 tmp,
679 function,
680 BRW_MATH_SATURATE_NONE,
681 2,
682 arg0,
683 BRW_MATH_DATA_SCALAR,
684 precision);
685
686 if (need_tmp) {
687 brw_MOV(p, dst, tmp);
688 release_tmp(c, tmp);
689 }
690 }
691
692 static void
693 emit_math1_gen6(struct brw_vs_compile *c,
694 GLuint function,
695 struct brw_reg dst,
696 struct brw_reg arg0,
697 GLuint precision)
698 {
699 struct brw_compile *p = &c->func;
700 struct brw_reg tmp_src, tmp_dst;
701
702 /* Something is strange on gen6 math in 16-wide mode, though the
703 * docs say it's supposed to work. Punt to using align1 mode,
704 * which doesn't do writemasking and swizzles.
705 */
706 tmp_src = get_tmp(c);
707 tmp_dst = get_tmp(c);
708
709 brw_MOV(p, tmp_src, arg0);
710
711 brw_set_access_mode(p, BRW_ALIGN_1);
712 brw_math(p,
713 tmp_dst,
714 function,
715 BRW_MATH_SATURATE_NONE,
716 2,
717 tmp_src,
718 BRW_MATH_DATA_SCALAR,
719 precision);
720 brw_set_access_mode(p, BRW_ALIGN_16);
721
722 brw_MOV(p, dst, tmp_dst);
723
724 release_tmp(c, tmp_src);
725 release_tmp(c, tmp_dst);
726 }
727
728 static void
729 emit_math1(struct brw_vs_compile *c,
730 GLuint function,
731 struct brw_reg dst,
732 struct brw_reg arg0,
733 GLuint precision)
734 {
735 struct brw_compile *p = &c->func;
736 struct intel_context *intel = &p->brw->intel;
737
738 if (intel->gen >= 6)
739 emit_math1_gen6(c, function, dst, arg0, precision);
740 else
741 emit_math1_gen4(c, function, dst, arg0, precision);
742 }
743
744 static void emit_math2_gen4( struct brw_vs_compile *c,
745 GLuint function,
746 struct brw_reg dst,
747 struct brw_reg arg0,
748 struct brw_reg arg1,
749 GLuint precision)
750 {
751 struct brw_compile *p = &c->func;
752 struct brw_reg tmp = dst;
753 bool need_tmp = false;
754
755 if (dst.file != BRW_GENERAL_REGISTER_FILE ||
756 dst.dw1.bits.writemask != 0xf)
757 need_tmp = true;
758
759 if (need_tmp)
760 tmp = get_tmp(c);
761
762 brw_MOV(p, brw_message_reg(3), arg1);
763
764 brw_math(p,
765 tmp,
766 function,
767 BRW_MATH_SATURATE_NONE,
768 2,
769 arg0,
770 BRW_MATH_DATA_SCALAR,
771 precision);
772
773 if (need_tmp) {
774 brw_MOV(p, dst, tmp);
775 release_tmp(c, tmp);
776 }
777 }
778
779 static void emit_math2_gen6( struct brw_vs_compile *c,
780 GLuint function,
781 struct brw_reg dst,
782 struct brw_reg arg0,
783 struct brw_reg arg1,
784 GLuint precision)
785 {
786 struct brw_compile *p = &c->func;
787 struct brw_reg tmp_src0, tmp_src1, tmp_dst;
788
789 tmp_src0 = get_tmp(c);
790 tmp_src1 = get_tmp(c);
791 tmp_dst = get_tmp(c);
792
793 brw_MOV(p, tmp_src0, arg0);
794 brw_MOV(p, tmp_src1, arg1);
795
796 brw_set_access_mode(p, BRW_ALIGN_1);
797 brw_math2(p,
798 tmp_dst,
799 function,
800 tmp_src0,
801 tmp_src1);
802 brw_set_access_mode(p, BRW_ALIGN_16);
803
804 brw_MOV(p, dst, tmp_dst);
805
806 release_tmp(c, tmp_src0);
807 release_tmp(c, tmp_src1);
808 release_tmp(c, tmp_dst);
809 }
810
811 static void emit_math2( struct brw_vs_compile *c,
812 GLuint function,
813 struct brw_reg dst,
814 struct brw_reg arg0,
815 struct brw_reg arg1,
816 GLuint precision)
817 {
818 struct brw_compile *p = &c->func;
819 struct intel_context *intel = &p->brw->intel;
820
821 if (intel->gen >= 6)
822 emit_math2_gen6(c, function, dst, arg0, arg1, precision);
823 else
824 emit_math2_gen4(c, function, dst, arg0, arg1, precision);
825 }
826
827 static void emit_exp_noalias( struct brw_vs_compile *c,
828 struct brw_reg dst,
829 struct brw_reg arg0 )
830 {
831 struct brw_compile *p = &c->func;
832
833
834 if (dst.dw1.bits.writemask & WRITEMASK_X) {
835 struct brw_reg tmp = get_tmp(c);
836 struct brw_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
837
838 /* tmp_d = floor(arg0.x) */
839 brw_RNDD(p, tmp_d, brw_swizzle1(arg0, 0));
840
841 /* result[0] = 2.0 ^ tmp */
842
843 /* Adjust exponent for floating point:
844 * exp += 127
845 */
846 brw_ADD(p, brw_writemask(tmp_d, WRITEMASK_X), tmp_d, brw_imm_d(127));
847
848 /* Install exponent and sign.
849 * Excess drops off the edge:
850 */
851 brw_SHL(p, brw_writemask(retype(dst, BRW_REGISTER_TYPE_D), WRITEMASK_X),
852 tmp_d, brw_imm_d(23));
853
854 release_tmp(c, tmp);
855 }
856
857 if (dst.dw1.bits.writemask & WRITEMASK_Y) {
858 /* result[1] = arg0.x - floor(arg0.x) */
859 brw_FRC(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0, 0));
860 }
861
862 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
863 /* As with the LOG instruction, we might be better off just
864 * doing a taylor expansion here, seeing as we have to do all
865 * the prep work.
866 *
867 * If mathbox partial precision is too low, consider also:
868 * result[3] = result[0] * EXP(result[1])
869 */
870 emit_math1(c,
871 BRW_MATH_FUNCTION_EXP,
872 brw_writemask(dst, WRITEMASK_Z),
873 brw_swizzle1(arg0, 0),
874 BRW_MATH_PRECISION_FULL);
875 }
876
877 if (dst.dw1.bits.writemask & WRITEMASK_W) {
878 /* result[3] = 1.0; */
879 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), brw_imm_f(1));
880 }
881 }
882
883
884 static void emit_log_noalias( struct brw_vs_compile *c,
885 struct brw_reg dst,
886 struct brw_reg arg0 )
887 {
888 struct brw_compile *p = &c->func;
889 struct brw_reg tmp = dst;
890 struct brw_reg tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
891 struct brw_reg arg0_ud = retype(arg0, BRW_REGISTER_TYPE_UD);
892 bool need_tmp = (dst.dw1.bits.writemask != 0xf ||
893 dst.file != BRW_GENERAL_REGISTER_FILE);
894
895 if (need_tmp) {
896 tmp = get_tmp(c);
897 tmp_ud = retype(tmp, BRW_REGISTER_TYPE_UD);
898 }
899
900 /* Perform mant = frexpf(fabsf(x), &exp), adjust exp and mnt
901 * according to spec:
902 *
903 * These almost look likey they could be joined up, but not really
904 * practical:
905 *
906 * result[0].f = (x.i & ((1<<31)-1) >> 23) - 127
907 * result[1].i = (x.i & ((1<<23)-1) + (127<<23)
908 */
909 if (dst.dw1.bits.writemask & WRITEMASK_XZ) {
910 brw_AND(p,
911 brw_writemask(tmp_ud, WRITEMASK_X),
912 brw_swizzle1(arg0_ud, 0),
913 brw_imm_ud((1U<<31)-1));
914
915 brw_SHR(p,
916 brw_writemask(tmp_ud, WRITEMASK_X),
917 tmp_ud,
918 brw_imm_ud(23));
919
920 brw_ADD(p,
921 brw_writemask(tmp, WRITEMASK_X),
922 retype(tmp_ud, BRW_REGISTER_TYPE_D), /* does it matter? */
923 brw_imm_d(-127));
924 }
925
926 if (dst.dw1.bits.writemask & WRITEMASK_YZ) {
927 brw_AND(p,
928 brw_writemask(tmp_ud, WRITEMASK_Y),
929 brw_swizzle1(arg0_ud, 0),
930 brw_imm_ud((1<<23)-1));
931
932 brw_OR(p,
933 brw_writemask(tmp_ud, WRITEMASK_Y),
934 tmp_ud,
935 brw_imm_ud(127<<23));
936 }
937
938 if (dst.dw1.bits.writemask & WRITEMASK_Z) {
939 /* result[2] = result[0] + LOG2(result[1]); */
940
941 /* Why bother? The above is just a hint how to do this with a
942 * taylor series. Maybe we *should* use a taylor series as by
943 * the time all the above has been done it's almost certainly
944 * quicker than calling the mathbox, even with low precision.
945 *
946 * Options are:
947 * - result[0] + mathbox.LOG2(result[1])
948 * - mathbox.LOG2(arg0.x)
949 * - result[0] + inline_taylor_approx(result[1])
950 */
951 emit_math1(c,
952 BRW_MATH_FUNCTION_LOG,
953 brw_writemask(tmp, WRITEMASK_Z),
954 brw_swizzle1(tmp, 1),
955 BRW_MATH_PRECISION_FULL);
956
957 brw_ADD(p,
958 brw_writemask(tmp, WRITEMASK_Z),
959 brw_swizzle1(tmp, 2),
960 brw_swizzle1(tmp, 0));
961 }
962
963 if (dst.dw1.bits.writemask & WRITEMASK_W) {
964 /* result[3] = 1.0; */
965 brw_MOV(p, brw_writemask(tmp, WRITEMASK_W), brw_imm_f(1));
966 }
967
968 if (need_tmp) {
969 brw_MOV(p, dst, tmp);
970 release_tmp(c, tmp);
971 }
972 }
973
974
975 /* Need to unalias - consider swizzles: r0 = DST r0.xxxx r1
976 */
977 static void emit_dst_noalias( struct brw_vs_compile *c,
978 struct brw_reg dst,
979 struct brw_reg arg0,
980 struct brw_reg arg1)
981 {
982 struct brw_compile *p = &c->func;
983
984 /* There must be a better way to do this:
985 */
986 if (dst.dw1.bits.writemask & WRITEMASK_X)
987 brw_MOV(p, brw_writemask(dst, WRITEMASK_X), brw_imm_f(1.0));
988 if (dst.dw1.bits.writemask & WRITEMASK_Y)
989 brw_MUL(p, brw_writemask(dst, WRITEMASK_Y), arg0, arg1);
990 if (dst.dw1.bits.writemask & WRITEMASK_Z)
991 brw_MOV(p, brw_writemask(dst, WRITEMASK_Z), arg0);
992 if (dst.dw1.bits.writemask & WRITEMASK_W)
993 brw_MOV(p, brw_writemask(dst, WRITEMASK_W), arg1);
994 }
995
996
997 static void emit_xpd( struct brw_compile *p,
998 struct brw_reg dst,
999 struct brw_reg t,
1000 struct brw_reg u)
1001 {
1002 brw_MUL(p, brw_null_reg(), brw_swizzle(t, 1,2,0,3), brw_swizzle(u,2,0,1,3));
1003 brw_MAC(p, dst, negate(brw_swizzle(t, 2,0,1,3)), brw_swizzle(u,1,2,0,3));
1004 }
1005
1006
1007 static void emit_lit_noalias( struct brw_vs_compile *c,
1008 struct brw_reg dst,
1009 struct brw_reg arg0 )
1010 {
1011 struct brw_compile *p = &c->func;
1012 struct brw_reg tmp = dst;
1013 bool need_tmp = (dst.file != BRW_GENERAL_REGISTER_FILE);
1014
1015 if (need_tmp)
1016 tmp = get_tmp(c);
1017
1018 brw_MOV(p, brw_writemask(dst, WRITEMASK_YZ), brw_imm_f(0));
1019 brw_MOV(p, brw_writemask(dst, WRITEMASK_XW), brw_imm_f(1));
1020
1021 /* Need to use BRW_EXECUTE_8 and also do an 8-wide compare in order
1022 * to get all channels active inside the IF. In the clipping code
1023 * we run with NoMask, so it's not an option and we can use
1024 * BRW_EXECUTE_1 for all comparisions.
1025 */
1026 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,0), brw_imm_f(0));
1027 brw_IF(p, BRW_EXECUTE_8);
1028 {
1029 brw_MOV(p, brw_writemask(dst, WRITEMASK_Y), brw_swizzle1(arg0,0));
1030
1031 brw_CMP(p, brw_null_reg(), BRW_CONDITIONAL_G, brw_swizzle1(arg0,1), brw_imm_f(0));
1032 brw_MOV(p, brw_writemask(tmp, WRITEMASK_Z), brw_swizzle1(arg0,1));
1033 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1034
1035 emit_math2(c,
1036 BRW_MATH_FUNCTION_POW,
1037 brw_writemask(dst, WRITEMASK_Z),
1038 brw_swizzle1(tmp, 2),
1039 brw_swizzle1(arg0, 3),
1040 BRW_MATH_PRECISION_PARTIAL);
1041 }
1042 brw_ENDIF(p);
1043
1044 release_tmp(c, tmp);
1045 }
1046
1047 static void emit_lrp_noalias(struct brw_vs_compile *c,
1048 struct brw_reg dst,
1049 struct brw_reg arg0,
1050 struct brw_reg arg1,
1051 struct brw_reg arg2)
1052 {
1053 struct brw_compile *p = &c->func;
1054
1055 brw_ADD(p, dst, negate(arg0), brw_imm_f(1.0));
1056 brw_MUL(p, brw_null_reg(), dst, arg2);
1057 brw_MAC(p, dst, arg0, arg1);
1058 }
1059
1060 static struct brw_reg
1061 get_constant(struct brw_vs_compile *c,
1062 const struct prog_instruction *inst,
1063 GLuint argIndex)
1064 {
1065 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1066 struct brw_compile *p = &c->func;
1067 struct brw_reg const_reg = c->current_const[argIndex].reg;
1068
1069 assert(argIndex < 3);
1070
1071 if (c->current_const[argIndex].index != src->Index) {
1072 /* Keep track of the last constant loaded in this slot, for reuse. */
1073 c->current_const[argIndex].index = src->Index;
1074
1075 #if 0
1076 printf(" fetch const[%d] for arg %d into reg %d\n",
1077 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1078 #endif
1079 /* need to fetch the constant now */
1080 brw_dp_READ_4_vs(p,
1081 const_reg, /* writeback dest */
1082 16 * src->Index, /* byte offset */
1083 SURF_INDEX_VERT_CONST_BUFFER /* binding table index */
1084 );
1085 }
1086
1087 /* replicate lower four floats into upper half (to get XYZWXYZW) */
1088 const_reg = stride(const_reg, 0, 4, 1);
1089 const_reg.subnr = 0;
1090
1091 return const_reg;
1092 }
1093
1094 static struct brw_reg
1095 get_reladdr_constant(struct brw_vs_compile *c,
1096 const struct prog_instruction *inst,
1097 GLuint argIndex)
1098 {
1099 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1100 struct brw_compile *p = &c->func;
1101 struct brw_context *brw = p->brw;
1102 struct intel_context *intel = &brw->intel;
1103 struct brw_reg const_reg = c->current_const[argIndex].reg;
1104 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1105 uint32_t offset;
1106
1107 assert(argIndex < 3);
1108
1109 /* Can't reuse a reladdr constant load. */
1110 c->current_const[argIndex].index = -1;
1111
1112 #if 0
1113 printf(" fetch const[a0.x+%d] for arg %d into reg %d\n",
1114 src->Index, argIndex, c->current_const[argIndex].reg.nr);
1115 #endif
1116
1117 if (intel->gen >= 6) {
1118 offset = src->Index;
1119 } else {
1120 struct brw_reg byte_addr_reg = retype(get_tmp(c), BRW_REGISTER_TYPE_D);
1121 brw_MUL(p, byte_addr_reg, addr_reg, brw_imm_d(16));
1122 addr_reg = byte_addr_reg;
1123 offset = 16 * src->Index;
1124 }
1125
1126 /* fetch the first vec4 */
1127 brw_dp_READ_4_vs_relative(p,
1128 const_reg,
1129 addr_reg,
1130 offset,
1131 SURF_INDEX_VERT_CONST_BUFFER);
1132
1133 return const_reg;
1134 }
1135
1136
1137
1138 /* TODO: relative addressing!
1139 */
1140 static struct brw_reg get_reg( struct brw_vs_compile *c,
1141 gl_register_file file,
1142 GLuint index )
1143 {
1144 switch (file) {
1145 case PROGRAM_TEMPORARY:
1146 case PROGRAM_INPUT:
1147 case PROGRAM_OUTPUT:
1148 assert(c->regs[file][index].nr != 0);
1149 return c->regs[file][index];
1150 case PROGRAM_STATE_VAR:
1151 case PROGRAM_CONSTANT:
1152 case PROGRAM_UNIFORM:
1153 assert(c->regs[PROGRAM_STATE_VAR][index].nr != 0);
1154 return c->regs[PROGRAM_STATE_VAR][index];
1155 case PROGRAM_ADDRESS:
1156 assert(index == 0);
1157 return c->regs[file][index];
1158
1159 case PROGRAM_UNDEFINED: /* undef values */
1160 return brw_null_reg();
1161
1162 case PROGRAM_LOCAL_PARAM:
1163 case PROGRAM_ENV_PARAM:
1164 case PROGRAM_WRITE_ONLY:
1165 default:
1166 assert(0);
1167 return brw_null_reg();
1168 }
1169 }
1170
1171
1172 /**
1173 * Indirect addressing: get reg[[arg] + offset].
1174 */
1175 static struct brw_reg deref( struct brw_vs_compile *c,
1176 struct brw_reg arg,
1177 GLint offset,
1178 GLuint reg_size )
1179 {
1180 struct brw_compile *p = &c->func;
1181 struct brw_reg tmp = get_tmp(c);
1182 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1183 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1184 GLuint byte_offset = arg.nr * 32 + arg.subnr + offset * reg_size;
1185 struct brw_reg indirect = brw_vec4_indirect(0,0);
1186 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1187
1188 /* Set the vertical stride on the register access so that the first
1189 * 4 components come from a0.0 and the second 4 from a0.1.
1190 */
1191 indirect.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
1192
1193 {
1194 brw_push_insn_state(p);
1195 brw_set_access_mode(p, BRW_ALIGN_1);
1196
1197 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1198 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1199
1200 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1201 brw_ADD(p, brw_address_reg(1), acc, brw_imm_uw(byte_offset));
1202
1203 brw_MOV(p, tmp, indirect);
1204
1205 brw_pop_insn_state(p);
1206 }
1207
1208 /* NOTE: tmp not released */
1209 return tmp;
1210 }
1211
1212 static void
1213 move_to_reladdr_dst(struct brw_vs_compile *c,
1214 const struct prog_instruction *inst,
1215 struct brw_reg val)
1216 {
1217 struct brw_compile *p = &c->func;
1218 int reg_size = 32;
1219 struct brw_reg addr_reg = c->regs[PROGRAM_ADDRESS][0];
1220 struct brw_reg vp_address = retype(vec1(addr_reg), BRW_REGISTER_TYPE_D);
1221 struct brw_reg base = c->regs[inst->DstReg.File][inst->DstReg.Index];
1222 GLuint byte_offset = base.nr * 32 + base.subnr;
1223 struct brw_reg indirect = brw_vec4_indirect(0,0);
1224 struct brw_reg acc = retype(vec1(get_tmp(c)), BRW_REGISTER_TYPE_UW);
1225
1226 /* Because destination register indirect addressing can only use
1227 * one index, we'll write each vertex's vec4 value separately.
1228 */
1229 val.width = BRW_WIDTH_4;
1230 val.vstride = BRW_VERTICAL_STRIDE_4;
1231
1232 brw_push_insn_state(p);
1233 brw_set_access_mode(p, BRW_ALIGN_1);
1234
1235 brw_MUL(p, acc, vp_address, brw_imm_uw(reg_size));
1236 brw_ADD(p, brw_address_reg(0), acc, brw_imm_uw(byte_offset));
1237 brw_MOV(p, indirect, val);
1238
1239 brw_MUL(p, acc, suboffset(vp_address, 4), brw_imm_uw(reg_size));
1240 brw_ADD(p, brw_address_reg(0), acc,
1241 brw_imm_uw(byte_offset + reg_size / 2));
1242 brw_MOV(p, indirect, suboffset(val, 4));
1243
1244 brw_pop_insn_state(p);
1245 }
1246
1247 /**
1248 * Get brw reg corresponding to the instruction's [argIndex] src reg.
1249 * TODO: relative addressing!
1250 */
1251 static struct brw_reg
1252 get_src_reg( struct brw_vs_compile *c,
1253 const struct prog_instruction *inst,
1254 GLuint argIndex )
1255 {
1256 const GLuint file = inst->SrcReg[argIndex].File;
1257 const GLint index = inst->SrcReg[argIndex].Index;
1258 const bool relAddr = inst->SrcReg[argIndex].RelAddr;
1259
1260 if (brw_vs_arg_can_be_immediate(inst->Opcode, argIndex)) {
1261 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1262
1263 if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ZERO,
1264 SWIZZLE_ZERO,
1265 SWIZZLE_ZERO,
1266 SWIZZLE_ZERO)) {
1267 return brw_imm_f(0.0f);
1268 } else if (src->Swizzle == MAKE_SWIZZLE4(SWIZZLE_ONE,
1269 SWIZZLE_ONE,
1270 SWIZZLE_ONE,
1271 SWIZZLE_ONE)) {
1272 if (src->Negate)
1273 return brw_imm_f(-1.0F);
1274 else
1275 return brw_imm_f(1.0F);
1276 } else if (src->File == PROGRAM_CONSTANT) {
1277 const struct gl_program_parameter_list *params;
1278 float f;
1279 int component = -1;
1280
1281 switch (src->Swizzle) {
1282 case SWIZZLE_XXXX:
1283 component = 0;
1284 break;
1285 case SWIZZLE_YYYY:
1286 component = 1;
1287 break;
1288 case SWIZZLE_ZZZZ:
1289 component = 2;
1290 break;
1291 case SWIZZLE_WWWW:
1292 component = 3;
1293 break;
1294 }
1295
1296 if (component >= 0) {
1297 params = c->vp->program.Base.Parameters;
1298 f = params->ParameterValues[src->Index][component].f;
1299
1300 if (src->Abs)
1301 f = fabs(f);
1302 if (src->Negate)
1303 f = -f;
1304 return brw_imm_f(f);
1305 }
1306 }
1307 }
1308
1309 switch (file) {
1310 case PROGRAM_TEMPORARY:
1311 case PROGRAM_INPUT:
1312 case PROGRAM_OUTPUT:
1313 if (relAddr) {
1314 return deref(c, c->regs[file][0], index, 32);
1315 }
1316 else {
1317 assert(c->regs[file][index].nr != 0);
1318 return c->regs[file][index];
1319 }
1320
1321 case PROGRAM_STATE_VAR:
1322 case PROGRAM_CONSTANT:
1323 case PROGRAM_UNIFORM:
1324 case PROGRAM_ENV_PARAM:
1325 case PROGRAM_LOCAL_PARAM:
1326 if (!relAddr && c->constant_map[index] != -1) {
1327 /* Take from the push constant buffer if possible. */
1328 assert(c->regs[PROGRAM_STATE_VAR][c->constant_map[index]].nr != 0);
1329 return c->regs[PROGRAM_STATE_VAR][c->constant_map[index]];
1330 } else {
1331 /* Must be in the pull constant buffer then .*/
1332 assert(c->vp->use_const_buffer);
1333 if (relAddr)
1334 return get_reladdr_constant(c, inst, argIndex);
1335 else
1336 return get_constant(c, inst, argIndex);
1337 }
1338 case PROGRAM_ADDRESS:
1339 assert(index == 0);
1340 return c->regs[file][index];
1341
1342 case PROGRAM_UNDEFINED:
1343 /* this is a normal case since we loop over all three src args */
1344 return brw_null_reg();
1345
1346 case PROGRAM_WRITE_ONLY:
1347 default:
1348 assert(0);
1349 return brw_null_reg();
1350 }
1351 }
1352
1353 /**
1354 * Return the brw reg for the given instruction's src argument.
1355 * Will return mangled results for SWZ op. The emit_swz() function
1356 * ignores this result and recalculates taking extended swizzles into
1357 * account.
1358 */
1359 static struct brw_reg get_arg( struct brw_vs_compile *c,
1360 const struct prog_instruction *inst,
1361 GLuint argIndex )
1362 {
1363 const struct prog_src_register *src = &inst->SrcReg[argIndex];
1364 struct brw_reg reg;
1365
1366 if (src->File == PROGRAM_UNDEFINED)
1367 return brw_null_reg();
1368
1369 reg = get_src_reg(c, inst, argIndex);
1370
1371 /* Convert 3-bit swizzle to 2-bit.
1372 */
1373 if (reg.file != BRW_IMMEDIATE_VALUE) {
1374 reg.dw1.bits.swizzle = BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1375 GET_SWZ(src->Swizzle, 1),
1376 GET_SWZ(src->Swizzle, 2),
1377 GET_SWZ(src->Swizzle, 3));
1378
1379 /* Note this is ok for non-swizzle ARB_vp instructions */
1380 reg.negate = src->Negate ? 1 : 0;
1381 }
1382
1383 return reg;
1384 }
1385
1386
1387 /**
1388 * Get brw register for the given program dest register.
1389 */
1390 static struct brw_reg get_dst( struct brw_vs_compile *c,
1391 struct prog_dst_register dst )
1392 {
1393 struct brw_reg reg;
1394
1395 switch (dst.File) {
1396 case PROGRAM_TEMPORARY:
1397 case PROGRAM_OUTPUT:
1398 /* register-indirect addressing is only 1x1, not VxH, for
1399 * destination regs. So, for RelAddr we'll return a temporary
1400 * for the dest and do a move of the result to the RelAddr
1401 * register after the instruction emit.
1402 */
1403 if (dst.RelAddr) {
1404 reg = get_tmp(c);
1405 } else {
1406 assert(c->regs[dst.File][dst.Index].nr != 0);
1407 reg = c->regs[dst.File][dst.Index];
1408 }
1409 break;
1410 case PROGRAM_ADDRESS:
1411 assert(dst.Index == 0);
1412 reg = c->regs[dst.File][dst.Index];
1413 break;
1414 case PROGRAM_UNDEFINED:
1415 /* we may hit this for OPCODE_END, OPCODE_KIL, etc */
1416 reg = brw_null_reg();
1417 break;
1418 default:
1419 assert(0);
1420 reg = brw_null_reg();
1421 }
1422
1423 assert(reg.type != BRW_IMMEDIATE_VALUE);
1424 reg.dw1.bits.writemask = dst.WriteMask;
1425
1426 return reg;
1427 }
1428
1429
1430 static void emit_swz( struct brw_vs_compile *c,
1431 struct brw_reg dst,
1432 const struct prog_instruction *inst)
1433 {
1434 const GLuint argIndex = 0;
1435 const struct prog_src_register src = inst->SrcReg[argIndex];
1436 struct brw_compile *p = &c->func;
1437 GLuint zeros_mask = 0;
1438 GLuint ones_mask = 0;
1439 GLuint src_mask = 0;
1440 GLubyte src_swz[4];
1441 bool need_tmp = (src.Negate &&
1442 dst.file != BRW_GENERAL_REGISTER_FILE);
1443 struct brw_reg tmp = dst;
1444 GLuint i;
1445
1446 if (need_tmp)
1447 tmp = get_tmp(c);
1448
1449 for (i = 0; i < 4; i++) {
1450 if (dst.dw1.bits.writemask & (1<<i)) {
1451 GLubyte s = GET_SWZ(src.Swizzle, i);
1452 switch (s) {
1453 case SWIZZLE_X:
1454 case SWIZZLE_Y:
1455 case SWIZZLE_Z:
1456 case SWIZZLE_W:
1457 src_mask |= 1<<i;
1458 src_swz[i] = s;
1459 break;
1460 case SWIZZLE_ZERO:
1461 zeros_mask |= 1<<i;
1462 break;
1463 case SWIZZLE_ONE:
1464 ones_mask |= 1<<i;
1465 break;
1466 }
1467 }
1468 }
1469
1470 /* Do src first, in case dst aliases src:
1471 */
1472 if (src_mask) {
1473 struct brw_reg arg0;
1474
1475 arg0 = get_src_reg(c, inst, argIndex);
1476
1477 arg0 = brw_swizzle(arg0,
1478 src_swz[0], src_swz[1],
1479 src_swz[2], src_swz[3]);
1480
1481 brw_MOV(p, brw_writemask(tmp, src_mask), arg0);
1482 }
1483
1484 if (zeros_mask)
1485 brw_MOV(p, brw_writemask(tmp, zeros_mask), brw_imm_f(0));
1486
1487 if (ones_mask)
1488 brw_MOV(p, brw_writemask(tmp, ones_mask), brw_imm_f(1));
1489
1490 if (src.Negate)
1491 brw_MOV(p, brw_writemask(tmp, src.Negate), negate(tmp));
1492
1493 if (need_tmp) {
1494 brw_MOV(p, dst, tmp);
1495 release_tmp(c, tmp);
1496 }
1497 }
1498
1499 static int
1500 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
1501 {
1502 struct intel_context *intel = &brw->intel;
1503
1504 if (intel->gen >= 6) {
1505 /* URB data written (does not include the message header reg) must
1506 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
1507 * section 5.4.3.2.2: URB_INTERLEAVED.
1508 *
1509 * URB entries are allocated on a multiple of 1024 bits, so an
1510 * extra 128 bits written here to make the end align to 256 is
1511 * no problem.
1512 */
1513 if ((mlen % 2) != 1)
1514 mlen++;
1515 }
1516
1517 return mlen;
1518 }
1519
1520 /**
1521 * Post-vertex-program processing. Send the results to the URB.
1522 */
1523 static void emit_vertex_write( struct brw_vs_compile *c)
1524 {
1525 struct brw_compile *p = &c->func;
1526 struct brw_context *brw = p->brw;
1527 struct intel_context *intel = &brw->intel;
1528 struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
1529 struct brw_reg ndc;
1530 int eot;
1531 GLuint len_vertex_header = 2;
1532 int i;
1533 int msg_len;
1534 int slot;
1535
1536 if (c->key.copy_edgeflag) {
1537 brw_MOV(p,
1538 get_reg(c, PROGRAM_OUTPUT, VERT_RESULT_EDGE),
1539 get_reg(c, PROGRAM_INPUT, VERT_ATTRIB_EDGEFLAG));
1540 }
1541
1542 if (intel->gen < 6) {
1543 /* Build ndc coords */
1544 ndc = get_tmp(c);
1545 /* ndc = 1.0 / pos.w */
1546 emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
1547 /* ndc.xyz = pos * ndc */
1548 brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
1549 }
1550
1551 /* Update the header for point size, user clipping flags, and -ve rhw
1552 * workaround.
1553 */
1554 if (intel->gen >= 6) {
1555 struct brw_reg m1 = brw_message_reg(1);
1556
1557 /* On gen6, m1 has each value in a separate dword, so we never
1558 * need to mess with a temporary for computing the m1 value.
1559 */
1560 brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1561 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1562 brw_MOV(p, brw_writemask(m1, WRITEMASK_W),
1563 brw_swizzle1(c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ], 0));
1564 }
1565
1566 /* Set the user clip distances in dword 8-15. (m3-4)*/
1567 if (c->key.userclip_active) {
1568 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1569 struct brw_reg m;
1570 if (i < 4)
1571 m = brw_message_reg(3);
1572 else
1573 m = brw_message_reg(4);
1574
1575 brw_DP4(p, brw_writemask(m, (1 << (i & 3))),pos, c->userplane[i]);
1576 }
1577 }
1578 } else if ((c->prog_data.outputs_written &
1579 BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
1580 c->key.userclip_active || brw->has_negative_rhw_bug) {
1581 struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
1582 GLuint i;
1583
1584 brw_MOV(p, header1, brw_imm_ud(0));
1585
1586 brw_set_access_mode(p, BRW_ALIGN_16);
1587
1588 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
1589 struct brw_reg psiz = c->regs[PROGRAM_OUTPUT][VERT_RESULT_PSIZ];
1590 brw_MUL(p, brw_writemask(header1, WRITEMASK_W),
1591 brw_swizzle1(psiz, 0), brw_imm_f(1<<11));
1592 brw_AND(p, brw_writemask(header1, WRITEMASK_W),
1593 header1, brw_imm_ud(0x7ff<<8));
1594 }
1595
1596 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
1597 brw_set_conditionalmod(p, BRW_CONDITIONAL_L);
1598 brw_DP4(p, brw_null_reg(), pos, c->userplane[i]);
1599 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<i));
1600 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1601 }
1602
1603 /* i965 clipping workaround:
1604 * 1) Test for -ve rhw
1605 * 2) If set,
1606 * set ndc = (0,0,0,0)
1607 * set ucp[6] = 1
1608 *
1609 * Later, clipping will detect ucp[6] and ensure the primitive is
1610 * clipped against all fixed planes.
1611 */
1612 if (brw->has_negative_rhw_bug) {
1613 brw_CMP(p,
1614 vec8(brw_null_reg()),
1615 BRW_CONDITIONAL_L,
1616 brw_swizzle1(ndc, 3),
1617 brw_imm_f(0));
1618
1619 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
1620 brw_MOV(p, ndc, brw_imm_f(0));
1621 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1622 }
1623
1624 brw_set_access_mode(p, BRW_ALIGN_1); /* why? */
1625 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), header1);
1626 brw_set_access_mode(p, BRW_ALIGN_16);
1627
1628 release_tmp(c, header1);
1629 }
1630 else {
1631 brw_MOV(p, retype(brw_message_reg(1), BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
1632 }
1633
1634 /* Emit the (interleaved) headers for the two vertices - an 8-reg
1635 * of zeros followed by two sets of NDC coordinates:
1636 */
1637 brw_set_access_mode(p, BRW_ALIGN_1);
1638 brw_set_acc_write_control(p, 0);
1639
1640 /* The VUE layout is documented in Volume 2a. */
1641 if (intel->gen >= 6) {
1642 /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
1643 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1644 * dword 4-7 (m2) is the 4D space position
1645 * dword 8-15 (m3,m4) of the vertex header is the user clip distance if
1646 * enabled.
1647 * m3 or 5 is the first vertex element data we fill, which is
1648 * the vertex position.
1649 */
1650 brw_MOV(p, brw_message_reg(2), pos);
1651 len_vertex_header = 1;
1652 if (c->key.userclip_active)
1653 len_vertex_header += 2;
1654 } else if (intel->gen == 5) {
1655 /* There are 20 DWs (D0-D19) in VUE header on Ironlake:
1656 * dword 0-3 (m1) of the header is indices, point width, clip flags.
1657 * dword 4-7 (m2) is the ndc position (set above)
1658 * dword 8-11 (m3) of the vertex header is the 4D space position
1659 * dword 12-19 (m4,m5) of the vertex header is the user clip distance.
1660 * m6 is a pad so that the vertex element data is aligned
1661 * m7 is the first vertex data we fill, which is the vertex position.
1662 */
1663 brw_MOV(p, brw_message_reg(2), ndc);
1664 brw_MOV(p, brw_message_reg(3), pos);
1665 brw_MOV(p, brw_message_reg(7), pos);
1666 len_vertex_header = 6;
1667 } else {
1668 /* There are 8 dwords in VUE header pre-Ironlake:
1669 * dword 0-3 (m1) is indices, point width, clip flags.
1670 * dword 4-7 (m2) is ndc position (set above)
1671 *
1672 * dword 8-11 (m3) is the first vertex data, which we always have be the
1673 * vertex position.
1674 */
1675 brw_MOV(p, brw_message_reg(2), ndc);
1676 brw_MOV(p, brw_message_reg(3), pos);
1677 len_vertex_header = 2;
1678 }
1679
1680 /* Move variable-addressed, non-overflow outputs to their MRFs. */
1681 for (slot = len_vertex_header; slot < c->vue_map.num_slots; ++slot) {
1682 if (slot >= MAX_SLOTS_IN_FIRST_URB_WRITE)
1683 break;
1684
1685 int mrf = slot + 1;
1686 int vert_result = c->vue_map.slot_to_vert_result[slot];
1687 if (c->regs[PROGRAM_OUTPUT][vert_result].file ==
1688 BRW_GENERAL_REGISTER_FILE) {
1689 brw_MOV(p, brw_message_reg(mrf),
1690 c->regs[PROGRAM_OUTPUT][vert_result]);
1691 }
1692 }
1693
1694 eot = (slot >= c->vue_map.num_slots);
1695
1696 /* Message header, plus the (first part of the) VUE. */
1697 msg_len = 1 + slot;
1698 msg_len = align_interleaved_urb_mlen(brw, msg_len);
1699 /* Any outputs beyond BRW_MAX_MRF should be in the second URB write */
1700 assert (msg_len <= BRW_MAX_MRF - 1);
1701
1702 brw_urb_WRITE(p,
1703 brw_null_reg(), /* dest */
1704 0, /* starting mrf reg nr */
1705 c->r0, /* src */
1706 0, /* allocate */
1707 1, /* used */
1708 msg_len,
1709 0, /* response len */
1710 eot, /* eot */
1711 eot, /* writes complete */
1712 0, /* urb destination offset */
1713 BRW_URB_SWIZZLE_INTERLEAVE);
1714
1715 if (slot < c->vue_map.num_slots) {
1716 /* Not all of the vertex outputs/results fit into the MRF.
1717 * Move the overflowed attributes from the GRF to the MRF and
1718 * issue another brw_urb_WRITE().
1719 */
1720 GLuint mrf = 1;
1721 for (; slot < c->vue_map.num_slots; ++slot) {
1722 int vert_result = c->vue_map.slot_to_vert_result[slot];
1723 /* move from GRF to MRF */
1724 brw_MOV(p, brw_message_reg(mrf),
1725 c->regs[PROGRAM_OUTPUT][vert_result]);
1726 mrf++;
1727 }
1728
1729 brw_urb_WRITE(p,
1730 brw_null_reg(), /* dest */
1731 0, /* starting mrf reg nr */
1732 c->r0, /* src */
1733 0, /* allocate */
1734 1, /* used */
1735 align_interleaved_urb_mlen(brw, mrf),
1736 0, /* response len */
1737 1, /* eot */
1738 1, /* writes complete */
1739 MAX_SLOTS_IN_FIRST_URB_WRITE / 2, /* urb destination offset */
1740 BRW_URB_SWIZZLE_INTERLEAVE);
1741 }
1742 }
1743
1744 static bool
1745 accumulator_contains(struct brw_vs_compile *c, struct brw_reg val)
1746 {
1747 struct brw_compile *p = &c->func;
1748 struct brw_instruction *prev_insn = &p->store[p->nr_insn - 1];
1749
1750 if (p->nr_insn == 0)
1751 return false;
1752
1753 if (val.address_mode != BRW_ADDRESS_DIRECT)
1754 return false;
1755
1756 if (val.negate || val.abs)
1757 return false;
1758
1759 switch (prev_insn->header.opcode) {
1760 case BRW_OPCODE_MOV:
1761 case BRW_OPCODE_MAC:
1762 case BRW_OPCODE_MUL:
1763 if (prev_insn->header.access_mode == BRW_ALIGN_16 &&
1764 prev_insn->header.execution_size == val.width &&
1765 prev_insn->bits1.da1.dest_reg_file == val.file &&
1766 prev_insn->bits1.da1.dest_reg_type == val.type &&
1767 prev_insn->bits1.da1.dest_address_mode == val.address_mode &&
1768 prev_insn->bits1.da1.dest_reg_nr == val.nr &&
1769 prev_insn->bits1.da16.dest_subreg_nr == val.subnr / 16 &&
1770 prev_insn->bits1.da16.dest_writemask == 0xf)
1771 return true;
1772 else
1773 return false;
1774 default:
1775 return false;
1776 }
1777 }
1778
1779 static uint32_t
1780 get_predicate(const struct prog_instruction *inst)
1781 {
1782 if (inst->DstReg.CondMask == COND_TR)
1783 return BRW_PREDICATE_NONE;
1784
1785 /* All of GLSL only produces predicates for COND_NE and one channel per
1786 * vector. Fail badly if someone starts doing something else, as it might
1787 * mean infinite looping or something.
1788 *
1789 * We'd like to support all the condition codes, but our hardware doesn't
1790 * quite match the Mesa IR, which is modeled after the NV extensions. For
1791 * those, the instruction may update the condition codes or not, then any
1792 * later instruction may use one of those condition codes. For gen4, the
1793 * instruction may update the flags register based on one of the condition
1794 * codes output by the instruction, and then further instructions may
1795 * predicate on that. We can probably support this, but it won't
1796 * necessarily be easy.
1797 */
1798 assert(inst->DstReg.CondMask == COND_NE);
1799
1800 switch (inst->DstReg.CondSwizzle) {
1801 case SWIZZLE_XXXX:
1802 return BRW_PREDICATE_ALIGN16_REPLICATE_X;
1803 case SWIZZLE_YYYY:
1804 return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
1805 case SWIZZLE_ZZZZ:
1806 return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
1807 case SWIZZLE_WWWW:
1808 return BRW_PREDICATE_ALIGN16_REPLICATE_W;
1809 default:
1810 _mesa_problem(NULL, "Unexpected predicate: 0x%08x\n",
1811 inst->DstReg.CondMask);
1812 return BRW_PREDICATE_NORMAL;
1813 }
1814 }
1815
1816 static void
1817 brw_vs_rescale_gl_fixed(struct brw_vs_compile *c)
1818 {
1819 struct brw_compile *p = &c->func;
1820 int i;
1821
1822 for (i = 0; i < VERT_ATTRIB_MAX; i++) {
1823 if (!(c->prog_data.inputs_read & BITFIELD64_BIT(i)))
1824 continue;
1825
1826 if (c->key.gl_fixed_input_size[i] != 0) {
1827 struct brw_reg reg = c->regs[PROGRAM_INPUT][i];
1828
1829 brw_MUL(p,
1830 brw_writemask(reg, (1 << c->key.gl_fixed_input_size[i]) - 1),
1831 reg, brw_imm_f(1.0 / 65536.0));
1832 }
1833 }
1834 }
1835
1836 /* Emit the vertex program instructions here.
1837 */
1838 void brw_old_vs_emit(struct brw_vs_compile *c )
1839 {
1840 #define MAX_IF_DEPTH 32
1841 #define MAX_LOOP_DEPTH 32
1842 struct brw_compile *p = &c->func;
1843 struct brw_context *brw = p->brw;
1844 struct intel_context *intel = &brw->intel;
1845 const GLuint nr_insns = c->vp->program.Base.NumInstructions;
1846 GLuint insn;
1847 const struct brw_indirect stack_index = brw_indirect(0, 0);
1848 GLuint index;
1849 GLuint file;
1850
1851 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1852 printf("vs-mesa:\n");
1853 _mesa_fprint_program_opt(stdout, &c->vp->program.Base, PROG_PRINT_DEBUG,
1854 true);
1855 printf("\n");
1856 }
1857
1858 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1859 brw_set_access_mode(p, BRW_ALIGN_16);
1860
1861 brw_set_acc_write_control(p, 1);
1862
1863 for (insn = 0; insn < nr_insns; insn++) {
1864 GLuint i;
1865 struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1866
1867 /* Message registers can't be read, so copy the output into GRF
1868 * register if they are used in source registers
1869 */
1870 for (i = 0; i < 3; i++) {
1871 struct prog_src_register *src = &inst->SrcReg[i];
1872 GLuint index = src->Index;
1873 GLuint file = src->File;
1874 if (file == PROGRAM_OUTPUT && index != VERT_RESULT_HPOS)
1875 c->output_regs[index].used_in_src = true;
1876 }
1877
1878 switch (inst->Opcode) {
1879 case OPCODE_CAL:
1880 case OPCODE_RET:
1881 c->needs_stack = true;
1882 break;
1883 default:
1884 break;
1885 }
1886 }
1887
1888 /* Static register allocation
1889 */
1890 brw_vs_alloc_regs(c);
1891
1892 brw_vs_rescale_gl_fixed(c);
1893
1894 if (c->needs_stack)
1895 brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
1896
1897 for (insn = 0; insn < nr_insns; insn++) {
1898
1899 const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
1900 struct brw_reg args[3], dst;
1901 GLuint i;
1902
1903 #if 0
1904 printf("%d: ", insn);
1905 _mesa_print_instruction(inst);
1906 #endif
1907
1908 /* Get argument regs. SWZ is special and does this itself.
1909 */
1910 if (inst->Opcode != OPCODE_SWZ)
1911 for (i = 0; i < 3; i++) {
1912 const struct prog_src_register *src = &inst->SrcReg[i];
1913 index = src->Index;
1914 file = src->File;
1915 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src) {
1916 /* Can't just make get_arg "do the right thing" here because
1917 * other callers of get_arg and get_src_reg don't expect any
1918 * special behavior for the c->output_regs[index].used_in_src
1919 * case.
1920 */
1921 args[i] = c->output_regs[index].reg;
1922 args[i].dw1.bits.swizzle =
1923 BRW_SWIZZLE4(GET_SWZ(src->Swizzle, 0),
1924 GET_SWZ(src->Swizzle, 1),
1925 GET_SWZ(src->Swizzle, 2),
1926 GET_SWZ(src->Swizzle, 3));
1927
1928 /* Note this is ok for non-swizzle ARB_vp instructions */
1929 args[i].negate = src->Negate ? 1 : 0;
1930 } else
1931 args[i] = get_arg(c, inst, i);
1932 }
1933
1934 /* Get dest regs. Note that it is possible for a reg to be both
1935 * dst and arg, given the static allocation of registers. So
1936 * care needs to be taken emitting multi-operation instructions.
1937 */
1938 index = inst->DstReg.Index;
1939 file = inst->DstReg.File;
1940 if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
1941 /* Can't just make get_dst "do the right thing" here because other
1942 * callers of get_dst don't expect any special behavior for the
1943 * c->output_regs[index].used_in_src case.
1944 */
1945 dst = brw_writemask(c->output_regs[index].reg, inst->DstReg.WriteMask);
1946 else
1947 dst = get_dst(c, inst->DstReg);
1948
1949 if (inst->SaturateMode != SATURATE_OFF) {
1950 _mesa_problem(NULL, "Unsupported saturate %d in vertex shader",
1951 inst->SaturateMode);
1952 }
1953
1954 switch (inst->Opcode) {
1955 case OPCODE_ABS:
1956 args[0].negate = false;
1957 brw_MOV(p, dst, brw_abs(args[0]));
1958 break;
1959 case OPCODE_ADD:
1960 brw_ADD(p, dst, args[0], args[1]);
1961 break;
1962 case OPCODE_COS:
1963 emit_math1(c, BRW_MATH_FUNCTION_COS, dst, args[0], BRW_MATH_PRECISION_FULL);
1964 break;
1965 case OPCODE_DP2:
1966 brw_DP2(p, dst, args[0], args[1]);
1967 break;
1968 case OPCODE_DP3:
1969 brw_DP3(p, dst, args[0], args[1]);
1970 break;
1971 case OPCODE_DP4:
1972 brw_DP4(p, dst, args[0], args[1]);
1973 break;
1974 case OPCODE_DPH:
1975 brw_DPH(p, dst, args[0], args[1]);
1976 break;
1977 case OPCODE_DST:
1978 unalias2(c, dst, args[0], args[1], emit_dst_noalias);
1979 break;
1980 case OPCODE_EXP:
1981 unalias1(c, dst, args[0], emit_exp_noalias);
1982 break;
1983 case OPCODE_EX2:
1984 emit_math1(c, BRW_MATH_FUNCTION_EXP, dst, args[0], BRW_MATH_PRECISION_FULL);
1985 break;
1986 case OPCODE_ARL:
1987 emit_arl(p, dst, args[0]);
1988 break;
1989 case OPCODE_FLR:
1990 brw_RNDD(p, dst, args[0]);
1991 break;
1992 case OPCODE_FRC:
1993 brw_FRC(p, dst, args[0]);
1994 break;
1995 case OPCODE_LOG:
1996 unalias1(c, dst, args[0], emit_log_noalias);
1997 break;
1998 case OPCODE_LG2:
1999 emit_math1(c, BRW_MATH_FUNCTION_LOG, dst, args[0], BRW_MATH_PRECISION_FULL);
2000 break;
2001 case OPCODE_LIT:
2002 unalias1(c, dst, args[0], emit_lit_noalias);
2003 break;
2004 case OPCODE_LRP:
2005 unalias3(c, dst, args[0], args[1], args[2], emit_lrp_noalias);
2006 break;
2007 case OPCODE_MAD:
2008 if (!accumulator_contains(c, args[2]))
2009 brw_MOV(p, brw_acc_reg(), args[2]);
2010 brw_MAC(p, dst, args[0], args[1]);
2011 break;
2012 case OPCODE_CMP:
2013 emit_cmp(p, dst, args[0], args[1], args[2]);
2014 break;
2015 case OPCODE_MAX:
2016 emit_max(p, dst, args[0], args[1]);
2017 break;
2018 case OPCODE_MIN:
2019 emit_min(p, dst, args[0], args[1]);
2020 break;
2021 case OPCODE_MOV:
2022 brw_MOV(p, dst, args[0]);
2023 break;
2024 case OPCODE_MUL:
2025 brw_MUL(p, dst, args[0], args[1]);
2026 break;
2027 case OPCODE_POW:
2028 emit_math2(c, BRW_MATH_FUNCTION_POW, dst, args[0], args[1], BRW_MATH_PRECISION_FULL);
2029 break;
2030 case OPCODE_RCP:
2031 emit_math1(c, BRW_MATH_FUNCTION_INV, dst, args[0], BRW_MATH_PRECISION_FULL);
2032 break;
2033 case OPCODE_RSQ:
2034 emit_math1(c, BRW_MATH_FUNCTION_RSQ, dst, brw_abs(args[0]), BRW_MATH_PRECISION_FULL);
2035 break;
2036
2037 case OPCODE_SEQ:
2038 unalias2(c, dst, args[0], args[1], emit_seq);
2039 break;
2040 case OPCODE_SIN:
2041 emit_math1(c, BRW_MATH_FUNCTION_SIN, dst, args[0], BRW_MATH_PRECISION_FULL);
2042 break;
2043 case OPCODE_SNE:
2044 unalias2(c, dst, args[0], args[1], emit_sne);
2045 break;
2046 case OPCODE_SGE:
2047 unalias2(c, dst, args[0], args[1], emit_sge);
2048 break;
2049 case OPCODE_SGT:
2050 unalias2(c, dst, args[0], args[1], emit_sgt);
2051 break;
2052 case OPCODE_SLT:
2053 unalias2(c, dst, args[0], args[1], emit_slt);
2054 break;
2055 case OPCODE_SLE:
2056 unalias2(c, dst, args[0], args[1], emit_sle);
2057 break;
2058 case OPCODE_SSG:
2059 unalias1(c, dst, args[0], emit_sign);
2060 break;
2061 case OPCODE_SUB:
2062 brw_ADD(p, dst, args[0], negate(args[1]));
2063 break;
2064 case OPCODE_SWZ:
2065 /* The args[0] value can't be used here as it won't have
2066 * correctly encoded the full swizzle:
2067 */
2068 emit_swz(c, dst, inst);
2069 break;
2070 case OPCODE_TRUNC:
2071 /* round toward zero */
2072 brw_RNDZ(p, dst, args[0]);
2073 break;
2074 case OPCODE_XPD:
2075 emit_xpd(p, dst, args[0], args[1]);
2076 break;
2077 case OPCODE_IF: {
2078 struct brw_instruction *if_inst = brw_IF(p, BRW_EXECUTE_8);
2079 /* Note that brw_IF smashes the predicate_control field. */
2080 if_inst->header.predicate_control = get_predicate(inst);
2081 break;
2082 }
2083 case OPCODE_ELSE:
2084 clear_current_const(c);
2085 brw_ELSE(p);
2086 break;
2087 case OPCODE_ENDIF:
2088 clear_current_const(c);
2089 brw_ENDIF(p);
2090 break;
2091 case OPCODE_BGNLOOP:
2092 clear_current_const(c);
2093 brw_DO(p, BRW_EXECUTE_8);
2094 break;
2095 case OPCODE_BRK:
2096 brw_set_predicate_control(p, get_predicate(inst));
2097 brw_BREAK(p);
2098 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2099 break;
2100 case OPCODE_CONT:
2101 brw_set_predicate_control(p, get_predicate(inst));
2102 if (intel->gen >= 6) {
2103 gen6_CONT(p);
2104 } else {
2105 brw_CONT(p);
2106 }
2107 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2108 break;
2109
2110 case OPCODE_ENDLOOP:
2111 clear_current_const(c);
2112 brw_WHILE(p);
2113 break;
2114
2115 case OPCODE_BRA:
2116 brw_set_predicate_control(p, get_predicate(inst));
2117 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2118 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2119 break;
2120 case OPCODE_CAL:
2121 brw_set_access_mode(p, BRW_ALIGN_1);
2122 brw_ADD(p, deref_1d(stack_index, 0), brw_ip_reg(), brw_imm_d(3*16));
2123 brw_set_access_mode(p, BRW_ALIGN_16);
2124 brw_ADD(p, get_addr_reg(stack_index),
2125 get_addr_reg(stack_index), brw_imm_d(4));
2126 brw_save_call(p, inst->Comment, p->nr_insn);
2127 brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
2128 break;
2129 case OPCODE_RET:
2130 brw_ADD(p, get_addr_reg(stack_index),
2131 get_addr_reg(stack_index), brw_imm_d(-4));
2132 brw_set_access_mode(p, BRW_ALIGN_1);
2133 brw_MOV(p, brw_ip_reg(), deref_1d(stack_index, 0));
2134 brw_set_access_mode(p, BRW_ALIGN_16);
2135 break;
2136 case OPCODE_END:
2137 emit_vertex_write(c);
2138 break;
2139 case OPCODE_PRINT:
2140 /* no-op */
2141 break;
2142 case OPCODE_BGNSUB:
2143 brw_save_label(p, inst->Comment, p->nr_insn);
2144 break;
2145 case OPCODE_ENDSUB:
2146 /* no-op */
2147 break;
2148 default:
2149 _mesa_problem(NULL, "Unsupported opcode %i (%s) in vertex shader",
2150 inst->Opcode, inst->Opcode < MAX_OPCODE ?
2151 _mesa_opcode_string(inst->Opcode) :
2152 "unknown");
2153 }
2154
2155 /* Set the predication update on the last instruction of the native
2156 * instruction sequence.
2157 *
2158 * This would be problematic if it was set on a math instruction,
2159 * but that shouldn't be the case with the current GLSL compiler.
2160 */
2161 if (inst->CondUpdate) {
2162 struct brw_instruction *hw_insn = &p->store[p->nr_insn - 1];
2163
2164 assert(hw_insn->header.destreg__conditionalmod == 0);
2165 hw_insn->header.destreg__conditionalmod = BRW_CONDITIONAL_NZ;
2166 }
2167
2168 if ((inst->DstReg.File == PROGRAM_OUTPUT)
2169 && (inst->DstReg.Index != VERT_RESULT_HPOS)
2170 && c->output_regs[inst->DstReg.Index].used_in_src) {
2171 brw_MOV(p, get_dst(c, inst->DstReg), dst);
2172 }
2173
2174 /* Result color clamping.
2175 *
2176 * When destination register is an output register and
2177 * it's primary/secondary front/back color, we have to clamp
2178 * the result to [0,1]. This is done by enabling the
2179 * saturation bit for the last instruction.
2180 *
2181 * We don't use brw_set_saturate() as it modifies
2182 * p->current->header.saturate, which affects all the subsequent
2183 * instructions. Instead, we directly modify the header
2184 * of the last (already stored) instruction.
2185 */
2186 if (inst->DstReg.File == PROGRAM_OUTPUT &&
2187 c->key.clamp_vertex_color) {
2188 if ((inst->DstReg.Index == VERT_RESULT_COL0)
2189 || (inst->DstReg.Index == VERT_RESULT_COL1)
2190 || (inst->DstReg.Index == VERT_RESULT_BFC0)
2191 || (inst->DstReg.Index == VERT_RESULT_BFC1)) {
2192 p->store[p->nr_insn-1].header.saturate = 1;
2193 }
2194 }
2195
2196 if (inst->DstReg.RelAddr) {
2197 assert(inst->DstReg.File == PROGRAM_TEMPORARY||
2198 inst->DstReg.File == PROGRAM_OUTPUT);
2199 move_to_reladdr_dst(c, inst, dst);
2200 }
2201
2202 release_tmps(c);
2203 }
2204
2205 brw_resolve_cals(p);
2206 brw_set_uip_jip(p);
2207
2208 brw_optimize(p);
2209
2210 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
2211 int i;
2212
2213 printf("vs-native:\n");
2214 for (i = 0; i < p->nr_insn; i++)
2215 brw_disasm(stdout, &p->store[i], intel->gen);
2216 printf("\n");
2217 }
2218 }