i965/vs: Store texturing results into a vec4 temporary.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU2(ADD)
117 ALU2(MUL)
118 ALU2(MACH)
119 ALU2(AND)
120 ALU2(OR)
121 ALU2(XOR)
122 ALU2(DP3)
123 ALU2(DP4)
124 ALU2(DPH)
125 ALU2(SHL)
126 ALU2(SHR)
127 ALU2(ASR)
128
129 /** Gen4 predicated IF. */
130 vec4_instruction *
131 vec4_visitor::IF(uint32_t predicate)
132 {
133 vec4_instruction *inst;
134
135 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
136 inst->predicate = predicate;
137
138 return inst;
139 }
140
141 /** Gen6+ IF with embedded comparison. */
142 vec4_instruction *
143 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
144 {
145 assert(intel->gen >= 6);
146
147 vec4_instruction *inst;
148
149 resolve_ud_negate(&src0);
150 resolve_ud_negate(&src1);
151
152 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
153 src0, src1);
154 inst->conditional_mod = condition;
155
156 return inst;
157 }
158
159 /**
160 * CMP: Sets the low bit of the destination channels with the result
161 * of the comparison, while the upper bits are undefined, and updates
162 * the flag register with the packed 16 bits of the result.
163 */
164 vec4_instruction *
165 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
166 {
167 vec4_instruction *inst;
168
169 /* original gen4 does type conversion to the destination type
170 * before before comparison, producing garbage results for floating
171 * point comparisons.
172 */
173 if (intel->gen == 4) {
174 dst.type = src0.type;
175 if (dst.file == HW_REG)
176 dst.fixed_hw_reg.type = dst.type;
177 }
178
179 resolve_ud_negate(&src0);
180 resolve_ud_negate(&src1);
181
182 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
183 inst->conditional_mod = condition;
184
185 return inst;
186 }
187
188 vec4_instruction *
189 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
190 {
191 vec4_instruction *inst;
192
193 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
194 dst, index);
195 inst->base_mrf = 14;
196 inst->mlen = 2;
197
198 return inst;
199 }
200
201 vec4_instruction *
202 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
203 {
204 vec4_instruction *inst;
205
206 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
207 dst, src, index);
208 inst->base_mrf = 13;
209 inst->mlen = 3;
210
211 return inst;
212 }
213
214 void
215 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
216 {
217 static enum opcode dot_opcodes[] = {
218 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
219 };
220
221 emit(dot_opcodes[elements - 2], dst, src0, src1);
222 }
223
224 src_reg
225 vec4_visitor::fix_math_operand(src_reg src)
226 {
227 /* The gen6 math instruction ignores the source modifiers --
228 * swizzle, abs, negate, and at least some parts of the register
229 * region description.
230 *
231 * Rather than trying to enumerate all these cases, *always* expand the
232 * operand to a temp GRF for gen6.
233 *
234 * For gen7, keep the operand as-is, except if immediate, which gen7 still
235 * can't use.
236 */
237
238 if (intel->gen == 7 && src.file != IMM)
239 return src;
240
241 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
242 expanded.type = src.type;
243 emit(MOV(expanded, src));
244 return src_reg(expanded);
245 }
246
247 void
248 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
249 {
250 src = fix_math_operand(src);
251
252 if (dst.writemask != WRITEMASK_XYZW) {
253 /* The gen6 math instruction must be align1, so we can't do
254 * writemasks.
255 */
256 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
257
258 emit(opcode, temp_dst, src);
259
260 emit(MOV(dst, src_reg(temp_dst)));
261 } else {
262 emit(opcode, dst, src);
263 }
264 }
265
266 void
267 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
268 {
269 vec4_instruction *inst = emit(opcode, dst, src);
270 inst->base_mrf = 1;
271 inst->mlen = 1;
272 }
273
274 void
275 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
276 {
277 switch (opcode) {
278 case SHADER_OPCODE_RCP:
279 case SHADER_OPCODE_RSQ:
280 case SHADER_OPCODE_SQRT:
281 case SHADER_OPCODE_EXP2:
282 case SHADER_OPCODE_LOG2:
283 case SHADER_OPCODE_SIN:
284 case SHADER_OPCODE_COS:
285 break;
286 default:
287 assert(!"not reached: bad math opcode");
288 return;
289 }
290
291 if (intel->gen >= 6) {
292 return emit_math1_gen6(opcode, dst, src);
293 } else {
294 return emit_math1_gen4(opcode, dst, src);
295 }
296 }
297
298 void
299 vec4_visitor::emit_math2_gen6(enum opcode opcode,
300 dst_reg dst, src_reg src0, src_reg src1)
301 {
302 src0 = fix_math_operand(src0);
303 src1 = fix_math_operand(src1);
304
305 if (dst.writemask != WRITEMASK_XYZW) {
306 /* The gen6 math instruction must be align1, so we can't do
307 * writemasks.
308 */
309 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
310 temp_dst.type = dst.type;
311
312 emit(opcode, temp_dst, src0, src1);
313
314 emit(MOV(dst, src_reg(temp_dst)));
315 } else {
316 emit(opcode, dst, src0, src1);
317 }
318 }
319
320 void
321 vec4_visitor::emit_math2_gen4(enum opcode opcode,
322 dst_reg dst, src_reg src0, src_reg src1)
323 {
324 vec4_instruction *inst = emit(opcode, dst, src0, src1);
325 inst->base_mrf = 1;
326 inst->mlen = 2;
327 }
328
329 void
330 vec4_visitor::emit_math(enum opcode opcode,
331 dst_reg dst, src_reg src0, src_reg src1)
332 {
333 switch (opcode) {
334 case SHADER_OPCODE_POW:
335 case SHADER_OPCODE_INT_QUOTIENT:
336 case SHADER_OPCODE_INT_REMAINDER:
337 break;
338 default:
339 assert(!"not reached: unsupported binary math opcode");
340 return;
341 }
342
343 if (intel->gen >= 6) {
344 return emit_math2_gen6(opcode, dst, src0, src1);
345 } else {
346 return emit_math2_gen4(opcode, dst, src0, src1);
347 }
348 }
349
350 void
351 vec4_visitor::visit_instructions(const exec_list *list)
352 {
353 foreach_list(node, list) {
354 ir_instruction *ir = (ir_instruction *)node;
355
356 base_ir = ir;
357 ir->accept(this);
358 }
359 }
360
361
362 static int
363 type_size(const struct glsl_type *type)
364 {
365 unsigned int i;
366 int size;
367
368 switch (type->base_type) {
369 case GLSL_TYPE_UINT:
370 case GLSL_TYPE_INT:
371 case GLSL_TYPE_FLOAT:
372 case GLSL_TYPE_BOOL:
373 if (type->is_matrix()) {
374 return type->matrix_columns;
375 } else {
376 /* Regardless of size of vector, it gets a vec4. This is bad
377 * packing for things like floats, but otherwise arrays become a
378 * mess. Hopefully a later pass over the code can pack scalars
379 * down if appropriate.
380 */
381 return 1;
382 }
383 case GLSL_TYPE_ARRAY:
384 assert(type->length > 0);
385 return type_size(type->fields.array) * type->length;
386 case GLSL_TYPE_STRUCT:
387 size = 0;
388 for (i = 0; i < type->length; i++) {
389 size += type_size(type->fields.structure[i].type);
390 }
391 return size;
392 case GLSL_TYPE_SAMPLER:
393 /* Samplers take up one slot in UNIFORMS[], but they're baked in
394 * at link time.
395 */
396 return 1;
397 default:
398 assert(0);
399 return 0;
400 }
401 }
402
403 int
404 vec4_visitor::virtual_grf_alloc(int size)
405 {
406 if (virtual_grf_array_size <= virtual_grf_count) {
407 if (virtual_grf_array_size == 0)
408 virtual_grf_array_size = 16;
409 else
410 virtual_grf_array_size *= 2;
411 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
412 virtual_grf_array_size);
413 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
414 virtual_grf_array_size);
415 }
416 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
417 virtual_grf_reg_count += size;
418 virtual_grf_sizes[virtual_grf_count] = size;
419 return virtual_grf_count++;
420 }
421
422 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
423 {
424 init();
425
426 this->file = GRF;
427 this->reg = v->virtual_grf_alloc(type_size(type));
428
429 if (type->is_array() || type->is_record()) {
430 this->swizzle = BRW_SWIZZLE_NOOP;
431 } else {
432 this->swizzle = swizzle_for_size(type->vector_elements);
433 }
434
435 this->type = brw_type_for_base_type(type);
436 }
437
438 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
439 {
440 init();
441
442 this->file = GRF;
443 this->reg = v->virtual_grf_alloc(type_size(type));
444
445 if (type->is_array() || type->is_record()) {
446 this->writemask = WRITEMASK_XYZW;
447 } else {
448 this->writemask = (1 << type->vector_elements) - 1;
449 }
450
451 this->type = brw_type_for_base_type(type);
452 }
453
454 /* Our support for uniforms is piggy-backed on the struct
455 * gl_fragment_program, because that's where the values actually
456 * get stored, rather than in some global gl_shader_program uniform
457 * store.
458 */
459 void
460 vec4_visitor::setup_uniform_values(ir_variable *ir)
461 {
462 int namelen = strlen(ir->name);
463
464 /* The data for our (non-builtin) uniforms is stored in a series of
465 * gl_uniform_driver_storage structs for each subcomponent that
466 * glGetUniformLocation() could name. We know it's been set up in the same
467 * order we'd walk the type, so walk the list of storage and find anything
468 * with our name, or the prefix of a component that starts with our name.
469 */
470 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
471 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
472
473 if (strncmp(ir->name, storage->name, namelen) != 0 ||
474 (storage->name[namelen] != 0 &&
475 storage->name[namelen] != '.' &&
476 storage->name[namelen] != '[')) {
477 continue;
478 }
479
480 gl_constant_value *components = storage->storage;
481 unsigned vector_count = (MAX2(storage->array_elements, 1) *
482 storage->type->matrix_columns);
483
484 for (unsigned s = 0; s < vector_count; s++) {
485 uniform_vector_size[uniforms] = storage->type->vector_elements;
486
487 int i;
488 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
489 c->prog_data.param[uniforms * 4 + i] = &components->f;
490 components++;
491 }
492 for (; i < 4; i++) {
493 static float zero = 0;
494 c->prog_data.param[uniforms * 4 + i] = &zero;
495 }
496
497 uniforms++;
498 }
499 }
500 }
501
502 void
503 vec4_visitor::setup_uniform_clipplane_values()
504 {
505 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
506
507 if (intel->gen < 6) {
508 /* Pre-Gen6, we compact clip planes. For example, if the user
509 * enables just clip planes 0, 1, and 3, we will enable clip planes
510 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
511 * plane 2. This simplifies the implementation of the Gen6 clip
512 * thread.
513 */
514 int compacted_clipplane_index = 0;
515 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
516 if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
517 continue;
518
519 this->uniform_vector_size[this->uniforms] = 4;
520 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
521 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
522 for (int j = 0; j < 4; ++j) {
523 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
524 }
525 ++compacted_clipplane_index;
526 ++this->uniforms;
527 }
528 } else {
529 /* In Gen6 and later, we don't compact clip planes, because this
530 * simplifies the implementation of gl_ClipDistance.
531 */
532 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
533 this->uniform_vector_size[this->uniforms] = 4;
534 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
535 this->userplane[i].type = BRW_REGISTER_TYPE_F;
536 for (int j = 0; j < 4; ++j) {
537 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
538 }
539 ++this->uniforms;
540 }
541 }
542 }
543
544 /* Our support for builtin uniforms is even scarier than non-builtin.
545 * It sits on top of the PROG_STATE_VAR parameters that are
546 * automatically updated from GL context state.
547 */
548 void
549 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
550 {
551 const ir_state_slot *const slots = ir->state_slots;
552 assert(ir->state_slots != NULL);
553
554 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
555 /* This state reference has already been setup by ir_to_mesa,
556 * but we'll get the same index back here. We can reference
557 * ParameterValues directly, since unlike brw_fs.cpp, we never
558 * add new state references during compile.
559 */
560 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
561 (gl_state_index *)slots[i].tokens);
562 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
563
564 this->uniform_vector_size[this->uniforms] = 0;
565 /* Add each of the unique swizzled channels of the element.
566 * This will end up matching the size of the glsl_type of this field.
567 */
568 int last_swiz = -1;
569 for (unsigned int j = 0; j < 4; j++) {
570 int swiz = GET_SWZ(slots[i].swizzle, j);
571 last_swiz = swiz;
572
573 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
574 if (swiz <= last_swiz)
575 this->uniform_vector_size[this->uniforms]++;
576 }
577 this->uniforms++;
578 }
579 }
580
581 dst_reg *
582 vec4_visitor::variable_storage(ir_variable *var)
583 {
584 return (dst_reg *)hash_table_find(this->variable_ht, var);
585 }
586
587 void
588 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
589 {
590 ir_expression *expr = ir->as_expression();
591
592 *predicate = BRW_PREDICATE_NORMAL;
593
594 if (expr) {
595 src_reg op[2];
596 vec4_instruction *inst;
597
598 assert(expr->get_num_operands() <= 2);
599 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
600 expr->operands[i]->accept(this);
601 op[i] = this->result;
602
603 resolve_ud_negate(&op[i]);
604 }
605
606 switch (expr->operation) {
607 case ir_unop_logic_not:
608 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
609 inst->conditional_mod = BRW_CONDITIONAL_Z;
610 break;
611
612 case ir_binop_logic_xor:
613 inst = emit(XOR(dst_null_d(), op[0], op[1]));
614 inst->conditional_mod = BRW_CONDITIONAL_NZ;
615 break;
616
617 case ir_binop_logic_or:
618 inst = emit(OR(dst_null_d(), op[0], op[1]));
619 inst->conditional_mod = BRW_CONDITIONAL_NZ;
620 break;
621
622 case ir_binop_logic_and:
623 inst = emit(AND(dst_null_d(), op[0], op[1]));
624 inst->conditional_mod = BRW_CONDITIONAL_NZ;
625 break;
626
627 case ir_unop_f2b:
628 if (intel->gen >= 6) {
629 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
630 } else {
631 inst = emit(MOV(dst_null_f(), op[0]));
632 inst->conditional_mod = BRW_CONDITIONAL_NZ;
633 }
634 break;
635
636 case ir_unop_i2b:
637 if (intel->gen >= 6) {
638 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
639 } else {
640 inst = emit(MOV(dst_null_d(), op[0]));
641 inst->conditional_mod = BRW_CONDITIONAL_NZ;
642 }
643 break;
644
645 case ir_binop_all_equal:
646 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
647 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
648 break;
649
650 case ir_binop_any_nequal:
651 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
652 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
653 break;
654
655 case ir_unop_any:
656 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
657 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
658 break;
659
660 case ir_binop_greater:
661 case ir_binop_gequal:
662 case ir_binop_less:
663 case ir_binop_lequal:
664 case ir_binop_equal:
665 case ir_binop_nequal:
666 emit(CMP(dst_null_d(), op[0], op[1],
667 brw_conditional_for_comparison(expr->operation)));
668 break;
669
670 default:
671 assert(!"not reached");
672 break;
673 }
674 return;
675 }
676
677 ir->accept(this);
678
679 resolve_ud_negate(&this->result);
680
681 if (intel->gen >= 6) {
682 vec4_instruction *inst = emit(AND(dst_null_d(),
683 this->result, src_reg(1)));
684 inst->conditional_mod = BRW_CONDITIONAL_NZ;
685 } else {
686 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
687 inst->conditional_mod = BRW_CONDITIONAL_NZ;
688 }
689 }
690
691 /**
692 * Emit a gen6 IF statement with the comparison folded into the IF
693 * instruction.
694 */
695 void
696 vec4_visitor::emit_if_gen6(ir_if *ir)
697 {
698 ir_expression *expr = ir->condition->as_expression();
699
700 if (expr) {
701 src_reg op[2];
702 dst_reg temp;
703
704 assert(expr->get_num_operands() <= 2);
705 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
706 expr->operands[i]->accept(this);
707 op[i] = this->result;
708 }
709
710 switch (expr->operation) {
711 case ir_unop_logic_not:
712 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
713 return;
714
715 case ir_binop_logic_xor:
716 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
717 return;
718
719 case ir_binop_logic_or:
720 temp = dst_reg(this, glsl_type::bool_type);
721 emit(OR(temp, op[0], op[1]));
722 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
723 return;
724
725 case ir_binop_logic_and:
726 temp = dst_reg(this, glsl_type::bool_type);
727 emit(AND(temp, op[0], op[1]));
728 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
729 return;
730
731 case ir_unop_f2b:
732 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
733 return;
734
735 case ir_unop_i2b:
736 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
737 return;
738
739 case ir_binop_greater:
740 case ir_binop_gequal:
741 case ir_binop_less:
742 case ir_binop_lequal:
743 case ir_binop_equal:
744 case ir_binop_nequal:
745 emit(IF(op[0], op[1],
746 brw_conditional_for_comparison(expr->operation)));
747 return;
748
749 case ir_binop_all_equal:
750 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
751 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
752 return;
753
754 case ir_binop_any_nequal:
755 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
756 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
757 return;
758
759 case ir_unop_any:
760 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
761 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
762 return;
763
764 default:
765 assert(!"not reached");
766 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
767 return;
768 }
769 return;
770 }
771
772 ir->condition->accept(this);
773
774 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
775 }
776
777 static dst_reg
778 with_writemask(dst_reg const & r, int mask)
779 {
780 dst_reg result = r;
781 result.writemask = mask;
782 return result;
783 }
784
785 void
786 vec4_visitor::emit_attribute_fixups()
787 {
788 dst_reg sign_recovery_shift;
789 dst_reg normalize_factor;
790 dst_reg es3_normalize_factor;
791
792 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
793 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
794 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
795 dst_reg reg(ATTR, i);
796 dst_reg reg_d = reg;
797 reg_d.type = BRW_REGISTER_TYPE_D;
798 dst_reg reg_ud = reg;
799 reg_ud.type = BRW_REGISTER_TYPE_UD;
800
801 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
802 * come in as floating point conversions of the integer values.
803 */
804 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
805 dst_reg dst = reg;
806 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
807 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
808 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
809 }
810
811 /* Do sign recovery for 2101010 formats if required. */
812 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
813 if (sign_recovery_shift.file == BAD_FILE) {
814 /* shift constant: <22,22,22,30> */
815 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
816 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
817 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
818 }
819
820 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
821 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
822 }
823
824 /* Apply BGRA swizzle if required. */
825 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
826 src_reg temp = src_reg(reg);
827 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
828 emit(MOV(reg, temp));
829 }
830
831 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
832 /* ES 3.0 has different rules for converting signed normalized
833 * fixed-point numbers than desktop GL.
834 */
835 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
836 /* According to equation 2.2 of the ES 3.0 specification,
837 * signed normalization conversion is done by:
838 *
839 * f = c / (2^(b-1)-1)
840 */
841 if (es3_normalize_factor.file == BAD_FILE) {
842 /* mul constant: 1 / (2^(b-1) - 1) */
843 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
844 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
845 src_reg(1.0f / ((1<<9) - 1))));
846 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
847 src_reg(1.0f / ((1<<1) - 1))));
848 }
849
850 dst_reg dst = reg;
851 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
852 emit(MOV(dst, src_reg(reg_d)));
853 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
854 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
855 } else {
856 /* The following equations are from the OpenGL 3.2 specification:
857 *
858 * 2.1 unsigned normalization
859 * f = c/(2^n-1)
860 *
861 * 2.2 signed normalization
862 * f = (2c+1)/(2^n-1)
863 *
864 * Both of these share a common divisor, which is represented by
865 * "normalize_factor" in the code below.
866 */
867 if (normalize_factor.file == BAD_FILE) {
868 /* 1 / (2^b - 1) for b=<10,10,10,2> */
869 normalize_factor = dst_reg(this, glsl_type::vec4_type);
870 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
871 src_reg(1.0f / ((1<<10) - 1))));
872 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
873 src_reg(1.0f / ((1<<2) - 1))));
874 }
875
876 dst_reg dst = reg;
877 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
878 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
879
880 /* For signed normalization, we want the numerator to be 2c+1. */
881 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
882 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
883 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
884 }
885
886 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
887 }
888 }
889
890 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
891 dst_reg dst = reg;
892 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
893 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
894 }
895 }
896 }
897 }
898
899 void
900 vec4_visitor::visit(ir_variable *ir)
901 {
902 dst_reg *reg = NULL;
903
904 if (variable_storage(ir))
905 return;
906
907 switch (ir->mode) {
908 case ir_var_in:
909 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
910 break;
911
912 case ir_var_out:
913 reg = new(mem_ctx) dst_reg(this, ir->type);
914
915 for (int i = 0; i < type_size(ir->type); i++) {
916 output_reg[ir->location + i] = *reg;
917 output_reg[ir->location + i].reg_offset = i;
918 output_reg[ir->location + i].type =
919 brw_type_for_base_type(ir->type->get_scalar_type());
920 output_reg_annotation[ir->location + i] = ir->name;
921 }
922 break;
923
924 case ir_var_auto:
925 case ir_var_temporary:
926 reg = new(mem_ctx) dst_reg(this, ir->type);
927 break;
928
929 case ir_var_uniform:
930 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
931
932 /* Thanks to the lower_ubo_reference pass, we will see only
933 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
934 * variables, so no need for them to be in variable_ht.
935 */
936 if (ir->uniform_block != -1)
937 return;
938
939 /* Track how big the whole uniform variable is, in case we need to put a
940 * copy of its data into pull constants for array access.
941 */
942 this->uniform_size[this->uniforms] = type_size(ir->type);
943
944 if (!strncmp(ir->name, "gl_", 3)) {
945 setup_builtin_uniform_values(ir);
946 } else {
947 setup_uniform_values(ir);
948 }
949 break;
950
951 case ir_var_system_value:
952 /* VertexID is stored by the VF as the last vertex element, but
953 * we don't represent it with a flag in inputs_read, so we call
954 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
955 */
956 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
957 prog_data->uses_vertexid = true;
958
959 switch (ir->location) {
960 case SYSTEM_VALUE_VERTEX_ID:
961 reg->writemask = WRITEMASK_X;
962 break;
963 case SYSTEM_VALUE_INSTANCE_ID:
964 reg->writemask = WRITEMASK_Y;
965 break;
966 default:
967 assert(!"not reached");
968 break;
969 }
970 break;
971
972 default:
973 assert(!"not reached");
974 }
975
976 reg->type = brw_type_for_base_type(ir->type);
977 hash_table_insert(this->variable_ht, reg, ir);
978 }
979
980 void
981 vec4_visitor::visit(ir_loop *ir)
982 {
983 dst_reg counter;
984
985 /* We don't want debugging output to print the whole body of the
986 * loop as the annotation.
987 */
988 this->base_ir = NULL;
989
990 if (ir->counter != NULL) {
991 this->base_ir = ir->counter;
992 ir->counter->accept(this);
993 counter = *(variable_storage(ir->counter));
994
995 if (ir->from != NULL) {
996 this->base_ir = ir->from;
997 ir->from->accept(this);
998
999 emit(MOV(counter, this->result));
1000 }
1001 }
1002
1003 emit(BRW_OPCODE_DO);
1004
1005 if (ir->to) {
1006 this->base_ir = ir->to;
1007 ir->to->accept(this);
1008
1009 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1010 brw_conditional_for_comparison(ir->cmp)));
1011
1012 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1013 inst->predicate = BRW_PREDICATE_NORMAL;
1014 }
1015
1016 visit_instructions(&ir->body_instructions);
1017
1018
1019 if (ir->increment) {
1020 this->base_ir = ir->increment;
1021 ir->increment->accept(this);
1022 emit(ADD(counter, src_reg(counter), this->result));
1023 }
1024
1025 emit(BRW_OPCODE_WHILE);
1026 }
1027
1028 void
1029 vec4_visitor::visit(ir_loop_jump *ir)
1030 {
1031 switch (ir->mode) {
1032 case ir_loop_jump::jump_break:
1033 emit(BRW_OPCODE_BREAK);
1034 break;
1035 case ir_loop_jump::jump_continue:
1036 emit(BRW_OPCODE_CONTINUE);
1037 break;
1038 }
1039 }
1040
1041
1042 void
1043 vec4_visitor::visit(ir_function_signature *ir)
1044 {
1045 assert(0);
1046 (void)ir;
1047 }
1048
1049 void
1050 vec4_visitor::visit(ir_function *ir)
1051 {
1052 /* Ignore function bodies other than main() -- we shouldn't see calls to
1053 * them since they should all be inlined.
1054 */
1055 if (strcmp(ir->name, "main") == 0) {
1056 const ir_function_signature *sig;
1057 exec_list empty;
1058
1059 sig = ir->matching_signature(&empty);
1060
1061 assert(sig);
1062
1063 visit_instructions(&sig->body);
1064 }
1065 }
1066
1067 bool
1068 vec4_visitor::try_emit_sat(ir_expression *ir)
1069 {
1070 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1071 if (!sat_src)
1072 return false;
1073
1074 sat_src->accept(this);
1075 src_reg src = this->result;
1076
1077 this->result = src_reg(this, ir->type);
1078 vec4_instruction *inst;
1079 inst = emit(MOV(dst_reg(this->result), src));
1080 inst->saturate = true;
1081
1082 return true;
1083 }
1084
1085 void
1086 vec4_visitor::emit_bool_comparison(unsigned int op,
1087 dst_reg dst, src_reg src0, src_reg src1)
1088 {
1089 /* original gen4 does destination conversion before comparison. */
1090 if (intel->gen < 5)
1091 dst.type = src0.type;
1092
1093 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1094
1095 dst.type = BRW_REGISTER_TYPE_D;
1096 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1097 }
1098
1099 void
1100 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1101 src_reg src0, src_reg src1)
1102 {
1103 vec4_instruction *inst;
1104
1105 if (intel->gen >= 6) {
1106 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1107 inst->conditional_mod = conditionalmod;
1108 } else {
1109 emit(CMP(dst, src0, src1, conditionalmod));
1110
1111 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1112 inst->predicate = BRW_PREDICATE_NORMAL;
1113 }
1114 }
1115
1116 void
1117 vec4_visitor::visit(ir_expression *ir)
1118 {
1119 unsigned int operand;
1120 src_reg op[Elements(ir->operands)];
1121 src_reg result_src;
1122 dst_reg result_dst;
1123 vec4_instruction *inst;
1124
1125 if (try_emit_sat(ir))
1126 return;
1127
1128 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1129 this->result.file = BAD_FILE;
1130 ir->operands[operand]->accept(this);
1131 if (this->result.file == BAD_FILE) {
1132 printf("Failed to get tree for expression operand:\n");
1133 ir->operands[operand]->print();
1134 exit(1);
1135 }
1136 op[operand] = this->result;
1137
1138 /* Matrix expression operands should have been broken down to vector
1139 * operations already.
1140 */
1141 assert(!ir->operands[operand]->type->is_matrix());
1142 }
1143
1144 int vector_elements = ir->operands[0]->type->vector_elements;
1145 if (ir->operands[1]) {
1146 vector_elements = MAX2(vector_elements,
1147 ir->operands[1]->type->vector_elements);
1148 }
1149
1150 this->result.file = BAD_FILE;
1151
1152 /* Storage for our result. Ideally for an assignment we'd be using
1153 * the actual storage for the result here, instead.
1154 */
1155 result_src = src_reg(this, ir->type);
1156 /* convenience for the emit functions below. */
1157 result_dst = dst_reg(result_src);
1158 /* If nothing special happens, this is the result. */
1159 this->result = result_src;
1160 /* Limit writes to the channels that will be used by result_src later.
1161 * This does limit this temp's use as a temporary for multi-instruction
1162 * sequences.
1163 */
1164 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1165
1166 switch (ir->operation) {
1167 case ir_unop_logic_not:
1168 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1169 * ones complement of the whole register, not just bit 0.
1170 */
1171 emit(XOR(result_dst, op[0], src_reg(1)));
1172 break;
1173 case ir_unop_neg:
1174 op[0].negate = !op[0].negate;
1175 this->result = op[0];
1176 break;
1177 case ir_unop_abs:
1178 op[0].abs = true;
1179 op[0].negate = false;
1180 this->result = op[0];
1181 break;
1182
1183 case ir_unop_sign:
1184 emit(MOV(result_dst, src_reg(0.0f)));
1185
1186 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1187 inst = emit(MOV(result_dst, src_reg(1.0f)));
1188 inst->predicate = BRW_PREDICATE_NORMAL;
1189
1190 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1191 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1192 inst->predicate = BRW_PREDICATE_NORMAL;
1193
1194 break;
1195
1196 case ir_unop_rcp:
1197 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1198 break;
1199
1200 case ir_unop_exp2:
1201 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1202 break;
1203 case ir_unop_log2:
1204 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1205 break;
1206 case ir_unop_exp:
1207 case ir_unop_log:
1208 assert(!"not reached: should be handled by ir_explog_to_explog2");
1209 break;
1210 case ir_unop_sin:
1211 case ir_unop_sin_reduced:
1212 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1213 break;
1214 case ir_unop_cos:
1215 case ir_unop_cos_reduced:
1216 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1217 break;
1218
1219 case ir_unop_dFdx:
1220 case ir_unop_dFdy:
1221 assert(!"derivatives not valid in vertex shader");
1222 break;
1223
1224 case ir_unop_noise:
1225 assert(!"not reached: should be handled by lower_noise");
1226 break;
1227
1228 case ir_binop_add:
1229 emit(ADD(result_dst, op[0], op[1]));
1230 break;
1231 case ir_binop_sub:
1232 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1233 break;
1234
1235 case ir_binop_mul:
1236 if (ir->type->is_integer()) {
1237 /* For integer multiplication, the MUL uses the low 16 bits
1238 * of one of the operands (src0 on gen6, src1 on gen7). The
1239 * MACH accumulates in the contribution of the upper 16 bits
1240 * of that operand.
1241 *
1242 * FINISHME: Emit just the MUL if we know an operand is small
1243 * enough.
1244 */
1245 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1246
1247 emit(MUL(acc, op[0], op[1]));
1248 emit(MACH(dst_null_d(), op[0], op[1]));
1249 emit(MOV(result_dst, src_reg(acc)));
1250 } else {
1251 emit(MUL(result_dst, op[0], op[1]));
1252 }
1253 break;
1254 case ir_binop_div:
1255 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1256 assert(ir->type->is_integer());
1257 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1258 break;
1259 case ir_binop_mod:
1260 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1261 assert(ir->type->is_integer());
1262 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1263 break;
1264
1265 case ir_binop_less:
1266 case ir_binop_greater:
1267 case ir_binop_lequal:
1268 case ir_binop_gequal:
1269 case ir_binop_equal:
1270 case ir_binop_nequal: {
1271 emit(CMP(result_dst, op[0], op[1],
1272 brw_conditional_for_comparison(ir->operation)));
1273 emit(AND(result_dst, result_src, src_reg(0x1)));
1274 break;
1275 }
1276
1277 case ir_binop_all_equal:
1278 /* "==" operator producing a scalar boolean. */
1279 if (ir->operands[0]->type->is_vector() ||
1280 ir->operands[1]->type->is_vector()) {
1281 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1282 emit(MOV(result_dst, src_reg(0)));
1283 inst = emit(MOV(result_dst, src_reg(1)));
1284 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1285 } else {
1286 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1287 emit(AND(result_dst, result_src, src_reg(0x1)));
1288 }
1289 break;
1290 case ir_binop_any_nequal:
1291 /* "!=" operator producing a scalar boolean. */
1292 if (ir->operands[0]->type->is_vector() ||
1293 ir->operands[1]->type->is_vector()) {
1294 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1295
1296 emit(MOV(result_dst, src_reg(0)));
1297 inst = emit(MOV(result_dst, src_reg(1)));
1298 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1299 } else {
1300 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1301 emit(AND(result_dst, result_src, src_reg(0x1)));
1302 }
1303 break;
1304
1305 case ir_unop_any:
1306 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1307 emit(MOV(result_dst, src_reg(0)));
1308
1309 inst = emit(MOV(result_dst, src_reg(1)));
1310 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1311 break;
1312
1313 case ir_binop_logic_xor:
1314 emit(XOR(result_dst, op[0], op[1]));
1315 break;
1316
1317 case ir_binop_logic_or:
1318 emit(OR(result_dst, op[0], op[1]));
1319 break;
1320
1321 case ir_binop_logic_and:
1322 emit(AND(result_dst, op[0], op[1]));
1323 break;
1324
1325 case ir_binop_dot:
1326 assert(ir->operands[0]->type->is_vector());
1327 assert(ir->operands[0]->type == ir->operands[1]->type);
1328 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1329 break;
1330
1331 case ir_unop_sqrt:
1332 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1333 break;
1334 case ir_unop_rsq:
1335 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1336 break;
1337
1338 case ir_unop_bitcast_i2f:
1339 case ir_unop_bitcast_u2f:
1340 this->result = op[0];
1341 this->result.type = BRW_REGISTER_TYPE_F;
1342 break;
1343
1344 case ir_unop_bitcast_f2i:
1345 this->result = op[0];
1346 this->result.type = BRW_REGISTER_TYPE_D;
1347 break;
1348
1349 case ir_unop_bitcast_f2u:
1350 this->result = op[0];
1351 this->result.type = BRW_REGISTER_TYPE_UD;
1352 break;
1353
1354 case ir_unop_i2f:
1355 case ir_unop_i2u:
1356 case ir_unop_u2i:
1357 case ir_unop_u2f:
1358 case ir_unop_b2f:
1359 case ir_unop_b2i:
1360 case ir_unop_f2i:
1361 case ir_unop_f2u:
1362 emit(MOV(result_dst, op[0]));
1363 break;
1364 case ir_unop_f2b:
1365 case ir_unop_i2b: {
1366 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1367 emit(AND(result_dst, result_src, src_reg(1)));
1368 break;
1369 }
1370
1371 case ir_unop_trunc:
1372 emit(RNDZ(result_dst, op[0]));
1373 break;
1374 case ir_unop_ceil:
1375 op[0].negate = !op[0].negate;
1376 inst = emit(RNDD(result_dst, op[0]));
1377 this->result.negate = true;
1378 break;
1379 case ir_unop_floor:
1380 inst = emit(RNDD(result_dst, op[0]));
1381 break;
1382 case ir_unop_fract:
1383 inst = emit(FRC(result_dst, op[0]));
1384 break;
1385 case ir_unop_round_even:
1386 emit(RNDE(result_dst, op[0]));
1387 break;
1388
1389 case ir_binop_min:
1390 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1391 break;
1392 case ir_binop_max:
1393 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1394 break;
1395
1396 case ir_binop_pow:
1397 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1398 break;
1399
1400 case ir_unop_bit_not:
1401 inst = emit(NOT(result_dst, op[0]));
1402 break;
1403 case ir_binop_bit_and:
1404 inst = emit(AND(result_dst, op[0], op[1]));
1405 break;
1406 case ir_binop_bit_xor:
1407 inst = emit(XOR(result_dst, op[0], op[1]));
1408 break;
1409 case ir_binop_bit_or:
1410 inst = emit(OR(result_dst, op[0], op[1]));
1411 break;
1412
1413 case ir_binop_lshift:
1414 inst = emit(SHL(result_dst, op[0], op[1]));
1415 break;
1416
1417 case ir_binop_rshift:
1418 if (ir->type->base_type == GLSL_TYPE_INT)
1419 inst = emit(ASR(result_dst, op[0], op[1]));
1420 else
1421 inst = emit(SHR(result_dst, op[0], op[1]));
1422 break;
1423
1424 case ir_binop_ubo_load: {
1425 ir_constant *uniform_block = ir->operands[0]->as_constant();
1426 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1427 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1428 src_reg offset = op[1];
1429
1430 /* Now, load the vector from that offset. */
1431 assert(ir->type->is_vector() || ir->type->is_scalar());
1432
1433 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1434 packed_consts.type = result.type;
1435 src_reg surf_index =
1436 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1437 if (const_offset_ir) {
1438 offset = src_reg(const_offset / 16);
1439 } else {
1440 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1441 }
1442
1443 vec4_instruction *pull =
1444 emit(new(mem_ctx) vec4_instruction(this,
1445 VS_OPCODE_PULL_CONSTANT_LOAD,
1446 dst_reg(packed_consts),
1447 surf_index,
1448 offset));
1449 pull->base_mrf = 14;
1450 pull->mlen = 1;
1451
1452 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1453 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1454 const_offset % 16 / 4,
1455 const_offset % 16 / 4,
1456 const_offset % 16 / 4);
1457
1458 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1459 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1460 emit(CMP(result_dst, packed_consts, src_reg(0u),
1461 BRW_CONDITIONAL_NZ));
1462 emit(AND(result_dst, result, src_reg(0x1)));
1463 } else {
1464 emit(MOV(result_dst, packed_consts));
1465 }
1466 break;
1467 }
1468
1469 case ir_quadop_vector:
1470 assert(!"not reached: should be handled by lower_quadop_vector");
1471 break;
1472 }
1473 }
1474
1475
1476 void
1477 vec4_visitor::visit(ir_swizzle *ir)
1478 {
1479 src_reg src;
1480 int i = 0;
1481 int swizzle[4];
1482
1483 /* Note that this is only swizzles in expressions, not those on the left
1484 * hand side of an assignment, which do write masking. See ir_assignment
1485 * for that.
1486 */
1487
1488 ir->val->accept(this);
1489 src = this->result;
1490 assert(src.file != BAD_FILE);
1491
1492 for (i = 0; i < ir->type->vector_elements; i++) {
1493 switch (i) {
1494 case 0:
1495 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1496 break;
1497 case 1:
1498 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1499 break;
1500 case 2:
1501 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1502 break;
1503 case 3:
1504 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1505 break;
1506 }
1507 }
1508 for (; i < 4; i++) {
1509 /* Replicate the last channel out. */
1510 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1511 }
1512
1513 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1514
1515 this->result = src;
1516 }
1517
1518 void
1519 vec4_visitor::visit(ir_dereference_variable *ir)
1520 {
1521 const struct glsl_type *type = ir->type;
1522 dst_reg *reg = variable_storage(ir->var);
1523
1524 if (!reg) {
1525 fail("Failed to find variable storage for %s\n", ir->var->name);
1526 this->result = src_reg(brw_null_reg());
1527 return;
1528 }
1529
1530 this->result = src_reg(*reg);
1531
1532 /* System values get their swizzle from the dst_reg writemask */
1533 if (ir->var->mode == ir_var_system_value)
1534 return;
1535
1536 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1537 this->result.swizzle = swizzle_for_size(type->vector_elements);
1538 }
1539
1540 void
1541 vec4_visitor::visit(ir_dereference_array *ir)
1542 {
1543 ir_constant *constant_index;
1544 src_reg src;
1545 int element_size = type_size(ir->type);
1546
1547 constant_index = ir->array_index->constant_expression_value();
1548
1549 ir->array->accept(this);
1550 src = this->result;
1551
1552 if (constant_index) {
1553 src.reg_offset += constant_index->value.i[0] * element_size;
1554 } else {
1555 /* Variable index array dereference. It eats the "vec4" of the
1556 * base of the array and an index that offsets the Mesa register
1557 * index.
1558 */
1559 ir->array_index->accept(this);
1560
1561 src_reg index_reg;
1562
1563 if (element_size == 1) {
1564 index_reg = this->result;
1565 } else {
1566 index_reg = src_reg(this, glsl_type::int_type);
1567
1568 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1569 }
1570
1571 if (src.reladdr) {
1572 src_reg temp = src_reg(this, glsl_type::int_type);
1573
1574 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1575
1576 index_reg = temp;
1577 }
1578
1579 src.reladdr = ralloc(mem_ctx, src_reg);
1580 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1581 }
1582
1583 /* If the type is smaller than a vec4, replicate the last channel out. */
1584 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1585 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1586 else
1587 src.swizzle = BRW_SWIZZLE_NOOP;
1588 src.type = brw_type_for_base_type(ir->type);
1589
1590 this->result = src;
1591 }
1592
1593 void
1594 vec4_visitor::visit(ir_dereference_record *ir)
1595 {
1596 unsigned int i;
1597 const glsl_type *struct_type = ir->record->type;
1598 int offset = 0;
1599
1600 ir->record->accept(this);
1601
1602 for (i = 0; i < struct_type->length; i++) {
1603 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1604 break;
1605 offset += type_size(struct_type->fields.structure[i].type);
1606 }
1607
1608 /* If the type is smaller than a vec4, replicate the last channel out. */
1609 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1610 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1611 else
1612 this->result.swizzle = BRW_SWIZZLE_NOOP;
1613 this->result.type = brw_type_for_base_type(ir->type);
1614
1615 this->result.reg_offset += offset;
1616 }
1617
1618 /**
1619 * We want to be careful in assignment setup to hit the actual storage
1620 * instead of potentially using a temporary like we might with the
1621 * ir_dereference handler.
1622 */
1623 static dst_reg
1624 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1625 {
1626 /* The LHS must be a dereference. If the LHS is a variable indexed array
1627 * access of a vector, it must be separated into a series conditional moves
1628 * before reaching this point (see ir_vec_index_to_cond_assign).
1629 */
1630 assert(ir->as_dereference());
1631 ir_dereference_array *deref_array = ir->as_dereference_array();
1632 if (deref_array) {
1633 assert(!deref_array->array->type->is_vector());
1634 }
1635
1636 /* Use the rvalue deref handler for the most part. We'll ignore
1637 * swizzles in it and write swizzles using writemask, though.
1638 */
1639 ir->accept(v);
1640 return dst_reg(v->result);
1641 }
1642
1643 void
1644 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1645 const struct glsl_type *type, uint32_t predicate)
1646 {
1647 if (type->base_type == GLSL_TYPE_STRUCT) {
1648 for (unsigned int i = 0; i < type->length; i++) {
1649 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1650 }
1651 return;
1652 }
1653
1654 if (type->is_array()) {
1655 for (unsigned int i = 0; i < type->length; i++) {
1656 emit_block_move(dst, src, type->fields.array, predicate);
1657 }
1658 return;
1659 }
1660
1661 if (type->is_matrix()) {
1662 const struct glsl_type *vec_type;
1663
1664 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1665 type->vector_elements, 1);
1666
1667 for (int i = 0; i < type->matrix_columns; i++) {
1668 emit_block_move(dst, src, vec_type, predicate);
1669 }
1670 return;
1671 }
1672
1673 assert(type->is_scalar() || type->is_vector());
1674
1675 dst->type = brw_type_for_base_type(type);
1676 src->type = dst->type;
1677
1678 dst->writemask = (1 << type->vector_elements) - 1;
1679
1680 src->swizzle = swizzle_for_size(type->vector_elements);
1681
1682 vec4_instruction *inst = emit(MOV(*dst, *src));
1683 inst->predicate = predicate;
1684
1685 dst->reg_offset++;
1686 src->reg_offset++;
1687 }
1688
1689
1690 /* If the RHS processing resulted in an instruction generating a
1691 * temporary value, and it would be easy to rewrite the instruction to
1692 * generate its result right into the LHS instead, do so. This ends
1693 * up reliably removing instructions where it can be tricky to do so
1694 * later without real UD chain information.
1695 */
1696 bool
1697 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1698 dst_reg dst,
1699 src_reg src,
1700 vec4_instruction *pre_rhs_inst,
1701 vec4_instruction *last_rhs_inst)
1702 {
1703 /* This could be supported, but it would take more smarts. */
1704 if (ir->condition)
1705 return false;
1706
1707 if (pre_rhs_inst == last_rhs_inst)
1708 return false; /* No instructions generated to work with. */
1709
1710 /* Make sure the last instruction generated our source reg. */
1711 if (src.file != GRF ||
1712 src.file != last_rhs_inst->dst.file ||
1713 src.reg != last_rhs_inst->dst.reg ||
1714 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1715 src.reladdr ||
1716 src.abs ||
1717 src.negate ||
1718 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1719 return false;
1720
1721 /* Check that that last instruction fully initialized the channels
1722 * we want to use, in the order we want to use them. We could
1723 * potentially reswizzle the operands of many instructions so that
1724 * we could handle out of order channels, but don't yet.
1725 */
1726
1727 for (unsigned i = 0; i < 4; i++) {
1728 if (dst.writemask & (1 << i)) {
1729 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1730 return false;
1731
1732 if (BRW_GET_SWZ(src.swizzle, i) != i)
1733 return false;
1734 }
1735 }
1736
1737 /* Success! Rewrite the instruction. */
1738 last_rhs_inst->dst.file = dst.file;
1739 last_rhs_inst->dst.reg = dst.reg;
1740 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1741 last_rhs_inst->dst.reladdr = dst.reladdr;
1742 last_rhs_inst->dst.writemask &= dst.writemask;
1743
1744 return true;
1745 }
1746
1747 void
1748 vec4_visitor::visit(ir_assignment *ir)
1749 {
1750 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1751 uint32_t predicate = BRW_PREDICATE_NONE;
1752
1753 if (!ir->lhs->type->is_scalar() &&
1754 !ir->lhs->type->is_vector()) {
1755 ir->rhs->accept(this);
1756 src_reg src = this->result;
1757
1758 if (ir->condition) {
1759 emit_bool_to_cond_code(ir->condition, &predicate);
1760 }
1761
1762 /* emit_block_move doesn't account for swizzles in the source register.
1763 * This should be ok, since the source register is a structure or an
1764 * array, and those can't be swizzled. But double-check to be sure.
1765 */
1766 assert(src.swizzle ==
1767 (ir->rhs->type->is_matrix()
1768 ? swizzle_for_size(ir->rhs->type->vector_elements)
1769 : BRW_SWIZZLE_NOOP));
1770
1771 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1772 return;
1773 }
1774
1775 /* Now we're down to just a scalar/vector with writemasks. */
1776 int i;
1777
1778 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1779 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1780
1781 ir->rhs->accept(this);
1782
1783 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1784
1785 src_reg src = this->result;
1786
1787 int swizzles[4];
1788 int first_enabled_chan = 0;
1789 int src_chan = 0;
1790
1791 assert(ir->lhs->type->is_vector() ||
1792 ir->lhs->type->is_scalar());
1793 dst.writemask = ir->write_mask;
1794
1795 for (int i = 0; i < 4; i++) {
1796 if (dst.writemask & (1 << i)) {
1797 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1798 break;
1799 }
1800 }
1801
1802 /* Swizzle a small RHS vector into the channels being written.
1803 *
1804 * glsl ir treats write_mask as dictating how many channels are
1805 * present on the RHS while in our instructions we need to make
1806 * those channels appear in the slots of the vec4 they're written to.
1807 */
1808 for (int i = 0; i < 4; i++) {
1809 if (dst.writemask & (1 << i))
1810 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1811 else
1812 swizzles[i] = first_enabled_chan;
1813 }
1814 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1815 swizzles[2], swizzles[3]);
1816
1817 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1818 return;
1819 }
1820
1821 if (ir->condition) {
1822 emit_bool_to_cond_code(ir->condition, &predicate);
1823 }
1824
1825 for (i = 0; i < type_size(ir->lhs->type); i++) {
1826 vec4_instruction *inst = emit(MOV(dst, src));
1827 inst->predicate = predicate;
1828
1829 dst.reg_offset++;
1830 src.reg_offset++;
1831 }
1832 }
1833
1834 void
1835 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1836 {
1837 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1838 foreach_list(node, &ir->components) {
1839 ir_constant *field_value = (ir_constant *)node;
1840
1841 emit_constant_values(dst, field_value);
1842 }
1843 return;
1844 }
1845
1846 if (ir->type->is_array()) {
1847 for (unsigned int i = 0; i < ir->type->length; i++) {
1848 emit_constant_values(dst, ir->array_elements[i]);
1849 }
1850 return;
1851 }
1852
1853 if (ir->type->is_matrix()) {
1854 for (int i = 0; i < ir->type->matrix_columns; i++) {
1855 float *vec = &ir->value.f[i * ir->type->vector_elements];
1856
1857 for (int j = 0; j < ir->type->vector_elements; j++) {
1858 dst->writemask = 1 << j;
1859 dst->type = BRW_REGISTER_TYPE_F;
1860
1861 emit(MOV(*dst, src_reg(vec[j])));
1862 }
1863 dst->reg_offset++;
1864 }
1865 return;
1866 }
1867
1868 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
1869
1870 for (int i = 0; i < ir->type->vector_elements; i++) {
1871 if (!(remaining_writemask & (1 << i)))
1872 continue;
1873
1874 dst->writemask = 1 << i;
1875 dst->type = brw_type_for_base_type(ir->type);
1876
1877 /* Find other components that match the one we're about to
1878 * write. Emits fewer instructions for things like vec4(0.5,
1879 * 1.5, 1.5, 1.5).
1880 */
1881 for (int j = i + 1; j < ir->type->vector_elements; j++) {
1882 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1883 if (ir->value.b[i] == ir->value.b[j])
1884 dst->writemask |= (1 << j);
1885 } else {
1886 /* u, i, and f storage all line up, so no need for a
1887 * switch case for comparing each type.
1888 */
1889 if (ir->value.u[i] == ir->value.u[j])
1890 dst->writemask |= (1 << j);
1891 }
1892 }
1893
1894 switch (ir->type->base_type) {
1895 case GLSL_TYPE_FLOAT:
1896 emit(MOV(*dst, src_reg(ir->value.f[i])));
1897 break;
1898 case GLSL_TYPE_INT:
1899 emit(MOV(*dst, src_reg(ir->value.i[i])));
1900 break;
1901 case GLSL_TYPE_UINT:
1902 emit(MOV(*dst, src_reg(ir->value.u[i])));
1903 break;
1904 case GLSL_TYPE_BOOL:
1905 emit(MOV(*dst, src_reg(ir->value.b[i])));
1906 break;
1907 default:
1908 assert(!"Non-float/uint/int/bool constant");
1909 break;
1910 }
1911
1912 remaining_writemask &= ~dst->writemask;
1913 }
1914 dst->reg_offset++;
1915 }
1916
1917 void
1918 vec4_visitor::visit(ir_constant *ir)
1919 {
1920 dst_reg dst = dst_reg(this, ir->type);
1921 this->result = src_reg(dst);
1922
1923 emit_constant_values(&dst, ir);
1924 }
1925
1926 void
1927 vec4_visitor::visit(ir_call *ir)
1928 {
1929 assert(!"not reached");
1930 }
1931
1932 void
1933 vec4_visitor::visit(ir_texture *ir)
1934 {
1935 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
1936
1937 /* Should be lowered by do_lower_texture_projection */
1938 assert(!ir->projector);
1939
1940 /* Generate code to compute all the subexpression trees. This has to be
1941 * done before loading any values into MRFs for the sampler message since
1942 * generating these values may involve SEND messages that need the MRFs.
1943 */
1944 src_reg coordinate;
1945 if (ir->coordinate) {
1946 ir->coordinate->accept(this);
1947 coordinate = this->result;
1948 }
1949
1950 src_reg shadow_comparitor;
1951 if (ir->shadow_comparitor) {
1952 ir->shadow_comparitor->accept(this);
1953 shadow_comparitor = this->result;
1954 }
1955
1956 const glsl_type *lod_type;
1957 src_reg lod, dPdx, dPdy;
1958 switch (ir->op) {
1959 case ir_tex:
1960 lod = src_reg(0.0f);
1961 lod_type = glsl_type::float_type;
1962 break;
1963 case ir_txf:
1964 case ir_txl:
1965 case ir_txs:
1966 ir->lod_info.lod->accept(this);
1967 lod = this->result;
1968 lod_type = ir->lod_info.lod->type;
1969 break;
1970 case ir_txd:
1971 ir->lod_info.grad.dPdx->accept(this);
1972 dPdx = this->result;
1973
1974 ir->lod_info.grad.dPdy->accept(this);
1975 dPdy = this->result;
1976
1977 lod_type = ir->lod_info.grad.dPdx->type;
1978 break;
1979 case ir_txb:
1980 break;
1981 }
1982
1983 vec4_instruction *inst = NULL;
1984 switch (ir->op) {
1985 case ir_tex:
1986 case ir_txl:
1987 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
1988 break;
1989 case ir_txd:
1990 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
1991 break;
1992 case ir_txf:
1993 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
1994 break;
1995 case ir_txs:
1996 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
1997 break;
1998 case ir_txb:
1999 assert(!"TXB is not valid for vertex shaders.");
2000 }
2001
2002 /* Texel offsets go in the message header; Gen4 also requires headers. */
2003 inst->header_present = ir->offset || intel->gen < 5;
2004 inst->base_mrf = 2;
2005 inst->mlen = inst->header_present + 1; /* always at least one */
2006 inst->sampler = sampler;
2007 inst->dst = dst_reg(this, ir->type);
2008 inst->dst.writemask = WRITEMASK_XYZW;
2009 inst->shadow_compare = ir->shadow_comparitor != NULL;
2010
2011 if (ir->offset != NULL && ir->op != ir_txf)
2012 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2013
2014 /* MRF for the first parameter */
2015 int param_base = inst->base_mrf + inst->header_present;
2016
2017 if (ir->op == ir_txs) {
2018 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2019 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2020 } else {
2021 int i, coord_mask = 0, zero_mask = 0;
2022 /* Load the coordinate */
2023 /* FINISHME: gl_clamp_mask and saturate */
2024 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2025 coord_mask |= (1 << i);
2026 for (; i < 4; i++)
2027 zero_mask |= (1 << i);
2028
2029 if (ir->offset && ir->op == ir_txf) {
2030 /* It appears that the ld instruction used for txf does its
2031 * address bounds check before adding in the offset. To work
2032 * around this, just add the integer offset to the integer
2033 * texel coordinate, and don't put the offset in the header.
2034 */
2035 ir_constant *offset = ir->offset->as_constant();
2036 assert(offset);
2037
2038 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2039 src_reg src = coordinate;
2040 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2041 BRW_GET_SWZ(src.swizzle, j),
2042 BRW_GET_SWZ(src.swizzle, j),
2043 BRW_GET_SWZ(src.swizzle, j));
2044 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2045 src, offset->value.i[j]));
2046 }
2047 } else {
2048 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2049 coordinate));
2050 }
2051 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2052 src_reg(0)));
2053 /* Load the shadow comparitor */
2054 if (ir->shadow_comparitor) {
2055 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2056 WRITEMASK_X),
2057 shadow_comparitor));
2058 inst->mlen++;
2059 }
2060
2061 /* Load the LOD info */
2062 if (ir->op == ir_tex || ir->op == ir_txl) {
2063 int mrf, writemask;
2064 if (intel->gen >= 5) {
2065 mrf = param_base + 1;
2066 if (ir->shadow_comparitor) {
2067 writemask = WRITEMASK_Y;
2068 /* mlen already incremented */
2069 } else {
2070 writemask = WRITEMASK_X;
2071 inst->mlen++;
2072 }
2073 } else /* intel->gen == 4 */ {
2074 mrf = param_base;
2075 writemask = WRITEMASK_Z;
2076 }
2077 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2078 } else if (ir->op == ir_txf) {
2079 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2080 lod));
2081 } else if (ir->op == ir_txd) {
2082 const glsl_type *type = lod_type;
2083
2084 if (intel->gen >= 5) {
2085 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2086 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2087 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2088 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2089 inst->mlen++;
2090
2091 if (ir->type->vector_elements == 3) {
2092 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2093 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2094 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2095 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2096 inst->mlen++;
2097 }
2098 } else /* intel->gen == 4 */ {
2099 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2100 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2101 inst->mlen += 2;
2102 }
2103 }
2104 }
2105
2106 emit(inst);
2107
2108 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2109 * spec requires layers.
2110 */
2111 if (ir->op == ir_txs) {
2112 glsl_type const *type = ir->sampler->type;
2113 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2114 type->sampler_array) {
2115 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2116 with_writemask(inst->dst, WRITEMASK_Z),
2117 src_reg(inst->dst), src_reg(6));
2118 }
2119 }
2120
2121 swizzle_result(ir, src_reg(inst->dst), sampler);
2122 }
2123
2124 void
2125 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2126 {
2127 int s = c->key.tex.swizzles[sampler];
2128
2129 this->result = src_reg(this, ir->type);
2130 dst_reg swizzled_result(this->result);
2131
2132 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2133 || s == SWIZZLE_NOOP) {
2134 emit(MOV(swizzled_result, orig_val));
2135 return;
2136 }
2137
2138 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2139 int swizzle[4];
2140
2141 for (int i = 0; i < 4; i++) {
2142 switch (GET_SWZ(s, i)) {
2143 case SWIZZLE_ZERO:
2144 zero_mask |= (1 << i);
2145 break;
2146 case SWIZZLE_ONE:
2147 one_mask |= (1 << i);
2148 break;
2149 default:
2150 copy_mask |= (1 << i);
2151 swizzle[i] = GET_SWZ(s, i);
2152 break;
2153 }
2154 }
2155
2156 if (copy_mask) {
2157 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2158 swizzled_result.writemask = copy_mask;
2159 emit(MOV(swizzled_result, orig_val));
2160 }
2161
2162 if (zero_mask) {
2163 swizzled_result.writemask = zero_mask;
2164 emit(MOV(swizzled_result, src_reg(0.0f)));
2165 }
2166
2167 if (one_mask) {
2168 swizzled_result.writemask = one_mask;
2169 emit(MOV(swizzled_result, src_reg(1.0f)));
2170 }
2171 }
2172
2173 void
2174 vec4_visitor::visit(ir_return *ir)
2175 {
2176 assert(!"not reached");
2177 }
2178
2179 void
2180 vec4_visitor::visit(ir_discard *ir)
2181 {
2182 assert(!"not reached");
2183 }
2184
2185 void
2186 vec4_visitor::visit(ir_if *ir)
2187 {
2188 /* Don't point the annotation at the if statement, because then it plus
2189 * the then and else blocks get printed.
2190 */
2191 this->base_ir = ir->condition;
2192
2193 if (intel->gen == 6) {
2194 emit_if_gen6(ir);
2195 } else {
2196 uint32_t predicate;
2197 emit_bool_to_cond_code(ir->condition, &predicate);
2198 emit(IF(predicate));
2199 }
2200
2201 visit_instructions(&ir->then_instructions);
2202
2203 if (!ir->else_instructions.is_empty()) {
2204 this->base_ir = ir->condition;
2205 emit(BRW_OPCODE_ELSE);
2206
2207 visit_instructions(&ir->else_instructions);
2208 }
2209
2210 this->base_ir = ir->condition;
2211 emit(BRW_OPCODE_ENDIF);
2212 }
2213
2214 void
2215 vec4_visitor::emit_ndc_computation()
2216 {
2217 /* Get the position */
2218 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2219
2220 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2221 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2222 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2223
2224 current_annotation = "NDC";
2225 dst_reg ndc_w = ndc;
2226 ndc_w.writemask = WRITEMASK_W;
2227 src_reg pos_w = pos;
2228 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2229 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2230
2231 dst_reg ndc_xyz = ndc;
2232 ndc_xyz.writemask = WRITEMASK_XYZ;
2233
2234 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2235 }
2236
2237 void
2238 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2239 {
2240 if (intel->gen < 6 &&
2241 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2242 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2243 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2244 dst_reg header1_w = header1;
2245 header1_w.writemask = WRITEMASK_W;
2246 GLuint i;
2247
2248 emit(MOV(header1, 0u));
2249
2250 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2251 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2252
2253 current_annotation = "Point size";
2254 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2255 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2256 }
2257
2258 current_annotation = "Clipping flags";
2259 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2260 vec4_instruction *inst;
2261
2262 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2263 src_reg(this->userplane[i])));
2264 inst->conditional_mod = BRW_CONDITIONAL_L;
2265
2266 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2267 inst->predicate = BRW_PREDICATE_NORMAL;
2268 }
2269
2270 /* i965 clipping workaround:
2271 * 1) Test for -ve rhw
2272 * 2) If set,
2273 * set ndc = (0,0,0,0)
2274 * set ucp[6] = 1
2275 *
2276 * Later, clipping will detect ucp[6] and ensure the primitive is
2277 * clipped against all fixed planes.
2278 */
2279 if (brw->has_negative_rhw_bug) {
2280 #if 0
2281 /* FINISHME */
2282 brw_CMP(p,
2283 vec8(brw_null_reg()),
2284 BRW_CONDITIONAL_L,
2285 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2286 brw_imm_f(0));
2287
2288 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2289 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2290 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2291 #endif
2292 }
2293
2294 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2295 } else if (intel->gen < 6) {
2296 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2297 } else {
2298 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2299 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2300 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2301 src_reg(output_reg[VERT_RESULT_PSIZ])));
2302 }
2303 }
2304 }
2305
2306 void
2307 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2308 {
2309 if (intel->gen < 6) {
2310 /* Clip distance slots are set aside in gen5, but they are not used. It
2311 * is not clear whether we actually need to set aside space for them,
2312 * but the performance cost is negligible.
2313 */
2314 return;
2315 }
2316
2317 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2318 *
2319 * "If a linked set of shaders forming the vertex stage contains no
2320 * static write to gl_ClipVertex or gl_ClipDistance, but the
2321 * application has requested clipping against user clip planes through
2322 * the API, then the coordinate written to gl_Position is used for
2323 * comparison against the user clip planes."
2324 *
2325 * This function is only called if the shader didn't write to
2326 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2327 * if the user wrote to it; otherwise we use gl_Position.
2328 */
2329 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2330 if (!(c->prog_data.outputs_written
2331 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2332 clip_vertex = VERT_RESULT_HPOS;
2333 }
2334
2335 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2336 ++i) {
2337 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2338 src_reg(output_reg[clip_vertex]),
2339 src_reg(this->userplane[i + offset])));
2340 }
2341 }
2342
2343 void
2344 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2345 {
2346 assert (vert_result < VERT_RESULT_MAX);
2347 reg.type = output_reg[vert_result].type;
2348 current_annotation = output_reg_annotation[vert_result];
2349 /* Copy the register, saturating if necessary */
2350 vec4_instruction *inst = emit(MOV(reg,
2351 src_reg(output_reg[vert_result])));
2352 if ((vert_result == VERT_RESULT_COL0 ||
2353 vert_result == VERT_RESULT_COL1 ||
2354 vert_result == VERT_RESULT_BFC0 ||
2355 vert_result == VERT_RESULT_BFC1) &&
2356 c->key.clamp_vertex_color) {
2357 inst->saturate = true;
2358 }
2359 }
2360
2361 void
2362 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2363 {
2364 struct brw_reg hw_reg = brw_message_reg(mrf);
2365 dst_reg reg = dst_reg(MRF, mrf);
2366 reg.type = BRW_REGISTER_TYPE_F;
2367
2368 switch (vert_result) {
2369 case VERT_RESULT_PSIZ:
2370 /* PSIZ is always in slot 0, and is coupled with other flags. */
2371 current_annotation = "indices, point width, clip flags";
2372 emit_psiz_and_flags(hw_reg);
2373 break;
2374 case BRW_VERT_RESULT_NDC:
2375 current_annotation = "NDC";
2376 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2377 break;
2378 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2379 case VERT_RESULT_HPOS:
2380 current_annotation = "gl_Position";
2381 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2382 break;
2383 case VERT_RESULT_CLIP_DIST0:
2384 case VERT_RESULT_CLIP_DIST1:
2385 if (this->c->key.uses_clip_distance) {
2386 emit_generic_urb_slot(reg, vert_result);
2387 } else {
2388 current_annotation = "user clip distances";
2389 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2390 }
2391 break;
2392 case VERT_RESULT_EDGE:
2393 /* This is present when doing unfilled polygons. We're supposed to copy
2394 * the edge flag from the user-provided vertex array
2395 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2396 * of that attribute (starts as 1.0f). This is then used in clipping to
2397 * determine which edges should be drawn as wireframe.
2398 */
2399 current_annotation = "edge flag";
2400 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2401 glsl_type::float_type, WRITEMASK_XYZW))));
2402 break;
2403 case BRW_VERT_RESULT_PAD:
2404 /* No need to write to this slot */
2405 break;
2406 default:
2407 emit_generic_urb_slot(reg, vert_result);
2408 break;
2409 }
2410 }
2411
2412 static int
2413 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2414 {
2415 struct intel_context *intel = &brw->intel;
2416
2417 if (intel->gen >= 6) {
2418 /* URB data written (does not include the message header reg) must
2419 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2420 * section 5.4.3.2.2: URB_INTERLEAVED.
2421 *
2422 * URB entries are allocated on a multiple of 1024 bits, so an
2423 * extra 128 bits written here to make the end align to 256 is
2424 * no problem.
2425 */
2426 if ((mlen % 2) != 1)
2427 mlen++;
2428 }
2429
2430 return mlen;
2431 }
2432
2433 /**
2434 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2435 * complete the VS thread.
2436 *
2437 * The VUE layout is documented in Volume 2a.
2438 */
2439 void
2440 vec4_visitor::emit_urb_writes()
2441 {
2442 /* MRF 0 is reserved for the debugger, so start with message header
2443 * in MRF 1.
2444 */
2445 int base_mrf = 1;
2446 int mrf = base_mrf;
2447 /* In the process of generating our URB write message contents, we
2448 * may need to unspill a register or load from an array. Those
2449 * reads would use MRFs 14-15.
2450 */
2451 int max_usable_mrf = 13;
2452
2453 /* The following assertion verifies that max_usable_mrf causes an
2454 * even-numbered amount of URB write data, which will meet gen6's
2455 * requirements for length alignment.
2456 */
2457 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2458
2459 /* First mrf is the g0-based message header containing URB handles and such,
2460 * which is implied in VS_OPCODE_URB_WRITE.
2461 */
2462 mrf++;
2463
2464 if (intel->gen < 6) {
2465 emit_ndc_computation();
2466 }
2467
2468 /* Set up the VUE data for the first URB write */
2469 int slot;
2470 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2471 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2472
2473 /* If this was max_usable_mrf, we can't fit anything more into this URB
2474 * WRITE.
2475 */
2476 if (mrf > max_usable_mrf) {
2477 slot++;
2478 break;
2479 }
2480 }
2481
2482 current_annotation = "URB write";
2483 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2484 inst->base_mrf = base_mrf;
2485 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2486 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2487
2488 /* Optional second URB write */
2489 if (!inst->eot) {
2490 mrf = base_mrf + 1;
2491
2492 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2493 assert(mrf < max_usable_mrf);
2494
2495 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2496 }
2497
2498 current_annotation = "URB write";
2499 inst = emit(VS_OPCODE_URB_WRITE);
2500 inst->base_mrf = base_mrf;
2501 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2502 inst->eot = true;
2503 /* URB destination offset. In the previous write, we got MRFs
2504 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2505 * URB row increments, and each of our MRFs is half of one of
2506 * those, since we're doing interleaved writes.
2507 */
2508 inst->offset = (max_usable_mrf - base_mrf) / 2;
2509 }
2510 }
2511
2512 src_reg
2513 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2514 src_reg *reladdr, int reg_offset)
2515 {
2516 /* Because we store the values to scratch interleaved like our
2517 * vertex data, we need to scale the vec4 index by 2.
2518 */
2519 int message_header_scale = 2;
2520
2521 /* Pre-gen6, the message header uses byte offsets instead of vec4
2522 * (16-byte) offset units.
2523 */
2524 if (intel->gen < 6)
2525 message_header_scale *= 16;
2526
2527 if (reladdr) {
2528 src_reg index = src_reg(this, glsl_type::int_type);
2529
2530 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2531 emit_before(inst, MUL(dst_reg(index),
2532 index, src_reg(message_header_scale)));
2533
2534 return index;
2535 } else {
2536 return src_reg(reg_offset * message_header_scale);
2537 }
2538 }
2539
2540 src_reg
2541 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2542 src_reg *reladdr, int reg_offset)
2543 {
2544 if (reladdr) {
2545 src_reg index = src_reg(this, glsl_type::int_type);
2546
2547 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2548
2549 /* Pre-gen6, the message header uses byte offsets instead of vec4
2550 * (16-byte) offset units.
2551 */
2552 if (intel->gen < 6) {
2553 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2554 }
2555
2556 return index;
2557 } else {
2558 int message_header_scale = intel->gen < 6 ? 16 : 1;
2559 return src_reg(reg_offset * message_header_scale);
2560 }
2561 }
2562
2563 /**
2564 * Emits an instruction before @inst to load the value named by @orig_src
2565 * from scratch space at @base_offset to @temp.
2566 *
2567 * @base_offset is measured in 32-byte units (the size of a register).
2568 */
2569 void
2570 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2571 dst_reg temp, src_reg orig_src,
2572 int base_offset)
2573 {
2574 int reg_offset = base_offset + orig_src.reg_offset;
2575 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2576
2577 emit_before(inst, SCRATCH_READ(temp, index));
2578 }
2579
2580 /**
2581 * Emits an instruction after @inst to store the value to be written
2582 * to @orig_dst to scratch space at @base_offset, from @temp.
2583 *
2584 * @base_offset is measured in 32-byte units (the size of a register).
2585 */
2586 void
2587 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2588 {
2589 int reg_offset = base_offset + inst->dst.reg_offset;
2590 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2591
2592 /* Create a temporary register to store *inst's result in.
2593 *
2594 * We have to be careful in MOVing from our temporary result register in
2595 * the scratch write. If we swizzle from channels of the temporary that
2596 * weren't initialized, it will confuse live interval analysis, which will
2597 * make spilling fail to make progress.
2598 */
2599 src_reg temp = src_reg(this, glsl_type::vec4_type);
2600 temp.type = inst->dst.type;
2601 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2602 int swizzles[4];
2603 for (int i = 0; i < 4; i++)
2604 if (inst->dst.writemask & (1 << i))
2605 swizzles[i] = i;
2606 else
2607 swizzles[i] = first_writemask_chan;
2608 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2609 swizzles[2], swizzles[3]);
2610
2611 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2612 inst->dst.writemask));
2613 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2614 write->predicate = inst->predicate;
2615 write->ir = inst->ir;
2616 write->annotation = inst->annotation;
2617 inst->insert_after(write);
2618
2619 inst->dst.file = temp.file;
2620 inst->dst.reg = temp.reg;
2621 inst->dst.reg_offset = temp.reg_offset;
2622 inst->dst.reladdr = NULL;
2623 }
2624
2625 /**
2626 * We can't generally support array access in GRF space, because a
2627 * single instruction's destination can only span 2 contiguous
2628 * registers. So, we send all GRF arrays that get variable index
2629 * access to scratch space.
2630 */
2631 void
2632 vec4_visitor::move_grf_array_access_to_scratch()
2633 {
2634 int scratch_loc[this->virtual_grf_count];
2635
2636 for (int i = 0; i < this->virtual_grf_count; i++) {
2637 scratch_loc[i] = -1;
2638 }
2639
2640 /* First, calculate the set of virtual GRFs that need to be punted
2641 * to scratch due to having any array access on them, and where in
2642 * scratch.
2643 */
2644 foreach_list(node, &this->instructions) {
2645 vec4_instruction *inst = (vec4_instruction *)node;
2646
2647 if (inst->dst.file == GRF && inst->dst.reladdr &&
2648 scratch_loc[inst->dst.reg] == -1) {
2649 scratch_loc[inst->dst.reg] = c->last_scratch;
2650 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2651 }
2652
2653 for (int i = 0 ; i < 3; i++) {
2654 src_reg *src = &inst->src[i];
2655
2656 if (src->file == GRF && src->reladdr &&
2657 scratch_loc[src->reg] == -1) {
2658 scratch_loc[src->reg] = c->last_scratch;
2659 c->last_scratch += this->virtual_grf_sizes[src->reg];
2660 }
2661 }
2662 }
2663
2664 /* Now, for anything that will be accessed through scratch, rewrite
2665 * it to load/store. Note that this is a _safe list walk, because
2666 * we may generate a new scratch_write instruction after the one
2667 * we're processing.
2668 */
2669 foreach_list_safe(node, &this->instructions) {
2670 vec4_instruction *inst = (vec4_instruction *)node;
2671
2672 /* Set up the annotation tracking for new generated instructions. */
2673 base_ir = inst->ir;
2674 current_annotation = inst->annotation;
2675
2676 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2677 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2678 }
2679
2680 for (int i = 0 ; i < 3; i++) {
2681 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2682 continue;
2683
2684 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2685
2686 emit_scratch_read(inst, temp, inst->src[i],
2687 scratch_loc[inst->src[i].reg]);
2688
2689 inst->src[i].file = temp.file;
2690 inst->src[i].reg = temp.reg;
2691 inst->src[i].reg_offset = temp.reg_offset;
2692 inst->src[i].reladdr = NULL;
2693 }
2694 }
2695 }
2696
2697 /**
2698 * Emits an instruction before @inst to load the value named by @orig_src
2699 * from the pull constant buffer (surface) at @base_offset to @temp.
2700 */
2701 void
2702 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2703 dst_reg temp, src_reg orig_src,
2704 int base_offset)
2705 {
2706 int reg_offset = base_offset + orig_src.reg_offset;
2707 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2708 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2709 vec4_instruction *load;
2710
2711 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2712 temp, index, offset);
2713 load->base_mrf = 14;
2714 load->mlen = 1;
2715 emit_before(inst, load);
2716 }
2717
2718 /**
2719 * Implements array access of uniforms by inserting a
2720 * PULL_CONSTANT_LOAD instruction.
2721 *
2722 * Unlike temporary GRF array access (where we don't support it due to
2723 * the difficulty of doing relative addressing on instruction
2724 * destinations), we could potentially do array access of uniforms
2725 * that were loaded in GRF space as push constants. In real-world
2726 * usage we've seen, though, the arrays being used are always larger
2727 * than we could load as push constants, so just always move all
2728 * uniform array access out to a pull constant buffer.
2729 */
2730 void
2731 vec4_visitor::move_uniform_array_access_to_pull_constants()
2732 {
2733 int pull_constant_loc[this->uniforms];
2734
2735 for (int i = 0; i < this->uniforms; i++) {
2736 pull_constant_loc[i] = -1;
2737 }
2738
2739 /* Walk through and find array access of uniforms. Put a copy of that
2740 * uniform in the pull constant buffer.
2741 *
2742 * Note that we don't move constant-indexed accesses to arrays. No
2743 * testing has been done of the performance impact of this choice.
2744 */
2745 foreach_list_safe(node, &this->instructions) {
2746 vec4_instruction *inst = (vec4_instruction *)node;
2747
2748 for (int i = 0 ; i < 3; i++) {
2749 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2750 continue;
2751
2752 int uniform = inst->src[i].reg;
2753
2754 /* If this array isn't already present in the pull constant buffer,
2755 * add it.
2756 */
2757 if (pull_constant_loc[uniform] == -1) {
2758 const float **values = &prog_data->param[uniform * 4];
2759
2760 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2761
2762 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2763 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2764 }
2765 }
2766
2767 /* Set up the annotation tracking for new generated instructions. */
2768 base_ir = inst->ir;
2769 current_annotation = inst->annotation;
2770
2771 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2772
2773 emit_pull_constant_load(inst, temp, inst->src[i],
2774 pull_constant_loc[uniform]);
2775
2776 inst->src[i].file = temp.file;
2777 inst->src[i].reg = temp.reg;
2778 inst->src[i].reg_offset = temp.reg_offset;
2779 inst->src[i].reladdr = NULL;
2780 }
2781 }
2782
2783 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2784 * no need to track them as larger-than-vec4 objects. This will be
2785 * relied on in cutting out unused uniform vectors from push
2786 * constants.
2787 */
2788 split_uniform_registers();
2789 }
2790
2791 void
2792 vec4_visitor::resolve_ud_negate(src_reg *reg)
2793 {
2794 if (reg->type != BRW_REGISTER_TYPE_UD ||
2795 !reg->negate)
2796 return;
2797
2798 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2799 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2800 *reg = temp;
2801 }
2802
2803 vec4_visitor::vec4_visitor(struct brw_context *brw,
2804 struct brw_vs_compile *c,
2805 struct gl_shader_program *prog,
2806 struct brw_shader *shader,
2807 void *mem_ctx)
2808 {
2809 this->c = c;
2810 this->brw = brw;
2811 this->intel = &brw->intel;
2812 this->ctx = &intel->ctx;
2813 this->prog = prog;
2814 this->shader = shader;
2815
2816 this->mem_ctx = mem_ctx;
2817 this->failed = false;
2818
2819 this->base_ir = NULL;
2820 this->current_annotation = NULL;
2821 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2822
2823 this->c = c;
2824 this->vp = &c->vp->program;
2825 this->prog_data = &c->prog_data;
2826
2827 this->variable_ht = hash_table_ctor(0,
2828 hash_table_pointer_hash,
2829 hash_table_pointer_compare);
2830
2831 this->virtual_grf_def = NULL;
2832 this->virtual_grf_use = NULL;
2833 this->virtual_grf_sizes = NULL;
2834 this->virtual_grf_count = 0;
2835 this->virtual_grf_reg_map = NULL;
2836 this->virtual_grf_reg_count = 0;
2837 this->virtual_grf_array_size = 0;
2838 this->live_intervals_valid = false;
2839
2840 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2841
2842 this->uniforms = 0;
2843 }
2844
2845 vec4_visitor::~vec4_visitor()
2846 {
2847 hash_table_dtor(this->variable_ht);
2848 }
2849
2850
2851 void
2852 vec4_visitor::fail(const char *format, ...)
2853 {
2854 va_list va;
2855 char *msg;
2856
2857 if (failed)
2858 return;
2859
2860 failed = true;
2861
2862 va_start(va, format);
2863 msg = ralloc_vasprintf(mem_ctx, format, va);
2864 va_end(va);
2865 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
2866
2867 this->fail_msg = msg;
2868
2869 if (INTEL_DEBUG & DEBUG_VS) {
2870 fprintf(stderr, "%s", msg);
2871 }
2872 }
2873
2874 } /* namespace brw */