glsl: Replace most default cases in switches on GLSL type
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU1(F32TO16)
117 ALU1(F16TO32)
118 ALU2(ADD)
119 ALU2(MUL)
120 ALU2(MACH)
121 ALU2(AND)
122 ALU2(OR)
123 ALU2(XOR)
124 ALU2(DP3)
125 ALU2(DP4)
126 ALU2(DPH)
127 ALU2(SHL)
128 ALU2(SHR)
129 ALU2(ASR)
130
131 /** Gen4 predicated IF. */
132 vec4_instruction *
133 vec4_visitor::IF(uint32_t predicate)
134 {
135 vec4_instruction *inst;
136
137 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
138 inst->predicate = predicate;
139
140 return inst;
141 }
142
143 /** Gen6+ IF with embedded comparison. */
144 vec4_instruction *
145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
146 {
147 assert(intel->gen >= 6);
148
149 vec4_instruction *inst;
150
151 resolve_ud_negate(&src0);
152 resolve_ud_negate(&src1);
153
154 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
155 src0, src1);
156 inst->conditional_mod = condition;
157
158 return inst;
159 }
160
161 /**
162 * CMP: Sets the low bit of the destination channels with the result
163 * of the comparison, while the upper bits are undefined, and updates
164 * the flag register with the packed 16 bits of the result.
165 */
166 vec4_instruction *
167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
168 {
169 vec4_instruction *inst;
170
171 /* original gen4 does type conversion to the destination type
172 * before before comparison, producing garbage results for floating
173 * point comparisons.
174 */
175 if (intel->gen == 4) {
176 dst.type = src0.type;
177 if (dst.file == HW_REG)
178 dst.fixed_hw_reg.type = dst.type;
179 }
180
181 resolve_ud_negate(&src0);
182 resolve_ud_negate(&src1);
183
184 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
185 inst->conditional_mod = condition;
186
187 return inst;
188 }
189
190 vec4_instruction *
191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
196 dst, index);
197 inst->base_mrf = 14;
198 inst->mlen = 2;
199
200 return inst;
201 }
202
203 vec4_instruction *
204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
205 {
206 vec4_instruction *inst;
207
208 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
209 dst, src, index);
210 inst->base_mrf = 13;
211 inst->mlen = 3;
212
213 return inst;
214 }
215
216 void
217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
218 {
219 static enum opcode dot_opcodes[] = {
220 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
221 };
222
223 emit(dot_opcodes[elements - 2], dst, src0, src1);
224 }
225
226 src_reg
227 vec4_visitor::fix_math_operand(src_reg src)
228 {
229 /* The gen6 math instruction ignores the source modifiers --
230 * swizzle, abs, negate, and at least some parts of the register
231 * region description.
232 *
233 * Rather than trying to enumerate all these cases, *always* expand the
234 * operand to a temp GRF for gen6.
235 *
236 * For gen7, keep the operand as-is, except if immediate, which gen7 still
237 * can't use.
238 */
239
240 if (intel->gen == 7 && src.file != IMM)
241 return src;
242
243 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
244 expanded.type = src.type;
245 emit(MOV(expanded, src));
246 return src_reg(expanded);
247 }
248
249 void
250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
251 {
252 src = fix_math_operand(src);
253
254 if (dst.writemask != WRITEMASK_XYZW) {
255 /* The gen6 math instruction must be align1, so we can't do
256 * writemasks.
257 */
258 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
259
260 emit(opcode, temp_dst, src);
261
262 emit(MOV(dst, src_reg(temp_dst)));
263 } else {
264 emit(opcode, dst, src);
265 }
266 }
267
268 void
269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
270 {
271 vec4_instruction *inst = emit(opcode, dst, src);
272 inst->base_mrf = 1;
273 inst->mlen = 1;
274 }
275
276 void
277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
278 {
279 switch (opcode) {
280 case SHADER_OPCODE_RCP:
281 case SHADER_OPCODE_RSQ:
282 case SHADER_OPCODE_SQRT:
283 case SHADER_OPCODE_EXP2:
284 case SHADER_OPCODE_LOG2:
285 case SHADER_OPCODE_SIN:
286 case SHADER_OPCODE_COS:
287 break;
288 default:
289 assert(!"not reached: bad math opcode");
290 return;
291 }
292
293 if (intel->gen >= 6) {
294 return emit_math1_gen6(opcode, dst, src);
295 } else {
296 return emit_math1_gen4(opcode, dst, src);
297 }
298 }
299
300 void
301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
302 dst_reg dst, src_reg src0, src_reg src1)
303 {
304 src0 = fix_math_operand(src0);
305 src1 = fix_math_operand(src1);
306
307 if (dst.writemask != WRITEMASK_XYZW) {
308 /* The gen6 math instruction must be align1, so we can't do
309 * writemasks.
310 */
311 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
312 temp_dst.type = dst.type;
313
314 emit(opcode, temp_dst, src0, src1);
315
316 emit(MOV(dst, src_reg(temp_dst)));
317 } else {
318 emit(opcode, dst, src0, src1);
319 }
320 }
321
322 void
323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
324 dst_reg dst, src_reg src0, src_reg src1)
325 {
326 vec4_instruction *inst = emit(opcode, dst, src0, src1);
327 inst->base_mrf = 1;
328 inst->mlen = 2;
329 }
330
331 void
332 vec4_visitor::emit_math(enum opcode opcode,
333 dst_reg dst, src_reg src0, src_reg src1)
334 {
335 switch (opcode) {
336 case SHADER_OPCODE_POW:
337 case SHADER_OPCODE_INT_QUOTIENT:
338 case SHADER_OPCODE_INT_REMAINDER:
339 break;
340 default:
341 assert(!"not reached: unsupported binary math opcode");
342 return;
343 }
344
345 if (intel->gen >= 6) {
346 return emit_math2_gen6(opcode, dst, src0, src1);
347 } else {
348 return emit_math2_gen4(opcode, dst, src0, src1);
349 }
350 }
351
352 void
353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
354 {
355 if (intel->gen < 7)
356 assert(!"ir_unop_pack_half_2x16 should be lowered");
357
358 assert(dst.type == BRW_REGISTER_TYPE_UD);
359 assert(src0.type == BRW_REGISTER_TYPE_F);
360
361 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
362 *
363 * Because this instruction does not have a 16-bit floating-point type,
364 * the destination data type must be Word (W).
365 *
366 * The destination must be DWord-aligned and specify a horizontal stride
367 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
368 * each destination channel and the upper word is not modified.
369 *
370 * The above restriction implies that the f32to16 instruction must use
371 * align1 mode, because only in align1 mode is it possible to specify
372 * horizontal stride. We choose here to defy the hardware docs and emit
373 * align16 instructions.
374 *
375 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
376 * instructions. I was partially successful in that the code passed all
377 * tests. However, the code was dubiously correct and fragile, and the
378 * tests were not harsh enough to probe that frailty. Not trusting the
379 * code, I chose instead to remain in align16 mode in defiance of the hw
380 * docs).
381 *
382 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
383 * simulator, emitting a f32to16 in align16 mode with UD as destination
384 * data type is safe. The behavior differs from that specified in the PRM
385 * in that the upper word of each destination channel is cleared to 0.
386 */
387
388 dst_reg tmp_dst(this, glsl_type::uvec2_type);
389 src_reg tmp_src(tmp_dst);
390
391 #if 0
392 /* Verify the undocumented behavior on which the following instructions
393 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
394 * then the result of the bit-or instruction below will be incorrect.
395 *
396 * You should inspect the disasm output in order to verify that the MOV is
397 * not optimized away.
398 */
399 emit(MOV(tmp_dst, src_reg(0x12345678u)));
400 #endif
401
402 /* Give tmp the form below, where "." means untouched.
403 *
404 * w z y x w z y x
405 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
406 *
407 * That the upper word of each write-channel be 0 is required for the
408 * following bit-shift and bit-or instructions to work. Note that this
409 * relies on the undocumented hardware behavior mentioned above.
410 */
411 tmp_dst.writemask = WRITEMASK_XY;
412 emit(F32TO16(tmp_dst, src0));
413
414 /* Give the write-channels of dst the form:
415 * 0xhhhh0000
416 */
417 tmp_src.swizzle = SWIZZLE_Y;
418 emit(SHL(dst, tmp_src, src_reg(16u)));
419
420 /* Finally, give the write-channels of dst the form of packHalf2x16's
421 * output:
422 * 0xhhhhllll
423 */
424 tmp_src.swizzle = SWIZZLE_X;
425 emit(OR(dst, src_reg(dst), tmp_src));
426 }
427
428 void
429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
430 {
431 if (intel->gen < 7)
432 assert(!"ir_unop_unpack_half_2x16 should be lowered");
433
434 assert(dst.type == BRW_REGISTER_TYPE_F);
435 assert(src0.type == BRW_REGISTER_TYPE_UD);
436
437 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
438 *
439 * Because this instruction does not have a 16-bit floating-point type,
440 * the source data type must be Word (W). The destination type must be
441 * F (Float).
442 *
443 * To use W as the source data type, we must adjust horizontal strides,
444 * which is only possible in align1 mode. All my [chadv] attempts at
445 * emitting align1 instructions for unpackHalf2x16 failed to pass the
446 * Piglit tests, so I gave up.
447 *
448 * I've verified that, on gen7 hardware and the simulator, it is safe to
449 * emit f16to32 in align16 mode with UD as source data type.
450 */
451
452 dst_reg tmp_dst(this, glsl_type::uvec2_type);
453 src_reg tmp_src(tmp_dst);
454
455 tmp_dst.writemask = WRITEMASK_X;
456 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
457
458 tmp_dst.writemask = WRITEMASK_Y;
459 emit(SHR(tmp_dst, src0, src_reg(16u)));
460
461 dst.writemask = WRITEMASK_XY;
462 emit(F16TO32(dst, tmp_src));
463 }
464
465 void
466 vec4_visitor::visit_instructions(const exec_list *list)
467 {
468 foreach_list(node, list) {
469 ir_instruction *ir = (ir_instruction *)node;
470
471 base_ir = ir;
472 ir->accept(this);
473 }
474 }
475
476
477 static int
478 type_size(const struct glsl_type *type)
479 {
480 unsigned int i;
481 int size;
482
483 switch (type->base_type) {
484 case GLSL_TYPE_UINT:
485 case GLSL_TYPE_INT:
486 case GLSL_TYPE_FLOAT:
487 case GLSL_TYPE_BOOL:
488 if (type->is_matrix()) {
489 return type->matrix_columns;
490 } else {
491 /* Regardless of size of vector, it gets a vec4. This is bad
492 * packing for things like floats, but otherwise arrays become a
493 * mess. Hopefully a later pass over the code can pack scalars
494 * down if appropriate.
495 */
496 return 1;
497 }
498 case GLSL_TYPE_ARRAY:
499 assert(type->length > 0);
500 return type_size(type->fields.array) * type->length;
501 case GLSL_TYPE_STRUCT:
502 size = 0;
503 for (i = 0; i < type->length; i++) {
504 size += type_size(type->fields.structure[i].type);
505 }
506 return size;
507 case GLSL_TYPE_SAMPLER:
508 /* Samplers take up one slot in UNIFORMS[], but they're baked in
509 * at link time.
510 */
511 return 1;
512 case GLSL_TYPE_VOID:
513 case GLSL_TYPE_ERROR:
514 assert(0);
515 break;
516 }
517
518 return 0;
519 }
520
521 int
522 vec4_visitor::virtual_grf_alloc(int size)
523 {
524 if (virtual_grf_array_size <= virtual_grf_count) {
525 if (virtual_grf_array_size == 0)
526 virtual_grf_array_size = 16;
527 else
528 virtual_grf_array_size *= 2;
529 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
530 virtual_grf_array_size);
531 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
532 virtual_grf_array_size);
533 }
534 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
535 virtual_grf_reg_count += size;
536 virtual_grf_sizes[virtual_grf_count] = size;
537 return virtual_grf_count++;
538 }
539
540 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
541 {
542 init();
543
544 this->file = GRF;
545 this->reg = v->virtual_grf_alloc(type_size(type));
546
547 if (type->is_array() || type->is_record()) {
548 this->swizzle = BRW_SWIZZLE_NOOP;
549 } else {
550 this->swizzle = swizzle_for_size(type->vector_elements);
551 }
552
553 this->type = brw_type_for_base_type(type);
554 }
555
556 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
557 {
558 init();
559
560 this->file = GRF;
561 this->reg = v->virtual_grf_alloc(type_size(type));
562
563 if (type->is_array() || type->is_record()) {
564 this->writemask = WRITEMASK_XYZW;
565 } else {
566 this->writemask = (1 << type->vector_elements) - 1;
567 }
568
569 this->type = brw_type_for_base_type(type);
570 }
571
572 /* Our support for uniforms is piggy-backed on the struct
573 * gl_fragment_program, because that's where the values actually
574 * get stored, rather than in some global gl_shader_program uniform
575 * store.
576 */
577 void
578 vec4_visitor::setup_uniform_values(ir_variable *ir)
579 {
580 int namelen = strlen(ir->name);
581
582 /* The data for our (non-builtin) uniforms is stored in a series of
583 * gl_uniform_driver_storage structs for each subcomponent that
584 * glGetUniformLocation() could name. We know it's been set up in the same
585 * order we'd walk the type, so walk the list of storage and find anything
586 * with our name, or the prefix of a component that starts with our name.
587 */
588 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
589 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
590
591 if (strncmp(ir->name, storage->name, namelen) != 0 ||
592 (storage->name[namelen] != 0 &&
593 storage->name[namelen] != '.' &&
594 storage->name[namelen] != '[')) {
595 continue;
596 }
597
598 gl_constant_value *components = storage->storage;
599 unsigned vector_count = (MAX2(storage->array_elements, 1) *
600 storage->type->matrix_columns);
601
602 for (unsigned s = 0; s < vector_count; s++) {
603 uniform_vector_size[uniforms] = storage->type->vector_elements;
604
605 int i;
606 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
607 c->prog_data.param[uniforms * 4 + i] = &components->f;
608 components++;
609 }
610 for (; i < 4; i++) {
611 static float zero = 0;
612 c->prog_data.param[uniforms * 4 + i] = &zero;
613 }
614
615 uniforms++;
616 }
617 }
618 }
619
620 void
621 vec4_visitor::setup_uniform_clipplane_values()
622 {
623 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
624
625 if (intel->gen < 6) {
626 /* Pre-Gen6, we compact clip planes. For example, if the user
627 * enables just clip planes 0, 1, and 3, we will enable clip planes
628 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
629 * plane 2. This simplifies the implementation of the Gen6 clip
630 * thread.
631 */
632 int compacted_clipplane_index = 0;
633 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
634 if (!(c->key.userclip_planes_enabled_gen_4_5 & (1 << i)))
635 continue;
636
637 this->uniform_vector_size[this->uniforms] = 4;
638 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
639 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
640 for (int j = 0; j < 4; ++j) {
641 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
642 }
643 ++compacted_clipplane_index;
644 ++this->uniforms;
645 }
646 } else {
647 /* In Gen6 and later, we don't compact clip planes, because this
648 * simplifies the implementation of gl_ClipDistance.
649 */
650 for (int i = 0; i < c->key.nr_userclip_plane_consts; ++i) {
651 this->uniform_vector_size[this->uniforms] = 4;
652 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
653 this->userplane[i].type = BRW_REGISTER_TYPE_F;
654 for (int j = 0; j < 4; ++j) {
655 c->prog_data.param[this->uniforms * 4 + j] = &clip_planes[i][j];
656 }
657 ++this->uniforms;
658 }
659 }
660 }
661
662 /* Our support for builtin uniforms is even scarier than non-builtin.
663 * It sits on top of the PROG_STATE_VAR parameters that are
664 * automatically updated from GL context state.
665 */
666 void
667 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
668 {
669 const ir_state_slot *const slots = ir->state_slots;
670 assert(ir->state_slots != NULL);
671
672 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
673 /* This state reference has already been setup by ir_to_mesa,
674 * but we'll get the same index back here. We can reference
675 * ParameterValues directly, since unlike brw_fs.cpp, we never
676 * add new state references during compile.
677 */
678 int index = _mesa_add_state_reference(this->vp->Base.Parameters,
679 (gl_state_index *)slots[i].tokens);
680 float *values = &this->vp->Base.Parameters->ParameterValues[index][0].f;
681
682 this->uniform_vector_size[this->uniforms] = 0;
683 /* Add each of the unique swizzled channels of the element.
684 * This will end up matching the size of the glsl_type of this field.
685 */
686 int last_swiz = -1;
687 for (unsigned int j = 0; j < 4; j++) {
688 int swiz = GET_SWZ(slots[i].swizzle, j);
689 last_swiz = swiz;
690
691 c->prog_data.param[this->uniforms * 4 + j] = &values[swiz];
692 if (swiz <= last_swiz)
693 this->uniform_vector_size[this->uniforms]++;
694 }
695 this->uniforms++;
696 }
697 }
698
699 dst_reg *
700 vec4_visitor::variable_storage(ir_variable *var)
701 {
702 return (dst_reg *)hash_table_find(this->variable_ht, var);
703 }
704
705 void
706 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
707 {
708 ir_expression *expr = ir->as_expression();
709
710 *predicate = BRW_PREDICATE_NORMAL;
711
712 if (expr) {
713 src_reg op[2];
714 vec4_instruction *inst;
715
716 assert(expr->get_num_operands() <= 2);
717 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
718 expr->operands[i]->accept(this);
719 op[i] = this->result;
720
721 resolve_ud_negate(&op[i]);
722 }
723
724 switch (expr->operation) {
725 case ir_unop_logic_not:
726 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
727 inst->conditional_mod = BRW_CONDITIONAL_Z;
728 break;
729
730 case ir_binop_logic_xor:
731 inst = emit(XOR(dst_null_d(), op[0], op[1]));
732 inst->conditional_mod = BRW_CONDITIONAL_NZ;
733 break;
734
735 case ir_binop_logic_or:
736 inst = emit(OR(dst_null_d(), op[0], op[1]));
737 inst->conditional_mod = BRW_CONDITIONAL_NZ;
738 break;
739
740 case ir_binop_logic_and:
741 inst = emit(AND(dst_null_d(), op[0], op[1]));
742 inst->conditional_mod = BRW_CONDITIONAL_NZ;
743 break;
744
745 case ir_unop_f2b:
746 if (intel->gen >= 6) {
747 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
748 } else {
749 inst = emit(MOV(dst_null_f(), op[0]));
750 inst->conditional_mod = BRW_CONDITIONAL_NZ;
751 }
752 break;
753
754 case ir_unop_i2b:
755 if (intel->gen >= 6) {
756 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
757 } else {
758 inst = emit(MOV(dst_null_d(), op[0]));
759 inst->conditional_mod = BRW_CONDITIONAL_NZ;
760 }
761 break;
762
763 case ir_binop_all_equal:
764 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
765 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
766 break;
767
768 case ir_binop_any_nequal:
769 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
770 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
771 break;
772
773 case ir_unop_any:
774 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
775 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
776 break;
777
778 case ir_binop_greater:
779 case ir_binop_gequal:
780 case ir_binop_less:
781 case ir_binop_lequal:
782 case ir_binop_equal:
783 case ir_binop_nequal:
784 emit(CMP(dst_null_d(), op[0], op[1],
785 brw_conditional_for_comparison(expr->operation)));
786 break;
787
788 default:
789 assert(!"not reached");
790 break;
791 }
792 return;
793 }
794
795 ir->accept(this);
796
797 resolve_ud_negate(&this->result);
798
799 if (intel->gen >= 6) {
800 vec4_instruction *inst = emit(AND(dst_null_d(),
801 this->result, src_reg(1)));
802 inst->conditional_mod = BRW_CONDITIONAL_NZ;
803 } else {
804 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
805 inst->conditional_mod = BRW_CONDITIONAL_NZ;
806 }
807 }
808
809 /**
810 * Emit a gen6 IF statement with the comparison folded into the IF
811 * instruction.
812 */
813 void
814 vec4_visitor::emit_if_gen6(ir_if *ir)
815 {
816 ir_expression *expr = ir->condition->as_expression();
817
818 if (expr) {
819 src_reg op[2];
820 dst_reg temp;
821
822 assert(expr->get_num_operands() <= 2);
823 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
824 expr->operands[i]->accept(this);
825 op[i] = this->result;
826 }
827
828 switch (expr->operation) {
829 case ir_unop_logic_not:
830 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
831 return;
832
833 case ir_binop_logic_xor:
834 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
835 return;
836
837 case ir_binop_logic_or:
838 temp = dst_reg(this, glsl_type::bool_type);
839 emit(OR(temp, op[0], op[1]));
840 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
841 return;
842
843 case ir_binop_logic_and:
844 temp = dst_reg(this, glsl_type::bool_type);
845 emit(AND(temp, op[0], op[1]));
846 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
847 return;
848
849 case ir_unop_f2b:
850 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
851 return;
852
853 case ir_unop_i2b:
854 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
855 return;
856
857 case ir_binop_greater:
858 case ir_binop_gequal:
859 case ir_binop_less:
860 case ir_binop_lequal:
861 case ir_binop_equal:
862 case ir_binop_nequal:
863 emit(IF(op[0], op[1],
864 brw_conditional_for_comparison(expr->operation)));
865 return;
866
867 case ir_binop_all_equal:
868 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
869 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
870 return;
871
872 case ir_binop_any_nequal:
873 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
874 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
875 return;
876
877 case ir_unop_any:
878 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
879 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
880 return;
881
882 default:
883 assert(!"not reached");
884 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
885 return;
886 }
887 return;
888 }
889
890 ir->condition->accept(this);
891
892 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
893 }
894
895 static dst_reg
896 with_writemask(dst_reg const & r, int mask)
897 {
898 dst_reg result = r;
899 result.writemask = mask;
900 return result;
901 }
902
903 void
904 vec4_visitor::emit_attribute_fixups()
905 {
906 dst_reg sign_recovery_shift;
907 dst_reg normalize_factor;
908 dst_reg es3_normalize_factor;
909
910 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
911 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
912 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
913 dst_reg reg(ATTR, i);
914 dst_reg reg_d = reg;
915 reg_d.type = BRW_REGISTER_TYPE_D;
916 dst_reg reg_ud = reg;
917 reg_ud.type = BRW_REGISTER_TYPE_UD;
918
919 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
920 * come in as floating point conversions of the integer values.
921 */
922 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
923 dst_reg dst = reg;
924 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
925 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
926 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
927 }
928
929 /* Do sign recovery for 2101010 formats if required. */
930 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
931 if (sign_recovery_shift.file == BAD_FILE) {
932 /* shift constant: <22,22,22,30> */
933 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
934 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
935 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
936 }
937
938 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
939 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
940 }
941
942 /* Apply BGRA swizzle if required. */
943 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
944 src_reg temp = src_reg(reg);
945 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
946 emit(MOV(reg, temp));
947 }
948
949 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
950 /* ES 3.0 has different rules for converting signed normalized
951 * fixed-point numbers than desktop GL.
952 */
953 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
954 /* According to equation 2.2 of the ES 3.0 specification,
955 * signed normalization conversion is done by:
956 *
957 * f = c / (2^(b-1)-1)
958 */
959 if (es3_normalize_factor.file == BAD_FILE) {
960 /* mul constant: 1 / (2^(b-1) - 1) */
961 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
962 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
963 src_reg(1.0f / ((1<<9) - 1))));
964 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
965 src_reg(1.0f / ((1<<1) - 1))));
966 }
967
968 dst_reg dst = reg;
969 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
970 emit(MOV(dst, src_reg(reg_d)));
971 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
972 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
973 } else {
974 /* The following equations are from the OpenGL 3.2 specification:
975 *
976 * 2.1 unsigned normalization
977 * f = c/(2^n-1)
978 *
979 * 2.2 signed normalization
980 * f = (2c+1)/(2^n-1)
981 *
982 * Both of these share a common divisor, which is represented by
983 * "normalize_factor" in the code below.
984 */
985 if (normalize_factor.file == BAD_FILE) {
986 /* 1 / (2^b - 1) for b=<10,10,10,2> */
987 normalize_factor = dst_reg(this, glsl_type::vec4_type);
988 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
989 src_reg(1.0f / ((1<<10) - 1))));
990 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
991 src_reg(1.0f / ((1<<2) - 1))));
992 }
993
994 dst_reg dst = reg;
995 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
996 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
997
998 /* For signed normalization, we want the numerator to be 2c+1. */
999 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1000 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1001 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1002 }
1003
1004 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1005 }
1006 }
1007
1008 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1009 dst_reg dst = reg;
1010 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1011 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1012 }
1013 }
1014 }
1015 }
1016
1017 void
1018 vec4_visitor::visit(ir_variable *ir)
1019 {
1020 dst_reg *reg = NULL;
1021
1022 if (variable_storage(ir))
1023 return;
1024
1025 switch (ir->mode) {
1026 case ir_var_shader_in:
1027 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1028 break;
1029
1030 case ir_var_shader_out:
1031 reg = new(mem_ctx) dst_reg(this, ir->type);
1032
1033 for (int i = 0; i < type_size(ir->type); i++) {
1034 output_reg[ir->location + i] = *reg;
1035 output_reg[ir->location + i].reg_offset = i;
1036 output_reg[ir->location + i].type =
1037 brw_type_for_base_type(ir->type->get_scalar_type());
1038 output_reg_annotation[ir->location + i] = ir->name;
1039 }
1040 break;
1041
1042 case ir_var_auto:
1043 case ir_var_temporary:
1044 reg = new(mem_ctx) dst_reg(this, ir->type);
1045 break;
1046
1047 case ir_var_uniform:
1048 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1049
1050 /* Thanks to the lower_ubo_reference pass, we will see only
1051 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1052 * variables, so no need for them to be in variable_ht.
1053 */
1054 if (ir->uniform_block != -1)
1055 return;
1056
1057 /* Track how big the whole uniform variable is, in case we need to put a
1058 * copy of its data into pull constants for array access.
1059 */
1060 this->uniform_size[this->uniforms] = type_size(ir->type);
1061
1062 if (!strncmp(ir->name, "gl_", 3)) {
1063 setup_builtin_uniform_values(ir);
1064 } else {
1065 setup_uniform_values(ir);
1066 }
1067 break;
1068
1069 case ir_var_system_value:
1070 /* VertexID is stored by the VF as the last vertex element, but
1071 * we don't represent it with a flag in inputs_read, so we call
1072 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1073 */
1074 reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1075 prog_data->uses_vertexid = true;
1076
1077 switch (ir->location) {
1078 case SYSTEM_VALUE_VERTEX_ID:
1079 reg->writemask = WRITEMASK_X;
1080 break;
1081 case SYSTEM_VALUE_INSTANCE_ID:
1082 reg->writemask = WRITEMASK_Y;
1083 break;
1084 default:
1085 assert(!"not reached");
1086 break;
1087 }
1088 break;
1089
1090 default:
1091 assert(!"not reached");
1092 }
1093
1094 reg->type = brw_type_for_base_type(ir->type);
1095 hash_table_insert(this->variable_ht, reg, ir);
1096 }
1097
1098 void
1099 vec4_visitor::visit(ir_loop *ir)
1100 {
1101 dst_reg counter;
1102
1103 /* We don't want debugging output to print the whole body of the
1104 * loop as the annotation.
1105 */
1106 this->base_ir = NULL;
1107
1108 if (ir->counter != NULL) {
1109 this->base_ir = ir->counter;
1110 ir->counter->accept(this);
1111 counter = *(variable_storage(ir->counter));
1112
1113 if (ir->from != NULL) {
1114 this->base_ir = ir->from;
1115 ir->from->accept(this);
1116
1117 emit(MOV(counter, this->result));
1118 }
1119 }
1120
1121 emit(BRW_OPCODE_DO);
1122
1123 if (ir->to) {
1124 this->base_ir = ir->to;
1125 ir->to->accept(this);
1126
1127 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1128 brw_conditional_for_comparison(ir->cmp)));
1129
1130 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1131 inst->predicate = BRW_PREDICATE_NORMAL;
1132 }
1133
1134 visit_instructions(&ir->body_instructions);
1135
1136
1137 if (ir->increment) {
1138 this->base_ir = ir->increment;
1139 ir->increment->accept(this);
1140 emit(ADD(counter, src_reg(counter), this->result));
1141 }
1142
1143 emit(BRW_OPCODE_WHILE);
1144 }
1145
1146 void
1147 vec4_visitor::visit(ir_loop_jump *ir)
1148 {
1149 switch (ir->mode) {
1150 case ir_loop_jump::jump_break:
1151 emit(BRW_OPCODE_BREAK);
1152 break;
1153 case ir_loop_jump::jump_continue:
1154 emit(BRW_OPCODE_CONTINUE);
1155 break;
1156 }
1157 }
1158
1159
1160 void
1161 vec4_visitor::visit(ir_function_signature *ir)
1162 {
1163 assert(0);
1164 (void)ir;
1165 }
1166
1167 void
1168 vec4_visitor::visit(ir_function *ir)
1169 {
1170 /* Ignore function bodies other than main() -- we shouldn't see calls to
1171 * them since they should all be inlined.
1172 */
1173 if (strcmp(ir->name, "main") == 0) {
1174 const ir_function_signature *sig;
1175 exec_list empty;
1176
1177 sig = ir->matching_signature(&empty);
1178
1179 assert(sig);
1180
1181 visit_instructions(&sig->body);
1182 }
1183 }
1184
1185 bool
1186 vec4_visitor::try_emit_sat(ir_expression *ir)
1187 {
1188 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1189 if (!sat_src)
1190 return false;
1191
1192 sat_src->accept(this);
1193 src_reg src = this->result;
1194
1195 this->result = src_reg(this, ir->type);
1196 vec4_instruction *inst;
1197 inst = emit(MOV(dst_reg(this->result), src));
1198 inst->saturate = true;
1199
1200 return true;
1201 }
1202
1203 void
1204 vec4_visitor::emit_bool_comparison(unsigned int op,
1205 dst_reg dst, src_reg src0, src_reg src1)
1206 {
1207 /* original gen4 does destination conversion before comparison. */
1208 if (intel->gen < 5)
1209 dst.type = src0.type;
1210
1211 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1212
1213 dst.type = BRW_REGISTER_TYPE_D;
1214 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1215 }
1216
1217 void
1218 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1219 src_reg src0, src_reg src1)
1220 {
1221 vec4_instruction *inst;
1222
1223 if (intel->gen >= 6) {
1224 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225 inst->conditional_mod = conditionalmod;
1226 } else {
1227 emit(CMP(dst, src0, src1, conditionalmod));
1228
1229 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1230 inst->predicate = BRW_PREDICATE_NORMAL;
1231 }
1232 }
1233
1234 void
1235 vec4_visitor::visit(ir_expression *ir)
1236 {
1237 unsigned int operand;
1238 src_reg op[Elements(ir->operands)];
1239 src_reg result_src;
1240 dst_reg result_dst;
1241 vec4_instruction *inst;
1242
1243 if (try_emit_sat(ir))
1244 return;
1245
1246 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1247 this->result.file = BAD_FILE;
1248 ir->operands[operand]->accept(this);
1249 if (this->result.file == BAD_FILE) {
1250 printf("Failed to get tree for expression operand:\n");
1251 ir->operands[operand]->print();
1252 exit(1);
1253 }
1254 op[operand] = this->result;
1255
1256 /* Matrix expression operands should have been broken down to vector
1257 * operations already.
1258 */
1259 assert(!ir->operands[operand]->type->is_matrix());
1260 }
1261
1262 int vector_elements = ir->operands[0]->type->vector_elements;
1263 if (ir->operands[1]) {
1264 vector_elements = MAX2(vector_elements,
1265 ir->operands[1]->type->vector_elements);
1266 }
1267
1268 this->result.file = BAD_FILE;
1269
1270 /* Storage for our result. Ideally for an assignment we'd be using
1271 * the actual storage for the result here, instead.
1272 */
1273 result_src = src_reg(this, ir->type);
1274 /* convenience for the emit functions below. */
1275 result_dst = dst_reg(result_src);
1276 /* If nothing special happens, this is the result. */
1277 this->result = result_src;
1278 /* Limit writes to the channels that will be used by result_src later.
1279 * This does limit this temp's use as a temporary for multi-instruction
1280 * sequences.
1281 */
1282 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1283
1284 switch (ir->operation) {
1285 case ir_unop_logic_not:
1286 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1287 * ones complement of the whole register, not just bit 0.
1288 */
1289 emit(XOR(result_dst, op[0], src_reg(1)));
1290 break;
1291 case ir_unop_neg:
1292 op[0].negate = !op[0].negate;
1293 this->result = op[0];
1294 break;
1295 case ir_unop_abs:
1296 op[0].abs = true;
1297 op[0].negate = false;
1298 this->result = op[0];
1299 break;
1300
1301 case ir_unop_sign:
1302 emit(MOV(result_dst, src_reg(0.0f)));
1303
1304 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1305 inst = emit(MOV(result_dst, src_reg(1.0f)));
1306 inst->predicate = BRW_PREDICATE_NORMAL;
1307
1308 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1309 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1310 inst->predicate = BRW_PREDICATE_NORMAL;
1311
1312 break;
1313
1314 case ir_unop_rcp:
1315 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1316 break;
1317
1318 case ir_unop_exp2:
1319 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1320 break;
1321 case ir_unop_log2:
1322 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1323 break;
1324 case ir_unop_exp:
1325 case ir_unop_log:
1326 assert(!"not reached: should be handled by ir_explog_to_explog2");
1327 break;
1328 case ir_unop_sin:
1329 case ir_unop_sin_reduced:
1330 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1331 break;
1332 case ir_unop_cos:
1333 case ir_unop_cos_reduced:
1334 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1335 break;
1336
1337 case ir_unop_dFdx:
1338 case ir_unop_dFdy:
1339 assert(!"derivatives not valid in vertex shader");
1340 break;
1341
1342 case ir_unop_noise:
1343 assert(!"not reached: should be handled by lower_noise");
1344 break;
1345
1346 case ir_binop_add:
1347 emit(ADD(result_dst, op[0], op[1]));
1348 break;
1349 case ir_binop_sub:
1350 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1351 break;
1352
1353 case ir_binop_mul:
1354 if (ir->type->is_integer()) {
1355 /* For integer multiplication, the MUL uses the low 16 bits
1356 * of one of the operands (src0 on gen6, src1 on gen7). The
1357 * MACH accumulates in the contribution of the upper 16 bits
1358 * of that operand.
1359 *
1360 * FINISHME: Emit just the MUL if we know an operand is small
1361 * enough.
1362 */
1363 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1364
1365 emit(MUL(acc, op[0], op[1]));
1366 emit(MACH(dst_null_d(), op[0], op[1]));
1367 emit(MOV(result_dst, src_reg(acc)));
1368 } else {
1369 emit(MUL(result_dst, op[0], op[1]));
1370 }
1371 break;
1372 case ir_binop_div:
1373 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1374 assert(ir->type->is_integer());
1375 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1376 break;
1377 case ir_binop_mod:
1378 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1379 assert(ir->type->is_integer());
1380 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1381 break;
1382
1383 case ir_binop_less:
1384 case ir_binop_greater:
1385 case ir_binop_lequal:
1386 case ir_binop_gequal:
1387 case ir_binop_equal:
1388 case ir_binop_nequal: {
1389 emit(CMP(result_dst, op[0], op[1],
1390 brw_conditional_for_comparison(ir->operation)));
1391 emit(AND(result_dst, result_src, src_reg(0x1)));
1392 break;
1393 }
1394
1395 case ir_binop_all_equal:
1396 /* "==" operator producing a scalar boolean. */
1397 if (ir->operands[0]->type->is_vector() ||
1398 ir->operands[1]->type->is_vector()) {
1399 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1400 emit(MOV(result_dst, src_reg(0)));
1401 inst = emit(MOV(result_dst, src_reg(1)));
1402 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1403 } else {
1404 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1405 emit(AND(result_dst, result_src, src_reg(0x1)));
1406 }
1407 break;
1408 case ir_binop_any_nequal:
1409 /* "!=" operator producing a scalar boolean. */
1410 if (ir->operands[0]->type->is_vector() ||
1411 ir->operands[1]->type->is_vector()) {
1412 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1413
1414 emit(MOV(result_dst, src_reg(0)));
1415 inst = emit(MOV(result_dst, src_reg(1)));
1416 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1417 } else {
1418 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1419 emit(AND(result_dst, result_src, src_reg(0x1)));
1420 }
1421 break;
1422
1423 case ir_unop_any:
1424 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1425 emit(MOV(result_dst, src_reg(0)));
1426
1427 inst = emit(MOV(result_dst, src_reg(1)));
1428 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1429 break;
1430
1431 case ir_binop_logic_xor:
1432 emit(XOR(result_dst, op[0], op[1]));
1433 break;
1434
1435 case ir_binop_logic_or:
1436 emit(OR(result_dst, op[0], op[1]));
1437 break;
1438
1439 case ir_binop_logic_and:
1440 emit(AND(result_dst, op[0], op[1]));
1441 break;
1442
1443 case ir_binop_dot:
1444 assert(ir->operands[0]->type->is_vector());
1445 assert(ir->operands[0]->type == ir->operands[1]->type);
1446 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1447 break;
1448
1449 case ir_unop_sqrt:
1450 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1451 break;
1452 case ir_unop_rsq:
1453 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1454 break;
1455
1456 case ir_unop_bitcast_i2f:
1457 case ir_unop_bitcast_u2f:
1458 this->result = op[0];
1459 this->result.type = BRW_REGISTER_TYPE_F;
1460 break;
1461
1462 case ir_unop_bitcast_f2i:
1463 this->result = op[0];
1464 this->result.type = BRW_REGISTER_TYPE_D;
1465 break;
1466
1467 case ir_unop_bitcast_f2u:
1468 this->result = op[0];
1469 this->result.type = BRW_REGISTER_TYPE_UD;
1470 break;
1471
1472 case ir_unop_i2f:
1473 case ir_unop_i2u:
1474 case ir_unop_u2i:
1475 case ir_unop_u2f:
1476 case ir_unop_b2f:
1477 case ir_unop_b2i:
1478 case ir_unop_f2i:
1479 case ir_unop_f2u:
1480 emit(MOV(result_dst, op[0]));
1481 break;
1482 case ir_unop_f2b:
1483 case ir_unop_i2b: {
1484 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1485 emit(AND(result_dst, result_src, src_reg(1)));
1486 break;
1487 }
1488
1489 case ir_unop_trunc:
1490 emit(RNDZ(result_dst, op[0]));
1491 break;
1492 case ir_unop_ceil:
1493 op[0].negate = !op[0].negate;
1494 inst = emit(RNDD(result_dst, op[0]));
1495 this->result.negate = true;
1496 break;
1497 case ir_unop_floor:
1498 inst = emit(RNDD(result_dst, op[0]));
1499 break;
1500 case ir_unop_fract:
1501 inst = emit(FRC(result_dst, op[0]));
1502 break;
1503 case ir_unop_round_even:
1504 emit(RNDE(result_dst, op[0]));
1505 break;
1506
1507 case ir_binop_min:
1508 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1509 break;
1510 case ir_binop_max:
1511 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1512 break;
1513
1514 case ir_binop_pow:
1515 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1516 break;
1517
1518 case ir_unop_bit_not:
1519 inst = emit(NOT(result_dst, op[0]));
1520 break;
1521 case ir_binop_bit_and:
1522 inst = emit(AND(result_dst, op[0], op[1]));
1523 break;
1524 case ir_binop_bit_xor:
1525 inst = emit(XOR(result_dst, op[0], op[1]));
1526 break;
1527 case ir_binop_bit_or:
1528 inst = emit(OR(result_dst, op[0], op[1]));
1529 break;
1530
1531 case ir_binop_lshift:
1532 inst = emit(SHL(result_dst, op[0], op[1]));
1533 break;
1534
1535 case ir_binop_rshift:
1536 if (ir->type->base_type == GLSL_TYPE_INT)
1537 inst = emit(ASR(result_dst, op[0], op[1]));
1538 else
1539 inst = emit(SHR(result_dst, op[0], op[1]));
1540 break;
1541
1542 case ir_binop_ubo_load: {
1543 ir_constant *uniform_block = ir->operands[0]->as_constant();
1544 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1545 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1546 src_reg offset = op[1];
1547
1548 /* Now, load the vector from that offset. */
1549 assert(ir->type->is_vector() || ir->type->is_scalar());
1550
1551 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1552 packed_consts.type = result.type;
1553 src_reg surf_index =
1554 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1555 if (const_offset_ir) {
1556 offset = src_reg(const_offset / 16);
1557 } else {
1558 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1559 }
1560
1561 vec4_instruction *pull =
1562 emit(new(mem_ctx) vec4_instruction(this,
1563 VS_OPCODE_PULL_CONSTANT_LOAD,
1564 dst_reg(packed_consts),
1565 surf_index,
1566 offset));
1567 pull->base_mrf = 14;
1568 pull->mlen = 1;
1569
1570 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1571 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1572 const_offset % 16 / 4,
1573 const_offset % 16 / 4,
1574 const_offset % 16 / 4);
1575
1576 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1577 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1578 emit(CMP(result_dst, packed_consts, src_reg(0u),
1579 BRW_CONDITIONAL_NZ));
1580 emit(AND(result_dst, result, src_reg(0x1)));
1581 } else {
1582 emit(MOV(result_dst, packed_consts));
1583 }
1584 break;
1585 }
1586
1587 case ir_quadop_vector:
1588 assert(!"not reached: should be handled by lower_quadop_vector");
1589 break;
1590
1591 case ir_unop_pack_half_2x16:
1592 emit_pack_half_2x16(result_dst, op[0]);
1593 break;
1594 case ir_unop_unpack_half_2x16:
1595 emit_unpack_half_2x16(result_dst, op[0]);
1596 break;
1597 case ir_unop_pack_snorm_2x16:
1598 case ir_unop_pack_unorm_2x16:
1599 case ir_unop_unpack_snorm_2x16:
1600 case ir_unop_unpack_unorm_2x16:
1601 assert(!"not reached: should be handled by lower_packing_builtins");
1602 break;
1603 case ir_unop_unpack_half_2x16_split_x:
1604 case ir_unop_unpack_half_2x16_split_y:
1605 case ir_binop_pack_half_2x16_split:
1606 assert(!"not reached: should not occur in vertex shader");
1607 break;
1608 }
1609 }
1610
1611
1612 void
1613 vec4_visitor::visit(ir_swizzle *ir)
1614 {
1615 src_reg src;
1616 int i = 0;
1617 int swizzle[4];
1618
1619 /* Note that this is only swizzles in expressions, not those on the left
1620 * hand side of an assignment, which do write masking. See ir_assignment
1621 * for that.
1622 */
1623
1624 ir->val->accept(this);
1625 src = this->result;
1626 assert(src.file != BAD_FILE);
1627
1628 for (i = 0; i < ir->type->vector_elements; i++) {
1629 switch (i) {
1630 case 0:
1631 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1632 break;
1633 case 1:
1634 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1635 break;
1636 case 2:
1637 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1638 break;
1639 case 3:
1640 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1641 break;
1642 }
1643 }
1644 for (; i < 4; i++) {
1645 /* Replicate the last channel out. */
1646 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1647 }
1648
1649 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1650
1651 this->result = src;
1652 }
1653
1654 void
1655 vec4_visitor::visit(ir_dereference_variable *ir)
1656 {
1657 const struct glsl_type *type = ir->type;
1658 dst_reg *reg = variable_storage(ir->var);
1659
1660 if (!reg) {
1661 fail("Failed to find variable storage for %s\n", ir->var->name);
1662 this->result = src_reg(brw_null_reg());
1663 return;
1664 }
1665
1666 this->result = src_reg(*reg);
1667
1668 /* System values get their swizzle from the dst_reg writemask */
1669 if (ir->var->mode == ir_var_system_value)
1670 return;
1671
1672 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1673 this->result.swizzle = swizzle_for_size(type->vector_elements);
1674 }
1675
1676 void
1677 vec4_visitor::visit(ir_dereference_array *ir)
1678 {
1679 ir_constant *constant_index;
1680 src_reg src;
1681 int element_size = type_size(ir->type);
1682
1683 constant_index = ir->array_index->constant_expression_value();
1684
1685 ir->array->accept(this);
1686 src = this->result;
1687
1688 if (constant_index) {
1689 src.reg_offset += constant_index->value.i[0] * element_size;
1690 } else {
1691 /* Variable index array dereference. It eats the "vec4" of the
1692 * base of the array and an index that offsets the Mesa register
1693 * index.
1694 */
1695 ir->array_index->accept(this);
1696
1697 src_reg index_reg;
1698
1699 if (element_size == 1) {
1700 index_reg = this->result;
1701 } else {
1702 index_reg = src_reg(this, glsl_type::int_type);
1703
1704 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1705 }
1706
1707 if (src.reladdr) {
1708 src_reg temp = src_reg(this, glsl_type::int_type);
1709
1710 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1711
1712 index_reg = temp;
1713 }
1714
1715 src.reladdr = ralloc(mem_ctx, src_reg);
1716 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1717 }
1718
1719 /* If the type is smaller than a vec4, replicate the last channel out. */
1720 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1721 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1722 else
1723 src.swizzle = BRW_SWIZZLE_NOOP;
1724 src.type = brw_type_for_base_type(ir->type);
1725
1726 this->result = src;
1727 }
1728
1729 void
1730 vec4_visitor::visit(ir_dereference_record *ir)
1731 {
1732 unsigned int i;
1733 const glsl_type *struct_type = ir->record->type;
1734 int offset = 0;
1735
1736 ir->record->accept(this);
1737
1738 for (i = 0; i < struct_type->length; i++) {
1739 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1740 break;
1741 offset += type_size(struct_type->fields.structure[i].type);
1742 }
1743
1744 /* If the type is smaller than a vec4, replicate the last channel out. */
1745 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1746 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1747 else
1748 this->result.swizzle = BRW_SWIZZLE_NOOP;
1749 this->result.type = brw_type_for_base_type(ir->type);
1750
1751 this->result.reg_offset += offset;
1752 }
1753
1754 /**
1755 * We want to be careful in assignment setup to hit the actual storage
1756 * instead of potentially using a temporary like we might with the
1757 * ir_dereference handler.
1758 */
1759 static dst_reg
1760 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1761 {
1762 /* The LHS must be a dereference. If the LHS is a variable indexed array
1763 * access of a vector, it must be separated into a series conditional moves
1764 * before reaching this point (see ir_vec_index_to_cond_assign).
1765 */
1766 assert(ir->as_dereference());
1767 ir_dereference_array *deref_array = ir->as_dereference_array();
1768 if (deref_array) {
1769 assert(!deref_array->array->type->is_vector());
1770 }
1771
1772 /* Use the rvalue deref handler for the most part. We'll ignore
1773 * swizzles in it and write swizzles using writemask, though.
1774 */
1775 ir->accept(v);
1776 return dst_reg(v->result);
1777 }
1778
1779 void
1780 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1781 const struct glsl_type *type, uint32_t predicate)
1782 {
1783 if (type->base_type == GLSL_TYPE_STRUCT) {
1784 for (unsigned int i = 0; i < type->length; i++) {
1785 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1786 }
1787 return;
1788 }
1789
1790 if (type->is_array()) {
1791 for (unsigned int i = 0; i < type->length; i++) {
1792 emit_block_move(dst, src, type->fields.array, predicate);
1793 }
1794 return;
1795 }
1796
1797 if (type->is_matrix()) {
1798 const struct glsl_type *vec_type;
1799
1800 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1801 type->vector_elements, 1);
1802
1803 for (int i = 0; i < type->matrix_columns; i++) {
1804 emit_block_move(dst, src, vec_type, predicate);
1805 }
1806 return;
1807 }
1808
1809 assert(type->is_scalar() || type->is_vector());
1810
1811 dst->type = brw_type_for_base_type(type);
1812 src->type = dst->type;
1813
1814 dst->writemask = (1 << type->vector_elements) - 1;
1815
1816 src->swizzle = swizzle_for_size(type->vector_elements);
1817
1818 vec4_instruction *inst = emit(MOV(*dst, *src));
1819 inst->predicate = predicate;
1820
1821 dst->reg_offset++;
1822 src->reg_offset++;
1823 }
1824
1825
1826 /* If the RHS processing resulted in an instruction generating a
1827 * temporary value, and it would be easy to rewrite the instruction to
1828 * generate its result right into the LHS instead, do so. This ends
1829 * up reliably removing instructions where it can be tricky to do so
1830 * later without real UD chain information.
1831 */
1832 bool
1833 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1834 dst_reg dst,
1835 src_reg src,
1836 vec4_instruction *pre_rhs_inst,
1837 vec4_instruction *last_rhs_inst)
1838 {
1839 /* This could be supported, but it would take more smarts. */
1840 if (ir->condition)
1841 return false;
1842
1843 if (pre_rhs_inst == last_rhs_inst)
1844 return false; /* No instructions generated to work with. */
1845
1846 /* Make sure the last instruction generated our source reg. */
1847 if (src.file != GRF ||
1848 src.file != last_rhs_inst->dst.file ||
1849 src.reg != last_rhs_inst->dst.reg ||
1850 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1851 src.reladdr ||
1852 src.abs ||
1853 src.negate ||
1854 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1855 return false;
1856
1857 /* Check that that last instruction fully initialized the channels
1858 * we want to use, in the order we want to use them. We could
1859 * potentially reswizzle the operands of many instructions so that
1860 * we could handle out of order channels, but don't yet.
1861 */
1862
1863 for (unsigned i = 0; i < 4; i++) {
1864 if (dst.writemask & (1 << i)) {
1865 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1866 return false;
1867
1868 if (BRW_GET_SWZ(src.swizzle, i) != i)
1869 return false;
1870 }
1871 }
1872
1873 /* Success! Rewrite the instruction. */
1874 last_rhs_inst->dst.file = dst.file;
1875 last_rhs_inst->dst.reg = dst.reg;
1876 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1877 last_rhs_inst->dst.reladdr = dst.reladdr;
1878 last_rhs_inst->dst.writemask &= dst.writemask;
1879
1880 return true;
1881 }
1882
1883 void
1884 vec4_visitor::visit(ir_assignment *ir)
1885 {
1886 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1887 uint32_t predicate = BRW_PREDICATE_NONE;
1888
1889 if (!ir->lhs->type->is_scalar() &&
1890 !ir->lhs->type->is_vector()) {
1891 ir->rhs->accept(this);
1892 src_reg src = this->result;
1893
1894 if (ir->condition) {
1895 emit_bool_to_cond_code(ir->condition, &predicate);
1896 }
1897
1898 /* emit_block_move doesn't account for swizzles in the source register.
1899 * This should be ok, since the source register is a structure or an
1900 * array, and those can't be swizzled. But double-check to be sure.
1901 */
1902 assert(src.swizzle ==
1903 (ir->rhs->type->is_matrix()
1904 ? swizzle_for_size(ir->rhs->type->vector_elements)
1905 : BRW_SWIZZLE_NOOP));
1906
1907 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1908 return;
1909 }
1910
1911 /* Now we're down to just a scalar/vector with writemasks. */
1912 int i;
1913
1914 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1915 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1916
1917 ir->rhs->accept(this);
1918
1919 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1920
1921 src_reg src = this->result;
1922
1923 int swizzles[4];
1924 int first_enabled_chan = 0;
1925 int src_chan = 0;
1926
1927 assert(ir->lhs->type->is_vector() ||
1928 ir->lhs->type->is_scalar());
1929 dst.writemask = ir->write_mask;
1930
1931 for (int i = 0; i < 4; i++) {
1932 if (dst.writemask & (1 << i)) {
1933 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1934 break;
1935 }
1936 }
1937
1938 /* Swizzle a small RHS vector into the channels being written.
1939 *
1940 * glsl ir treats write_mask as dictating how many channels are
1941 * present on the RHS while in our instructions we need to make
1942 * those channels appear in the slots of the vec4 they're written to.
1943 */
1944 for (int i = 0; i < 4; i++) {
1945 if (dst.writemask & (1 << i))
1946 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1947 else
1948 swizzles[i] = first_enabled_chan;
1949 }
1950 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1951 swizzles[2], swizzles[3]);
1952
1953 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1954 return;
1955 }
1956
1957 if (ir->condition) {
1958 emit_bool_to_cond_code(ir->condition, &predicate);
1959 }
1960
1961 for (i = 0; i < type_size(ir->lhs->type); i++) {
1962 vec4_instruction *inst = emit(MOV(dst, src));
1963 inst->predicate = predicate;
1964
1965 dst.reg_offset++;
1966 src.reg_offset++;
1967 }
1968 }
1969
1970 void
1971 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1972 {
1973 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1974 foreach_list(node, &ir->components) {
1975 ir_constant *field_value = (ir_constant *)node;
1976
1977 emit_constant_values(dst, field_value);
1978 }
1979 return;
1980 }
1981
1982 if (ir->type->is_array()) {
1983 for (unsigned int i = 0; i < ir->type->length; i++) {
1984 emit_constant_values(dst, ir->array_elements[i]);
1985 }
1986 return;
1987 }
1988
1989 if (ir->type->is_matrix()) {
1990 for (int i = 0; i < ir->type->matrix_columns; i++) {
1991 float *vec = &ir->value.f[i * ir->type->vector_elements];
1992
1993 for (int j = 0; j < ir->type->vector_elements; j++) {
1994 dst->writemask = 1 << j;
1995 dst->type = BRW_REGISTER_TYPE_F;
1996
1997 emit(MOV(*dst, src_reg(vec[j])));
1998 }
1999 dst->reg_offset++;
2000 }
2001 return;
2002 }
2003
2004 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2005
2006 for (int i = 0; i < ir->type->vector_elements; i++) {
2007 if (!(remaining_writemask & (1 << i)))
2008 continue;
2009
2010 dst->writemask = 1 << i;
2011 dst->type = brw_type_for_base_type(ir->type);
2012
2013 /* Find other components that match the one we're about to
2014 * write. Emits fewer instructions for things like vec4(0.5,
2015 * 1.5, 1.5, 1.5).
2016 */
2017 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2018 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2019 if (ir->value.b[i] == ir->value.b[j])
2020 dst->writemask |= (1 << j);
2021 } else {
2022 /* u, i, and f storage all line up, so no need for a
2023 * switch case for comparing each type.
2024 */
2025 if (ir->value.u[i] == ir->value.u[j])
2026 dst->writemask |= (1 << j);
2027 }
2028 }
2029
2030 switch (ir->type->base_type) {
2031 case GLSL_TYPE_FLOAT:
2032 emit(MOV(*dst, src_reg(ir->value.f[i])));
2033 break;
2034 case GLSL_TYPE_INT:
2035 emit(MOV(*dst, src_reg(ir->value.i[i])));
2036 break;
2037 case GLSL_TYPE_UINT:
2038 emit(MOV(*dst, src_reg(ir->value.u[i])));
2039 break;
2040 case GLSL_TYPE_BOOL:
2041 emit(MOV(*dst, src_reg(ir->value.b[i])));
2042 break;
2043 default:
2044 assert(!"Non-float/uint/int/bool constant");
2045 break;
2046 }
2047
2048 remaining_writemask &= ~dst->writemask;
2049 }
2050 dst->reg_offset++;
2051 }
2052
2053 void
2054 vec4_visitor::visit(ir_constant *ir)
2055 {
2056 dst_reg dst = dst_reg(this, ir->type);
2057 this->result = src_reg(dst);
2058
2059 emit_constant_values(&dst, ir);
2060 }
2061
2062 void
2063 vec4_visitor::visit(ir_call *ir)
2064 {
2065 assert(!"not reached");
2066 }
2067
2068 void
2069 vec4_visitor::visit(ir_texture *ir)
2070 {
2071 int sampler = _mesa_get_sampler_uniform_value(ir->sampler, prog, &vp->Base);
2072
2073 /* Should be lowered by do_lower_texture_projection */
2074 assert(!ir->projector);
2075
2076 /* Generate code to compute all the subexpression trees. This has to be
2077 * done before loading any values into MRFs for the sampler message since
2078 * generating these values may involve SEND messages that need the MRFs.
2079 */
2080 src_reg coordinate;
2081 if (ir->coordinate) {
2082 ir->coordinate->accept(this);
2083 coordinate = this->result;
2084 }
2085
2086 src_reg shadow_comparitor;
2087 if (ir->shadow_comparitor) {
2088 ir->shadow_comparitor->accept(this);
2089 shadow_comparitor = this->result;
2090 }
2091
2092 const glsl_type *lod_type;
2093 src_reg lod, dPdx, dPdy;
2094 switch (ir->op) {
2095 case ir_tex:
2096 lod = src_reg(0.0f);
2097 lod_type = glsl_type::float_type;
2098 break;
2099 case ir_txf:
2100 case ir_txl:
2101 case ir_txs:
2102 ir->lod_info.lod->accept(this);
2103 lod = this->result;
2104 lod_type = ir->lod_info.lod->type;
2105 break;
2106 case ir_txd:
2107 ir->lod_info.grad.dPdx->accept(this);
2108 dPdx = this->result;
2109
2110 ir->lod_info.grad.dPdy->accept(this);
2111 dPdy = this->result;
2112
2113 lod_type = ir->lod_info.grad.dPdx->type;
2114 break;
2115 case ir_txb:
2116 break;
2117 }
2118
2119 vec4_instruction *inst = NULL;
2120 switch (ir->op) {
2121 case ir_tex:
2122 case ir_txl:
2123 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2124 break;
2125 case ir_txd:
2126 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2127 break;
2128 case ir_txf:
2129 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2130 break;
2131 case ir_txs:
2132 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2133 break;
2134 case ir_txb:
2135 assert(!"TXB is not valid for vertex shaders.");
2136 }
2137
2138 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2139
2140 /* Texel offsets go in the message header; Gen4 also requires headers. */
2141 inst->header_present = use_texture_offset || intel->gen < 5;
2142 inst->base_mrf = 2;
2143 inst->mlen = inst->header_present + 1; /* always at least one */
2144 inst->sampler = sampler;
2145 inst->dst = dst_reg(this, ir->type);
2146 inst->dst.writemask = WRITEMASK_XYZW;
2147 inst->shadow_compare = ir->shadow_comparitor != NULL;
2148
2149 if (use_texture_offset)
2150 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2151
2152 /* MRF for the first parameter */
2153 int param_base = inst->base_mrf + inst->header_present;
2154
2155 if (ir->op == ir_txs) {
2156 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2157 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2158 } else {
2159 int i, coord_mask = 0, zero_mask = 0;
2160 /* Load the coordinate */
2161 /* FINISHME: gl_clamp_mask and saturate */
2162 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2163 coord_mask |= (1 << i);
2164 for (; i < 4; i++)
2165 zero_mask |= (1 << i);
2166
2167 if (ir->offset && ir->op == ir_txf) {
2168 /* It appears that the ld instruction used for txf does its
2169 * address bounds check before adding in the offset. To work
2170 * around this, just add the integer offset to the integer
2171 * texel coordinate, and don't put the offset in the header.
2172 */
2173 ir_constant *offset = ir->offset->as_constant();
2174 assert(offset);
2175
2176 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2177 src_reg src = coordinate;
2178 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2179 BRW_GET_SWZ(src.swizzle, j),
2180 BRW_GET_SWZ(src.swizzle, j),
2181 BRW_GET_SWZ(src.swizzle, j));
2182 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2183 src, offset->value.i[j]));
2184 }
2185 } else {
2186 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2187 coordinate));
2188 }
2189 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2190 src_reg(0)));
2191 /* Load the shadow comparitor */
2192 if (ir->shadow_comparitor) {
2193 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2194 WRITEMASK_X),
2195 shadow_comparitor));
2196 inst->mlen++;
2197 }
2198
2199 /* Load the LOD info */
2200 if (ir->op == ir_tex || ir->op == ir_txl) {
2201 int mrf, writemask;
2202 if (intel->gen >= 5) {
2203 mrf = param_base + 1;
2204 if (ir->shadow_comparitor) {
2205 writemask = WRITEMASK_Y;
2206 /* mlen already incremented */
2207 } else {
2208 writemask = WRITEMASK_X;
2209 inst->mlen++;
2210 }
2211 } else /* intel->gen == 4 */ {
2212 mrf = param_base;
2213 writemask = WRITEMASK_Z;
2214 }
2215 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2216 } else if (ir->op == ir_txf) {
2217 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W),
2218 lod));
2219 } else if (ir->op == ir_txd) {
2220 const glsl_type *type = lod_type;
2221
2222 if (intel->gen >= 5) {
2223 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2224 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2225 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2226 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2227 inst->mlen++;
2228
2229 if (ir->type->vector_elements == 3) {
2230 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2231 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2232 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2233 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2234 inst->mlen++;
2235 }
2236 } else /* intel->gen == 4 */ {
2237 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2238 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2239 inst->mlen += 2;
2240 }
2241 }
2242 }
2243
2244 emit(inst);
2245
2246 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2247 * spec requires layers.
2248 */
2249 if (ir->op == ir_txs) {
2250 glsl_type const *type = ir->sampler->type;
2251 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2252 type->sampler_array) {
2253 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2254 with_writemask(inst->dst, WRITEMASK_Z),
2255 src_reg(inst->dst), src_reg(6));
2256 }
2257 }
2258
2259 swizzle_result(ir, src_reg(inst->dst), sampler);
2260 }
2261
2262 void
2263 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2264 {
2265 int s = c->key.tex.swizzles[sampler];
2266
2267 this->result = src_reg(this, ir->type);
2268 dst_reg swizzled_result(this->result);
2269
2270 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2271 || s == SWIZZLE_NOOP) {
2272 emit(MOV(swizzled_result, orig_val));
2273 return;
2274 }
2275
2276 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2277 int swizzle[4];
2278
2279 for (int i = 0; i < 4; i++) {
2280 switch (GET_SWZ(s, i)) {
2281 case SWIZZLE_ZERO:
2282 zero_mask |= (1 << i);
2283 break;
2284 case SWIZZLE_ONE:
2285 one_mask |= (1 << i);
2286 break;
2287 default:
2288 copy_mask |= (1 << i);
2289 swizzle[i] = GET_SWZ(s, i);
2290 break;
2291 }
2292 }
2293
2294 if (copy_mask) {
2295 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2296 swizzled_result.writemask = copy_mask;
2297 emit(MOV(swizzled_result, orig_val));
2298 }
2299
2300 if (zero_mask) {
2301 swizzled_result.writemask = zero_mask;
2302 emit(MOV(swizzled_result, src_reg(0.0f)));
2303 }
2304
2305 if (one_mask) {
2306 swizzled_result.writemask = one_mask;
2307 emit(MOV(swizzled_result, src_reg(1.0f)));
2308 }
2309 }
2310
2311 void
2312 vec4_visitor::visit(ir_return *ir)
2313 {
2314 assert(!"not reached");
2315 }
2316
2317 void
2318 vec4_visitor::visit(ir_discard *ir)
2319 {
2320 assert(!"not reached");
2321 }
2322
2323 void
2324 vec4_visitor::visit(ir_if *ir)
2325 {
2326 /* Don't point the annotation at the if statement, because then it plus
2327 * the then and else blocks get printed.
2328 */
2329 this->base_ir = ir->condition;
2330
2331 if (intel->gen == 6) {
2332 emit_if_gen6(ir);
2333 } else {
2334 uint32_t predicate;
2335 emit_bool_to_cond_code(ir->condition, &predicate);
2336 emit(IF(predicate));
2337 }
2338
2339 visit_instructions(&ir->then_instructions);
2340
2341 if (!ir->else_instructions.is_empty()) {
2342 this->base_ir = ir->condition;
2343 emit(BRW_OPCODE_ELSE);
2344
2345 visit_instructions(&ir->else_instructions);
2346 }
2347
2348 this->base_ir = ir->condition;
2349 emit(BRW_OPCODE_ENDIF);
2350 }
2351
2352 void
2353 vec4_visitor::emit_ndc_computation()
2354 {
2355 /* Get the position */
2356 src_reg pos = src_reg(output_reg[VERT_RESULT_HPOS]);
2357
2358 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2359 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2360 output_reg[BRW_VERT_RESULT_NDC] = ndc;
2361
2362 current_annotation = "NDC";
2363 dst_reg ndc_w = ndc;
2364 ndc_w.writemask = WRITEMASK_W;
2365 src_reg pos_w = pos;
2366 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2367 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2368
2369 dst_reg ndc_xyz = ndc;
2370 ndc_xyz.writemask = WRITEMASK_XYZ;
2371
2372 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2373 }
2374
2375 void
2376 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2377 {
2378 if (intel->gen < 6 &&
2379 ((c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) ||
2380 c->key.userclip_active || brw->has_negative_rhw_bug)) {
2381 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2382 dst_reg header1_w = header1;
2383 header1_w.writemask = WRITEMASK_W;
2384 GLuint i;
2385
2386 emit(MOV(header1, 0u));
2387
2388 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2389 src_reg psiz = src_reg(output_reg[VERT_RESULT_PSIZ]);
2390
2391 current_annotation = "Point size";
2392 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2393 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2394 }
2395
2396 current_annotation = "Clipping flags";
2397 for (i = 0; i < c->key.nr_userclip_plane_consts; i++) {
2398 vec4_instruction *inst;
2399
2400 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VERT_RESULT_HPOS]),
2401 src_reg(this->userplane[i])));
2402 inst->conditional_mod = BRW_CONDITIONAL_L;
2403
2404 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2405 inst->predicate = BRW_PREDICATE_NORMAL;
2406 }
2407
2408 /* i965 clipping workaround:
2409 * 1) Test for -ve rhw
2410 * 2) If set,
2411 * set ndc = (0,0,0,0)
2412 * set ucp[6] = 1
2413 *
2414 * Later, clipping will detect ucp[6] and ensure the primitive is
2415 * clipped against all fixed planes.
2416 */
2417 if (brw->has_negative_rhw_bug) {
2418 #if 0
2419 /* FINISHME */
2420 brw_CMP(p,
2421 vec8(brw_null_reg()),
2422 BRW_CONDITIONAL_L,
2423 brw_swizzle1(output_reg[BRW_VERT_RESULT_NDC], 3),
2424 brw_imm_f(0));
2425
2426 brw_OR(p, brw_writemask(header1, WRITEMASK_W), header1, brw_imm_ud(1<<6));
2427 brw_MOV(p, output_reg[BRW_VERT_RESULT_NDC], brw_imm_f(0));
2428 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2429 #endif
2430 }
2431
2432 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2433 } else if (intel->gen < 6) {
2434 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2435 } else {
2436 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2437 if (c->prog_data.outputs_written & BITFIELD64_BIT(VERT_RESULT_PSIZ)) {
2438 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2439 src_reg(output_reg[VERT_RESULT_PSIZ])));
2440 }
2441 }
2442 }
2443
2444 void
2445 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2446 {
2447 if (intel->gen < 6) {
2448 /* Clip distance slots are set aside in gen5, but they are not used. It
2449 * is not clear whether we actually need to set aside space for them,
2450 * but the performance cost is negligible.
2451 */
2452 return;
2453 }
2454
2455 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2456 *
2457 * "If a linked set of shaders forming the vertex stage contains no
2458 * static write to gl_ClipVertex or gl_ClipDistance, but the
2459 * application has requested clipping against user clip planes through
2460 * the API, then the coordinate written to gl_Position is used for
2461 * comparison against the user clip planes."
2462 *
2463 * This function is only called if the shader didn't write to
2464 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2465 * if the user wrote to it; otherwise we use gl_Position.
2466 */
2467 gl_vert_result clip_vertex = VERT_RESULT_CLIP_VERTEX;
2468 if (!(c->prog_data.outputs_written
2469 & BITFIELD64_BIT(VERT_RESULT_CLIP_VERTEX))) {
2470 clip_vertex = VERT_RESULT_HPOS;
2471 }
2472
2473 for (int i = 0; i + offset < c->key.nr_userclip_plane_consts && i < 4;
2474 ++i) {
2475 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2476 src_reg(output_reg[clip_vertex]),
2477 src_reg(this->userplane[i + offset])));
2478 }
2479 }
2480
2481 void
2482 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int vert_result)
2483 {
2484 assert (vert_result < VERT_RESULT_MAX);
2485 reg.type = output_reg[vert_result].type;
2486 current_annotation = output_reg_annotation[vert_result];
2487 /* Copy the register, saturating if necessary */
2488 vec4_instruction *inst = emit(MOV(reg,
2489 src_reg(output_reg[vert_result])));
2490 if ((vert_result == VERT_RESULT_COL0 ||
2491 vert_result == VERT_RESULT_COL1 ||
2492 vert_result == VERT_RESULT_BFC0 ||
2493 vert_result == VERT_RESULT_BFC1) &&
2494 c->key.clamp_vertex_color) {
2495 inst->saturate = true;
2496 }
2497 }
2498
2499 void
2500 vec4_visitor::emit_urb_slot(int mrf, int vert_result)
2501 {
2502 struct brw_reg hw_reg = brw_message_reg(mrf);
2503 dst_reg reg = dst_reg(MRF, mrf);
2504 reg.type = BRW_REGISTER_TYPE_F;
2505
2506 switch (vert_result) {
2507 case VERT_RESULT_PSIZ:
2508 /* PSIZ is always in slot 0, and is coupled with other flags. */
2509 current_annotation = "indices, point width, clip flags";
2510 emit_psiz_and_flags(hw_reg);
2511 break;
2512 case BRW_VERT_RESULT_NDC:
2513 current_annotation = "NDC";
2514 emit(MOV(reg, src_reg(output_reg[BRW_VERT_RESULT_NDC])));
2515 break;
2516 case BRW_VERT_RESULT_HPOS_DUPLICATE:
2517 case VERT_RESULT_HPOS:
2518 current_annotation = "gl_Position";
2519 emit(MOV(reg, src_reg(output_reg[VERT_RESULT_HPOS])));
2520 break;
2521 case VERT_RESULT_CLIP_DIST0:
2522 case VERT_RESULT_CLIP_DIST1:
2523 if (this->c->key.uses_clip_distance) {
2524 emit_generic_urb_slot(reg, vert_result);
2525 } else {
2526 current_annotation = "user clip distances";
2527 emit_clip_distances(hw_reg, (vert_result - VERT_RESULT_CLIP_DIST0) * 4);
2528 }
2529 break;
2530 case VERT_RESULT_EDGE:
2531 /* This is present when doing unfilled polygons. We're supposed to copy
2532 * the edge flag from the user-provided vertex array
2533 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2534 * of that attribute (starts as 1.0f). This is then used in clipping to
2535 * determine which edges should be drawn as wireframe.
2536 */
2537 current_annotation = "edge flag";
2538 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2539 glsl_type::float_type, WRITEMASK_XYZW))));
2540 break;
2541 case BRW_VERT_RESULT_PAD:
2542 /* No need to write to this slot */
2543 break;
2544 default:
2545 emit_generic_urb_slot(reg, vert_result);
2546 break;
2547 }
2548 }
2549
2550 static int
2551 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2552 {
2553 struct intel_context *intel = &brw->intel;
2554
2555 if (intel->gen >= 6) {
2556 /* URB data written (does not include the message header reg) must
2557 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2558 * section 5.4.3.2.2: URB_INTERLEAVED.
2559 *
2560 * URB entries are allocated on a multiple of 1024 bits, so an
2561 * extra 128 bits written here to make the end align to 256 is
2562 * no problem.
2563 */
2564 if ((mlen % 2) != 1)
2565 mlen++;
2566 }
2567
2568 return mlen;
2569 }
2570
2571 /**
2572 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2573 * complete the VS thread.
2574 *
2575 * The VUE layout is documented in Volume 2a.
2576 */
2577 void
2578 vec4_visitor::emit_urb_writes()
2579 {
2580 /* MRF 0 is reserved for the debugger, so start with message header
2581 * in MRF 1.
2582 */
2583 int base_mrf = 1;
2584 int mrf = base_mrf;
2585 /* In the process of generating our URB write message contents, we
2586 * may need to unspill a register or load from an array. Those
2587 * reads would use MRFs 14-15.
2588 */
2589 int max_usable_mrf = 13;
2590
2591 /* The following assertion verifies that max_usable_mrf causes an
2592 * even-numbered amount of URB write data, which will meet gen6's
2593 * requirements for length alignment.
2594 */
2595 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2596
2597 /* First mrf is the g0-based message header containing URB handles and such,
2598 * which is implied in VS_OPCODE_URB_WRITE.
2599 */
2600 mrf++;
2601
2602 if (intel->gen < 6) {
2603 emit_ndc_computation();
2604 }
2605
2606 /* Set up the VUE data for the first URB write */
2607 int slot;
2608 for (slot = 0; slot < c->prog_data.vue_map.num_slots; ++slot) {
2609 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2610
2611 /* If this was max_usable_mrf, we can't fit anything more into this URB
2612 * WRITE.
2613 */
2614 if (mrf > max_usable_mrf) {
2615 slot++;
2616 break;
2617 }
2618 }
2619
2620 current_annotation = "URB write";
2621 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2622 inst->base_mrf = base_mrf;
2623 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2624 inst->eot = (slot >= c->prog_data.vue_map.num_slots);
2625
2626 /* Optional second URB write */
2627 if (!inst->eot) {
2628 mrf = base_mrf + 1;
2629
2630 for (; slot < c->prog_data.vue_map.num_slots; ++slot) {
2631 assert(mrf < max_usable_mrf);
2632
2633 emit_urb_slot(mrf++, c->prog_data.vue_map.slot_to_vert_result[slot]);
2634 }
2635
2636 current_annotation = "URB write";
2637 inst = emit(VS_OPCODE_URB_WRITE);
2638 inst->base_mrf = base_mrf;
2639 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2640 inst->eot = true;
2641 /* URB destination offset. In the previous write, we got MRFs
2642 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2643 * URB row increments, and each of our MRFs is half of one of
2644 * those, since we're doing interleaved writes.
2645 */
2646 inst->offset = (max_usable_mrf - base_mrf) / 2;
2647 }
2648 }
2649
2650 src_reg
2651 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2652 src_reg *reladdr, int reg_offset)
2653 {
2654 /* Because we store the values to scratch interleaved like our
2655 * vertex data, we need to scale the vec4 index by 2.
2656 */
2657 int message_header_scale = 2;
2658
2659 /* Pre-gen6, the message header uses byte offsets instead of vec4
2660 * (16-byte) offset units.
2661 */
2662 if (intel->gen < 6)
2663 message_header_scale *= 16;
2664
2665 if (reladdr) {
2666 src_reg index = src_reg(this, glsl_type::int_type);
2667
2668 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2669 emit_before(inst, MUL(dst_reg(index),
2670 index, src_reg(message_header_scale)));
2671
2672 return index;
2673 } else {
2674 return src_reg(reg_offset * message_header_scale);
2675 }
2676 }
2677
2678 src_reg
2679 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2680 src_reg *reladdr, int reg_offset)
2681 {
2682 if (reladdr) {
2683 src_reg index = src_reg(this, glsl_type::int_type);
2684
2685 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2686
2687 /* Pre-gen6, the message header uses byte offsets instead of vec4
2688 * (16-byte) offset units.
2689 */
2690 if (intel->gen < 6) {
2691 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2692 }
2693
2694 return index;
2695 } else {
2696 int message_header_scale = intel->gen < 6 ? 16 : 1;
2697 return src_reg(reg_offset * message_header_scale);
2698 }
2699 }
2700
2701 /**
2702 * Emits an instruction before @inst to load the value named by @orig_src
2703 * from scratch space at @base_offset to @temp.
2704 *
2705 * @base_offset is measured in 32-byte units (the size of a register).
2706 */
2707 void
2708 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2709 dst_reg temp, src_reg orig_src,
2710 int base_offset)
2711 {
2712 int reg_offset = base_offset + orig_src.reg_offset;
2713 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2714
2715 emit_before(inst, SCRATCH_READ(temp, index));
2716 }
2717
2718 /**
2719 * Emits an instruction after @inst to store the value to be written
2720 * to @orig_dst to scratch space at @base_offset, from @temp.
2721 *
2722 * @base_offset is measured in 32-byte units (the size of a register).
2723 */
2724 void
2725 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2726 {
2727 int reg_offset = base_offset + inst->dst.reg_offset;
2728 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2729
2730 /* Create a temporary register to store *inst's result in.
2731 *
2732 * We have to be careful in MOVing from our temporary result register in
2733 * the scratch write. If we swizzle from channels of the temporary that
2734 * weren't initialized, it will confuse live interval analysis, which will
2735 * make spilling fail to make progress.
2736 */
2737 src_reg temp = src_reg(this, glsl_type::vec4_type);
2738 temp.type = inst->dst.type;
2739 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2740 int swizzles[4];
2741 for (int i = 0; i < 4; i++)
2742 if (inst->dst.writemask & (1 << i))
2743 swizzles[i] = i;
2744 else
2745 swizzles[i] = first_writemask_chan;
2746 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2747 swizzles[2], swizzles[3]);
2748
2749 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2750 inst->dst.writemask));
2751 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2752 write->predicate = inst->predicate;
2753 write->ir = inst->ir;
2754 write->annotation = inst->annotation;
2755 inst->insert_after(write);
2756
2757 inst->dst.file = temp.file;
2758 inst->dst.reg = temp.reg;
2759 inst->dst.reg_offset = temp.reg_offset;
2760 inst->dst.reladdr = NULL;
2761 }
2762
2763 /**
2764 * We can't generally support array access in GRF space, because a
2765 * single instruction's destination can only span 2 contiguous
2766 * registers. So, we send all GRF arrays that get variable index
2767 * access to scratch space.
2768 */
2769 void
2770 vec4_visitor::move_grf_array_access_to_scratch()
2771 {
2772 int scratch_loc[this->virtual_grf_count];
2773
2774 for (int i = 0; i < this->virtual_grf_count; i++) {
2775 scratch_loc[i] = -1;
2776 }
2777
2778 /* First, calculate the set of virtual GRFs that need to be punted
2779 * to scratch due to having any array access on them, and where in
2780 * scratch.
2781 */
2782 foreach_list(node, &this->instructions) {
2783 vec4_instruction *inst = (vec4_instruction *)node;
2784
2785 if (inst->dst.file == GRF && inst->dst.reladdr &&
2786 scratch_loc[inst->dst.reg] == -1) {
2787 scratch_loc[inst->dst.reg] = c->last_scratch;
2788 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2789 }
2790
2791 for (int i = 0 ; i < 3; i++) {
2792 src_reg *src = &inst->src[i];
2793
2794 if (src->file == GRF && src->reladdr &&
2795 scratch_loc[src->reg] == -1) {
2796 scratch_loc[src->reg] = c->last_scratch;
2797 c->last_scratch += this->virtual_grf_sizes[src->reg];
2798 }
2799 }
2800 }
2801
2802 /* Now, for anything that will be accessed through scratch, rewrite
2803 * it to load/store. Note that this is a _safe list walk, because
2804 * we may generate a new scratch_write instruction after the one
2805 * we're processing.
2806 */
2807 foreach_list_safe(node, &this->instructions) {
2808 vec4_instruction *inst = (vec4_instruction *)node;
2809
2810 /* Set up the annotation tracking for new generated instructions. */
2811 base_ir = inst->ir;
2812 current_annotation = inst->annotation;
2813
2814 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2815 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2816 }
2817
2818 for (int i = 0 ; i < 3; i++) {
2819 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2820 continue;
2821
2822 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2823
2824 emit_scratch_read(inst, temp, inst->src[i],
2825 scratch_loc[inst->src[i].reg]);
2826
2827 inst->src[i].file = temp.file;
2828 inst->src[i].reg = temp.reg;
2829 inst->src[i].reg_offset = temp.reg_offset;
2830 inst->src[i].reladdr = NULL;
2831 }
2832 }
2833 }
2834
2835 /**
2836 * Emits an instruction before @inst to load the value named by @orig_src
2837 * from the pull constant buffer (surface) at @base_offset to @temp.
2838 */
2839 void
2840 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2841 dst_reg temp, src_reg orig_src,
2842 int base_offset)
2843 {
2844 int reg_offset = base_offset + orig_src.reg_offset;
2845 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2846 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2847 vec4_instruction *load;
2848
2849 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2850 temp, index, offset);
2851 load->base_mrf = 14;
2852 load->mlen = 1;
2853 emit_before(inst, load);
2854 }
2855
2856 /**
2857 * Implements array access of uniforms by inserting a
2858 * PULL_CONSTANT_LOAD instruction.
2859 *
2860 * Unlike temporary GRF array access (where we don't support it due to
2861 * the difficulty of doing relative addressing on instruction
2862 * destinations), we could potentially do array access of uniforms
2863 * that were loaded in GRF space as push constants. In real-world
2864 * usage we've seen, though, the arrays being used are always larger
2865 * than we could load as push constants, so just always move all
2866 * uniform array access out to a pull constant buffer.
2867 */
2868 void
2869 vec4_visitor::move_uniform_array_access_to_pull_constants()
2870 {
2871 int pull_constant_loc[this->uniforms];
2872
2873 for (int i = 0; i < this->uniforms; i++) {
2874 pull_constant_loc[i] = -1;
2875 }
2876
2877 /* Walk through and find array access of uniforms. Put a copy of that
2878 * uniform in the pull constant buffer.
2879 *
2880 * Note that we don't move constant-indexed accesses to arrays. No
2881 * testing has been done of the performance impact of this choice.
2882 */
2883 foreach_list_safe(node, &this->instructions) {
2884 vec4_instruction *inst = (vec4_instruction *)node;
2885
2886 for (int i = 0 ; i < 3; i++) {
2887 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2888 continue;
2889
2890 int uniform = inst->src[i].reg;
2891
2892 /* If this array isn't already present in the pull constant buffer,
2893 * add it.
2894 */
2895 if (pull_constant_loc[uniform] == -1) {
2896 const float **values = &prog_data->param[uniform * 4];
2897
2898 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
2899
2900 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2901 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
2902 }
2903 }
2904
2905 /* Set up the annotation tracking for new generated instructions. */
2906 base_ir = inst->ir;
2907 current_annotation = inst->annotation;
2908
2909 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2910
2911 emit_pull_constant_load(inst, temp, inst->src[i],
2912 pull_constant_loc[uniform]);
2913
2914 inst->src[i].file = temp.file;
2915 inst->src[i].reg = temp.reg;
2916 inst->src[i].reg_offset = temp.reg_offset;
2917 inst->src[i].reladdr = NULL;
2918 }
2919 }
2920
2921 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2922 * no need to track them as larger-than-vec4 objects. This will be
2923 * relied on in cutting out unused uniform vectors from push
2924 * constants.
2925 */
2926 split_uniform_registers();
2927 }
2928
2929 void
2930 vec4_visitor::resolve_ud_negate(src_reg *reg)
2931 {
2932 if (reg->type != BRW_REGISTER_TYPE_UD ||
2933 !reg->negate)
2934 return;
2935
2936 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2937 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2938 *reg = temp;
2939 }
2940
2941 vec4_visitor::vec4_visitor(struct brw_context *brw,
2942 struct brw_vs_compile *c,
2943 struct gl_shader_program *prog,
2944 struct brw_shader *shader,
2945 void *mem_ctx)
2946 {
2947 this->c = c;
2948 this->brw = brw;
2949 this->intel = &brw->intel;
2950 this->ctx = &intel->ctx;
2951 this->prog = prog;
2952 this->shader = shader;
2953
2954 this->mem_ctx = mem_ctx;
2955 this->failed = false;
2956
2957 this->base_ir = NULL;
2958 this->current_annotation = NULL;
2959 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
2960
2961 this->c = c;
2962 this->vp = &c->vp->program;
2963 this->prog_data = &c->prog_data;
2964
2965 this->variable_ht = hash_table_ctor(0,
2966 hash_table_pointer_hash,
2967 hash_table_pointer_compare);
2968
2969 this->virtual_grf_def = NULL;
2970 this->virtual_grf_use = NULL;
2971 this->virtual_grf_sizes = NULL;
2972 this->virtual_grf_count = 0;
2973 this->virtual_grf_reg_map = NULL;
2974 this->virtual_grf_reg_count = 0;
2975 this->virtual_grf_array_size = 0;
2976 this->live_intervals_valid = false;
2977
2978 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2979
2980 this->uniforms = 0;
2981 }
2982
2983 vec4_visitor::~vec4_visitor()
2984 {
2985 hash_table_dtor(this->variable_ht);
2986 }
2987
2988
2989 void
2990 vec4_visitor::fail(const char *format, ...)
2991 {
2992 va_list va;
2993 char *msg;
2994
2995 if (failed)
2996 return;
2997
2998 failed = true;
2999
3000 va_start(va, format);
3001 msg = ralloc_vasprintf(mem_ctx, format, va);
3002 va_end(va);
3003 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3004
3005 this->fail_msg = msg;
3006
3007 if (INTEL_DEBUG & DEBUG_VS) {
3008 fprintf(stderr, "%s", msg);
3009 }
3010 }
3011
3012 } /* namespace brw */