i965/vs: Fix textureGrad() with shadow samplers on Haswell.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 #define ALU3(op) \
111 vec4_instruction * \
112 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
113 { \
114 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
115 src0, src1, src2); \
116 }
117
118 ALU1(NOT)
119 ALU1(MOV)
120 ALU1(FRC)
121 ALU1(RNDD)
122 ALU1(RNDE)
123 ALU1(RNDZ)
124 ALU1(F32TO16)
125 ALU1(F16TO32)
126 ALU2(ADD)
127 ALU2(MUL)
128 ALU2(MACH)
129 ALU2(AND)
130 ALU2(OR)
131 ALU2(XOR)
132 ALU2(DP3)
133 ALU2(DP4)
134 ALU2(DPH)
135 ALU2(SHL)
136 ALU2(SHR)
137 ALU2(ASR)
138 ALU3(LRP)
139
140 /** Gen4 predicated IF. */
141 vec4_instruction *
142 vec4_visitor::IF(uint32_t predicate)
143 {
144 vec4_instruction *inst;
145
146 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
147 inst->predicate = predicate;
148
149 return inst;
150 }
151
152 /** Gen6+ IF with embedded comparison. */
153 vec4_instruction *
154 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
155 {
156 assert(intel->gen >= 6);
157
158 vec4_instruction *inst;
159
160 resolve_ud_negate(&src0);
161 resolve_ud_negate(&src1);
162
163 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
164 src0, src1);
165 inst->conditional_mod = condition;
166
167 return inst;
168 }
169
170 /**
171 * CMP: Sets the low bit of the destination channels with the result
172 * of the comparison, while the upper bits are undefined, and updates
173 * the flag register with the packed 16 bits of the result.
174 */
175 vec4_instruction *
176 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
177 {
178 vec4_instruction *inst;
179
180 /* original gen4 does type conversion to the destination type
181 * before before comparison, producing garbage results for floating
182 * point comparisons.
183 */
184 if (intel->gen == 4) {
185 dst.type = src0.type;
186 if (dst.file == HW_REG)
187 dst.fixed_hw_reg.type = dst.type;
188 }
189
190 resolve_ud_negate(&src0);
191 resolve_ud_negate(&src1);
192
193 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
194 inst->conditional_mod = condition;
195
196 return inst;
197 }
198
199 vec4_instruction *
200 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
201 {
202 vec4_instruction *inst;
203
204 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
205 dst, index);
206 inst->base_mrf = 14;
207 inst->mlen = 2;
208
209 return inst;
210 }
211
212 vec4_instruction *
213 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
214 {
215 vec4_instruction *inst;
216
217 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
218 dst, src, index);
219 inst->base_mrf = 13;
220 inst->mlen = 3;
221
222 return inst;
223 }
224
225 void
226 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
227 {
228 static enum opcode dot_opcodes[] = {
229 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
230 };
231
232 emit(dot_opcodes[elements - 2], dst, src0, src1);
233 }
234
235 src_reg
236 vec4_visitor::fix_3src_operand(src_reg src)
237 {
238 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
239 * able to use vertical stride of zero to replicate the vec4 uniform, like
240 *
241 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
242 *
243 * But you can't, since vertical stride is always four in three-source
244 * instructions. Instead, insert a MOV instruction to do the replication so
245 * that the three-source instruction can consume it.
246 */
247
248 /* The MOV is only needed if the source is a uniform or immediate. */
249 if (src.file != UNIFORM && src.file != IMM)
250 return src;
251
252 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
253 expanded.type = src.type;
254 emit(MOV(expanded, src));
255 return src_reg(expanded);
256 }
257
258 src_reg
259 vec4_visitor::fix_math_operand(src_reg src)
260 {
261 /* The gen6 math instruction ignores the source modifiers --
262 * swizzle, abs, negate, and at least some parts of the register
263 * region description.
264 *
265 * Rather than trying to enumerate all these cases, *always* expand the
266 * operand to a temp GRF for gen6.
267 *
268 * For gen7, keep the operand as-is, except if immediate, which gen7 still
269 * can't use.
270 */
271
272 if (intel->gen == 7 && src.file != IMM)
273 return src;
274
275 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
276 expanded.type = src.type;
277 emit(MOV(expanded, src));
278 return src_reg(expanded);
279 }
280
281 void
282 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
283 {
284 src = fix_math_operand(src);
285
286 if (dst.writemask != WRITEMASK_XYZW) {
287 /* The gen6 math instruction must be align1, so we can't do
288 * writemasks.
289 */
290 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
291
292 emit(opcode, temp_dst, src);
293
294 emit(MOV(dst, src_reg(temp_dst)));
295 } else {
296 emit(opcode, dst, src);
297 }
298 }
299
300 void
301 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
302 {
303 vec4_instruction *inst = emit(opcode, dst, src);
304 inst->base_mrf = 1;
305 inst->mlen = 1;
306 }
307
308 void
309 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
310 {
311 switch (opcode) {
312 case SHADER_OPCODE_RCP:
313 case SHADER_OPCODE_RSQ:
314 case SHADER_OPCODE_SQRT:
315 case SHADER_OPCODE_EXP2:
316 case SHADER_OPCODE_LOG2:
317 case SHADER_OPCODE_SIN:
318 case SHADER_OPCODE_COS:
319 break;
320 default:
321 assert(!"not reached: bad math opcode");
322 return;
323 }
324
325 if (intel->gen >= 6) {
326 return emit_math1_gen6(opcode, dst, src);
327 } else {
328 return emit_math1_gen4(opcode, dst, src);
329 }
330 }
331
332 void
333 vec4_visitor::emit_math2_gen6(enum opcode opcode,
334 dst_reg dst, src_reg src0, src_reg src1)
335 {
336 src0 = fix_math_operand(src0);
337 src1 = fix_math_operand(src1);
338
339 if (dst.writemask != WRITEMASK_XYZW) {
340 /* The gen6 math instruction must be align1, so we can't do
341 * writemasks.
342 */
343 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
344 temp_dst.type = dst.type;
345
346 emit(opcode, temp_dst, src0, src1);
347
348 emit(MOV(dst, src_reg(temp_dst)));
349 } else {
350 emit(opcode, dst, src0, src1);
351 }
352 }
353
354 void
355 vec4_visitor::emit_math2_gen4(enum opcode opcode,
356 dst_reg dst, src_reg src0, src_reg src1)
357 {
358 vec4_instruction *inst = emit(opcode, dst, src0, src1);
359 inst->base_mrf = 1;
360 inst->mlen = 2;
361 }
362
363 void
364 vec4_visitor::emit_math(enum opcode opcode,
365 dst_reg dst, src_reg src0, src_reg src1)
366 {
367 switch (opcode) {
368 case SHADER_OPCODE_POW:
369 case SHADER_OPCODE_INT_QUOTIENT:
370 case SHADER_OPCODE_INT_REMAINDER:
371 break;
372 default:
373 assert(!"not reached: unsupported binary math opcode");
374 return;
375 }
376
377 if (intel->gen >= 6) {
378 return emit_math2_gen6(opcode, dst, src0, src1);
379 } else {
380 return emit_math2_gen4(opcode, dst, src0, src1);
381 }
382 }
383
384 void
385 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
386 {
387 if (intel->gen < 7)
388 assert(!"ir_unop_pack_half_2x16 should be lowered");
389
390 assert(dst.type == BRW_REGISTER_TYPE_UD);
391 assert(src0.type == BRW_REGISTER_TYPE_F);
392
393 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
394 *
395 * Because this instruction does not have a 16-bit floating-point type,
396 * the destination data type must be Word (W).
397 *
398 * The destination must be DWord-aligned and specify a horizontal stride
399 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
400 * each destination channel and the upper word is not modified.
401 *
402 * The above restriction implies that the f32to16 instruction must use
403 * align1 mode, because only in align1 mode is it possible to specify
404 * horizontal stride. We choose here to defy the hardware docs and emit
405 * align16 instructions.
406 *
407 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
408 * instructions. I was partially successful in that the code passed all
409 * tests. However, the code was dubiously correct and fragile, and the
410 * tests were not harsh enough to probe that frailty. Not trusting the
411 * code, I chose instead to remain in align16 mode in defiance of the hw
412 * docs).
413 *
414 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
415 * simulator, emitting a f32to16 in align16 mode with UD as destination
416 * data type is safe. The behavior differs from that specified in the PRM
417 * in that the upper word of each destination channel is cleared to 0.
418 */
419
420 dst_reg tmp_dst(this, glsl_type::uvec2_type);
421 src_reg tmp_src(tmp_dst);
422
423 #if 0
424 /* Verify the undocumented behavior on which the following instructions
425 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
426 * then the result of the bit-or instruction below will be incorrect.
427 *
428 * You should inspect the disasm output in order to verify that the MOV is
429 * not optimized away.
430 */
431 emit(MOV(tmp_dst, src_reg(0x12345678u)));
432 #endif
433
434 /* Give tmp the form below, where "." means untouched.
435 *
436 * w z y x w z y x
437 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
438 *
439 * That the upper word of each write-channel be 0 is required for the
440 * following bit-shift and bit-or instructions to work. Note that this
441 * relies on the undocumented hardware behavior mentioned above.
442 */
443 tmp_dst.writemask = WRITEMASK_XY;
444 emit(F32TO16(tmp_dst, src0));
445
446 /* Give the write-channels of dst the form:
447 * 0xhhhh0000
448 */
449 tmp_src.swizzle = SWIZZLE_Y;
450 emit(SHL(dst, tmp_src, src_reg(16u)));
451
452 /* Finally, give the write-channels of dst the form of packHalf2x16's
453 * output:
454 * 0xhhhhllll
455 */
456 tmp_src.swizzle = SWIZZLE_X;
457 emit(OR(dst, src_reg(dst), tmp_src));
458 }
459
460 void
461 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
462 {
463 if (intel->gen < 7)
464 assert(!"ir_unop_unpack_half_2x16 should be lowered");
465
466 assert(dst.type == BRW_REGISTER_TYPE_F);
467 assert(src0.type == BRW_REGISTER_TYPE_UD);
468
469 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
470 *
471 * Because this instruction does not have a 16-bit floating-point type,
472 * the source data type must be Word (W). The destination type must be
473 * F (Float).
474 *
475 * To use W as the source data type, we must adjust horizontal strides,
476 * which is only possible in align1 mode. All my [chadv] attempts at
477 * emitting align1 instructions for unpackHalf2x16 failed to pass the
478 * Piglit tests, so I gave up.
479 *
480 * I've verified that, on gen7 hardware and the simulator, it is safe to
481 * emit f16to32 in align16 mode with UD as source data type.
482 */
483
484 dst_reg tmp_dst(this, glsl_type::uvec2_type);
485 src_reg tmp_src(tmp_dst);
486
487 tmp_dst.writemask = WRITEMASK_X;
488 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
489
490 tmp_dst.writemask = WRITEMASK_Y;
491 emit(SHR(tmp_dst, src0, src_reg(16u)));
492
493 dst.writemask = WRITEMASK_XY;
494 emit(F16TO32(dst, tmp_src));
495 }
496
497 void
498 vec4_visitor::visit_instructions(const exec_list *list)
499 {
500 foreach_list(node, list) {
501 ir_instruction *ir = (ir_instruction *)node;
502
503 base_ir = ir;
504 ir->accept(this);
505 }
506 }
507
508
509 static int
510 type_size(const struct glsl_type *type)
511 {
512 unsigned int i;
513 int size;
514
515 switch (type->base_type) {
516 case GLSL_TYPE_UINT:
517 case GLSL_TYPE_INT:
518 case GLSL_TYPE_FLOAT:
519 case GLSL_TYPE_BOOL:
520 if (type->is_matrix()) {
521 return type->matrix_columns;
522 } else {
523 /* Regardless of size of vector, it gets a vec4. This is bad
524 * packing for things like floats, but otherwise arrays become a
525 * mess. Hopefully a later pass over the code can pack scalars
526 * down if appropriate.
527 */
528 return 1;
529 }
530 case GLSL_TYPE_ARRAY:
531 assert(type->length > 0);
532 return type_size(type->fields.array) * type->length;
533 case GLSL_TYPE_STRUCT:
534 size = 0;
535 for (i = 0; i < type->length; i++) {
536 size += type_size(type->fields.structure[i].type);
537 }
538 return size;
539 case GLSL_TYPE_SAMPLER:
540 /* Samplers take up one slot in UNIFORMS[], but they're baked in
541 * at link time.
542 */
543 return 1;
544 case GLSL_TYPE_VOID:
545 case GLSL_TYPE_ERROR:
546 case GLSL_TYPE_INTERFACE:
547 assert(0);
548 break;
549 }
550
551 return 0;
552 }
553
554 int
555 vec4_visitor::virtual_grf_alloc(int size)
556 {
557 if (virtual_grf_array_size <= virtual_grf_count) {
558 if (virtual_grf_array_size == 0)
559 virtual_grf_array_size = 16;
560 else
561 virtual_grf_array_size *= 2;
562 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
563 virtual_grf_array_size);
564 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
565 virtual_grf_array_size);
566 }
567 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
568 virtual_grf_reg_count += size;
569 virtual_grf_sizes[virtual_grf_count] = size;
570 return virtual_grf_count++;
571 }
572
573 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
574 {
575 init();
576
577 this->file = GRF;
578 this->reg = v->virtual_grf_alloc(type_size(type));
579
580 if (type->is_array() || type->is_record()) {
581 this->swizzle = BRW_SWIZZLE_NOOP;
582 } else {
583 this->swizzle = swizzle_for_size(type->vector_elements);
584 }
585
586 this->type = brw_type_for_base_type(type);
587 }
588
589 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
590 {
591 init();
592
593 this->file = GRF;
594 this->reg = v->virtual_grf_alloc(type_size(type));
595
596 if (type->is_array() || type->is_record()) {
597 this->writemask = WRITEMASK_XYZW;
598 } else {
599 this->writemask = (1 << type->vector_elements) - 1;
600 }
601
602 this->type = brw_type_for_base_type(type);
603 }
604
605 /* Our support for uniforms is piggy-backed on the struct
606 * gl_fragment_program, because that's where the values actually
607 * get stored, rather than in some global gl_shader_program uniform
608 * store.
609 */
610 void
611 vec4_visitor::setup_uniform_values(ir_variable *ir)
612 {
613 int namelen = strlen(ir->name);
614
615 /* The data for our (non-builtin) uniforms is stored in a series of
616 * gl_uniform_driver_storage structs for each subcomponent that
617 * glGetUniformLocation() could name. We know it's been set up in the same
618 * order we'd walk the type, so walk the list of storage and find anything
619 * with our name, or the prefix of a component that starts with our name.
620 */
621 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
622 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
623
624 if (strncmp(ir->name, storage->name, namelen) != 0 ||
625 (storage->name[namelen] != 0 &&
626 storage->name[namelen] != '.' &&
627 storage->name[namelen] != '[')) {
628 continue;
629 }
630
631 gl_constant_value *components = storage->storage;
632 unsigned vector_count = (MAX2(storage->array_elements, 1) *
633 storage->type->matrix_columns);
634
635 for (unsigned s = 0; s < vector_count; s++) {
636 uniform_vector_size[uniforms] = storage->type->vector_elements;
637
638 int i;
639 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
640 prog_data->param[uniforms * 4 + i] = &components->f;
641 components++;
642 }
643 for (; i < 4; i++) {
644 static float zero = 0;
645 prog_data->param[uniforms * 4 + i] = &zero;
646 }
647
648 uniforms++;
649 }
650 }
651 }
652
653 void
654 vec4_visitor::setup_uniform_clipplane_values()
655 {
656 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
657
658 if (intel->gen < 6) {
659 /* Pre-Gen6, we compact clip planes. For example, if the user
660 * enables just clip planes 0, 1, and 3, we will enable clip planes
661 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
662 * plane 2. This simplifies the implementation of the Gen6 clip
663 * thread.
664 */
665 int compacted_clipplane_index = 0;
666 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
667 if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
668 continue;
669
670 this->uniform_vector_size[this->uniforms] = 4;
671 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
672 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
673 for (int j = 0; j < 4; ++j) {
674 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
675 }
676 ++compacted_clipplane_index;
677 ++this->uniforms;
678 }
679 } else {
680 /* In Gen6 and later, we don't compact clip planes, because this
681 * simplifies the implementation of gl_ClipDistance.
682 */
683 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
684 this->uniform_vector_size[this->uniforms] = 4;
685 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
686 this->userplane[i].type = BRW_REGISTER_TYPE_F;
687 for (int j = 0; j < 4; ++j) {
688 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
689 }
690 ++this->uniforms;
691 }
692 }
693 }
694
695 /* Our support for builtin uniforms is even scarier than non-builtin.
696 * It sits on top of the PROG_STATE_VAR parameters that are
697 * automatically updated from GL context state.
698 */
699 void
700 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
701 {
702 const ir_state_slot *const slots = ir->state_slots;
703 assert(ir->state_slots != NULL);
704
705 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
706 /* This state reference has already been setup by ir_to_mesa,
707 * but we'll get the same index back here. We can reference
708 * ParameterValues directly, since unlike brw_fs.cpp, we never
709 * add new state references during compile.
710 */
711 int index = _mesa_add_state_reference(this->prog->Parameters,
712 (gl_state_index *)slots[i].tokens);
713 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
714
715 this->uniform_vector_size[this->uniforms] = 0;
716 /* Add each of the unique swizzled channels of the element.
717 * This will end up matching the size of the glsl_type of this field.
718 */
719 int last_swiz = -1;
720 for (unsigned int j = 0; j < 4; j++) {
721 int swiz = GET_SWZ(slots[i].swizzle, j);
722 last_swiz = swiz;
723
724 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
725 if (swiz <= last_swiz)
726 this->uniform_vector_size[this->uniforms]++;
727 }
728 this->uniforms++;
729 }
730 }
731
732 dst_reg *
733 vec4_visitor::variable_storage(ir_variable *var)
734 {
735 return (dst_reg *)hash_table_find(this->variable_ht, var);
736 }
737
738 void
739 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
740 {
741 ir_expression *expr = ir->as_expression();
742
743 *predicate = BRW_PREDICATE_NORMAL;
744
745 if (expr) {
746 src_reg op[2];
747 vec4_instruction *inst;
748
749 assert(expr->get_num_operands() <= 2);
750 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
751 expr->operands[i]->accept(this);
752 op[i] = this->result;
753
754 resolve_ud_negate(&op[i]);
755 }
756
757 switch (expr->operation) {
758 case ir_unop_logic_not:
759 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
760 inst->conditional_mod = BRW_CONDITIONAL_Z;
761 break;
762
763 case ir_binop_logic_xor:
764 inst = emit(XOR(dst_null_d(), op[0], op[1]));
765 inst->conditional_mod = BRW_CONDITIONAL_NZ;
766 break;
767
768 case ir_binop_logic_or:
769 inst = emit(OR(dst_null_d(), op[0], op[1]));
770 inst->conditional_mod = BRW_CONDITIONAL_NZ;
771 break;
772
773 case ir_binop_logic_and:
774 inst = emit(AND(dst_null_d(), op[0], op[1]));
775 inst->conditional_mod = BRW_CONDITIONAL_NZ;
776 break;
777
778 case ir_unop_f2b:
779 if (intel->gen >= 6) {
780 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
781 } else {
782 inst = emit(MOV(dst_null_f(), op[0]));
783 inst->conditional_mod = BRW_CONDITIONAL_NZ;
784 }
785 break;
786
787 case ir_unop_i2b:
788 if (intel->gen >= 6) {
789 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
790 } else {
791 inst = emit(MOV(dst_null_d(), op[0]));
792 inst->conditional_mod = BRW_CONDITIONAL_NZ;
793 }
794 break;
795
796 case ir_binop_all_equal:
797 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
798 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
799 break;
800
801 case ir_binop_any_nequal:
802 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
803 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
804 break;
805
806 case ir_unop_any:
807 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
808 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
809 break;
810
811 case ir_binop_greater:
812 case ir_binop_gequal:
813 case ir_binop_less:
814 case ir_binop_lequal:
815 case ir_binop_equal:
816 case ir_binop_nequal:
817 emit(CMP(dst_null_d(), op[0], op[1],
818 brw_conditional_for_comparison(expr->operation)));
819 break;
820
821 default:
822 assert(!"not reached");
823 break;
824 }
825 return;
826 }
827
828 ir->accept(this);
829
830 resolve_ud_negate(&this->result);
831
832 if (intel->gen >= 6) {
833 vec4_instruction *inst = emit(AND(dst_null_d(),
834 this->result, src_reg(1)));
835 inst->conditional_mod = BRW_CONDITIONAL_NZ;
836 } else {
837 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
838 inst->conditional_mod = BRW_CONDITIONAL_NZ;
839 }
840 }
841
842 /**
843 * Emit a gen6 IF statement with the comparison folded into the IF
844 * instruction.
845 */
846 void
847 vec4_visitor::emit_if_gen6(ir_if *ir)
848 {
849 ir_expression *expr = ir->condition->as_expression();
850
851 if (expr) {
852 src_reg op[2];
853 dst_reg temp;
854
855 assert(expr->get_num_operands() <= 2);
856 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
857 expr->operands[i]->accept(this);
858 op[i] = this->result;
859 }
860
861 switch (expr->operation) {
862 case ir_unop_logic_not:
863 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
864 return;
865
866 case ir_binop_logic_xor:
867 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
868 return;
869
870 case ir_binop_logic_or:
871 temp = dst_reg(this, glsl_type::bool_type);
872 emit(OR(temp, op[0], op[1]));
873 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
874 return;
875
876 case ir_binop_logic_and:
877 temp = dst_reg(this, glsl_type::bool_type);
878 emit(AND(temp, op[0], op[1]));
879 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
880 return;
881
882 case ir_unop_f2b:
883 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
884 return;
885
886 case ir_unop_i2b:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
888 return;
889
890 case ir_binop_greater:
891 case ir_binop_gequal:
892 case ir_binop_less:
893 case ir_binop_lequal:
894 case ir_binop_equal:
895 case ir_binop_nequal:
896 emit(IF(op[0], op[1],
897 brw_conditional_for_comparison(expr->operation)));
898 return;
899
900 case ir_binop_all_equal:
901 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
902 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
903 return;
904
905 case ir_binop_any_nequal:
906 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
907 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
908 return;
909
910 case ir_unop_any:
911 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
912 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
913 return;
914
915 default:
916 assert(!"not reached");
917 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
918 return;
919 }
920 return;
921 }
922
923 ir->condition->accept(this);
924
925 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
926 }
927
928 static dst_reg
929 with_writemask(dst_reg const & r, int mask)
930 {
931 dst_reg result = r;
932 result.writemask = mask;
933 return result;
934 }
935
936 void
937 vec4_vs_visitor::emit_prolog()
938 {
939 dst_reg sign_recovery_shift;
940 dst_reg normalize_factor;
941 dst_reg es3_normalize_factor;
942
943 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
944 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
945 uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
946 dst_reg reg(ATTR, i);
947 dst_reg reg_d = reg;
948 reg_d.type = BRW_REGISTER_TYPE_D;
949 dst_reg reg_ud = reg;
950 reg_ud.type = BRW_REGISTER_TYPE_UD;
951
952 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
953 * come in as floating point conversions of the integer values.
954 */
955 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
956 dst_reg dst = reg;
957 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
958 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
959 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
960 }
961
962 /* Do sign recovery for 2101010 formats if required. */
963 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
964 if (sign_recovery_shift.file == BAD_FILE) {
965 /* shift constant: <22,22,22,30> */
966 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
967 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
968 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
969 }
970
971 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
972 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
973 }
974
975 /* Apply BGRA swizzle if required. */
976 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
977 src_reg temp = src_reg(reg);
978 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
979 emit(MOV(reg, temp));
980 }
981
982 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
983 /* ES 3.0 has different rules for converting signed normalized
984 * fixed-point numbers than desktop GL.
985 */
986 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
987 /* According to equation 2.2 of the ES 3.0 specification,
988 * signed normalization conversion is done by:
989 *
990 * f = c / (2^(b-1)-1)
991 */
992 if (es3_normalize_factor.file == BAD_FILE) {
993 /* mul constant: 1 / (2^(b-1) - 1) */
994 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
995 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
996 src_reg(1.0f / ((1<<9) - 1))));
997 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
998 src_reg(1.0f / ((1<<1) - 1))));
999 }
1000
1001 dst_reg dst = reg;
1002 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1003 emit(MOV(dst, src_reg(reg_d)));
1004 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1005 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1006 } else {
1007 /* The following equations are from the OpenGL 3.2 specification:
1008 *
1009 * 2.1 unsigned normalization
1010 * f = c/(2^n-1)
1011 *
1012 * 2.2 signed normalization
1013 * f = (2c+1)/(2^n-1)
1014 *
1015 * Both of these share a common divisor, which is represented by
1016 * "normalize_factor" in the code below.
1017 */
1018 if (normalize_factor.file == BAD_FILE) {
1019 /* 1 / (2^b - 1) for b=<10,10,10,2> */
1020 normalize_factor = dst_reg(this, glsl_type::vec4_type);
1021 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1022 src_reg(1.0f / ((1<<10) - 1))));
1023 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1024 src_reg(1.0f / ((1<<2) - 1))));
1025 }
1026
1027 dst_reg dst = reg;
1028 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1029 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1030
1031 /* For signed normalization, we want the numerator to be 2c+1. */
1032 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1033 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1034 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1035 }
1036
1037 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1038 }
1039 }
1040
1041 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1042 dst_reg dst = reg;
1043 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1044 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1045 }
1046 }
1047 }
1048 }
1049
1050
1051 dst_reg *
1052 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1053 {
1054 /* VertexID is stored by the VF as the last vertex element, but
1055 * we don't represent it with a flag in inputs_read, so we call
1056 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1057 */
1058 dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1059 vs_prog_data->uses_vertexid = true;
1060
1061 switch (ir->location) {
1062 case SYSTEM_VALUE_VERTEX_ID:
1063 reg->writemask = WRITEMASK_X;
1064 break;
1065 case SYSTEM_VALUE_INSTANCE_ID:
1066 reg->writemask = WRITEMASK_Y;
1067 break;
1068 default:
1069 assert(!"not reached");
1070 break;
1071 }
1072
1073 return reg;
1074 }
1075
1076
1077 void
1078 vec4_visitor::visit(ir_variable *ir)
1079 {
1080 dst_reg *reg = NULL;
1081
1082 if (variable_storage(ir))
1083 return;
1084
1085 switch (ir->mode) {
1086 case ir_var_shader_in:
1087 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1088 break;
1089
1090 case ir_var_shader_out:
1091 reg = new(mem_ctx) dst_reg(this, ir->type);
1092
1093 for (int i = 0; i < type_size(ir->type); i++) {
1094 output_reg[ir->location + i] = *reg;
1095 output_reg[ir->location + i].reg_offset = i;
1096 output_reg[ir->location + i].type =
1097 brw_type_for_base_type(ir->type->get_scalar_type());
1098 output_reg_annotation[ir->location + i] = ir->name;
1099 }
1100 break;
1101
1102 case ir_var_auto:
1103 case ir_var_temporary:
1104 reg = new(mem_ctx) dst_reg(this, ir->type);
1105 break;
1106
1107 case ir_var_uniform:
1108 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1109
1110 /* Thanks to the lower_ubo_reference pass, we will see only
1111 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1112 * variables, so no need for them to be in variable_ht.
1113 */
1114 if (ir->is_in_uniform_block())
1115 return;
1116
1117 /* Track how big the whole uniform variable is, in case we need to put a
1118 * copy of its data into pull constants for array access.
1119 */
1120 this->uniform_size[this->uniforms] = type_size(ir->type);
1121
1122 if (!strncmp(ir->name, "gl_", 3)) {
1123 setup_builtin_uniform_values(ir);
1124 } else {
1125 setup_uniform_values(ir);
1126 }
1127 break;
1128
1129 case ir_var_system_value:
1130 reg = make_reg_for_system_value(ir);
1131 break;
1132
1133 default:
1134 assert(!"not reached");
1135 }
1136
1137 reg->type = brw_type_for_base_type(ir->type);
1138 hash_table_insert(this->variable_ht, reg, ir);
1139 }
1140
1141 void
1142 vec4_visitor::visit(ir_loop *ir)
1143 {
1144 dst_reg counter;
1145
1146 /* We don't want debugging output to print the whole body of the
1147 * loop as the annotation.
1148 */
1149 this->base_ir = NULL;
1150
1151 if (ir->counter != NULL) {
1152 this->base_ir = ir->counter;
1153 ir->counter->accept(this);
1154 counter = *(variable_storage(ir->counter));
1155
1156 if (ir->from != NULL) {
1157 this->base_ir = ir->from;
1158 ir->from->accept(this);
1159
1160 emit(MOV(counter, this->result));
1161 }
1162 }
1163
1164 emit(BRW_OPCODE_DO);
1165
1166 if (ir->to) {
1167 this->base_ir = ir->to;
1168 ir->to->accept(this);
1169
1170 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1171 brw_conditional_for_comparison(ir->cmp)));
1172
1173 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1174 inst->predicate = BRW_PREDICATE_NORMAL;
1175 }
1176
1177 visit_instructions(&ir->body_instructions);
1178
1179
1180 if (ir->increment) {
1181 this->base_ir = ir->increment;
1182 ir->increment->accept(this);
1183 emit(ADD(counter, src_reg(counter), this->result));
1184 }
1185
1186 emit(BRW_OPCODE_WHILE);
1187 }
1188
1189 void
1190 vec4_visitor::visit(ir_loop_jump *ir)
1191 {
1192 switch (ir->mode) {
1193 case ir_loop_jump::jump_break:
1194 emit(BRW_OPCODE_BREAK);
1195 break;
1196 case ir_loop_jump::jump_continue:
1197 emit(BRW_OPCODE_CONTINUE);
1198 break;
1199 }
1200 }
1201
1202
1203 void
1204 vec4_visitor::visit(ir_function_signature *ir)
1205 {
1206 assert(0);
1207 (void)ir;
1208 }
1209
1210 void
1211 vec4_visitor::visit(ir_function *ir)
1212 {
1213 /* Ignore function bodies other than main() -- we shouldn't see calls to
1214 * them since they should all be inlined.
1215 */
1216 if (strcmp(ir->name, "main") == 0) {
1217 const ir_function_signature *sig;
1218 exec_list empty;
1219
1220 sig = ir->matching_signature(&empty);
1221
1222 assert(sig);
1223
1224 visit_instructions(&sig->body);
1225 }
1226 }
1227
1228 bool
1229 vec4_visitor::try_emit_sat(ir_expression *ir)
1230 {
1231 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1232 if (!sat_src)
1233 return false;
1234
1235 sat_src->accept(this);
1236 src_reg src = this->result;
1237
1238 this->result = src_reg(this, ir->type);
1239 vec4_instruction *inst;
1240 inst = emit(MOV(dst_reg(this->result), src));
1241 inst->saturate = true;
1242
1243 return true;
1244 }
1245
1246 void
1247 vec4_visitor::emit_bool_comparison(unsigned int op,
1248 dst_reg dst, src_reg src0, src_reg src1)
1249 {
1250 /* original gen4 does destination conversion before comparison. */
1251 if (intel->gen < 5)
1252 dst.type = src0.type;
1253
1254 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1255
1256 dst.type = BRW_REGISTER_TYPE_D;
1257 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1258 }
1259
1260 void
1261 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1262 src_reg src0, src_reg src1)
1263 {
1264 vec4_instruction *inst;
1265
1266 if (intel->gen >= 6) {
1267 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1268 inst->conditional_mod = conditionalmod;
1269 } else {
1270 emit(CMP(dst, src0, src1, conditionalmod));
1271
1272 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1273 inst->predicate = BRW_PREDICATE_NORMAL;
1274 }
1275 }
1276
1277 void
1278 vec4_visitor::visit(ir_expression *ir)
1279 {
1280 unsigned int operand;
1281 src_reg op[Elements(ir->operands)];
1282 src_reg result_src;
1283 dst_reg result_dst;
1284 vec4_instruction *inst;
1285
1286 if (try_emit_sat(ir))
1287 return;
1288
1289 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1290 this->result.file = BAD_FILE;
1291 ir->operands[operand]->accept(this);
1292 if (this->result.file == BAD_FILE) {
1293 printf("Failed to get tree for expression operand:\n");
1294 ir->operands[operand]->print();
1295 exit(1);
1296 }
1297 op[operand] = this->result;
1298
1299 /* Matrix expression operands should have been broken down to vector
1300 * operations already.
1301 */
1302 assert(!ir->operands[operand]->type->is_matrix());
1303 }
1304
1305 int vector_elements = ir->operands[0]->type->vector_elements;
1306 if (ir->operands[1]) {
1307 vector_elements = MAX2(vector_elements,
1308 ir->operands[1]->type->vector_elements);
1309 }
1310
1311 this->result.file = BAD_FILE;
1312
1313 /* Storage for our result. Ideally for an assignment we'd be using
1314 * the actual storage for the result here, instead.
1315 */
1316 result_src = src_reg(this, ir->type);
1317 /* convenience for the emit functions below. */
1318 result_dst = dst_reg(result_src);
1319 /* If nothing special happens, this is the result. */
1320 this->result = result_src;
1321 /* Limit writes to the channels that will be used by result_src later.
1322 * This does limit this temp's use as a temporary for multi-instruction
1323 * sequences.
1324 */
1325 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1326
1327 switch (ir->operation) {
1328 case ir_unop_logic_not:
1329 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1330 * ones complement of the whole register, not just bit 0.
1331 */
1332 emit(XOR(result_dst, op[0], src_reg(1)));
1333 break;
1334 case ir_unop_neg:
1335 op[0].negate = !op[0].negate;
1336 this->result = op[0];
1337 break;
1338 case ir_unop_abs:
1339 op[0].abs = true;
1340 op[0].negate = false;
1341 this->result = op[0];
1342 break;
1343
1344 case ir_unop_sign:
1345 emit(MOV(result_dst, src_reg(0.0f)));
1346
1347 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1348 inst = emit(MOV(result_dst, src_reg(1.0f)));
1349 inst->predicate = BRW_PREDICATE_NORMAL;
1350
1351 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1352 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1353 inst->predicate = BRW_PREDICATE_NORMAL;
1354
1355 break;
1356
1357 case ir_unop_rcp:
1358 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1359 break;
1360
1361 case ir_unop_exp2:
1362 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1363 break;
1364 case ir_unop_log2:
1365 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1366 break;
1367 case ir_unop_exp:
1368 case ir_unop_log:
1369 assert(!"not reached: should be handled by ir_explog_to_explog2");
1370 break;
1371 case ir_unop_sin:
1372 case ir_unop_sin_reduced:
1373 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1374 break;
1375 case ir_unop_cos:
1376 case ir_unop_cos_reduced:
1377 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1378 break;
1379
1380 case ir_unop_dFdx:
1381 case ir_unop_dFdy:
1382 assert(!"derivatives not valid in vertex shader");
1383 break;
1384
1385 case ir_unop_noise:
1386 assert(!"not reached: should be handled by lower_noise");
1387 break;
1388
1389 case ir_binop_add:
1390 emit(ADD(result_dst, op[0], op[1]));
1391 break;
1392 case ir_binop_sub:
1393 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1394 break;
1395
1396 case ir_binop_mul:
1397 if (ir->type->is_integer()) {
1398 /* For integer multiplication, the MUL uses the low 16 bits
1399 * of one of the operands (src0 on gen6, src1 on gen7). The
1400 * MACH accumulates in the contribution of the upper 16 bits
1401 * of that operand.
1402 *
1403 * FINISHME: Emit just the MUL if we know an operand is small
1404 * enough.
1405 */
1406 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1407
1408 emit(MUL(acc, op[0], op[1]));
1409 emit(MACH(dst_null_d(), op[0], op[1]));
1410 emit(MOV(result_dst, src_reg(acc)));
1411 } else {
1412 emit(MUL(result_dst, op[0], op[1]));
1413 }
1414 break;
1415 case ir_binop_div:
1416 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1417 assert(ir->type->is_integer());
1418 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1419 break;
1420 case ir_binop_mod:
1421 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1422 assert(ir->type->is_integer());
1423 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1424 break;
1425
1426 case ir_binop_less:
1427 case ir_binop_greater:
1428 case ir_binop_lequal:
1429 case ir_binop_gequal:
1430 case ir_binop_equal:
1431 case ir_binop_nequal: {
1432 emit(CMP(result_dst, op[0], op[1],
1433 brw_conditional_for_comparison(ir->operation)));
1434 emit(AND(result_dst, result_src, src_reg(0x1)));
1435 break;
1436 }
1437
1438 case ir_binop_all_equal:
1439 /* "==" operator producing a scalar boolean. */
1440 if (ir->operands[0]->type->is_vector() ||
1441 ir->operands[1]->type->is_vector()) {
1442 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1443 emit(MOV(result_dst, src_reg(0)));
1444 inst = emit(MOV(result_dst, src_reg(1)));
1445 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1446 } else {
1447 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1448 emit(AND(result_dst, result_src, src_reg(0x1)));
1449 }
1450 break;
1451 case ir_binop_any_nequal:
1452 /* "!=" operator producing a scalar boolean. */
1453 if (ir->operands[0]->type->is_vector() ||
1454 ir->operands[1]->type->is_vector()) {
1455 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1456
1457 emit(MOV(result_dst, src_reg(0)));
1458 inst = emit(MOV(result_dst, src_reg(1)));
1459 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1460 } else {
1461 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1462 emit(AND(result_dst, result_src, src_reg(0x1)));
1463 }
1464 break;
1465
1466 case ir_unop_any:
1467 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1468 emit(MOV(result_dst, src_reg(0)));
1469
1470 inst = emit(MOV(result_dst, src_reg(1)));
1471 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1472 break;
1473
1474 case ir_binop_logic_xor:
1475 emit(XOR(result_dst, op[0], op[1]));
1476 break;
1477
1478 case ir_binop_logic_or:
1479 emit(OR(result_dst, op[0], op[1]));
1480 break;
1481
1482 case ir_binop_logic_and:
1483 emit(AND(result_dst, op[0], op[1]));
1484 break;
1485
1486 case ir_binop_dot:
1487 assert(ir->operands[0]->type->is_vector());
1488 assert(ir->operands[0]->type == ir->operands[1]->type);
1489 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1490 break;
1491
1492 case ir_unop_sqrt:
1493 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1494 break;
1495 case ir_unop_rsq:
1496 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1497 break;
1498
1499 case ir_unop_bitcast_i2f:
1500 case ir_unop_bitcast_u2f:
1501 this->result = op[0];
1502 this->result.type = BRW_REGISTER_TYPE_F;
1503 break;
1504
1505 case ir_unop_bitcast_f2i:
1506 this->result = op[0];
1507 this->result.type = BRW_REGISTER_TYPE_D;
1508 break;
1509
1510 case ir_unop_bitcast_f2u:
1511 this->result = op[0];
1512 this->result.type = BRW_REGISTER_TYPE_UD;
1513 break;
1514
1515 case ir_unop_i2f:
1516 case ir_unop_i2u:
1517 case ir_unop_u2i:
1518 case ir_unop_u2f:
1519 case ir_unop_b2f:
1520 case ir_unop_b2i:
1521 case ir_unop_f2i:
1522 case ir_unop_f2u:
1523 emit(MOV(result_dst, op[0]));
1524 break;
1525 case ir_unop_f2b:
1526 case ir_unop_i2b: {
1527 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1528 emit(AND(result_dst, result_src, src_reg(1)));
1529 break;
1530 }
1531
1532 case ir_unop_trunc:
1533 emit(RNDZ(result_dst, op[0]));
1534 break;
1535 case ir_unop_ceil:
1536 op[0].negate = !op[0].negate;
1537 inst = emit(RNDD(result_dst, op[0]));
1538 this->result.negate = true;
1539 break;
1540 case ir_unop_floor:
1541 inst = emit(RNDD(result_dst, op[0]));
1542 break;
1543 case ir_unop_fract:
1544 inst = emit(FRC(result_dst, op[0]));
1545 break;
1546 case ir_unop_round_even:
1547 emit(RNDE(result_dst, op[0]));
1548 break;
1549
1550 case ir_binop_min:
1551 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1552 break;
1553 case ir_binop_max:
1554 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1555 break;
1556
1557 case ir_binop_pow:
1558 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1559 break;
1560
1561 case ir_unop_bit_not:
1562 inst = emit(NOT(result_dst, op[0]));
1563 break;
1564 case ir_binop_bit_and:
1565 inst = emit(AND(result_dst, op[0], op[1]));
1566 break;
1567 case ir_binop_bit_xor:
1568 inst = emit(XOR(result_dst, op[0], op[1]));
1569 break;
1570 case ir_binop_bit_or:
1571 inst = emit(OR(result_dst, op[0], op[1]));
1572 break;
1573
1574 case ir_binop_lshift:
1575 inst = emit(SHL(result_dst, op[0], op[1]));
1576 break;
1577
1578 case ir_binop_rshift:
1579 if (ir->type->base_type == GLSL_TYPE_INT)
1580 inst = emit(ASR(result_dst, op[0], op[1]));
1581 else
1582 inst = emit(SHR(result_dst, op[0], op[1]));
1583 break;
1584
1585 case ir_binop_ubo_load: {
1586 ir_constant *uniform_block = ir->operands[0]->as_constant();
1587 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1588 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1589 src_reg offset = op[1];
1590
1591 /* Now, load the vector from that offset. */
1592 assert(ir->type->is_vector() || ir->type->is_scalar());
1593
1594 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1595 packed_consts.type = result.type;
1596 src_reg surf_index =
1597 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1598 if (const_offset_ir) {
1599 offset = src_reg(const_offset / 16);
1600 } else {
1601 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1602 }
1603
1604 vec4_instruction *pull =
1605 emit(new(mem_ctx) vec4_instruction(this,
1606 VS_OPCODE_PULL_CONSTANT_LOAD,
1607 dst_reg(packed_consts),
1608 surf_index,
1609 offset));
1610 pull->base_mrf = 14;
1611 pull->mlen = 1;
1612
1613 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1614 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1615 const_offset % 16 / 4,
1616 const_offset % 16 / 4,
1617 const_offset % 16 / 4);
1618
1619 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1620 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1621 emit(CMP(result_dst, packed_consts, src_reg(0u),
1622 BRW_CONDITIONAL_NZ));
1623 emit(AND(result_dst, result, src_reg(0x1)));
1624 } else {
1625 emit(MOV(result_dst, packed_consts));
1626 }
1627 break;
1628 }
1629
1630 case ir_triop_lrp:
1631 op[0] = fix_3src_operand(op[0]);
1632 op[1] = fix_3src_operand(op[1]);
1633 op[2] = fix_3src_operand(op[2]);
1634 /* Note that the instruction's argument order is reversed from GLSL
1635 * and the IR.
1636 */
1637 emit(LRP(result_dst, op[2], op[1], op[0]));
1638 break;
1639
1640 case ir_quadop_vector:
1641 assert(!"not reached: should be handled by lower_quadop_vector");
1642 break;
1643
1644 case ir_unop_pack_half_2x16:
1645 emit_pack_half_2x16(result_dst, op[0]);
1646 break;
1647 case ir_unop_unpack_half_2x16:
1648 emit_unpack_half_2x16(result_dst, op[0]);
1649 break;
1650 case ir_unop_pack_snorm_2x16:
1651 case ir_unop_pack_snorm_4x8:
1652 case ir_unop_pack_unorm_2x16:
1653 case ir_unop_pack_unorm_4x8:
1654 case ir_unop_unpack_snorm_2x16:
1655 case ir_unop_unpack_snorm_4x8:
1656 case ir_unop_unpack_unorm_2x16:
1657 case ir_unop_unpack_unorm_4x8:
1658 assert(!"not reached: should be handled by lower_packing_builtins");
1659 break;
1660 case ir_unop_unpack_half_2x16_split_x:
1661 case ir_unop_unpack_half_2x16_split_y:
1662 case ir_binop_pack_half_2x16_split:
1663 assert(!"not reached: should not occur in vertex shader");
1664 break;
1665 }
1666 }
1667
1668
1669 void
1670 vec4_visitor::visit(ir_swizzle *ir)
1671 {
1672 src_reg src;
1673 int i = 0;
1674 int swizzle[4];
1675
1676 /* Note that this is only swizzles in expressions, not those on the left
1677 * hand side of an assignment, which do write masking. See ir_assignment
1678 * for that.
1679 */
1680
1681 ir->val->accept(this);
1682 src = this->result;
1683 assert(src.file != BAD_FILE);
1684
1685 for (i = 0; i < ir->type->vector_elements; i++) {
1686 switch (i) {
1687 case 0:
1688 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1689 break;
1690 case 1:
1691 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1692 break;
1693 case 2:
1694 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1695 break;
1696 case 3:
1697 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1698 break;
1699 }
1700 }
1701 for (; i < 4; i++) {
1702 /* Replicate the last channel out. */
1703 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1704 }
1705
1706 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1707
1708 this->result = src;
1709 }
1710
1711 void
1712 vec4_visitor::visit(ir_dereference_variable *ir)
1713 {
1714 const struct glsl_type *type = ir->type;
1715 dst_reg *reg = variable_storage(ir->var);
1716
1717 if (!reg) {
1718 fail("Failed to find variable storage for %s\n", ir->var->name);
1719 this->result = src_reg(brw_null_reg());
1720 return;
1721 }
1722
1723 this->result = src_reg(*reg);
1724
1725 /* System values get their swizzle from the dst_reg writemask */
1726 if (ir->var->mode == ir_var_system_value)
1727 return;
1728
1729 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1730 this->result.swizzle = swizzle_for_size(type->vector_elements);
1731 }
1732
1733
1734 int
1735 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1736 {
1737 /* Under normal circumstances array elements are stored consecutively, so
1738 * the stride is equal to the size of the array element.
1739 */
1740 return type_size(ir->type);
1741 }
1742
1743
1744 void
1745 vec4_visitor::visit(ir_dereference_array *ir)
1746 {
1747 ir_constant *constant_index;
1748 src_reg src;
1749 int array_stride = compute_array_stride(ir);
1750
1751 constant_index = ir->array_index->constant_expression_value();
1752
1753 ir->array->accept(this);
1754 src = this->result;
1755
1756 if (constant_index) {
1757 src.reg_offset += constant_index->value.i[0] * array_stride;
1758 } else {
1759 /* Variable index array dereference. It eats the "vec4" of the
1760 * base of the array and an index that offsets the Mesa register
1761 * index.
1762 */
1763 ir->array_index->accept(this);
1764
1765 src_reg index_reg;
1766
1767 if (array_stride == 1) {
1768 index_reg = this->result;
1769 } else {
1770 index_reg = src_reg(this, glsl_type::int_type);
1771
1772 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1773 }
1774
1775 if (src.reladdr) {
1776 src_reg temp = src_reg(this, glsl_type::int_type);
1777
1778 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1779
1780 index_reg = temp;
1781 }
1782
1783 src.reladdr = ralloc(mem_ctx, src_reg);
1784 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1785 }
1786
1787 /* If the type is smaller than a vec4, replicate the last channel out. */
1788 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1789 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1790 else
1791 src.swizzle = BRW_SWIZZLE_NOOP;
1792 src.type = brw_type_for_base_type(ir->type);
1793
1794 this->result = src;
1795 }
1796
1797 void
1798 vec4_visitor::visit(ir_dereference_record *ir)
1799 {
1800 unsigned int i;
1801 const glsl_type *struct_type = ir->record->type;
1802 int offset = 0;
1803
1804 ir->record->accept(this);
1805
1806 for (i = 0; i < struct_type->length; i++) {
1807 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1808 break;
1809 offset += type_size(struct_type->fields.structure[i].type);
1810 }
1811
1812 /* If the type is smaller than a vec4, replicate the last channel out. */
1813 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1814 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1815 else
1816 this->result.swizzle = BRW_SWIZZLE_NOOP;
1817 this->result.type = brw_type_for_base_type(ir->type);
1818
1819 this->result.reg_offset += offset;
1820 }
1821
1822 /**
1823 * We want to be careful in assignment setup to hit the actual storage
1824 * instead of potentially using a temporary like we might with the
1825 * ir_dereference handler.
1826 */
1827 static dst_reg
1828 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1829 {
1830 /* The LHS must be a dereference. If the LHS is a variable indexed array
1831 * access of a vector, it must be separated into a series conditional moves
1832 * before reaching this point (see ir_vec_index_to_cond_assign).
1833 */
1834 assert(ir->as_dereference());
1835 ir_dereference_array *deref_array = ir->as_dereference_array();
1836 if (deref_array) {
1837 assert(!deref_array->array->type->is_vector());
1838 }
1839
1840 /* Use the rvalue deref handler for the most part. We'll ignore
1841 * swizzles in it and write swizzles using writemask, though.
1842 */
1843 ir->accept(v);
1844 return dst_reg(v->result);
1845 }
1846
1847 void
1848 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1849 const struct glsl_type *type, uint32_t predicate)
1850 {
1851 if (type->base_type == GLSL_TYPE_STRUCT) {
1852 for (unsigned int i = 0; i < type->length; i++) {
1853 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1854 }
1855 return;
1856 }
1857
1858 if (type->is_array()) {
1859 for (unsigned int i = 0; i < type->length; i++) {
1860 emit_block_move(dst, src, type->fields.array, predicate);
1861 }
1862 return;
1863 }
1864
1865 if (type->is_matrix()) {
1866 const struct glsl_type *vec_type;
1867
1868 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1869 type->vector_elements, 1);
1870
1871 for (int i = 0; i < type->matrix_columns; i++) {
1872 emit_block_move(dst, src, vec_type, predicate);
1873 }
1874 return;
1875 }
1876
1877 assert(type->is_scalar() || type->is_vector());
1878
1879 dst->type = brw_type_for_base_type(type);
1880 src->type = dst->type;
1881
1882 dst->writemask = (1 << type->vector_elements) - 1;
1883
1884 src->swizzle = swizzle_for_size(type->vector_elements);
1885
1886 vec4_instruction *inst = emit(MOV(*dst, *src));
1887 inst->predicate = predicate;
1888
1889 dst->reg_offset++;
1890 src->reg_offset++;
1891 }
1892
1893
1894 /* If the RHS processing resulted in an instruction generating a
1895 * temporary value, and it would be easy to rewrite the instruction to
1896 * generate its result right into the LHS instead, do so. This ends
1897 * up reliably removing instructions where it can be tricky to do so
1898 * later without real UD chain information.
1899 */
1900 bool
1901 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1902 dst_reg dst,
1903 src_reg src,
1904 vec4_instruction *pre_rhs_inst,
1905 vec4_instruction *last_rhs_inst)
1906 {
1907 /* This could be supported, but it would take more smarts. */
1908 if (ir->condition)
1909 return false;
1910
1911 if (pre_rhs_inst == last_rhs_inst)
1912 return false; /* No instructions generated to work with. */
1913
1914 /* Make sure the last instruction generated our source reg. */
1915 if (src.file != GRF ||
1916 src.file != last_rhs_inst->dst.file ||
1917 src.reg != last_rhs_inst->dst.reg ||
1918 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1919 src.reladdr ||
1920 src.abs ||
1921 src.negate ||
1922 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1923 return false;
1924
1925 /* Check that that last instruction fully initialized the channels
1926 * we want to use, in the order we want to use them. We could
1927 * potentially reswizzle the operands of many instructions so that
1928 * we could handle out of order channels, but don't yet.
1929 */
1930
1931 for (unsigned i = 0; i < 4; i++) {
1932 if (dst.writemask & (1 << i)) {
1933 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1934 return false;
1935
1936 if (BRW_GET_SWZ(src.swizzle, i) != i)
1937 return false;
1938 }
1939 }
1940
1941 /* Success! Rewrite the instruction. */
1942 last_rhs_inst->dst.file = dst.file;
1943 last_rhs_inst->dst.reg = dst.reg;
1944 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1945 last_rhs_inst->dst.reladdr = dst.reladdr;
1946 last_rhs_inst->dst.writemask &= dst.writemask;
1947
1948 return true;
1949 }
1950
1951 void
1952 vec4_visitor::visit(ir_assignment *ir)
1953 {
1954 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1955 uint32_t predicate = BRW_PREDICATE_NONE;
1956
1957 if (!ir->lhs->type->is_scalar() &&
1958 !ir->lhs->type->is_vector()) {
1959 ir->rhs->accept(this);
1960 src_reg src = this->result;
1961
1962 if (ir->condition) {
1963 emit_bool_to_cond_code(ir->condition, &predicate);
1964 }
1965
1966 /* emit_block_move doesn't account for swizzles in the source register.
1967 * This should be ok, since the source register is a structure or an
1968 * array, and those can't be swizzled. But double-check to be sure.
1969 */
1970 assert(src.swizzle ==
1971 (ir->rhs->type->is_matrix()
1972 ? swizzle_for_size(ir->rhs->type->vector_elements)
1973 : BRW_SWIZZLE_NOOP));
1974
1975 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1976 return;
1977 }
1978
1979 /* Now we're down to just a scalar/vector with writemasks. */
1980 int i;
1981
1982 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1983 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1984
1985 ir->rhs->accept(this);
1986
1987 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1988
1989 src_reg src = this->result;
1990
1991 int swizzles[4];
1992 int first_enabled_chan = 0;
1993 int src_chan = 0;
1994
1995 assert(ir->lhs->type->is_vector() ||
1996 ir->lhs->type->is_scalar());
1997 dst.writemask = ir->write_mask;
1998
1999 for (int i = 0; i < 4; i++) {
2000 if (dst.writemask & (1 << i)) {
2001 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2002 break;
2003 }
2004 }
2005
2006 /* Swizzle a small RHS vector into the channels being written.
2007 *
2008 * glsl ir treats write_mask as dictating how many channels are
2009 * present on the RHS while in our instructions we need to make
2010 * those channels appear in the slots of the vec4 they're written to.
2011 */
2012 for (int i = 0; i < 4; i++) {
2013 if (dst.writemask & (1 << i))
2014 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2015 else
2016 swizzles[i] = first_enabled_chan;
2017 }
2018 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2019 swizzles[2], swizzles[3]);
2020
2021 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2022 return;
2023 }
2024
2025 if (ir->condition) {
2026 emit_bool_to_cond_code(ir->condition, &predicate);
2027 }
2028
2029 for (i = 0; i < type_size(ir->lhs->type); i++) {
2030 vec4_instruction *inst = emit(MOV(dst, src));
2031 inst->predicate = predicate;
2032
2033 dst.reg_offset++;
2034 src.reg_offset++;
2035 }
2036 }
2037
2038 void
2039 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2040 {
2041 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2042 foreach_list(node, &ir->components) {
2043 ir_constant *field_value = (ir_constant *)node;
2044
2045 emit_constant_values(dst, field_value);
2046 }
2047 return;
2048 }
2049
2050 if (ir->type->is_array()) {
2051 for (unsigned int i = 0; i < ir->type->length; i++) {
2052 emit_constant_values(dst, ir->array_elements[i]);
2053 }
2054 return;
2055 }
2056
2057 if (ir->type->is_matrix()) {
2058 for (int i = 0; i < ir->type->matrix_columns; i++) {
2059 float *vec = &ir->value.f[i * ir->type->vector_elements];
2060
2061 for (int j = 0; j < ir->type->vector_elements; j++) {
2062 dst->writemask = 1 << j;
2063 dst->type = BRW_REGISTER_TYPE_F;
2064
2065 emit(MOV(*dst, src_reg(vec[j])));
2066 }
2067 dst->reg_offset++;
2068 }
2069 return;
2070 }
2071
2072 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2073
2074 for (int i = 0; i < ir->type->vector_elements; i++) {
2075 if (!(remaining_writemask & (1 << i)))
2076 continue;
2077
2078 dst->writemask = 1 << i;
2079 dst->type = brw_type_for_base_type(ir->type);
2080
2081 /* Find other components that match the one we're about to
2082 * write. Emits fewer instructions for things like vec4(0.5,
2083 * 1.5, 1.5, 1.5).
2084 */
2085 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2086 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2087 if (ir->value.b[i] == ir->value.b[j])
2088 dst->writemask |= (1 << j);
2089 } else {
2090 /* u, i, and f storage all line up, so no need for a
2091 * switch case for comparing each type.
2092 */
2093 if (ir->value.u[i] == ir->value.u[j])
2094 dst->writemask |= (1 << j);
2095 }
2096 }
2097
2098 switch (ir->type->base_type) {
2099 case GLSL_TYPE_FLOAT:
2100 emit(MOV(*dst, src_reg(ir->value.f[i])));
2101 break;
2102 case GLSL_TYPE_INT:
2103 emit(MOV(*dst, src_reg(ir->value.i[i])));
2104 break;
2105 case GLSL_TYPE_UINT:
2106 emit(MOV(*dst, src_reg(ir->value.u[i])));
2107 break;
2108 case GLSL_TYPE_BOOL:
2109 emit(MOV(*dst, src_reg(ir->value.b[i])));
2110 break;
2111 default:
2112 assert(!"Non-float/uint/int/bool constant");
2113 break;
2114 }
2115
2116 remaining_writemask &= ~dst->writemask;
2117 }
2118 dst->reg_offset++;
2119 }
2120
2121 void
2122 vec4_visitor::visit(ir_constant *ir)
2123 {
2124 dst_reg dst = dst_reg(this, ir->type);
2125 this->result = src_reg(dst);
2126
2127 emit_constant_values(&dst, ir);
2128 }
2129
2130 void
2131 vec4_visitor::visit(ir_call *ir)
2132 {
2133 assert(!"not reached");
2134 }
2135
2136 void
2137 vec4_visitor::visit(ir_texture *ir)
2138 {
2139 int sampler =
2140 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2141
2142 /* Should be lowered by do_lower_texture_projection */
2143 assert(!ir->projector);
2144
2145 /* Generate code to compute all the subexpression trees. This has to be
2146 * done before loading any values into MRFs for the sampler message since
2147 * generating these values may involve SEND messages that need the MRFs.
2148 */
2149 src_reg coordinate;
2150 if (ir->coordinate) {
2151 ir->coordinate->accept(this);
2152 coordinate = this->result;
2153 }
2154
2155 src_reg shadow_comparitor;
2156 if (ir->shadow_comparitor) {
2157 ir->shadow_comparitor->accept(this);
2158 shadow_comparitor = this->result;
2159 }
2160
2161 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2162 src_reg lod, dPdx, dPdy, sample_index;
2163 switch (ir->op) {
2164 case ir_tex:
2165 lod = src_reg(0.0f);
2166 lod_type = glsl_type::float_type;
2167 break;
2168 case ir_txf:
2169 case ir_txl:
2170 case ir_txs:
2171 ir->lod_info.lod->accept(this);
2172 lod = this->result;
2173 lod_type = ir->lod_info.lod->type;
2174 break;
2175 case ir_txf_ms:
2176 ir->lod_info.sample_index->accept(this);
2177 sample_index = this->result;
2178 sample_index_type = ir->lod_info.sample_index->type;
2179 break;
2180 case ir_txd:
2181 ir->lod_info.grad.dPdx->accept(this);
2182 dPdx = this->result;
2183
2184 ir->lod_info.grad.dPdy->accept(this);
2185 dPdy = this->result;
2186
2187 lod_type = ir->lod_info.grad.dPdx->type;
2188 break;
2189 case ir_txb:
2190 case ir_lod:
2191 break;
2192 }
2193
2194 vec4_instruction *inst = NULL;
2195 switch (ir->op) {
2196 case ir_tex:
2197 case ir_txl:
2198 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2199 break;
2200 case ir_txd:
2201 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2202 break;
2203 case ir_txf:
2204 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2205 break;
2206 case ir_txf_ms:
2207 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2208 break;
2209 case ir_txs:
2210 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2211 break;
2212 case ir_txb:
2213 assert(!"TXB is not valid for vertex shaders.");
2214 break;
2215 case ir_lod:
2216 assert(!"LOD is not valid for vertex shaders.");
2217 break;
2218 }
2219
2220 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2221
2222 /* Texel offsets go in the message header; Gen4 also requires headers. */
2223 inst->header_present = use_texture_offset || intel->gen < 5;
2224 inst->base_mrf = 2;
2225 inst->mlen = inst->header_present + 1; /* always at least one */
2226 inst->sampler = sampler;
2227 inst->dst = dst_reg(this, ir->type);
2228 inst->dst.writemask = WRITEMASK_XYZW;
2229 inst->shadow_compare = ir->shadow_comparitor != NULL;
2230
2231 if (use_texture_offset)
2232 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2233
2234 /* MRF for the first parameter */
2235 int param_base = inst->base_mrf + inst->header_present;
2236
2237 if (ir->op == ir_txs) {
2238 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2239 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2240 } else {
2241 int i, coord_mask = 0, zero_mask = 0;
2242 /* Load the coordinate */
2243 /* FINISHME: gl_clamp_mask and saturate */
2244 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2245 coord_mask |= (1 << i);
2246 for (; i < 4; i++)
2247 zero_mask |= (1 << i);
2248
2249 if (ir->offset && ir->op == ir_txf) {
2250 /* It appears that the ld instruction used for txf does its
2251 * address bounds check before adding in the offset. To work
2252 * around this, just add the integer offset to the integer
2253 * texel coordinate, and don't put the offset in the header.
2254 */
2255 ir_constant *offset = ir->offset->as_constant();
2256 assert(offset);
2257
2258 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2259 src_reg src = coordinate;
2260 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2261 BRW_GET_SWZ(src.swizzle, j),
2262 BRW_GET_SWZ(src.swizzle, j),
2263 BRW_GET_SWZ(src.swizzle, j));
2264 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2265 src, offset->value.i[j]));
2266 }
2267 } else {
2268 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2269 coordinate));
2270 }
2271 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2272 src_reg(0)));
2273 /* Load the shadow comparitor */
2274 if (ir->shadow_comparitor && ir->op != ir_txd) {
2275 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2276 WRITEMASK_X),
2277 shadow_comparitor));
2278 inst->mlen++;
2279 }
2280
2281 /* Load the LOD info */
2282 if (ir->op == ir_tex || ir->op == ir_txl) {
2283 int mrf, writemask;
2284 if (intel->gen >= 5) {
2285 mrf = param_base + 1;
2286 if (ir->shadow_comparitor) {
2287 writemask = WRITEMASK_Y;
2288 /* mlen already incremented */
2289 } else {
2290 writemask = WRITEMASK_X;
2291 inst->mlen++;
2292 }
2293 } else /* intel->gen == 4 */ {
2294 mrf = param_base;
2295 writemask = WRITEMASK_Z;
2296 }
2297 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2298 } else if (ir->op == ir_txf) {
2299 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2300 } else if (ir->op == ir_txf_ms) {
2301 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2302 sample_index));
2303 inst->mlen++;
2304
2305 /* on Gen7, there is an additional MCS parameter here after SI,
2306 * but we don't bother to emit it since it's always zero. If
2307 * we start supporting texturing from CMS surfaces, this will have
2308 * to change
2309 */
2310 } else if (ir->op == ir_txd) {
2311 const glsl_type *type = lod_type;
2312
2313 if (intel->gen >= 5) {
2314 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2315 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2316 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2317 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2318 inst->mlen++;
2319
2320 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2321 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2322 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2323 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2324 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2325 inst->mlen++;
2326
2327 if (ir->shadow_comparitor) {
2328 emit(MOV(dst_reg(MRF, param_base + 2,
2329 ir->shadow_comparitor->type, WRITEMASK_Z),
2330 shadow_comparitor));
2331 }
2332 }
2333 } else /* intel->gen == 4 */ {
2334 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2335 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2336 inst->mlen += 2;
2337 }
2338 }
2339 }
2340
2341 emit(inst);
2342
2343 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2344 * spec requires layers.
2345 */
2346 if (ir->op == ir_txs) {
2347 glsl_type const *type = ir->sampler->type;
2348 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2349 type->sampler_array) {
2350 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2351 with_writemask(inst->dst, WRITEMASK_Z),
2352 src_reg(inst->dst), src_reg(6));
2353 }
2354 }
2355
2356 swizzle_result(ir, src_reg(inst->dst), sampler);
2357 }
2358
2359 void
2360 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2361 {
2362 int s = key->tex.swizzles[sampler];
2363
2364 this->result = src_reg(this, ir->type);
2365 dst_reg swizzled_result(this->result);
2366
2367 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2368 || s == SWIZZLE_NOOP) {
2369 emit(MOV(swizzled_result, orig_val));
2370 return;
2371 }
2372
2373 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2374 int swizzle[4];
2375
2376 for (int i = 0; i < 4; i++) {
2377 switch (GET_SWZ(s, i)) {
2378 case SWIZZLE_ZERO:
2379 zero_mask |= (1 << i);
2380 break;
2381 case SWIZZLE_ONE:
2382 one_mask |= (1 << i);
2383 break;
2384 default:
2385 copy_mask |= (1 << i);
2386 swizzle[i] = GET_SWZ(s, i);
2387 break;
2388 }
2389 }
2390
2391 if (copy_mask) {
2392 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2393 swizzled_result.writemask = copy_mask;
2394 emit(MOV(swizzled_result, orig_val));
2395 }
2396
2397 if (zero_mask) {
2398 swizzled_result.writemask = zero_mask;
2399 emit(MOV(swizzled_result, src_reg(0.0f)));
2400 }
2401
2402 if (one_mask) {
2403 swizzled_result.writemask = one_mask;
2404 emit(MOV(swizzled_result, src_reg(1.0f)));
2405 }
2406 }
2407
2408 void
2409 vec4_visitor::visit(ir_return *ir)
2410 {
2411 assert(!"not reached");
2412 }
2413
2414 void
2415 vec4_visitor::visit(ir_discard *ir)
2416 {
2417 assert(!"not reached");
2418 }
2419
2420 void
2421 vec4_visitor::visit(ir_if *ir)
2422 {
2423 /* Don't point the annotation at the if statement, because then it plus
2424 * the then and else blocks get printed.
2425 */
2426 this->base_ir = ir->condition;
2427
2428 if (intel->gen == 6) {
2429 emit_if_gen6(ir);
2430 } else {
2431 uint32_t predicate;
2432 emit_bool_to_cond_code(ir->condition, &predicate);
2433 emit(IF(predicate));
2434 }
2435
2436 visit_instructions(&ir->then_instructions);
2437
2438 if (!ir->else_instructions.is_empty()) {
2439 this->base_ir = ir->condition;
2440 emit(BRW_OPCODE_ELSE);
2441
2442 visit_instructions(&ir->else_instructions);
2443 }
2444
2445 this->base_ir = ir->condition;
2446 emit(BRW_OPCODE_ENDIF);
2447 }
2448
2449 void
2450 vec4_visitor::emit_ndc_computation()
2451 {
2452 /* Get the position */
2453 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2454
2455 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2456 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2457 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2458
2459 current_annotation = "NDC";
2460 dst_reg ndc_w = ndc;
2461 ndc_w.writemask = WRITEMASK_W;
2462 src_reg pos_w = pos;
2463 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2464 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2465
2466 dst_reg ndc_xyz = ndc;
2467 ndc_xyz.writemask = WRITEMASK_XYZ;
2468
2469 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2470 }
2471
2472 void
2473 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2474 {
2475 if (intel->gen < 6 &&
2476 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2477 key->userclip_active || brw->has_negative_rhw_bug)) {
2478 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2479 dst_reg header1_w = header1;
2480 header1_w.writemask = WRITEMASK_W;
2481 GLuint i;
2482
2483 emit(MOV(header1, 0u));
2484
2485 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2486 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2487
2488 current_annotation = "Point size";
2489 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2490 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2491 }
2492
2493 current_annotation = "Clipping flags";
2494 for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2495 vec4_instruction *inst;
2496
2497 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2498 src_reg(this->userplane[i])));
2499 inst->conditional_mod = BRW_CONDITIONAL_L;
2500
2501 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2502 inst->predicate = BRW_PREDICATE_NORMAL;
2503 }
2504
2505 /* i965 clipping workaround:
2506 * 1) Test for -ve rhw
2507 * 2) If set,
2508 * set ndc = (0,0,0,0)
2509 * set ucp[6] = 1
2510 *
2511 * Later, clipping will detect ucp[6] and ensure the primitive is
2512 * clipped against all fixed planes.
2513 */
2514 if (brw->has_negative_rhw_bug) {
2515 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2516 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2517 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2518 vec4_instruction *inst;
2519 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2520 inst->predicate = BRW_PREDICATE_NORMAL;
2521 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2522 inst->predicate = BRW_PREDICATE_NORMAL;
2523 }
2524
2525 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2526 } else if (intel->gen < 6) {
2527 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2528 } else {
2529 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2530 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2531 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2532 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2533 }
2534 }
2535 }
2536
2537 void
2538 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2539 {
2540 if (intel->gen < 6) {
2541 /* Clip distance slots are set aside in gen5, but they are not used. It
2542 * is not clear whether we actually need to set aside space for them,
2543 * but the performance cost is negligible.
2544 */
2545 return;
2546 }
2547
2548 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2549 *
2550 * "If a linked set of shaders forming the vertex stage contains no
2551 * static write to gl_ClipVertex or gl_ClipDistance, but the
2552 * application has requested clipping against user clip planes through
2553 * the API, then the coordinate written to gl_Position is used for
2554 * comparison against the user clip planes."
2555 *
2556 * This function is only called if the shader didn't write to
2557 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2558 * if the user wrote to it; otherwise we use gl_Position.
2559 */
2560 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2561 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2562 clip_vertex = VARYING_SLOT_POS;
2563 }
2564
2565 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2566 ++i) {
2567 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2568 src_reg(output_reg[clip_vertex]),
2569 src_reg(this->userplane[i + offset])));
2570 }
2571 }
2572
2573 void
2574 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2575 {
2576 assert (varying < VARYING_SLOT_MAX);
2577 reg.type = output_reg[varying].type;
2578 current_annotation = output_reg_annotation[varying];
2579 /* Copy the register, saturating if necessary */
2580 vec4_instruction *inst = emit(MOV(reg,
2581 src_reg(output_reg[varying])));
2582 if ((varying == VARYING_SLOT_COL0 ||
2583 varying == VARYING_SLOT_COL1 ||
2584 varying == VARYING_SLOT_BFC0 ||
2585 varying == VARYING_SLOT_BFC1) &&
2586 key->clamp_vertex_color) {
2587 inst->saturate = true;
2588 }
2589 }
2590
2591 void
2592 vec4_visitor::emit_urb_slot(int mrf, int varying)
2593 {
2594 struct brw_reg hw_reg = brw_message_reg(mrf);
2595 dst_reg reg = dst_reg(MRF, mrf);
2596 reg.type = BRW_REGISTER_TYPE_F;
2597
2598 switch (varying) {
2599 case VARYING_SLOT_PSIZ:
2600 /* PSIZ is always in slot 0, and is coupled with other flags. */
2601 current_annotation = "indices, point width, clip flags";
2602 emit_psiz_and_flags(hw_reg);
2603 break;
2604 case BRW_VARYING_SLOT_NDC:
2605 current_annotation = "NDC";
2606 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2607 break;
2608 case BRW_VARYING_SLOT_POS_DUPLICATE:
2609 case VARYING_SLOT_POS:
2610 current_annotation = "gl_Position";
2611 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2612 break;
2613 case VARYING_SLOT_CLIP_DIST0:
2614 case VARYING_SLOT_CLIP_DIST1:
2615 if (this->key->uses_clip_distance) {
2616 emit_generic_urb_slot(reg, varying);
2617 } else {
2618 current_annotation = "user clip distances";
2619 emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2620 }
2621 break;
2622 case VARYING_SLOT_EDGE:
2623 /* This is present when doing unfilled polygons. We're supposed to copy
2624 * the edge flag from the user-provided vertex array
2625 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2626 * of that attribute (starts as 1.0f). This is then used in clipping to
2627 * determine which edges should be drawn as wireframe.
2628 */
2629 current_annotation = "edge flag";
2630 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2631 glsl_type::float_type, WRITEMASK_XYZW))));
2632 break;
2633 case BRW_VARYING_SLOT_PAD:
2634 /* No need to write to this slot */
2635 break;
2636 default:
2637 emit_generic_urb_slot(reg, varying);
2638 break;
2639 }
2640 }
2641
2642 static int
2643 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2644 {
2645 struct intel_context *intel = &brw->intel;
2646
2647 if (intel->gen >= 6) {
2648 /* URB data written (does not include the message header reg) must
2649 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2650 * section 5.4.3.2.2: URB_INTERLEAVED.
2651 *
2652 * URB entries are allocated on a multiple of 1024 bits, so an
2653 * extra 128 bits written here to make the end align to 256 is
2654 * no problem.
2655 */
2656 if ((mlen % 2) != 1)
2657 mlen++;
2658 }
2659
2660 return mlen;
2661 }
2662
2663 void
2664 vec4_vs_visitor::emit_urb_write_header(int mrf)
2665 {
2666 /* No need to do anything for VS; an implied write to this MRF will be
2667 * performed by VS_OPCODE_URB_WRITE.
2668 */
2669 (void) mrf;
2670 }
2671
2672 vec4_instruction *
2673 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2674 {
2675 /* For VS, the URB writes end the thread. */
2676 if (complete) {
2677 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2678 emit_shader_time_end();
2679 }
2680
2681 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2682 inst->eot = complete;
2683
2684 return inst;
2685 }
2686
2687 /**
2688 * Generates the VUE payload plus the necessary URB write instructions to
2689 * output it.
2690 *
2691 * The VUE layout is documented in Volume 2a.
2692 */
2693 void
2694 vec4_visitor::emit_vertex()
2695 {
2696 /* MRF 0 is reserved for the debugger, so start with message header
2697 * in MRF 1.
2698 */
2699 int base_mrf = 1;
2700 int mrf = base_mrf;
2701 /* In the process of generating our URB write message contents, we
2702 * may need to unspill a register or load from an array. Those
2703 * reads would use MRFs 14-15.
2704 */
2705 int max_usable_mrf = 13;
2706
2707 /* The following assertion verifies that max_usable_mrf causes an
2708 * even-numbered amount of URB write data, which will meet gen6's
2709 * requirements for length alignment.
2710 */
2711 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2712
2713 /* First mrf is the g0-based message header containing URB handles and
2714 * such.
2715 */
2716 emit_urb_write_header(mrf++);
2717
2718 if (intel->gen < 6) {
2719 emit_ndc_computation();
2720 }
2721
2722 /* Set up the VUE data for the first URB write */
2723 int slot;
2724 for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2725 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2726
2727 /* If this was max_usable_mrf, we can't fit anything more into this URB
2728 * WRITE.
2729 */
2730 if (mrf > max_usable_mrf) {
2731 slot++;
2732 break;
2733 }
2734 }
2735
2736 bool complete = slot >= prog_data->vue_map.num_slots;
2737 current_annotation = "URB write";
2738 vec4_instruction *inst = emit_urb_write_opcode(complete);
2739 inst->base_mrf = base_mrf;
2740 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2741
2742 /* Optional second URB write */
2743 if (!complete) {
2744 mrf = base_mrf + 1;
2745
2746 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2747 assert(mrf < max_usable_mrf);
2748
2749 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2750 }
2751
2752 current_annotation = "URB write";
2753 inst = emit_urb_write_opcode(true /* complete */);
2754 inst->base_mrf = base_mrf;
2755 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2756 /* URB destination offset. In the previous write, we got MRFs
2757 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2758 * URB row increments, and each of our MRFs is half of one of
2759 * those, since we're doing interleaved writes.
2760 */
2761 inst->offset = (max_usable_mrf - base_mrf) / 2;
2762 }
2763 }
2764
2765 void
2766 vec4_vs_visitor::emit_thread_end()
2767 {
2768 /* For VS, we always end the thread by emitting a single vertex.
2769 * emit_urb_write_opcode() will take care of setting the eot flag on the
2770 * SEND instruction.
2771 */
2772 emit_vertex();
2773 }
2774
2775 src_reg
2776 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2777 src_reg *reladdr, int reg_offset)
2778 {
2779 /* Because we store the values to scratch interleaved like our
2780 * vertex data, we need to scale the vec4 index by 2.
2781 */
2782 int message_header_scale = 2;
2783
2784 /* Pre-gen6, the message header uses byte offsets instead of vec4
2785 * (16-byte) offset units.
2786 */
2787 if (intel->gen < 6)
2788 message_header_scale *= 16;
2789
2790 if (reladdr) {
2791 src_reg index = src_reg(this, glsl_type::int_type);
2792
2793 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2794 emit_before(inst, MUL(dst_reg(index),
2795 index, src_reg(message_header_scale)));
2796
2797 return index;
2798 } else {
2799 return src_reg(reg_offset * message_header_scale);
2800 }
2801 }
2802
2803 src_reg
2804 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2805 src_reg *reladdr, int reg_offset)
2806 {
2807 if (reladdr) {
2808 src_reg index = src_reg(this, glsl_type::int_type);
2809
2810 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2811
2812 /* Pre-gen6, the message header uses byte offsets instead of vec4
2813 * (16-byte) offset units.
2814 */
2815 if (intel->gen < 6) {
2816 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2817 }
2818
2819 return index;
2820 } else {
2821 int message_header_scale = intel->gen < 6 ? 16 : 1;
2822 return src_reg(reg_offset * message_header_scale);
2823 }
2824 }
2825
2826 /**
2827 * Emits an instruction before @inst to load the value named by @orig_src
2828 * from scratch space at @base_offset to @temp.
2829 *
2830 * @base_offset is measured in 32-byte units (the size of a register).
2831 */
2832 void
2833 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2834 dst_reg temp, src_reg orig_src,
2835 int base_offset)
2836 {
2837 int reg_offset = base_offset + orig_src.reg_offset;
2838 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2839
2840 emit_before(inst, SCRATCH_READ(temp, index));
2841 }
2842
2843 /**
2844 * Emits an instruction after @inst to store the value to be written
2845 * to @orig_dst to scratch space at @base_offset, from @temp.
2846 *
2847 * @base_offset is measured in 32-byte units (the size of a register).
2848 */
2849 void
2850 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2851 {
2852 int reg_offset = base_offset + inst->dst.reg_offset;
2853 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2854
2855 /* Create a temporary register to store *inst's result in.
2856 *
2857 * We have to be careful in MOVing from our temporary result register in
2858 * the scratch write. If we swizzle from channels of the temporary that
2859 * weren't initialized, it will confuse live interval analysis, which will
2860 * make spilling fail to make progress.
2861 */
2862 src_reg temp = src_reg(this, glsl_type::vec4_type);
2863 temp.type = inst->dst.type;
2864 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2865 int swizzles[4];
2866 for (int i = 0; i < 4; i++)
2867 if (inst->dst.writemask & (1 << i))
2868 swizzles[i] = i;
2869 else
2870 swizzles[i] = first_writemask_chan;
2871 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2872 swizzles[2], swizzles[3]);
2873
2874 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2875 inst->dst.writemask));
2876 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2877 write->predicate = inst->predicate;
2878 write->ir = inst->ir;
2879 write->annotation = inst->annotation;
2880 inst->insert_after(write);
2881
2882 inst->dst.file = temp.file;
2883 inst->dst.reg = temp.reg;
2884 inst->dst.reg_offset = temp.reg_offset;
2885 inst->dst.reladdr = NULL;
2886 }
2887
2888 /**
2889 * We can't generally support array access in GRF space, because a
2890 * single instruction's destination can only span 2 contiguous
2891 * registers. So, we send all GRF arrays that get variable index
2892 * access to scratch space.
2893 */
2894 void
2895 vec4_visitor::move_grf_array_access_to_scratch()
2896 {
2897 int scratch_loc[this->virtual_grf_count];
2898
2899 for (int i = 0; i < this->virtual_grf_count; i++) {
2900 scratch_loc[i] = -1;
2901 }
2902
2903 /* First, calculate the set of virtual GRFs that need to be punted
2904 * to scratch due to having any array access on them, and where in
2905 * scratch.
2906 */
2907 foreach_list(node, &this->instructions) {
2908 vec4_instruction *inst = (vec4_instruction *)node;
2909
2910 if (inst->dst.file == GRF && inst->dst.reladdr &&
2911 scratch_loc[inst->dst.reg] == -1) {
2912 scratch_loc[inst->dst.reg] = c->last_scratch;
2913 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2914 }
2915
2916 for (int i = 0 ; i < 3; i++) {
2917 src_reg *src = &inst->src[i];
2918
2919 if (src->file == GRF && src->reladdr &&
2920 scratch_loc[src->reg] == -1) {
2921 scratch_loc[src->reg] = c->last_scratch;
2922 c->last_scratch += this->virtual_grf_sizes[src->reg];
2923 }
2924 }
2925 }
2926
2927 /* Now, for anything that will be accessed through scratch, rewrite
2928 * it to load/store. Note that this is a _safe list walk, because
2929 * we may generate a new scratch_write instruction after the one
2930 * we're processing.
2931 */
2932 foreach_list_safe(node, &this->instructions) {
2933 vec4_instruction *inst = (vec4_instruction *)node;
2934
2935 /* Set up the annotation tracking for new generated instructions. */
2936 base_ir = inst->ir;
2937 current_annotation = inst->annotation;
2938
2939 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2940 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2941 }
2942
2943 for (int i = 0 ; i < 3; i++) {
2944 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2945 continue;
2946
2947 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2948
2949 emit_scratch_read(inst, temp, inst->src[i],
2950 scratch_loc[inst->src[i].reg]);
2951
2952 inst->src[i].file = temp.file;
2953 inst->src[i].reg = temp.reg;
2954 inst->src[i].reg_offset = temp.reg_offset;
2955 inst->src[i].reladdr = NULL;
2956 }
2957 }
2958 }
2959
2960 /**
2961 * Emits an instruction before @inst to load the value named by @orig_src
2962 * from the pull constant buffer (surface) at @base_offset to @temp.
2963 */
2964 void
2965 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2966 dst_reg temp, src_reg orig_src,
2967 int base_offset)
2968 {
2969 int reg_offset = base_offset + orig_src.reg_offset;
2970 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2971 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2972 vec4_instruction *load;
2973
2974 if (intel->gen >= 7) {
2975 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2976 grf_offset.type = offset.type;
2977 emit_before(inst, MOV(grf_offset, offset));
2978
2979 load = new(mem_ctx) vec4_instruction(this,
2980 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2981 temp, index, src_reg(grf_offset));
2982 } else {
2983 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2984 temp, index, offset);
2985 load->base_mrf = 14;
2986 load->mlen = 1;
2987 }
2988 emit_before(inst, load);
2989 }
2990
2991 /**
2992 * Implements array access of uniforms by inserting a
2993 * PULL_CONSTANT_LOAD instruction.
2994 *
2995 * Unlike temporary GRF array access (where we don't support it due to
2996 * the difficulty of doing relative addressing on instruction
2997 * destinations), we could potentially do array access of uniforms
2998 * that were loaded in GRF space as push constants. In real-world
2999 * usage we've seen, though, the arrays being used are always larger
3000 * than we could load as push constants, so just always move all
3001 * uniform array access out to a pull constant buffer.
3002 */
3003 void
3004 vec4_visitor::move_uniform_array_access_to_pull_constants()
3005 {
3006 int pull_constant_loc[this->uniforms];
3007
3008 for (int i = 0; i < this->uniforms; i++) {
3009 pull_constant_loc[i] = -1;
3010 }
3011
3012 /* Walk through and find array access of uniforms. Put a copy of that
3013 * uniform in the pull constant buffer.
3014 *
3015 * Note that we don't move constant-indexed accesses to arrays. No
3016 * testing has been done of the performance impact of this choice.
3017 */
3018 foreach_list_safe(node, &this->instructions) {
3019 vec4_instruction *inst = (vec4_instruction *)node;
3020
3021 for (int i = 0 ; i < 3; i++) {
3022 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3023 continue;
3024
3025 int uniform = inst->src[i].reg;
3026
3027 /* If this array isn't already present in the pull constant buffer,
3028 * add it.
3029 */
3030 if (pull_constant_loc[uniform] == -1) {
3031 const float **values = &prog_data->param[uniform * 4];
3032
3033 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3034
3035 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3036 prog_data->pull_param[prog_data->nr_pull_params++]
3037 = values[j];
3038 }
3039 }
3040
3041 /* Set up the annotation tracking for new generated instructions. */
3042 base_ir = inst->ir;
3043 current_annotation = inst->annotation;
3044
3045 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3046
3047 emit_pull_constant_load(inst, temp, inst->src[i],
3048 pull_constant_loc[uniform]);
3049
3050 inst->src[i].file = temp.file;
3051 inst->src[i].reg = temp.reg;
3052 inst->src[i].reg_offset = temp.reg_offset;
3053 inst->src[i].reladdr = NULL;
3054 }
3055 }
3056
3057 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3058 * no need to track them as larger-than-vec4 objects. This will be
3059 * relied on in cutting out unused uniform vectors from push
3060 * constants.
3061 */
3062 split_uniform_registers();
3063 }
3064
3065 void
3066 vec4_visitor::resolve_ud_negate(src_reg *reg)
3067 {
3068 if (reg->type != BRW_REGISTER_TYPE_UD ||
3069 !reg->negate)
3070 return;
3071
3072 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3073 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3074 *reg = temp;
3075 }
3076
3077 vec4_visitor::vec4_visitor(struct brw_context *brw,
3078 struct brw_vec4_compile *c,
3079 struct gl_program *prog,
3080 const struct brw_vec4_prog_key *key,
3081 struct brw_vec4_prog_data *prog_data,
3082 struct gl_shader_program *shader_prog,
3083 struct brw_shader *shader,
3084 void *mem_ctx,
3085 bool debug_flag)
3086 : debug_flag(debug_flag)
3087 {
3088 this->brw = brw;
3089 this->intel = &brw->intel;
3090 this->ctx = &intel->ctx;
3091 this->shader_prog = shader_prog;
3092 this->shader = shader;
3093
3094 this->mem_ctx = mem_ctx;
3095 this->failed = false;
3096
3097 this->base_ir = NULL;
3098 this->current_annotation = NULL;
3099 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3100
3101 this->c = c;
3102 this->prog = prog;
3103 this->key = key;
3104 this->prog_data = prog_data;
3105
3106 this->variable_ht = hash_table_ctor(0,
3107 hash_table_pointer_hash,
3108 hash_table_pointer_compare);
3109
3110 this->virtual_grf_def = NULL;
3111 this->virtual_grf_use = NULL;
3112 this->virtual_grf_sizes = NULL;
3113 this->virtual_grf_count = 0;
3114 this->virtual_grf_reg_map = NULL;
3115 this->virtual_grf_reg_count = 0;
3116 this->virtual_grf_array_size = 0;
3117 this->live_intervals_valid = false;
3118
3119 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3120
3121 this->uniforms = 0;
3122 }
3123
3124 vec4_visitor::~vec4_visitor()
3125 {
3126 hash_table_dtor(this->variable_ht);
3127 }
3128
3129
3130 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3131 struct brw_vs_compile *vs_compile,
3132 struct brw_vs_prog_data *vs_prog_data,
3133 struct gl_shader_program *prog,
3134 struct brw_shader *shader,
3135 void *mem_ctx)
3136 : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3137 &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3138 mem_ctx, INTEL_DEBUG & DEBUG_VS),
3139 vs_compile(vs_compile),
3140 vs_prog_data(vs_prog_data)
3141 {
3142 }
3143
3144
3145 void
3146 vec4_visitor::fail(const char *format, ...)
3147 {
3148 va_list va;
3149 char *msg;
3150
3151 if (failed)
3152 return;
3153
3154 failed = true;
3155
3156 va_start(va, format);
3157 msg = ralloc_vasprintf(mem_ctx, format, va);
3158 va_end(va);
3159 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3160
3161 this->fail_msg = msg;
3162
3163 if (debug_flag) {
3164 fprintf(stderr, "%s", msg);
3165 }
3166 }
3167
3168 } /* namespace brw */