i965: Skip allocating UNIFORM file storage for uniforms of size 0.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr && expr->operation != ir_binop_ubo_load) {
780 src_reg op[3];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 3);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 case ir_triop_csel: {
856 /* Expand the boolean condition into the flag register. */
857 inst = emit(MOV(dst_null_d(), op[0]));
858 inst->conditional_mod = BRW_CONDITIONAL_NZ;
859
860 /* Select which boolean to return. */
861 dst_reg temp(this, expr->operands[1]->type);
862 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
863 inst->predicate = BRW_PREDICATE_NORMAL;
864
865 /* Expand the result to a condition code. */
866 inst = emit(MOV(dst_null_d(), src_reg(temp)));
867 inst->conditional_mod = BRW_CONDITIONAL_NZ;
868 break;
869 }
870
871 default:
872 unreachable("not reached");
873 }
874 return;
875 }
876
877 ir->accept(this);
878
879 resolve_ud_negate(&this->result);
880
881 if (brw->gen >= 6) {
882 vec4_instruction *inst = emit(AND(dst_null_d(),
883 this->result, src_reg(1)));
884 inst->conditional_mod = BRW_CONDITIONAL_NZ;
885 } else {
886 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
887 inst->conditional_mod = BRW_CONDITIONAL_NZ;
888 }
889 }
890
891 /**
892 * Emit a gen6 IF statement with the comparison folded into the IF
893 * instruction.
894 */
895 void
896 vec4_visitor::emit_if_gen6(ir_if *ir)
897 {
898 ir_expression *expr = ir->condition->as_expression();
899
900 if (expr && expr->operation != ir_binop_ubo_load) {
901 src_reg op[3];
902 dst_reg temp;
903
904 assert(expr->get_num_operands() <= 3);
905 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
906 expr->operands[i]->accept(this);
907 op[i] = this->result;
908 }
909
910 switch (expr->operation) {
911 case ir_unop_logic_not:
912 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
913 return;
914
915 case ir_binop_logic_xor:
916 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_binop_logic_or:
920 temp = dst_reg(this, glsl_type::bool_type);
921 emit(OR(temp, op[0], op[1]));
922 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
923 return;
924
925 case ir_binop_logic_and:
926 temp = dst_reg(this, glsl_type::bool_type);
927 emit(AND(temp, op[0], op[1]));
928 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
929 return;
930
931 case ir_unop_f2b:
932 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
933 return;
934
935 case ir_unop_i2b:
936 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
937 return;
938
939 case ir_binop_greater:
940 case ir_binop_gequal:
941 case ir_binop_less:
942 case ir_binop_lequal:
943 case ir_binop_equal:
944 case ir_binop_nequal:
945 emit(IF(op[0], op[1],
946 brw_conditional_for_comparison(expr->operation)));
947 return;
948
949 case ir_binop_all_equal:
950 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
951 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
952 return;
953
954 case ir_binop_any_nequal:
955 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
956 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
957 return;
958
959 case ir_unop_any:
960 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
961 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
962 return;
963
964 case ir_triop_csel: {
965 /* Expand the boolean condition into the flag register. */
966 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
967 inst->conditional_mod = BRW_CONDITIONAL_NZ;
968
969 /* Select which boolean to return. */
970 dst_reg temp(this, expr->operands[1]->type);
971 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
972 inst->predicate = BRW_PREDICATE_NORMAL;
973
974 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
975 return;
976 }
977
978 default:
979 unreachable("not reached");
980 }
981 return;
982 }
983
984 ir->condition->accept(this);
985
986 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
987 }
988
989 void
990 vec4_visitor::visit(ir_variable *ir)
991 {
992 dst_reg *reg = NULL;
993
994 if (variable_storage(ir))
995 return;
996
997 switch (ir->data.mode) {
998 case ir_var_shader_in:
999 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1000 break;
1001
1002 case ir_var_shader_out:
1003 reg = new(mem_ctx) dst_reg(this, ir->type);
1004
1005 for (int i = 0; i < type_size(ir->type); i++) {
1006 output_reg[ir->data.location + i] = *reg;
1007 output_reg[ir->data.location + i].reg_offset = i;
1008 output_reg[ir->data.location + i].type =
1009 brw_type_for_base_type(ir->type->get_scalar_type());
1010 output_reg_annotation[ir->data.location + i] = ir->name;
1011 }
1012 break;
1013
1014 case ir_var_auto:
1015 case ir_var_temporary:
1016 reg = new(mem_ctx) dst_reg(this, ir->type);
1017 break;
1018
1019 case ir_var_uniform:
1020 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1021
1022 /* Thanks to the lower_ubo_reference pass, we will see only
1023 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1024 * variables, so no need for them to be in variable_ht.
1025 *
1026 * Some uniforms, such as samplers and atomic counters, have no actual
1027 * storage, so we should ignore them.
1028 */
1029 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1030 return;
1031
1032 /* Track how big the whole uniform variable is, in case we need to put a
1033 * copy of its data into pull constants for array access.
1034 */
1035 assert(this->uniforms < uniform_array_size);
1036 this->uniform_size[this->uniforms] = type_size(ir->type);
1037
1038 if (!strncmp(ir->name, "gl_", 3)) {
1039 setup_builtin_uniform_values(ir);
1040 } else {
1041 setup_uniform_values(ir);
1042 }
1043 break;
1044
1045 case ir_var_system_value:
1046 reg = make_reg_for_system_value(ir);
1047 break;
1048
1049 default:
1050 unreachable("not reached");
1051 }
1052
1053 reg->type = brw_type_for_base_type(ir->type);
1054 hash_table_insert(this->variable_ht, reg, ir);
1055 }
1056
1057 void
1058 vec4_visitor::visit(ir_loop *ir)
1059 {
1060 /* We don't want debugging output to print the whole body of the
1061 * loop as the annotation.
1062 */
1063 this->base_ir = NULL;
1064
1065 emit(BRW_OPCODE_DO);
1066
1067 visit_instructions(&ir->body_instructions);
1068
1069 emit(BRW_OPCODE_WHILE);
1070 }
1071
1072 void
1073 vec4_visitor::visit(ir_loop_jump *ir)
1074 {
1075 switch (ir->mode) {
1076 case ir_loop_jump::jump_break:
1077 emit(BRW_OPCODE_BREAK);
1078 break;
1079 case ir_loop_jump::jump_continue:
1080 emit(BRW_OPCODE_CONTINUE);
1081 break;
1082 }
1083 }
1084
1085
1086 void
1087 vec4_visitor::visit(ir_function_signature *)
1088 {
1089 unreachable("not reached");
1090 }
1091
1092 void
1093 vec4_visitor::visit(ir_function *ir)
1094 {
1095 /* Ignore function bodies other than main() -- we shouldn't see calls to
1096 * them since they should all be inlined.
1097 */
1098 if (strcmp(ir->name, "main") == 0) {
1099 const ir_function_signature *sig;
1100 exec_list empty;
1101
1102 sig = ir->matching_signature(NULL, &empty, false);
1103
1104 assert(sig);
1105
1106 visit_instructions(&sig->body);
1107 }
1108 }
1109
1110 bool
1111 vec4_visitor::try_emit_mad(ir_expression *ir)
1112 {
1113 /* 3-src instructions were introduced in gen6. */
1114 if (brw->gen < 6)
1115 return false;
1116
1117 /* MAD can only handle floating-point data. */
1118 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1119 return false;
1120
1121 ir_rvalue *nonmul = ir->operands[1];
1122 ir_expression *mul = ir->operands[0]->as_expression();
1123
1124 if (!mul || mul->operation != ir_binop_mul) {
1125 nonmul = ir->operands[0];
1126 mul = ir->operands[1]->as_expression();
1127
1128 if (!mul || mul->operation != ir_binop_mul)
1129 return false;
1130 }
1131
1132 nonmul->accept(this);
1133 src_reg src0 = fix_3src_operand(this->result);
1134
1135 mul->operands[0]->accept(this);
1136 src_reg src1 = fix_3src_operand(this->result);
1137
1138 mul->operands[1]->accept(this);
1139 src_reg src2 = fix_3src_operand(this->result);
1140
1141 this->result = src_reg(this, ir->type);
1142 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1143
1144 return true;
1145 }
1146
1147 bool
1148 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1149 {
1150 /* This optimization relies on CMP setting the destination to 0 when
1151 * false. Early hardware only sets the least significant bit, and
1152 * leaves the other bits undefined. So we can't use it.
1153 */
1154 if (brw->gen < 6)
1155 return false;
1156
1157 ir_expression *const cmp = ir->operands[0]->as_expression();
1158
1159 if (cmp == NULL)
1160 return false;
1161
1162 switch (cmp->operation) {
1163 case ir_binop_less:
1164 case ir_binop_greater:
1165 case ir_binop_lequal:
1166 case ir_binop_gequal:
1167 case ir_binop_equal:
1168 case ir_binop_nequal:
1169 break;
1170
1171 default:
1172 return false;
1173 }
1174
1175 cmp->operands[0]->accept(this);
1176 const src_reg cmp_src0 = this->result;
1177
1178 cmp->operands[1]->accept(this);
1179 const src_reg cmp_src1 = this->result;
1180
1181 this->result = src_reg(this, ir->type);
1182
1183 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1184 brw_conditional_for_comparison(cmp->operation)));
1185
1186 /* If the comparison is false, this->result will just happen to be zero.
1187 */
1188 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1189 this->result, src_reg(1.0f));
1190 inst->predicate = BRW_PREDICATE_NORMAL;
1191 inst->predicate_inverse = true;
1192
1193 return true;
1194 }
1195
1196 void
1197 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1198 src_reg src0, src_reg src1)
1199 {
1200 vec4_instruction *inst;
1201
1202 if (brw->gen >= 6) {
1203 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1204 inst->conditional_mod = conditionalmod;
1205 } else {
1206 emit(CMP(dst, src0, src1, conditionalmod));
1207
1208 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1209 inst->predicate = BRW_PREDICATE_NORMAL;
1210 }
1211 }
1212
1213 void
1214 vec4_visitor::emit_lrp(const dst_reg &dst,
1215 const src_reg &x, const src_reg &y, const src_reg &a)
1216 {
1217 if (brw->gen >= 6) {
1218 /* Note that the instruction's argument order is reversed from GLSL
1219 * and the IR.
1220 */
1221 emit(LRP(dst,
1222 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1223 } else {
1224 /* Earlier generations don't support three source operations, so we
1225 * need to emit x*(1-a) + y*a.
1226 */
1227 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1228 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1229 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1230 y_times_a.writemask = dst.writemask;
1231 one_minus_a.writemask = dst.writemask;
1232 x_times_one_minus_a.writemask = dst.writemask;
1233
1234 emit(MUL(y_times_a, y, a));
1235 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1236 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1237 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1238 }
1239 }
1240
1241 void
1242 vec4_visitor::visit(ir_expression *ir)
1243 {
1244 unsigned int operand;
1245 src_reg op[Elements(ir->operands)];
1246 src_reg result_src;
1247 dst_reg result_dst;
1248 vec4_instruction *inst;
1249
1250 if (ir->operation == ir_binop_add) {
1251 if (try_emit_mad(ir))
1252 return;
1253 }
1254
1255 if (ir->operation == ir_unop_b2f) {
1256 if (try_emit_b2f_of_compare(ir))
1257 return;
1258 }
1259
1260 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1261 this->result.file = BAD_FILE;
1262 ir->operands[operand]->accept(this);
1263 if (this->result.file == BAD_FILE) {
1264 fprintf(stderr, "Failed to get tree for expression operand:\n");
1265 ir->operands[operand]->fprint(stderr);
1266 exit(1);
1267 }
1268 op[operand] = this->result;
1269
1270 /* Matrix expression operands should have been broken down to vector
1271 * operations already.
1272 */
1273 assert(!ir->operands[operand]->type->is_matrix());
1274 }
1275
1276 int vector_elements = ir->operands[0]->type->vector_elements;
1277 if (ir->operands[1]) {
1278 vector_elements = MAX2(vector_elements,
1279 ir->operands[1]->type->vector_elements);
1280 }
1281
1282 this->result.file = BAD_FILE;
1283
1284 /* Storage for our result. Ideally for an assignment we'd be using
1285 * the actual storage for the result here, instead.
1286 */
1287 result_src = src_reg(this, ir->type);
1288 /* convenience for the emit functions below. */
1289 result_dst = dst_reg(result_src);
1290 /* If nothing special happens, this is the result. */
1291 this->result = result_src;
1292 /* Limit writes to the channels that will be used by result_src later.
1293 * This does limit this temp's use as a temporary for multi-instruction
1294 * sequences.
1295 */
1296 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1297
1298 switch (ir->operation) {
1299 case ir_unop_logic_not:
1300 if (ctx->Const.UniformBooleanTrue != 1) {
1301 emit(NOT(result_dst, op[0]));
1302 } else {
1303 emit(XOR(result_dst, op[0], src_reg(1)));
1304 }
1305 break;
1306 case ir_unop_neg:
1307 op[0].negate = !op[0].negate;
1308 emit(MOV(result_dst, op[0]));
1309 break;
1310 case ir_unop_abs:
1311 op[0].abs = true;
1312 op[0].negate = false;
1313 emit(MOV(result_dst, op[0]));
1314 break;
1315
1316 case ir_unop_sign:
1317 if (ir->type->is_float()) {
1318 /* AND(val, 0x80000000) gives the sign bit.
1319 *
1320 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1321 * zero.
1322 */
1323 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1324
1325 op[0].type = BRW_REGISTER_TYPE_UD;
1326 result_dst.type = BRW_REGISTER_TYPE_UD;
1327 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1328
1329 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1330 inst->predicate = BRW_PREDICATE_NORMAL;
1331
1332 this->result.type = BRW_REGISTER_TYPE_F;
1333 } else {
1334 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1335 * -> non-negative val generates 0x00000000.
1336 * Predicated OR sets 1 if val is positive.
1337 */
1338 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1339
1340 emit(ASR(result_dst, op[0], src_reg(31)));
1341
1342 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1343 inst->predicate = BRW_PREDICATE_NORMAL;
1344 }
1345 break;
1346
1347 case ir_unop_rcp:
1348 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1349 break;
1350
1351 case ir_unop_exp2:
1352 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1353 break;
1354 case ir_unop_log2:
1355 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1356 break;
1357 case ir_unop_exp:
1358 case ir_unop_log:
1359 unreachable("not reached: should be handled by ir_explog_to_explog2");
1360 case ir_unop_sin:
1361 case ir_unop_sin_reduced:
1362 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1363 break;
1364 case ir_unop_cos:
1365 case ir_unop_cos_reduced:
1366 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1367 break;
1368
1369 case ir_unop_dFdx:
1370 case ir_unop_dFdx_coarse:
1371 case ir_unop_dFdx_fine:
1372 case ir_unop_dFdy:
1373 case ir_unop_dFdy_coarse:
1374 case ir_unop_dFdy_fine:
1375 unreachable("derivatives not valid in vertex shader");
1376
1377 case ir_unop_bitfield_reverse:
1378 emit(BFREV(result_dst, op[0]));
1379 break;
1380 case ir_unop_bit_count:
1381 emit(CBIT(result_dst, op[0]));
1382 break;
1383 case ir_unop_find_msb: {
1384 src_reg temp = src_reg(this, glsl_type::uint_type);
1385
1386 inst = emit(FBH(dst_reg(temp), op[0]));
1387 inst->dst.writemask = WRITEMASK_XYZW;
1388
1389 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1390 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1391 * subtract the result from 31 to convert the MSB count into an LSB count.
1392 */
1393
1394 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1395 temp.swizzle = BRW_SWIZZLE_NOOP;
1396 emit(MOV(result_dst, temp));
1397
1398 src_reg src_tmp = src_reg(result_dst);
1399 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1400
1401 src_tmp.negate = true;
1402 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1403 inst->predicate = BRW_PREDICATE_NORMAL;
1404 break;
1405 }
1406 case ir_unop_find_lsb:
1407 emit(FBL(result_dst, op[0]));
1408 break;
1409 case ir_unop_saturate:
1410 inst = emit(MOV(result_dst, op[0]));
1411 inst->saturate = true;
1412 break;
1413
1414 case ir_unop_noise:
1415 unreachable("not reached: should be handled by lower_noise");
1416
1417 case ir_binop_add:
1418 emit(ADD(result_dst, op[0], op[1]));
1419 break;
1420 case ir_binop_sub:
1421 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1422
1423 case ir_binop_mul:
1424 if (brw->gen < 8 && ir->type->is_integer()) {
1425 /* For integer multiplication, the MUL uses the low 16 bits of one of
1426 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1427 * accumulates in the contribution of the upper 16 bits of that
1428 * operand. If we can determine that one of the args is in the low
1429 * 16 bits, though, we can just emit a single MUL.
1430 */
1431 if (ir->operands[0]->is_uint16_constant()) {
1432 if (brw->gen < 7)
1433 emit(MUL(result_dst, op[0], op[1]));
1434 else
1435 emit(MUL(result_dst, op[1], op[0]));
1436 } else if (ir->operands[1]->is_uint16_constant()) {
1437 if (brw->gen < 7)
1438 emit(MUL(result_dst, op[1], op[0]));
1439 else
1440 emit(MUL(result_dst, op[0], op[1]));
1441 } else {
1442 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1443
1444 emit(MUL(acc, op[0], op[1]));
1445 emit(MACH(dst_null_d(), op[0], op[1]));
1446 emit(MOV(result_dst, src_reg(acc)));
1447 }
1448 } else {
1449 emit(MUL(result_dst, op[0], op[1]));
1450 }
1451 break;
1452 case ir_binop_imul_high: {
1453 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1454
1455 emit(MUL(acc, op[0], op[1]));
1456 emit(MACH(result_dst, op[0], op[1]));
1457 break;
1458 }
1459 case ir_binop_div:
1460 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1461 assert(ir->type->is_integer());
1462 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1463 break;
1464 case ir_binop_carry: {
1465 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1466
1467 emit(ADDC(dst_null_ud(), op[0], op[1]));
1468 emit(MOV(result_dst, src_reg(acc)));
1469 break;
1470 }
1471 case ir_binop_borrow: {
1472 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1473
1474 emit(SUBB(dst_null_ud(), op[0], op[1]));
1475 emit(MOV(result_dst, src_reg(acc)));
1476 break;
1477 }
1478 case ir_binop_mod:
1479 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1480 assert(ir->type->is_integer());
1481 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1482 break;
1483
1484 case ir_binop_less:
1485 case ir_binop_greater:
1486 case ir_binop_lequal:
1487 case ir_binop_gequal:
1488 case ir_binop_equal:
1489 case ir_binop_nequal: {
1490 emit(CMP(result_dst, op[0], op[1],
1491 brw_conditional_for_comparison(ir->operation)));
1492 if (ctx->Const.UniformBooleanTrue == 1) {
1493 emit(AND(result_dst, result_src, src_reg(1)));
1494 }
1495 break;
1496 }
1497
1498 case ir_binop_all_equal:
1499 /* "==" operator producing a scalar boolean. */
1500 if (ir->operands[0]->type->is_vector() ||
1501 ir->operands[1]->type->is_vector()) {
1502 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1503 emit(MOV(result_dst, src_reg(0)));
1504 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1505 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1506 } else {
1507 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1508 if (ctx->Const.UniformBooleanTrue == 1) {
1509 emit(AND(result_dst, result_src, src_reg(1)));
1510 }
1511 }
1512 break;
1513 case ir_binop_any_nequal:
1514 /* "!=" operator producing a scalar boolean. */
1515 if (ir->operands[0]->type->is_vector() ||
1516 ir->operands[1]->type->is_vector()) {
1517 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1518
1519 emit(MOV(result_dst, src_reg(0)));
1520 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1522 } else {
1523 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1524 if (ctx->Const.UniformBooleanTrue == 1) {
1525 emit(AND(result_dst, result_src, src_reg(1)));
1526 }
1527 }
1528 break;
1529
1530 case ir_unop_any:
1531 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1532 emit(MOV(result_dst, src_reg(0)));
1533
1534 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1535 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1536 break;
1537
1538 case ir_binop_logic_xor:
1539 emit(XOR(result_dst, op[0], op[1]));
1540 break;
1541
1542 case ir_binop_logic_or:
1543 emit(OR(result_dst, op[0], op[1]));
1544 break;
1545
1546 case ir_binop_logic_and:
1547 emit(AND(result_dst, op[0], op[1]));
1548 break;
1549
1550 case ir_binop_dot:
1551 assert(ir->operands[0]->type->is_vector());
1552 assert(ir->operands[0]->type == ir->operands[1]->type);
1553 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1554 break;
1555
1556 case ir_unop_sqrt:
1557 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1558 break;
1559 case ir_unop_rsq:
1560 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1561 break;
1562
1563 case ir_unop_bitcast_i2f:
1564 case ir_unop_bitcast_u2f:
1565 this->result = op[0];
1566 this->result.type = BRW_REGISTER_TYPE_F;
1567 break;
1568
1569 case ir_unop_bitcast_f2i:
1570 this->result = op[0];
1571 this->result.type = BRW_REGISTER_TYPE_D;
1572 break;
1573
1574 case ir_unop_bitcast_f2u:
1575 this->result = op[0];
1576 this->result.type = BRW_REGISTER_TYPE_UD;
1577 break;
1578
1579 case ir_unop_i2f:
1580 case ir_unop_i2u:
1581 case ir_unop_u2i:
1582 case ir_unop_u2f:
1583 case ir_unop_f2i:
1584 case ir_unop_f2u:
1585 emit(MOV(result_dst, op[0]));
1586 break;
1587 case ir_unop_b2i:
1588 if (ctx->Const.UniformBooleanTrue != 1) {
1589 emit(AND(result_dst, op[0], src_reg(1)));
1590 } else {
1591 emit(MOV(result_dst, op[0]));
1592 }
1593 break;
1594 case ir_unop_b2f:
1595 if (ctx->Const.UniformBooleanTrue != 1) {
1596 op[0].type = BRW_REGISTER_TYPE_UD;
1597 result_dst.type = BRW_REGISTER_TYPE_UD;
1598 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1599 result_dst.type = BRW_REGISTER_TYPE_F;
1600 } else {
1601 emit(MOV(result_dst, op[0]));
1602 }
1603 break;
1604 case ir_unop_f2b:
1605 case ir_unop_i2b:
1606 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1607 if (ctx->Const.UniformBooleanTrue == 1) {
1608 emit(AND(result_dst, result_src, src_reg(1)));
1609 }
1610 break;
1611
1612 case ir_unop_trunc:
1613 emit(RNDZ(result_dst, op[0]));
1614 break;
1615 case ir_unop_ceil:
1616 op[0].negate = !op[0].negate;
1617 inst = emit(RNDD(result_dst, op[0]));
1618 this->result.negate = true;
1619 break;
1620 case ir_unop_floor:
1621 inst = emit(RNDD(result_dst, op[0]));
1622 break;
1623 case ir_unop_fract:
1624 inst = emit(FRC(result_dst, op[0]));
1625 break;
1626 case ir_unop_round_even:
1627 emit(RNDE(result_dst, op[0]));
1628 break;
1629
1630 case ir_binop_min:
1631 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1632 break;
1633 case ir_binop_max:
1634 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1635 break;
1636
1637 case ir_binop_pow:
1638 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1639 break;
1640
1641 case ir_unop_bit_not:
1642 inst = emit(NOT(result_dst, op[0]));
1643 break;
1644 case ir_binop_bit_and:
1645 inst = emit(AND(result_dst, op[0], op[1]));
1646 break;
1647 case ir_binop_bit_xor:
1648 inst = emit(XOR(result_dst, op[0], op[1]));
1649 break;
1650 case ir_binop_bit_or:
1651 inst = emit(OR(result_dst, op[0], op[1]));
1652 break;
1653
1654 case ir_binop_lshift:
1655 inst = emit(SHL(result_dst, op[0], op[1]));
1656 break;
1657
1658 case ir_binop_rshift:
1659 if (ir->type->base_type == GLSL_TYPE_INT)
1660 inst = emit(ASR(result_dst, op[0], op[1]));
1661 else
1662 inst = emit(SHR(result_dst, op[0], op[1]));
1663 break;
1664
1665 case ir_binop_bfm:
1666 emit(BFI1(result_dst, op[0], op[1]));
1667 break;
1668
1669 case ir_binop_ubo_load: {
1670 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1671 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1672 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1673 src_reg offset;
1674
1675 /* Now, load the vector from that offset. */
1676 assert(ir->type->is_vector() || ir->type->is_scalar());
1677
1678 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1679 packed_consts.type = result.type;
1680 src_reg surf_index;
1681
1682 if (const_uniform_block) {
1683 /* The block index is a constant, so just emit the binding table entry
1684 * as an immediate.
1685 */
1686 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1687 const_uniform_block->value.u[0]);
1688 } else {
1689 /* The block index is not a constant. Evaluate the index expression
1690 * per-channel and add the base UBO index; the generator will select
1691 * a value from any live channel.
1692 */
1693 surf_index = src_reg(this, glsl_type::uint_type);
1694 emit(ADD(dst_reg(surf_index), op[0],
1695 src_reg(prog_data->base.binding_table.ubo_start)));
1696
1697 /* Assume this may touch any UBO. It would be nice to provide
1698 * a tighter bound, but the array information is already lowered away.
1699 */
1700 brw_mark_surface_used(&prog_data->base,
1701 prog_data->base.binding_table.ubo_start +
1702 shader_prog->NumUniformBlocks - 1);
1703 }
1704
1705 if (const_offset_ir) {
1706 if (brw->gen >= 8) {
1707 /* Store the offset in a GRF so we can send-from-GRF. */
1708 offset = src_reg(this, glsl_type::int_type);
1709 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1710 } else {
1711 /* Immediates are fine on older generations since they'll be moved
1712 * to a (potentially fake) MRF at the generator level.
1713 */
1714 offset = src_reg(const_offset / 16);
1715 }
1716 } else {
1717 offset = src_reg(this, glsl_type::uint_type);
1718 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1719 }
1720
1721 if (brw->gen >= 7) {
1722 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1723 grf_offset.type = offset.type;
1724
1725 emit(MOV(grf_offset, offset));
1726
1727 emit(new(mem_ctx) vec4_instruction(this,
1728 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1729 dst_reg(packed_consts),
1730 surf_index,
1731 src_reg(grf_offset)));
1732 } else {
1733 vec4_instruction *pull =
1734 emit(new(mem_ctx) vec4_instruction(this,
1735 VS_OPCODE_PULL_CONSTANT_LOAD,
1736 dst_reg(packed_consts),
1737 surf_index,
1738 offset));
1739 pull->base_mrf = 14;
1740 pull->mlen = 1;
1741 }
1742
1743 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1744 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1745 const_offset % 16 / 4,
1746 const_offset % 16 / 4,
1747 const_offset % 16 / 4);
1748
1749 /* UBO bools are any nonzero int. We need to convert them to use the
1750 * value of true stored in ctx->Const.UniformBooleanTrue.
1751 */
1752 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1753 emit(CMP(result_dst, packed_consts, src_reg(0u),
1754 BRW_CONDITIONAL_NZ));
1755 if (ctx->Const.UniformBooleanTrue == 1) {
1756 emit(AND(result_dst, result, src_reg(1)));
1757 }
1758 } else {
1759 emit(MOV(result_dst, packed_consts));
1760 }
1761 break;
1762 }
1763
1764 case ir_binop_vector_extract:
1765 unreachable("should have been lowered by vec_index_to_cond_assign");
1766
1767 case ir_triop_fma:
1768 op[0] = fix_3src_operand(op[0]);
1769 op[1] = fix_3src_operand(op[1]);
1770 op[2] = fix_3src_operand(op[2]);
1771 /* Note that the instruction's argument order is reversed from GLSL
1772 * and the IR.
1773 */
1774 emit(MAD(result_dst, op[2], op[1], op[0]));
1775 break;
1776
1777 case ir_triop_lrp:
1778 emit_lrp(result_dst, op[0], op[1], op[2]);
1779 break;
1780
1781 case ir_triop_csel:
1782 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1783 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1784 inst->predicate = BRW_PREDICATE_NORMAL;
1785 break;
1786
1787 case ir_triop_bfi:
1788 op[0] = fix_3src_operand(op[0]);
1789 op[1] = fix_3src_operand(op[1]);
1790 op[2] = fix_3src_operand(op[2]);
1791 emit(BFI2(result_dst, op[0], op[1], op[2]));
1792 break;
1793
1794 case ir_triop_bitfield_extract:
1795 op[0] = fix_3src_operand(op[0]);
1796 op[1] = fix_3src_operand(op[1]);
1797 op[2] = fix_3src_operand(op[2]);
1798 /* Note that the instruction's argument order is reversed from GLSL
1799 * and the IR.
1800 */
1801 emit(BFE(result_dst, op[2], op[1], op[0]));
1802 break;
1803
1804 case ir_triop_vector_insert:
1805 unreachable("should have been lowered by lower_vector_insert");
1806
1807 case ir_quadop_bitfield_insert:
1808 unreachable("not reached: should be handled by "
1809 "bitfield_insert_to_bfm_bfi\n");
1810
1811 case ir_quadop_vector:
1812 unreachable("not reached: should be handled by lower_quadop_vector");
1813
1814 case ir_unop_pack_half_2x16:
1815 emit_pack_half_2x16(result_dst, op[0]);
1816 break;
1817 case ir_unop_unpack_half_2x16:
1818 emit_unpack_half_2x16(result_dst, op[0]);
1819 break;
1820 case ir_unop_pack_snorm_2x16:
1821 case ir_unop_pack_snorm_4x8:
1822 case ir_unop_pack_unorm_2x16:
1823 case ir_unop_pack_unorm_4x8:
1824 case ir_unop_unpack_snorm_2x16:
1825 case ir_unop_unpack_snorm_4x8:
1826 case ir_unop_unpack_unorm_2x16:
1827 case ir_unop_unpack_unorm_4x8:
1828 unreachable("not reached: should be handled by lower_packing_builtins");
1829 case ir_unop_unpack_half_2x16_split_x:
1830 case ir_unop_unpack_half_2x16_split_y:
1831 case ir_binop_pack_half_2x16_split:
1832 case ir_unop_interpolate_at_centroid:
1833 case ir_binop_interpolate_at_sample:
1834 case ir_binop_interpolate_at_offset:
1835 unreachable("not reached: should not occur in vertex shader");
1836 case ir_binop_ldexp:
1837 unreachable("not reached: should be handled by ldexp_to_arith()");
1838 }
1839 }
1840
1841
1842 void
1843 vec4_visitor::visit(ir_swizzle *ir)
1844 {
1845 src_reg src;
1846 int i = 0;
1847 int swizzle[4];
1848
1849 /* Note that this is only swizzles in expressions, not those on the left
1850 * hand side of an assignment, which do write masking. See ir_assignment
1851 * for that.
1852 */
1853
1854 ir->val->accept(this);
1855 src = this->result;
1856 assert(src.file != BAD_FILE);
1857
1858 for (i = 0; i < ir->type->vector_elements; i++) {
1859 switch (i) {
1860 case 0:
1861 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1862 break;
1863 case 1:
1864 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1865 break;
1866 case 2:
1867 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1868 break;
1869 case 3:
1870 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1871 break;
1872 }
1873 }
1874 for (; i < 4; i++) {
1875 /* Replicate the last channel out. */
1876 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1877 }
1878
1879 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1880
1881 this->result = src;
1882 }
1883
1884 void
1885 vec4_visitor::visit(ir_dereference_variable *ir)
1886 {
1887 const struct glsl_type *type = ir->type;
1888 dst_reg *reg = variable_storage(ir->var);
1889
1890 if (!reg) {
1891 fail("Failed to find variable storage for %s\n", ir->var->name);
1892 this->result = src_reg(brw_null_reg());
1893 return;
1894 }
1895
1896 this->result = src_reg(*reg);
1897
1898 /* System values get their swizzle from the dst_reg writemask */
1899 if (ir->var->data.mode == ir_var_system_value)
1900 return;
1901
1902 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1903 this->result.swizzle = swizzle_for_size(type->vector_elements);
1904 }
1905
1906
1907 int
1908 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1909 {
1910 /* Under normal circumstances array elements are stored consecutively, so
1911 * the stride is equal to the size of the array element.
1912 */
1913 return type_size(ir->type);
1914 }
1915
1916
1917 void
1918 vec4_visitor::visit(ir_dereference_array *ir)
1919 {
1920 ir_constant *constant_index;
1921 src_reg src;
1922 int array_stride = compute_array_stride(ir);
1923
1924 constant_index = ir->array_index->constant_expression_value();
1925
1926 ir->array->accept(this);
1927 src = this->result;
1928
1929 if (constant_index) {
1930 src.reg_offset += constant_index->value.i[0] * array_stride;
1931 } else {
1932 /* Variable index array dereference. It eats the "vec4" of the
1933 * base of the array and an index that offsets the Mesa register
1934 * index.
1935 */
1936 ir->array_index->accept(this);
1937
1938 src_reg index_reg;
1939
1940 if (array_stride == 1) {
1941 index_reg = this->result;
1942 } else {
1943 index_reg = src_reg(this, glsl_type::int_type);
1944
1945 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1946 }
1947
1948 if (src.reladdr) {
1949 src_reg temp = src_reg(this, glsl_type::int_type);
1950
1951 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1952
1953 index_reg = temp;
1954 }
1955
1956 src.reladdr = ralloc(mem_ctx, src_reg);
1957 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1958 }
1959
1960 /* If the type is smaller than a vec4, replicate the last channel out. */
1961 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1962 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1963 else
1964 src.swizzle = BRW_SWIZZLE_NOOP;
1965 src.type = brw_type_for_base_type(ir->type);
1966
1967 this->result = src;
1968 }
1969
1970 void
1971 vec4_visitor::visit(ir_dereference_record *ir)
1972 {
1973 unsigned int i;
1974 const glsl_type *struct_type = ir->record->type;
1975 int offset = 0;
1976
1977 ir->record->accept(this);
1978
1979 for (i = 0; i < struct_type->length; i++) {
1980 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1981 break;
1982 offset += type_size(struct_type->fields.structure[i].type);
1983 }
1984
1985 /* If the type is smaller than a vec4, replicate the last channel out. */
1986 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1987 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1988 else
1989 this->result.swizzle = BRW_SWIZZLE_NOOP;
1990 this->result.type = brw_type_for_base_type(ir->type);
1991
1992 this->result.reg_offset += offset;
1993 }
1994
1995 /**
1996 * We want to be careful in assignment setup to hit the actual storage
1997 * instead of potentially using a temporary like we might with the
1998 * ir_dereference handler.
1999 */
2000 static dst_reg
2001 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2002 {
2003 /* The LHS must be a dereference. If the LHS is a variable indexed array
2004 * access of a vector, it must be separated into a series conditional moves
2005 * before reaching this point (see ir_vec_index_to_cond_assign).
2006 */
2007 assert(ir->as_dereference());
2008 ir_dereference_array *deref_array = ir->as_dereference_array();
2009 if (deref_array) {
2010 assert(!deref_array->array->type->is_vector());
2011 }
2012
2013 /* Use the rvalue deref handler for the most part. We'll ignore
2014 * swizzles in it and write swizzles using writemask, though.
2015 */
2016 ir->accept(v);
2017 return dst_reg(v->result);
2018 }
2019
2020 void
2021 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2022 const struct glsl_type *type,
2023 enum brw_predicate predicate)
2024 {
2025 if (type->base_type == GLSL_TYPE_STRUCT) {
2026 for (unsigned int i = 0; i < type->length; i++) {
2027 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2028 }
2029 return;
2030 }
2031
2032 if (type->is_array()) {
2033 for (unsigned int i = 0; i < type->length; i++) {
2034 emit_block_move(dst, src, type->fields.array, predicate);
2035 }
2036 return;
2037 }
2038
2039 if (type->is_matrix()) {
2040 const struct glsl_type *vec_type;
2041
2042 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2043 type->vector_elements, 1);
2044
2045 for (int i = 0; i < type->matrix_columns; i++) {
2046 emit_block_move(dst, src, vec_type, predicate);
2047 }
2048 return;
2049 }
2050
2051 assert(type->is_scalar() || type->is_vector());
2052
2053 dst->type = brw_type_for_base_type(type);
2054 src->type = dst->type;
2055
2056 dst->writemask = (1 << type->vector_elements) - 1;
2057
2058 src->swizzle = swizzle_for_size(type->vector_elements);
2059
2060 vec4_instruction *inst = emit(MOV(*dst, *src));
2061 inst->predicate = predicate;
2062
2063 dst->reg_offset++;
2064 src->reg_offset++;
2065 }
2066
2067
2068 /* If the RHS processing resulted in an instruction generating a
2069 * temporary value, and it would be easy to rewrite the instruction to
2070 * generate its result right into the LHS instead, do so. This ends
2071 * up reliably removing instructions where it can be tricky to do so
2072 * later without real UD chain information.
2073 */
2074 bool
2075 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2076 dst_reg dst,
2077 src_reg src,
2078 vec4_instruction *pre_rhs_inst,
2079 vec4_instruction *last_rhs_inst)
2080 {
2081 /* This could be supported, but it would take more smarts. */
2082 if (ir->condition)
2083 return false;
2084
2085 if (pre_rhs_inst == last_rhs_inst)
2086 return false; /* No instructions generated to work with. */
2087
2088 /* Make sure the last instruction generated our source reg. */
2089 if (src.file != GRF ||
2090 src.file != last_rhs_inst->dst.file ||
2091 src.reg != last_rhs_inst->dst.reg ||
2092 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2093 src.reladdr ||
2094 src.abs ||
2095 src.negate ||
2096 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2097 return false;
2098
2099 /* Check that that last instruction fully initialized the channels
2100 * we want to use, in the order we want to use them. We could
2101 * potentially reswizzle the operands of many instructions so that
2102 * we could handle out of order channels, but don't yet.
2103 */
2104
2105 for (unsigned i = 0; i < 4; i++) {
2106 if (dst.writemask & (1 << i)) {
2107 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2108 return false;
2109
2110 if (BRW_GET_SWZ(src.swizzle, i) != i)
2111 return false;
2112 }
2113 }
2114
2115 /* Success! Rewrite the instruction. */
2116 last_rhs_inst->dst.file = dst.file;
2117 last_rhs_inst->dst.reg = dst.reg;
2118 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2119 last_rhs_inst->dst.reladdr = dst.reladdr;
2120 last_rhs_inst->dst.writemask &= dst.writemask;
2121
2122 return true;
2123 }
2124
2125 void
2126 vec4_visitor::visit(ir_assignment *ir)
2127 {
2128 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2129 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2130
2131 if (!ir->lhs->type->is_scalar() &&
2132 !ir->lhs->type->is_vector()) {
2133 ir->rhs->accept(this);
2134 src_reg src = this->result;
2135
2136 if (ir->condition) {
2137 emit_bool_to_cond_code(ir->condition, &predicate);
2138 }
2139
2140 /* emit_block_move doesn't account for swizzles in the source register.
2141 * This should be ok, since the source register is a structure or an
2142 * array, and those can't be swizzled. But double-check to be sure.
2143 */
2144 assert(src.swizzle ==
2145 (ir->rhs->type->is_matrix()
2146 ? swizzle_for_size(ir->rhs->type->vector_elements)
2147 : BRW_SWIZZLE_NOOP));
2148
2149 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2150 return;
2151 }
2152
2153 /* Now we're down to just a scalar/vector with writemasks. */
2154 int i;
2155
2156 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2157 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2158
2159 ir->rhs->accept(this);
2160
2161 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2162
2163 src_reg src = this->result;
2164
2165 int swizzles[4];
2166 int first_enabled_chan = 0;
2167 int src_chan = 0;
2168
2169 assert(ir->lhs->type->is_vector() ||
2170 ir->lhs->type->is_scalar());
2171 dst.writemask = ir->write_mask;
2172
2173 for (int i = 0; i < 4; i++) {
2174 if (dst.writemask & (1 << i)) {
2175 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2176 break;
2177 }
2178 }
2179
2180 /* Swizzle a small RHS vector into the channels being written.
2181 *
2182 * glsl ir treats write_mask as dictating how many channels are
2183 * present on the RHS while in our instructions we need to make
2184 * those channels appear in the slots of the vec4 they're written to.
2185 */
2186 for (int i = 0; i < 4; i++) {
2187 if (dst.writemask & (1 << i))
2188 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2189 else
2190 swizzles[i] = first_enabled_chan;
2191 }
2192 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2193 swizzles[2], swizzles[3]);
2194
2195 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2196 return;
2197 }
2198
2199 if (ir->condition) {
2200 emit_bool_to_cond_code(ir->condition, &predicate);
2201 }
2202
2203 for (i = 0; i < type_size(ir->lhs->type); i++) {
2204 vec4_instruction *inst = emit(MOV(dst, src));
2205 inst->predicate = predicate;
2206
2207 dst.reg_offset++;
2208 src.reg_offset++;
2209 }
2210 }
2211
2212 void
2213 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2214 {
2215 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2216 foreach_in_list(ir_constant, field_value, &ir->components) {
2217 emit_constant_values(dst, field_value);
2218 }
2219 return;
2220 }
2221
2222 if (ir->type->is_array()) {
2223 for (unsigned int i = 0; i < ir->type->length; i++) {
2224 emit_constant_values(dst, ir->array_elements[i]);
2225 }
2226 return;
2227 }
2228
2229 if (ir->type->is_matrix()) {
2230 for (int i = 0; i < ir->type->matrix_columns; i++) {
2231 float *vec = &ir->value.f[i * ir->type->vector_elements];
2232
2233 for (int j = 0; j < ir->type->vector_elements; j++) {
2234 dst->writemask = 1 << j;
2235 dst->type = BRW_REGISTER_TYPE_F;
2236
2237 emit(MOV(*dst, src_reg(vec[j])));
2238 }
2239 dst->reg_offset++;
2240 }
2241 return;
2242 }
2243
2244 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2245
2246 for (int i = 0; i < ir->type->vector_elements; i++) {
2247 if (!(remaining_writemask & (1 << i)))
2248 continue;
2249
2250 dst->writemask = 1 << i;
2251 dst->type = brw_type_for_base_type(ir->type);
2252
2253 /* Find other components that match the one we're about to
2254 * write. Emits fewer instructions for things like vec4(0.5,
2255 * 1.5, 1.5, 1.5).
2256 */
2257 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2258 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2259 if (ir->value.b[i] == ir->value.b[j])
2260 dst->writemask |= (1 << j);
2261 } else {
2262 /* u, i, and f storage all line up, so no need for a
2263 * switch case for comparing each type.
2264 */
2265 if (ir->value.u[i] == ir->value.u[j])
2266 dst->writemask |= (1 << j);
2267 }
2268 }
2269
2270 switch (ir->type->base_type) {
2271 case GLSL_TYPE_FLOAT:
2272 emit(MOV(*dst, src_reg(ir->value.f[i])));
2273 break;
2274 case GLSL_TYPE_INT:
2275 emit(MOV(*dst, src_reg(ir->value.i[i])));
2276 break;
2277 case GLSL_TYPE_UINT:
2278 emit(MOV(*dst, src_reg(ir->value.u[i])));
2279 break;
2280 case GLSL_TYPE_BOOL:
2281 emit(MOV(*dst,
2282 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2283 : 0)));
2284 break;
2285 default:
2286 unreachable("Non-float/uint/int/bool constant");
2287 }
2288
2289 remaining_writemask &= ~dst->writemask;
2290 }
2291 dst->reg_offset++;
2292 }
2293
2294 void
2295 vec4_visitor::visit(ir_constant *ir)
2296 {
2297 dst_reg dst = dst_reg(this, ir->type);
2298 this->result = src_reg(dst);
2299
2300 emit_constant_values(&dst, ir);
2301 }
2302
2303 void
2304 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2305 {
2306 ir_dereference *deref = static_cast<ir_dereference *>(
2307 ir->actual_parameters.get_head());
2308 ir_variable *location = deref->variable_referenced();
2309 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2310 location->data.binding);
2311
2312 /* Calculate the surface offset */
2313 src_reg offset(this, glsl_type::uint_type);
2314 ir_dereference_array *deref_array = deref->as_dereference_array();
2315 if (deref_array) {
2316 deref_array->array_index->accept(this);
2317
2318 src_reg tmp(this, glsl_type::uint_type);
2319 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2320 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2321 } else {
2322 offset = location->data.atomic.offset;
2323 }
2324
2325 /* Emit the appropriate machine instruction */
2326 const char *callee = ir->callee->function_name();
2327 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2328
2329 if (!strcmp("__intrinsic_atomic_read", callee)) {
2330 emit_untyped_surface_read(surf_index, dst, offset);
2331
2332 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2333 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2334 src_reg(), src_reg());
2335
2336 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2337 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2338 src_reg(), src_reg());
2339 }
2340 }
2341
2342 void
2343 vec4_visitor::visit(ir_call *ir)
2344 {
2345 const char *callee = ir->callee->function_name();
2346
2347 if (!strcmp("__intrinsic_atomic_read", callee) ||
2348 !strcmp("__intrinsic_atomic_increment", callee) ||
2349 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2350 visit_atomic_counter_intrinsic(ir);
2351 } else {
2352 unreachable("Unsupported intrinsic.");
2353 }
2354 }
2355
2356 src_reg
2357 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2358 {
2359 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2360 inst->base_mrf = 2;
2361 inst->mlen = 1;
2362 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2363 inst->dst.writemask = WRITEMASK_XYZW;
2364
2365 inst->src[1] = sampler;
2366
2367 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2368 int param_base = inst->base_mrf;
2369 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2370 int zero_mask = 0xf & ~coord_mask;
2371
2372 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2373 coordinate));
2374
2375 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2376 src_reg(0)));
2377
2378 emit(inst);
2379 return src_reg(inst->dst);
2380 }
2381
2382 static bool
2383 is_high_sampler(struct brw_context *brw, src_reg sampler)
2384 {
2385 if (brw->gen < 8 && !brw->is_haswell)
2386 return false;
2387
2388 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2389 }
2390
2391 void
2392 vec4_visitor::visit(ir_texture *ir)
2393 {
2394 uint32_t sampler =
2395 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2396
2397 ir_rvalue *nonconst_sampler_index =
2398 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2399
2400 /* Handle non-constant sampler array indexing */
2401 src_reg sampler_reg;
2402 if (nonconst_sampler_index) {
2403 /* The highest sampler which may be used by this operation is
2404 * the last element of the array. Mark it here, because the generator
2405 * doesn't have enough information to determine the bound.
2406 */
2407 uint32_t array_size = ir->sampler->as_dereference_array()
2408 ->array->type->array_size();
2409
2410 uint32_t max_used = sampler + array_size - 1;
2411 if (ir->op == ir_tg4 && brw->gen < 8) {
2412 max_used += prog_data->base.binding_table.gather_texture_start;
2413 } else {
2414 max_used += prog_data->base.binding_table.texture_start;
2415 }
2416
2417 brw_mark_surface_used(&prog_data->base, max_used);
2418
2419 /* Emit code to evaluate the actual indexing expression */
2420 nonconst_sampler_index->accept(this);
2421 dst_reg temp(this, glsl_type::uint_type);
2422 emit(ADD(temp, this->result, src_reg(sampler)))
2423 ->force_writemask_all = true;
2424 sampler_reg = src_reg(temp);
2425 } else {
2426 /* Single sampler, or constant array index; the indexing expression
2427 * is just an immediate.
2428 */
2429 sampler_reg = src_reg(sampler);
2430 }
2431
2432 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2433 * emitting anything other than setting up the constant result.
2434 */
2435 if (ir->op == ir_tg4) {
2436 ir_constant *chan = ir->lod_info.component->as_constant();
2437 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2438 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2439 dst_reg result(this, ir->type);
2440 this->result = src_reg(result);
2441 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2442 return;
2443 }
2444 }
2445
2446 /* Should be lowered by do_lower_texture_projection */
2447 assert(!ir->projector);
2448
2449 /* Should be lowered */
2450 assert(!ir->offset || !ir->offset->type->is_array());
2451
2452 /* Generate code to compute all the subexpression trees. This has to be
2453 * done before loading any values into MRFs for the sampler message since
2454 * generating these values may involve SEND messages that need the MRFs.
2455 */
2456 src_reg coordinate;
2457 if (ir->coordinate) {
2458 ir->coordinate->accept(this);
2459 coordinate = this->result;
2460 }
2461
2462 src_reg shadow_comparitor;
2463 if (ir->shadow_comparitor) {
2464 ir->shadow_comparitor->accept(this);
2465 shadow_comparitor = this->result;
2466 }
2467
2468 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2469 src_reg offset_value;
2470 if (has_nonconstant_offset) {
2471 ir->offset->accept(this);
2472 offset_value = src_reg(this->result);
2473 }
2474
2475 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2476 src_reg lod, dPdx, dPdy, sample_index, mcs;
2477 switch (ir->op) {
2478 case ir_tex:
2479 lod = src_reg(0.0f);
2480 lod_type = glsl_type::float_type;
2481 break;
2482 case ir_txf:
2483 case ir_txl:
2484 case ir_txs:
2485 ir->lod_info.lod->accept(this);
2486 lod = this->result;
2487 lod_type = ir->lod_info.lod->type;
2488 break;
2489 case ir_query_levels:
2490 lod = src_reg(0);
2491 lod_type = glsl_type::int_type;
2492 break;
2493 case ir_txf_ms:
2494 ir->lod_info.sample_index->accept(this);
2495 sample_index = this->result;
2496 sample_index_type = ir->lod_info.sample_index->type;
2497
2498 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2499 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2500 else
2501 mcs = src_reg(0u);
2502 break;
2503 case ir_txd:
2504 ir->lod_info.grad.dPdx->accept(this);
2505 dPdx = this->result;
2506
2507 ir->lod_info.grad.dPdy->accept(this);
2508 dPdy = this->result;
2509
2510 lod_type = ir->lod_info.grad.dPdx->type;
2511 break;
2512 case ir_txb:
2513 case ir_lod:
2514 case ir_tg4:
2515 break;
2516 }
2517
2518 enum opcode opcode;
2519 switch (ir->op) {
2520 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2521 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2522 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2523 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2524 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2525 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2526 case ir_tg4: opcode = has_nonconstant_offset
2527 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2528 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2529 case ir_txb:
2530 unreachable("TXB is not valid for vertex shaders.");
2531 case ir_lod:
2532 unreachable("LOD is not valid for vertex shaders.");
2533 default:
2534 unreachable("Unrecognized tex op");
2535 }
2536
2537 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2538
2539 if (ir->offset != NULL && ir->op != ir_txf)
2540 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2541
2542 /* Stuff the channel select bits in the top of the texture offset */
2543 if (ir->op == ir_tg4)
2544 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2545
2546 /* The message header is necessary for:
2547 * - Gen4 (always)
2548 * - Texel offsets
2549 * - Gather channel selection
2550 * - Sampler indices too large to fit in a 4-bit value.
2551 */
2552 inst->header_present =
2553 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2554 is_high_sampler(brw, sampler_reg);
2555 inst->base_mrf = 2;
2556 inst->mlen = inst->header_present + 1; /* always at least one */
2557 inst->dst = dst_reg(this, ir->type);
2558 inst->dst.writemask = WRITEMASK_XYZW;
2559 inst->shadow_compare = ir->shadow_comparitor != NULL;
2560
2561 inst->src[1] = sampler_reg;
2562
2563 /* MRF for the first parameter */
2564 int param_base = inst->base_mrf + inst->header_present;
2565
2566 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2567 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2568 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2569 } else {
2570 /* Load the coordinate */
2571 /* FINISHME: gl_clamp_mask and saturate */
2572 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2573 int zero_mask = 0xf & ~coord_mask;
2574
2575 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2576 coordinate));
2577
2578 if (zero_mask != 0) {
2579 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2580 src_reg(0)));
2581 }
2582 /* Load the shadow comparitor */
2583 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2584 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2585 WRITEMASK_X),
2586 shadow_comparitor));
2587 inst->mlen++;
2588 }
2589
2590 /* Load the LOD info */
2591 if (ir->op == ir_tex || ir->op == ir_txl) {
2592 int mrf, writemask;
2593 if (brw->gen >= 5) {
2594 mrf = param_base + 1;
2595 if (ir->shadow_comparitor) {
2596 writemask = WRITEMASK_Y;
2597 /* mlen already incremented */
2598 } else {
2599 writemask = WRITEMASK_X;
2600 inst->mlen++;
2601 }
2602 } else /* brw->gen == 4 */ {
2603 mrf = param_base;
2604 writemask = WRITEMASK_W;
2605 }
2606 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2607 } else if (ir->op == ir_txf) {
2608 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2609 } else if (ir->op == ir_txf_ms) {
2610 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2611 sample_index));
2612 if (brw->gen >= 7) {
2613 /* MCS data is in the first channel of `mcs`, but we need to get it into
2614 * the .y channel of the second vec4 of params, so replicate .x across
2615 * the whole vec4 and then mask off everything except .y
2616 */
2617 mcs.swizzle = BRW_SWIZZLE_XXXX;
2618 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2619 mcs));
2620 }
2621 inst->mlen++;
2622 } else if (ir->op == ir_txd) {
2623 const glsl_type *type = lod_type;
2624
2625 if (brw->gen >= 5) {
2626 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2627 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2628 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2629 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2630 inst->mlen++;
2631
2632 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2633 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2634 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2635 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2636 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2637 inst->mlen++;
2638
2639 if (ir->shadow_comparitor) {
2640 emit(MOV(dst_reg(MRF, param_base + 2,
2641 ir->shadow_comparitor->type, WRITEMASK_Z),
2642 shadow_comparitor));
2643 }
2644 }
2645 } else /* brw->gen == 4 */ {
2646 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2647 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2648 inst->mlen += 2;
2649 }
2650 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2651 if (ir->shadow_comparitor) {
2652 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2653 shadow_comparitor));
2654 }
2655
2656 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2657 offset_value));
2658 inst->mlen++;
2659 }
2660 }
2661
2662 emit(inst);
2663
2664 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2665 * spec requires layers.
2666 */
2667 if (ir->op == ir_txs) {
2668 glsl_type const *type = ir->sampler->type;
2669 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2670 type->sampler_array) {
2671 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2672 writemask(inst->dst, WRITEMASK_Z),
2673 src_reg(inst->dst), src_reg(6));
2674 }
2675 }
2676
2677 if (brw->gen == 6 && ir->op == ir_tg4) {
2678 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2679 }
2680
2681 swizzle_result(ir, src_reg(inst->dst), sampler);
2682 }
2683
2684 /**
2685 * Apply workarounds for Gen6 gather with UINT/SINT
2686 */
2687 void
2688 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2689 {
2690 if (!wa)
2691 return;
2692
2693 int width = (wa & WA_8BIT) ? 8 : 16;
2694 dst_reg dst_f = dst;
2695 dst_f.type = BRW_REGISTER_TYPE_F;
2696
2697 /* Convert from UNORM to UINT */
2698 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2699 emit(MOV(dst, src_reg(dst_f)));
2700
2701 if (wa & WA_SIGN) {
2702 /* Reinterpret the UINT value as a signed INT value by
2703 * shifting the sign bit into place, then shifting back
2704 * preserving sign.
2705 */
2706 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2707 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2708 }
2709 }
2710
2711 /**
2712 * Set up the gather channel based on the swizzle, for gather4.
2713 */
2714 uint32_t
2715 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2716 {
2717 ir_constant *chan = ir->lod_info.component->as_constant();
2718 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2719 switch (swiz) {
2720 case SWIZZLE_X: return 0;
2721 case SWIZZLE_Y:
2722 /* gather4 sampler is broken for green channel on RG32F --
2723 * we must ask for blue instead.
2724 */
2725 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2726 return 2;
2727 return 1;
2728 case SWIZZLE_Z: return 2;
2729 case SWIZZLE_W: return 3;
2730 default:
2731 unreachable("Not reached"); /* zero, one swizzles handled already */
2732 }
2733 }
2734
2735 void
2736 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2737 {
2738 int s = key->tex.swizzles[sampler];
2739
2740 this->result = src_reg(this, ir->type);
2741 dst_reg swizzled_result(this->result);
2742
2743 if (ir->op == ir_query_levels) {
2744 /* # levels is in .w */
2745 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2746 emit(MOV(swizzled_result, orig_val));
2747 return;
2748 }
2749
2750 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2751 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2752 emit(MOV(swizzled_result, orig_val));
2753 return;
2754 }
2755
2756
2757 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2758 int swizzle[4] = {0};
2759
2760 for (int i = 0; i < 4; i++) {
2761 switch (GET_SWZ(s, i)) {
2762 case SWIZZLE_ZERO:
2763 zero_mask |= (1 << i);
2764 break;
2765 case SWIZZLE_ONE:
2766 one_mask |= (1 << i);
2767 break;
2768 default:
2769 copy_mask |= (1 << i);
2770 swizzle[i] = GET_SWZ(s, i);
2771 break;
2772 }
2773 }
2774
2775 if (copy_mask) {
2776 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2777 swizzled_result.writemask = copy_mask;
2778 emit(MOV(swizzled_result, orig_val));
2779 }
2780
2781 if (zero_mask) {
2782 swizzled_result.writemask = zero_mask;
2783 emit(MOV(swizzled_result, src_reg(0.0f)));
2784 }
2785
2786 if (one_mask) {
2787 swizzled_result.writemask = one_mask;
2788 emit(MOV(swizzled_result, src_reg(1.0f)));
2789 }
2790 }
2791
2792 void
2793 vec4_visitor::visit(ir_return *)
2794 {
2795 unreachable("not reached");
2796 }
2797
2798 void
2799 vec4_visitor::visit(ir_discard *)
2800 {
2801 unreachable("not reached");
2802 }
2803
2804 void
2805 vec4_visitor::visit(ir_if *ir)
2806 {
2807 /* Don't point the annotation at the if statement, because then it plus
2808 * the then and else blocks get printed.
2809 */
2810 this->base_ir = ir->condition;
2811
2812 if (brw->gen == 6) {
2813 emit_if_gen6(ir);
2814 } else {
2815 enum brw_predicate predicate;
2816 emit_bool_to_cond_code(ir->condition, &predicate);
2817 emit(IF(predicate));
2818 }
2819
2820 visit_instructions(&ir->then_instructions);
2821
2822 if (!ir->else_instructions.is_empty()) {
2823 this->base_ir = ir->condition;
2824 emit(BRW_OPCODE_ELSE);
2825
2826 visit_instructions(&ir->else_instructions);
2827 }
2828
2829 this->base_ir = ir->condition;
2830 emit(BRW_OPCODE_ENDIF);
2831 }
2832
2833 void
2834 vec4_visitor::visit(ir_emit_vertex *)
2835 {
2836 unreachable("not reached");
2837 }
2838
2839 void
2840 vec4_visitor::visit(ir_end_primitive *)
2841 {
2842 unreachable("not reached");
2843 }
2844
2845 void
2846 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2847 dst_reg dst, src_reg offset,
2848 src_reg src0, src_reg src1)
2849 {
2850 unsigned mlen = 0;
2851
2852 /* Set the atomic operation offset. */
2853 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2854 mlen++;
2855
2856 /* Set the atomic operation arguments. */
2857 if (src0.file != BAD_FILE) {
2858 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2859 mlen++;
2860 }
2861
2862 if (src1.file != BAD_FILE) {
2863 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2864 mlen++;
2865 }
2866
2867 /* Emit the instruction. Note that this maps to the normal SIMD8
2868 * untyped atomic message on Ivy Bridge, but that's OK because
2869 * unused channels will be masked out.
2870 */
2871 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2872 src_reg(atomic_op), src_reg(surf_index));
2873 inst->base_mrf = 0;
2874 inst->mlen = mlen;
2875 }
2876
2877 void
2878 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2879 src_reg offset)
2880 {
2881 /* Set the surface read offset. */
2882 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2883
2884 /* Emit the instruction. Note that this maps to the normal SIMD8
2885 * untyped surface read message, but that's OK because unused
2886 * channels will be masked out.
2887 */
2888 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2889 dst, src_reg(surf_index));
2890 inst->base_mrf = 0;
2891 inst->mlen = 1;
2892 }
2893
2894 void
2895 vec4_visitor::emit_ndc_computation()
2896 {
2897 /* Get the position */
2898 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2899
2900 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2901 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2902 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2903
2904 current_annotation = "NDC";
2905 dst_reg ndc_w = ndc;
2906 ndc_w.writemask = WRITEMASK_W;
2907 src_reg pos_w = pos;
2908 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2909 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2910
2911 dst_reg ndc_xyz = ndc;
2912 ndc_xyz.writemask = WRITEMASK_XYZ;
2913
2914 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2915 }
2916
2917 void
2918 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2919 {
2920 if (brw->gen < 6 &&
2921 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2922 key->userclip_active || brw->has_negative_rhw_bug)) {
2923 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2924 dst_reg header1_w = header1;
2925 header1_w.writemask = WRITEMASK_W;
2926
2927 emit(MOV(header1, 0u));
2928
2929 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2930 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2931
2932 current_annotation = "Point size";
2933 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2934 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2935 }
2936
2937 if (key->userclip_active) {
2938 current_annotation = "Clipping flags";
2939 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2940 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2941
2942 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2943 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2944 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2945
2946 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2947 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2948 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2949 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2950 }
2951
2952 /* i965 clipping workaround:
2953 * 1) Test for -ve rhw
2954 * 2) If set,
2955 * set ndc = (0,0,0,0)
2956 * set ucp[6] = 1
2957 *
2958 * Later, clipping will detect ucp[6] and ensure the primitive is
2959 * clipped against all fixed planes.
2960 */
2961 if (brw->has_negative_rhw_bug) {
2962 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2963 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2964 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2965 vec4_instruction *inst;
2966 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2967 inst->predicate = BRW_PREDICATE_NORMAL;
2968 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2969 inst->predicate = BRW_PREDICATE_NORMAL;
2970 }
2971
2972 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2973 } else if (brw->gen < 6) {
2974 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2975 } else {
2976 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2977 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2978 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2979 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2980 }
2981 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2982 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2983 src_reg(output_reg[VARYING_SLOT_LAYER])));
2984 }
2985 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2986 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2987 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2988 }
2989 }
2990 }
2991
2992 void
2993 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2994 {
2995 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2996 *
2997 * "If a linked set of shaders forming the vertex stage contains no
2998 * static write to gl_ClipVertex or gl_ClipDistance, but the
2999 * application has requested clipping against user clip planes through
3000 * the API, then the coordinate written to gl_Position is used for
3001 * comparison against the user clip planes."
3002 *
3003 * This function is only called if the shader didn't write to
3004 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3005 * if the user wrote to it; otherwise we use gl_Position.
3006 */
3007 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3008 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3009 clip_vertex = VARYING_SLOT_POS;
3010 }
3011
3012 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3013 ++i) {
3014 reg.writemask = 1 << i;
3015 emit(DP4(reg,
3016 src_reg(output_reg[clip_vertex]),
3017 src_reg(this->userplane[i + offset])));
3018 }
3019 }
3020
3021 void
3022 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3023 {
3024 assert (varying < VARYING_SLOT_MAX);
3025 reg.type = output_reg[varying].type;
3026 current_annotation = output_reg_annotation[varying];
3027 /* Copy the register, saturating if necessary */
3028 vec4_instruction *inst = emit(MOV(reg,
3029 src_reg(output_reg[varying])));
3030 if ((varying == VARYING_SLOT_COL0 ||
3031 varying == VARYING_SLOT_COL1 ||
3032 varying == VARYING_SLOT_BFC0 ||
3033 varying == VARYING_SLOT_BFC1) &&
3034 key->clamp_vertex_color) {
3035 inst->saturate = true;
3036 }
3037 }
3038
3039 void
3040 vec4_visitor::emit_urb_slot(int mrf, int varying)
3041 {
3042 struct brw_reg hw_reg = brw_message_reg(mrf);
3043 dst_reg reg = dst_reg(MRF, mrf);
3044 reg.type = BRW_REGISTER_TYPE_F;
3045
3046 switch (varying) {
3047 case VARYING_SLOT_PSIZ:
3048 /* PSIZ is always in slot 0, and is coupled with other flags. */
3049 current_annotation = "indices, point width, clip flags";
3050 emit_psiz_and_flags(hw_reg);
3051 break;
3052 case BRW_VARYING_SLOT_NDC:
3053 current_annotation = "NDC";
3054 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3055 break;
3056 case VARYING_SLOT_POS:
3057 current_annotation = "gl_Position";
3058 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3059 break;
3060 case VARYING_SLOT_EDGE:
3061 /* This is present when doing unfilled polygons. We're supposed to copy
3062 * the edge flag from the user-provided vertex array
3063 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3064 * of that attribute (starts as 1.0f). This is then used in clipping to
3065 * determine which edges should be drawn as wireframe.
3066 */
3067 current_annotation = "edge flag";
3068 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3069 glsl_type::float_type, WRITEMASK_XYZW))));
3070 break;
3071 case BRW_VARYING_SLOT_PAD:
3072 /* No need to write to this slot */
3073 break;
3074 default:
3075 emit_generic_urb_slot(reg, varying);
3076 break;
3077 }
3078 }
3079
3080 static int
3081 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3082 {
3083 if (brw->gen >= 6) {
3084 /* URB data written (does not include the message header reg) must
3085 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3086 * section 5.4.3.2.2: URB_INTERLEAVED.
3087 *
3088 * URB entries are allocated on a multiple of 1024 bits, so an
3089 * extra 128 bits written here to make the end align to 256 is
3090 * no problem.
3091 */
3092 if ((mlen % 2) != 1)
3093 mlen++;
3094 }
3095
3096 return mlen;
3097 }
3098
3099
3100 /**
3101 * Generates the VUE payload plus the necessary URB write instructions to
3102 * output it.
3103 *
3104 * The VUE layout is documented in Volume 2a.
3105 */
3106 void
3107 vec4_visitor::emit_vertex()
3108 {
3109 /* MRF 0 is reserved for the debugger, so start with message header
3110 * in MRF 1.
3111 */
3112 int base_mrf = 1;
3113 int mrf = base_mrf;
3114 /* In the process of generating our URB write message contents, we
3115 * may need to unspill a register or load from an array. Those
3116 * reads would use MRFs 14-15.
3117 */
3118 int max_usable_mrf = 13;
3119
3120 /* The following assertion verifies that max_usable_mrf causes an
3121 * even-numbered amount of URB write data, which will meet gen6's
3122 * requirements for length alignment.
3123 */
3124 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3125
3126 /* First mrf is the g0-based message header containing URB handles and
3127 * such.
3128 */
3129 emit_urb_write_header(mrf++);
3130
3131 if (brw->gen < 6) {
3132 emit_ndc_computation();
3133 }
3134
3135 /* Lower legacy ff and ClipVertex clipping to clip distances */
3136 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3137 current_annotation = "user clip distances";
3138
3139 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3140 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3141
3142 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3143 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3144 }
3145
3146 /* We may need to split this up into several URB writes, so do them in a
3147 * loop.
3148 */
3149 int slot = 0;
3150 bool complete = false;
3151 do {
3152 /* URB offset is in URB row increments, and each of our MRFs is half of
3153 * one of those, since we're doing interleaved writes.
3154 */
3155 int offset = slot / 2;
3156
3157 mrf = base_mrf + 1;
3158 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3159 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3160
3161 /* If this was max_usable_mrf, we can't fit anything more into this
3162 * URB WRITE.
3163 */
3164 if (mrf > max_usable_mrf) {
3165 slot++;
3166 break;
3167 }
3168 }
3169
3170 complete = slot >= prog_data->vue_map.num_slots;
3171 current_annotation = "URB write";
3172 vec4_instruction *inst = emit_urb_write_opcode(complete);
3173 inst->base_mrf = base_mrf;
3174 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3175 inst->offset += offset;
3176 } while(!complete);
3177 }
3178
3179
3180 src_reg
3181 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3182 src_reg *reladdr, int reg_offset)
3183 {
3184 /* Because we store the values to scratch interleaved like our
3185 * vertex data, we need to scale the vec4 index by 2.
3186 */
3187 int message_header_scale = 2;
3188
3189 /* Pre-gen6, the message header uses byte offsets instead of vec4
3190 * (16-byte) offset units.
3191 */
3192 if (brw->gen < 6)
3193 message_header_scale *= 16;
3194
3195 if (reladdr) {
3196 src_reg index = src_reg(this, glsl_type::int_type);
3197
3198 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3199 emit_before(inst, MUL(dst_reg(index),
3200 index, src_reg(message_header_scale)));
3201
3202 return index;
3203 } else {
3204 return src_reg(reg_offset * message_header_scale);
3205 }
3206 }
3207
3208 src_reg
3209 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3210 src_reg *reladdr, int reg_offset)
3211 {
3212 if (reladdr) {
3213 src_reg index = src_reg(this, glsl_type::int_type);
3214
3215 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3216
3217 /* Pre-gen6, the message header uses byte offsets instead of vec4
3218 * (16-byte) offset units.
3219 */
3220 if (brw->gen < 6) {
3221 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3222 }
3223
3224 return index;
3225 } else if (brw->gen >= 8) {
3226 /* Store the offset in a GRF so we can send-from-GRF. */
3227 src_reg offset = src_reg(this, glsl_type::int_type);
3228 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3229 return offset;
3230 } else {
3231 int message_header_scale = brw->gen < 6 ? 16 : 1;
3232 return src_reg(reg_offset * message_header_scale);
3233 }
3234 }
3235
3236 /**
3237 * Emits an instruction before @inst to load the value named by @orig_src
3238 * from scratch space at @base_offset to @temp.
3239 *
3240 * @base_offset is measured in 32-byte units (the size of a register).
3241 */
3242 void
3243 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3244 dst_reg temp, src_reg orig_src,
3245 int base_offset)
3246 {
3247 int reg_offset = base_offset + orig_src.reg_offset;
3248 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3249
3250 emit_before(inst, SCRATCH_READ(temp, index));
3251 }
3252
3253 /**
3254 * Emits an instruction after @inst to store the value to be written
3255 * to @orig_dst to scratch space at @base_offset, from @temp.
3256 *
3257 * @base_offset is measured in 32-byte units (the size of a register).
3258 */
3259 void
3260 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3261 {
3262 int reg_offset = base_offset + inst->dst.reg_offset;
3263 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3264
3265 /* Create a temporary register to store *inst's result in.
3266 *
3267 * We have to be careful in MOVing from our temporary result register in
3268 * the scratch write. If we swizzle from channels of the temporary that
3269 * weren't initialized, it will confuse live interval analysis, which will
3270 * make spilling fail to make progress.
3271 */
3272 src_reg temp = src_reg(this, glsl_type::vec4_type);
3273 temp.type = inst->dst.type;
3274 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3275 int swizzles[4];
3276 for (int i = 0; i < 4; i++)
3277 if (inst->dst.writemask & (1 << i))
3278 swizzles[i] = i;
3279 else
3280 swizzles[i] = first_writemask_chan;
3281 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3282 swizzles[2], swizzles[3]);
3283
3284 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3285 inst->dst.writemask));
3286 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3287 write->predicate = inst->predicate;
3288 write->ir = inst->ir;
3289 write->annotation = inst->annotation;
3290 inst->insert_after(write);
3291
3292 inst->dst.file = temp.file;
3293 inst->dst.reg = temp.reg;
3294 inst->dst.reg_offset = temp.reg_offset;
3295 inst->dst.reladdr = NULL;
3296 }
3297
3298 /**
3299 * We can't generally support array access in GRF space, because a
3300 * single instruction's destination can only span 2 contiguous
3301 * registers. So, we send all GRF arrays that get variable index
3302 * access to scratch space.
3303 */
3304 void
3305 vec4_visitor::move_grf_array_access_to_scratch()
3306 {
3307 int scratch_loc[this->virtual_grf_count];
3308
3309 for (int i = 0; i < this->virtual_grf_count; i++) {
3310 scratch_loc[i] = -1;
3311 }
3312
3313 /* First, calculate the set of virtual GRFs that need to be punted
3314 * to scratch due to having any array access on them, and where in
3315 * scratch.
3316 */
3317 foreach_in_list(vec4_instruction, inst, &instructions) {
3318 if (inst->dst.file == GRF && inst->dst.reladdr &&
3319 scratch_loc[inst->dst.reg] == -1) {
3320 scratch_loc[inst->dst.reg] = c->last_scratch;
3321 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3322 }
3323
3324 for (int i = 0 ; i < 3; i++) {
3325 src_reg *src = &inst->src[i];
3326
3327 if (src->file == GRF && src->reladdr &&
3328 scratch_loc[src->reg] == -1) {
3329 scratch_loc[src->reg] = c->last_scratch;
3330 c->last_scratch += this->virtual_grf_sizes[src->reg];
3331 }
3332 }
3333 }
3334
3335 /* Now, for anything that will be accessed through scratch, rewrite
3336 * it to load/store. Note that this is a _safe list walk, because
3337 * we may generate a new scratch_write instruction after the one
3338 * we're processing.
3339 */
3340 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3341 /* Set up the annotation tracking for new generated instructions. */
3342 base_ir = inst->ir;
3343 current_annotation = inst->annotation;
3344
3345 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3346 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3347 }
3348
3349 for (int i = 0 ; i < 3; i++) {
3350 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3351 continue;
3352
3353 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3354
3355 emit_scratch_read(inst, temp, inst->src[i],
3356 scratch_loc[inst->src[i].reg]);
3357
3358 inst->src[i].file = temp.file;
3359 inst->src[i].reg = temp.reg;
3360 inst->src[i].reg_offset = temp.reg_offset;
3361 inst->src[i].reladdr = NULL;
3362 }
3363 }
3364 }
3365
3366 /**
3367 * Emits an instruction before @inst to load the value named by @orig_src
3368 * from the pull constant buffer (surface) at @base_offset to @temp.
3369 */
3370 void
3371 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3372 dst_reg temp, src_reg orig_src,
3373 int base_offset)
3374 {
3375 int reg_offset = base_offset + orig_src.reg_offset;
3376 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3377 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3378 vec4_instruction *load;
3379
3380 if (brw->gen >= 7) {
3381 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3382 grf_offset.type = offset.type;
3383 emit_before(inst, MOV(grf_offset, offset));
3384
3385 load = new(mem_ctx) vec4_instruction(this,
3386 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3387 temp, index, src_reg(grf_offset));
3388 } else {
3389 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3390 temp, index, offset);
3391 load->base_mrf = 14;
3392 load->mlen = 1;
3393 }
3394 emit_before(inst, load);
3395 }
3396
3397 /**
3398 * Implements array access of uniforms by inserting a
3399 * PULL_CONSTANT_LOAD instruction.
3400 *
3401 * Unlike temporary GRF array access (where we don't support it due to
3402 * the difficulty of doing relative addressing on instruction
3403 * destinations), we could potentially do array access of uniforms
3404 * that were loaded in GRF space as push constants. In real-world
3405 * usage we've seen, though, the arrays being used are always larger
3406 * than we could load as push constants, so just always move all
3407 * uniform array access out to a pull constant buffer.
3408 */
3409 void
3410 vec4_visitor::move_uniform_array_access_to_pull_constants()
3411 {
3412 int pull_constant_loc[this->uniforms];
3413
3414 for (int i = 0; i < this->uniforms; i++) {
3415 pull_constant_loc[i] = -1;
3416 }
3417
3418 /* Walk through and find array access of uniforms. Put a copy of that
3419 * uniform in the pull constant buffer.
3420 *
3421 * Note that we don't move constant-indexed accesses to arrays. No
3422 * testing has been done of the performance impact of this choice.
3423 */
3424 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3425 for (int i = 0 ; i < 3; i++) {
3426 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3427 continue;
3428
3429 int uniform = inst->src[i].reg;
3430
3431 /* If this array isn't already present in the pull constant buffer,
3432 * add it.
3433 */
3434 if (pull_constant_loc[uniform] == -1) {
3435 const gl_constant_value **values =
3436 &stage_prog_data->param[uniform * 4];
3437
3438 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3439
3440 assert(uniform < uniform_array_size);
3441 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3442 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3443 = values[j];
3444 }
3445 }
3446
3447 /* Set up the annotation tracking for new generated instructions. */
3448 base_ir = inst->ir;
3449 current_annotation = inst->annotation;
3450
3451 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3452
3453 emit_pull_constant_load(inst, temp, inst->src[i],
3454 pull_constant_loc[uniform]);
3455
3456 inst->src[i].file = temp.file;
3457 inst->src[i].reg = temp.reg;
3458 inst->src[i].reg_offset = temp.reg_offset;
3459 inst->src[i].reladdr = NULL;
3460 }
3461 }
3462
3463 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3464 * no need to track them as larger-than-vec4 objects. This will be
3465 * relied on in cutting out unused uniform vectors from push
3466 * constants.
3467 */
3468 split_uniform_registers();
3469 }
3470
3471 void
3472 vec4_visitor::resolve_ud_negate(src_reg *reg)
3473 {
3474 if (reg->type != BRW_REGISTER_TYPE_UD ||
3475 !reg->negate)
3476 return;
3477
3478 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3479 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3480 *reg = temp;
3481 }
3482
3483 vec4_visitor::vec4_visitor(struct brw_context *brw,
3484 struct brw_vec4_compile *c,
3485 struct gl_program *prog,
3486 const struct brw_vec4_prog_key *key,
3487 struct brw_vec4_prog_data *prog_data,
3488 struct gl_shader_program *shader_prog,
3489 gl_shader_stage stage,
3490 void *mem_ctx,
3491 bool debug_flag,
3492 bool no_spills,
3493 shader_time_shader_type st_base,
3494 shader_time_shader_type st_written,
3495 shader_time_shader_type st_reset)
3496 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3497 c(c),
3498 key(key),
3499 prog_data(prog_data),
3500 sanity_param_count(0),
3501 fail_msg(NULL),
3502 first_non_payload_grf(0),
3503 need_all_constants_in_pull_buffer(false),
3504 debug_flag(debug_flag),
3505 no_spills(no_spills),
3506 st_base(st_base),
3507 st_written(st_written),
3508 st_reset(st_reset)
3509 {
3510 this->mem_ctx = mem_ctx;
3511 this->failed = false;
3512
3513 this->base_ir = NULL;
3514 this->current_annotation = NULL;
3515 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3516
3517 this->variable_ht = hash_table_ctor(0,
3518 hash_table_pointer_hash,
3519 hash_table_pointer_compare);
3520
3521 this->virtual_grf_start = NULL;
3522 this->virtual_grf_end = NULL;
3523 this->virtual_grf_sizes = NULL;
3524 this->virtual_grf_count = 0;
3525 this->virtual_grf_reg_map = NULL;
3526 this->virtual_grf_reg_count = 0;
3527 this->virtual_grf_array_size = 0;
3528 this->live_intervals_valid = false;
3529
3530 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3531
3532 this->uniforms = 0;
3533
3534 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3535 * at least one. See setup_uniforms() in brw_vec4.cpp.
3536 */
3537 this->uniform_array_size = 1;
3538 if (prog_data) {
3539 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3540 }
3541
3542 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3543 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3544 }
3545
3546 vec4_visitor::~vec4_visitor()
3547 {
3548 hash_table_dtor(this->variable_ht);
3549 }
3550
3551
3552 void
3553 vec4_visitor::fail(const char *format, ...)
3554 {
3555 va_list va;
3556 char *msg;
3557
3558 if (failed)
3559 return;
3560
3561 failed = true;
3562
3563 va_start(va, format);
3564 msg = ralloc_vasprintf(mem_ctx, format, va);
3565 va_end(va);
3566 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3567
3568 this->fail_msg = msg;
3569
3570 if (debug_flag) {
3571 fprintf(stderr, "%s", msg);
3572 }
3573 }
3574
3575 } /* namespace brw */