i965/vec4: Add support for nonconst sampler indexing in VS visitor
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr) {
780 src_reg op[2];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 2);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 default:
856 unreachable("not reached");
857 }
858 return;
859 }
860
861 ir->accept(this);
862
863 resolve_ud_negate(&this->result);
864
865 if (brw->gen >= 6) {
866 vec4_instruction *inst = emit(AND(dst_null_d(),
867 this->result, src_reg(1)));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 } else {
870 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
871 inst->conditional_mod = BRW_CONDITIONAL_NZ;
872 }
873 }
874
875 /**
876 * Emit a gen6 IF statement with the comparison folded into the IF
877 * instruction.
878 */
879 void
880 vec4_visitor::emit_if_gen6(ir_if *ir)
881 {
882 ir_expression *expr = ir->condition->as_expression();
883
884 if (expr) {
885 src_reg op[2];
886 dst_reg temp;
887
888 assert(expr->get_num_operands() <= 2);
889 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
890 expr->operands[i]->accept(this);
891 op[i] = this->result;
892 }
893
894 switch (expr->operation) {
895 case ir_unop_logic_not:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
897 return;
898
899 case ir_binop_logic_xor:
900 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
901 return;
902
903 case ir_binop_logic_or:
904 temp = dst_reg(this, glsl_type::bool_type);
905 emit(OR(temp, op[0], op[1]));
906 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
907 return;
908
909 case ir_binop_logic_and:
910 temp = dst_reg(this, glsl_type::bool_type);
911 emit(AND(temp, op[0], op[1]));
912 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_f2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_unop_i2b:
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922
923 case ir_binop_greater:
924 case ir_binop_gequal:
925 case ir_binop_less:
926 case ir_binop_lequal:
927 case ir_binop_equal:
928 case ir_binop_nequal:
929 emit(IF(op[0], op[1],
930 brw_conditional_for_comparison(expr->operation)));
931 return;
932
933 case ir_binop_all_equal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
935 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
936 return;
937
938 case ir_binop_any_nequal:
939 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 case ir_unop_any:
944 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
945 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
946 return;
947
948 default:
949 unreachable("not reached");
950 }
951 return;
952 }
953
954 ir->condition->accept(this);
955
956 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
957 }
958
959 void
960 vec4_visitor::visit(ir_variable *ir)
961 {
962 dst_reg *reg = NULL;
963
964 if (variable_storage(ir))
965 return;
966
967 switch (ir->data.mode) {
968 case ir_var_shader_in:
969 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
970 break;
971
972 case ir_var_shader_out:
973 reg = new(mem_ctx) dst_reg(this, ir->type);
974
975 for (int i = 0; i < type_size(ir->type); i++) {
976 output_reg[ir->data.location + i] = *reg;
977 output_reg[ir->data.location + i].reg_offset = i;
978 output_reg[ir->data.location + i].type =
979 brw_type_for_base_type(ir->type->get_scalar_type());
980 output_reg_annotation[ir->data.location + i] = ir->name;
981 }
982 break;
983
984 case ir_var_auto:
985 case ir_var_temporary:
986 reg = new(mem_ctx) dst_reg(this, ir->type);
987 break;
988
989 case ir_var_uniform:
990 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
991
992 /* Thanks to the lower_ubo_reference pass, we will see only
993 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
994 * variables, so no need for them to be in variable_ht.
995 *
996 * Atomic counters take no uniform storage, no need to do
997 * anything here.
998 */
999 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000 return;
1001
1002 /* Track how big the whole uniform variable is, in case we need to put a
1003 * copy of its data into pull constants for array access.
1004 */
1005 assert(this->uniforms < uniform_array_size);
1006 this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008 if (!strncmp(ir->name, "gl_", 3)) {
1009 setup_builtin_uniform_values(ir);
1010 } else {
1011 setup_uniform_values(ir);
1012 }
1013 break;
1014
1015 case ir_var_system_value:
1016 reg = make_reg_for_system_value(ir);
1017 break;
1018
1019 default:
1020 unreachable("not reached");
1021 }
1022
1023 reg->type = brw_type_for_base_type(ir->type);
1024 hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030 /* We don't want debugging output to print the whole body of the
1031 * loop as the annotation.
1032 */
1033 this->base_ir = NULL;
1034
1035 emit(BRW_OPCODE_DO);
1036
1037 visit_instructions(&ir->body_instructions);
1038
1039 emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045 switch (ir->mode) {
1046 case ir_loop_jump::jump_break:
1047 emit(BRW_OPCODE_BREAK);
1048 break;
1049 case ir_loop_jump::jump_continue:
1050 emit(BRW_OPCODE_CONTINUE);
1051 break;
1052 }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059 unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065 /* Ignore function bodies other than main() -- we shouldn't see calls to
1066 * them since they should all be inlined.
1067 */
1068 if (strcmp(ir->name, "main") == 0) {
1069 const ir_function_signature *sig;
1070 exec_list empty;
1071
1072 sig = ir->matching_signature(NULL, &empty, false);
1073
1074 assert(sig);
1075
1076 visit_instructions(&sig->body);
1077 }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084 if (!sat_src)
1085 return false;
1086
1087 sat_src->accept(this);
1088 src_reg src = this->result;
1089
1090 this->result = src_reg(this, ir->type);
1091 vec4_instruction *inst;
1092 inst = emit(MOV(dst_reg(this->result), src));
1093 inst->saturate = true;
1094
1095 return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101 /* 3-src instructions were introduced in gen6. */
1102 if (brw->gen < 6)
1103 return false;
1104
1105 /* MAD can only handle floating-point data. */
1106 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107 return false;
1108
1109 ir_rvalue *nonmul = ir->operands[1];
1110 ir_expression *mul = ir->operands[0]->as_expression();
1111
1112 if (!mul || mul->operation != ir_binop_mul) {
1113 nonmul = ir->operands[0];
1114 mul = ir->operands[1]->as_expression();
1115
1116 if (!mul || mul->operation != ir_binop_mul)
1117 return false;
1118 }
1119
1120 nonmul->accept(this);
1121 src_reg src0 = fix_3src_operand(this->result);
1122
1123 mul->operands[0]->accept(this);
1124 src_reg src1 = fix_3src_operand(this->result);
1125
1126 mul->operands[1]->accept(this);
1127 src_reg src2 = fix_3src_operand(this->result);
1128
1129 this->result = src_reg(this, ir->type);
1130 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132 return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138 ir_expression *const cmp = ir->operands[0]->as_expression();
1139
1140 if (cmp == NULL)
1141 return false;
1142
1143 switch (cmp->operation) {
1144 case ir_binop_less:
1145 case ir_binop_greater:
1146 case ir_binop_lequal:
1147 case ir_binop_gequal:
1148 case ir_binop_equal:
1149 case ir_binop_nequal:
1150 break;
1151
1152 default:
1153 return false;
1154 }
1155
1156 cmp->operands[0]->accept(this);
1157 const src_reg cmp_src0 = this->result;
1158
1159 cmp->operands[1]->accept(this);
1160 const src_reg cmp_src1 = this->result;
1161
1162 this->result = src_reg(this, ir->type);
1163
1164 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1165 brw_conditional_for_comparison(cmp->operation)));
1166
1167 /* If the comparison is false, this->result will just happen to be zero.
1168 */
1169 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1170 this->result, src_reg(1.0f));
1171 inst->predicate = BRW_PREDICATE_NORMAL;
1172 inst->predicate_inverse = true;
1173
1174 return true;
1175 }
1176
1177 void
1178 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1179 src_reg src0, src_reg src1)
1180 {
1181 vec4_instruction *inst;
1182
1183 if (brw->gen >= 6) {
1184 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1185 inst->conditional_mod = conditionalmod;
1186 } else {
1187 emit(CMP(dst, src0, src1, conditionalmod));
1188
1189 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1190 inst->predicate = BRW_PREDICATE_NORMAL;
1191 }
1192 }
1193
1194 void
1195 vec4_visitor::emit_lrp(const dst_reg &dst,
1196 const src_reg &x, const src_reg &y, const src_reg &a)
1197 {
1198 if (brw->gen >= 6) {
1199 /* Note that the instruction's argument order is reversed from GLSL
1200 * and the IR.
1201 */
1202 emit(LRP(dst,
1203 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1204 } else {
1205 /* Earlier generations don't support three source operations, so we
1206 * need to emit x*(1-a) + y*a.
1207 */
1208 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1209 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1210 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1211 y_times_a.writemask = dst.writemask;
1212 one_minus_a.writemask = dst.writemask;
1213 x_times_one_minus_a.writemask = dst.writemask;
1214
1215 emit(MUL(y_times_a, y, a));
1216 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1217 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1218 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1219 }
1220 }
1221
1222 void
1223 vec4_visitor::visit(ir_expression *ir)
1224 {
1225 unsigned int operand;
1226 src_reg op[Elements(ir->operands)];
1227 src_reg result_src;
1228 dst_reg result_dst;
1229 vec4_instruction *inst;
1230
1231 if (try_emit_sat(ir))
1232 return;
1233
1234 if (ir->operation == ir_binop_add) {
1235 if (try_emit_mad(ir))
1236 return;
1237 }
1238
1239 if (ir->operation == ir_unop_b2f) {
1240 if (try_emit_b2f_of_compare(ir))
1241 return;
1242 }
1243
1244 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1245 this->result.file = BAD_FILE;
1246 ir->operands[operand]->accept(this);
1247 if (this->result.file == BAD_FILE) {
1248 fprintf(stderr, "Failed to get tree for expression operand:\n");
1249 ir->operands[operand]->fprint(stderr);
1250 exit(1);
1251 }
1252 op[operand] = this->result;
1253
1254 /* Matrix expression operands should have been broken down to vector
1255 * operations already.
1256 */
1257 assert(!ir->operands[operand]->type->is_matrix());
1258 }
1259
1260 int vector_elements = ir->operands[0]->type->vector_elements;
1261 if (ir->operands[1]) {
1262 vector_elements = MAX2(vector_elements,
1263 ir->operands[1]->type->vector_elements);
1264 }
1265
1266 this->result.file = BAD_FILE;
1267
1268 /* Storage for our result. Ideally for an assignment we'd be using
1269 * the actual storage for the result here, instead.
1270 */
1271 result_src = src_reg(this, ir->type);
1272 /* convenience for the emit functions below. */
1273 result_dst = dst_reg(result_src);
1274 /* If nothing special happens, this is the result. */
1275 this->result = result_src;
1276 /* Limit writes to the channels that will be used by result_src later.
1277 * This does limit this temp's use as a temporary for multi-instruction
1278 * sequences.
1279 */
1280 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1281
1282 switch (ir->operation) {
1283 case ir_unop_logic_not:
1284 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1285 * ones complement of the whole register, not just bit 0.
1286 */
1287 emit(XOR(result_dst, op[0], src_reg(1)));
1288 break;
1289 case ir_unop_neg:
1290 op[0].negate = !op[0].negate;
1291 emit(MOV(result_dst, op[0]));
1292 break;
1293 case ir_unop_abs:
1294 op[0].abs = true;
1295 op[0].negate = false;
1296 emit(MOV(result_dst, op[0]));
1297 break;
1298
1299 case ir_unop_sign:
1300 if (ir->type->is_float()) {
1301 /* AND(val, 0x80000000) gives the sign bit.
1302 *
1303 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1304 * zero.
1305 */
1306 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1307
1308 op[0].type = BRW_REGISTER_TYPE_UD;
1309 result_dst.type = BRW_REGISTER_TYPE_UD;
1310 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1311
1312 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1313 inst->predicate = BRW_PREDICATE_NORMAL;
1314
1315 this->result.type = BRW_REGISTER_TYPE_F;
1316 } else {
1317 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1318 * -> non-negative val generates 0x00000000.
1319 * Predicated OR sets 1 if val is positive.
1320 */
1321 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1322
1323 emit(ASR(result_dst, op[0], src_reg(31)));
1324
1325 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1326 inst->predicate = BRW_PREDICATE_NORMAL;
1327 }
1328 break;
1329
1330 case ir_unop_rcp:
1331 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1332 break;
1333
1334 case ir_unop_exp2:
1335 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1336 break;
1337 case ir_unop_log2:
1338 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1339 break;
1340 case ir_unop_exp:
1341 case ir_unop_log:
1342 unreachable("not reached: should be handled by ir_explog_to_explog2");
1343 case ir_unop_sin:
1344 case ir_unop_sin_reduced:
1345 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1346 break;
1347 case ir_unop_cos:
1348 case ir_unop_cos_reduced:
1349 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1350 break;
1351
1352 case ir_unop_dFdx:
1353 case ir_unop_dFdx_coarse:
1354 case ir_unop_dFdx_fine:
1355 case ir_unop_dFdy:
1356 case ir_unop_dFdy_coarse:
1357 case ir_unop_dFdy_fine:
1358 unreachable("derivatives not valid in vertex shader");
1359
1360 case ir_unop_bitfield_reverse:
1361 emit(BFREV(result_dst, op[0]));
1362 break;
1363 case ir_unop_bit_count:
1364 emit(CBIT(result_dst, op[0]));
1365 break;
1366 case ir_unop_find_msb: {
1367 src_reg temp = src_reg(this, glsl_type::uint_type);
1368
1369 inst = emit(FBH(dst_reg(temp), op[0]));
1370 inst->dst.writemask = WRITEMASK_XYZW;
1371
1372 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1373 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1374 * subtract the result from 31 to convert the MSB count into an LSB count.
1375 */
1376
1377 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1378 temp.swizzle = BRW_SWIZZLE_NOOP;
1379 emit(MOV(result_dst, temp));
1380
1381 src_reg src_tmp = src_reg(result_dst);
1382 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1383
1384 src_tmp.negate = true;
1385 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1386 inst->predicate = BRW_PREDICATE_NORMAL;
1387 break;
1388 }
1389 case ir_unop_find_lsb:
1390 emit(FBL(result_dst, op[0]));
1391 break;
1392
1393 case ir_unop_noise:
1394 unreachable("not reached: should be handled by lower_noise");
1395
1396 case ir_binop_add:
1397 emit(ADD(result_dst, op[0], op[1]));
1398 break;
1399 case ir_binop_sub:
1400 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1401
1402 case ir_binop_mul:
1403 if (brw->gen < 8 && ir->type->is_integer()) {
1404 /* For integer multiplication, the MUL uses the low 16 bits of one of
1405 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1406 * accumulates in the contribution of the upper 16 bits of that
1407 * operand. If we can determine that one of the args is in the low
1408 * 16 bits, though, we can just emit a single MUL.
1409 */
1410 if (ir->operands[0]->is_uint16_constant()) {
1411 if (brw->gen < 7)
1412 emit(MUL(result_dst, op[0], op[1]));
1413 else
1414 emit(MUL(result_dst, op[1], op[0]));
1415 } else if (ir->operands[1]->is_uint16_constant()) {
1416 if (brw->gen < 7)
1417 emit(MUL(result_dst, op[1], op[0]));
1418 else
1419 emit(MUL(result_dst, op[0], op[1]));
1420 } else {
1421 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1422
1423 emit(MUL(acc, op[0], op[1]));
1424 emit(MACH(dst_null_d(), op[0], op[1]));
1425 emit(MOV(result_dst, src_reg(acc)));
1426 }
1427 } else {
1428 emit(MUL(result_dst, op[0], op[1]));
1429 }
1430 break;
1431 case ir_binop_imul_high: {
1432 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1433
1434 emit(MUL(acc, op[0], op[1]));
1435 emit(MACH(result_dst, op[0], op[1]));
1436 break;
1437 }
1438 case ir_binop_div:
1439 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1440 assert(ir->type->is_integer());
1441 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1442 break;
1443 case ir_binop_carry: {
1444 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1445
1446 emit(ADDC(dst_null_ud(), op[0], op[1]));
1447 emit(MOV(result_dst, src_reg(acc)));
1448 break;
1449 }
1450 case ir_binop_borrow: {
1451 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1452
1453 emit(SUBB(dst_null_ud(), op[0], op[1]));
1454 emit(MOV(result_dst, src_reg(acc)));
1455 break;
1456 }
1457 case ir_binop_mod:
1458 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1459 assert(ir->type->is_integer());
1460 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1461 break;
1462
1463 case ir_binop_less:
1464 case ir_binop_greater:
1465 case ir_binop_lequal:
1466 case ir_binop_gequal:
1467 case ir_binop_equal:
1468 case ir_binop_nequal: {
1469 emit(CMP(result_dst, op[0], op[1],
1470 brw_conditional_for_comparison(ir->operation)));
1471 emit(AND(result_dst, result_src, src_reg(0x1)));
1472 break;
1473 }
1474
1475 case ir_binop_all_equal:
1476 /* "==" operator producing a scalar boolean. */
1477 if (ir->operands[0]->type->is_vector() ||
1478 ir->operands[1]->type->is_vector()) {
1479 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1480 emit(MOV(result_dst, src_reg(0)));
1481 inst = emit(MOV(result_dst, src_reg(1)));
1482 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1483 } else {
1484 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1485 emit(AND(result_dst, result_src, src_reg(0x1)));
1486 }
1487 break;
1488 case ir_binop_any_nequal:
1489 /* "!=" operator producing a scalar boolean. */
1490 if (ir->operands[0]->type->is_vector() ||
1491 ir->operands[1]->type->is_vector()) {
1492 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1493
1494 emit(MOV(result_dst, src_reg(0)));
1495 inst = emit(MOV(result_dst, src_reg(1)));
1496 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1497 } else {
1498 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1499 emit(AND(result_dst, result_src, src_reg(0x1)));
1500 }
1501 break;
1502
1503 case ir_unop_any:
1504 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1505 emit(MOV(result_dst, src_reg(0)));
1506
1507 inst = emit(MOV(result_dst, src_reg(1)));
1508 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1509 break;
1510
1511 case ir_binop_logic_xor:
1512 emit(XOR(result_dst, op[0], op[1]));
1513 break;
1514
1515 case ir_binop_logic_or:
1516 emit(OR(result_dst, op[0], op[1]));
1517 break;
1518
1519 case ir_binop_logic_and:
1520 emit(AND(result_dst, op[0], op[1]));
1521 break;
1522
1523 case ir_binop_dot:
1524 assert(ir->operands[0]->type->is_vector());
1525 assert(ir->operands[0]->type == ir->operands[1]->type);
1526 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1527 break;
1528
1529 case ir_unop_sqrt:
1530 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1531 break;
1532 case ir_unop_rsq:
1533 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1534 break;
1535
1536 case ir_unop_bitcast_i2f:
1537 case ir_unop_bitcast_u2f:
1538 this->result = op[0];
1539 this->result.type = BRW_REGISTER_TYPE_F;
1540 break;
1541
1542 case ir_unop_bitcast_f2i:
1543 this->result = op[0];
1544 this->result.type = BRW_REGISTER_TYPE_D;
1545 break;
1546
1547 case ir_unop_bitcast_f2u:
1548 this->result = op[0];
1549 this->result.type = BRW_REGISTER_TYPE_UD;
1550 break;
1551
1552 case ir_unop_i2f:
1553 case ir_unop_i2u:
1554 case ir_unop_u2i:
1555 case ir_unop_u2f:
1556 case ir_unop_b2f:
1557 case ir_unop_b2i:
1558 case ir_unop_f2i:
1559 case ir_unop_f2u:
1560 emit(MOV(result_dst, op[0]));
1561 break;
1562 case ir_unop_f2b:
1563 case ir_unop_i2b: {
1564 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1565 emit(AND(result_dst, result_src, src_reg(1)));
1566 break;
1567 }
1568
1569 case ir_unop_trunc:
1570 emit(RNDZ(result_dst, op[0]));
1571 break;
1572 case ir_unop_ceil:
1573 op[0].negate = !op[0].negate;
1574 inst = emit(RNDD(result_dst, op[0]));
1575 this->result.negate = true;
1576 break;
1577 case ir_unop_floor:
1578 inst = emit(RNDD(result_dst, op[0]));
1579 break;
1580 case ir_unop_fract:
1581 inst = emit(FRC(result_dst, op[0]));
1582 break;
1583 case ir_unop_round_even:
1584 emit(RNDE(result_dst, op[0]));
1585 break;
1586
1587 case ir_binop_min:
1588 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1589 break;
1590 case ir_binop_max:
1591 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1592 break;
1593
1594 case ir_binop_pow:
1595 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1596 break;
1597
1598 case ir_unop_bit_not:
1599 inst = emit(NOT(result_dst, op[0]));
1600 break;
1601 case ir_binop_bit_and:
1602 inst = emit(AND(result_dst, op[0], op[1]));
1603 break;
1604 case ir_binop_bit_xor:
1605 inst = emit(XOR(result_dst, op[0], op[1]));
1606 break;
1607 case ir_binop_bit_or:
1608 inst = emit(OR(result_dst, op[0], op[1]));
1609 break;
1610
1611 case ir_binop_lshift:
1612 inst = emit(SHL(result_dst, op[0], op[1]));
1613 break;
1614
1615 case ir_binop_rshift:
1616 if (ir->type->base_type == GLSL_TYPE_INT)
1617 inst = emit(ASR(result_dst, op[0], op[1]));
1618 else
1619 inst = emit(SHR(result_dst, op[0], op[1]));
1620 break;
1621
1622 case ir_binop_bfm:
1623 emit(BFI1(result_dst, op[0], op[1]));
1624 break;
1625
1626 case ir_binop_ubo_load: {
1627 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1628 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1629 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1630 src_reg offset;
1631
1632 /* Now, load the vector from that offset. */
1633 assert(ir->type->is_vector() || ir->type->is_scalar());
1634
1635 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1636 packed_consts.type = result.type;
1637 src_reg surf_index;
1638
1639 if (const_uniform_block) {
1640 /* The block index is a constant, so just emit the binding table entry
1641 * as an immediate.
1642 */
1643 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1644 const_uniform_block->value.u[0]);
1645 } else {
1646 /* The block index is not a constant. Evaluate the index expression
1647 * per-channel and add the base UBO index; the generator will select
1648 * a value from any live channel.
1649 */
1650 surf_index = src_reg(this, glsl_type::uint_type);
1651 emit(ADD(dst_reg(surf_index), op[0],
1652 src_reg(prog_data->base.binding_table.ubo_start)));
1653
1654 /* Assume this may touch any UBO. It would be nice to provide
1655 * a tighter bound, but the array information is already lowered away.
1656 */
1657 brw_mark_surface_used(&prog_data->base,
1658 prog_data->base.binding_table.ubo_start +
1659 shader_prog->NumUniformBlocks - 1);
1660 }
1661
1662 if (const_offset_ir) {
1663 if (brw->gen >= 8) {
1664 /* Store the offset in a GRF so we can send-from-GRF. */
1665 offset = src_reg(this, glsl_type::int_type);
1666 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1667 } else {
1668 /* Immediates are fine on older generations since they'll be moved
1669 * to a (potentially fake) MRF at the generator level.
1670 */
1671 offset = src_reg(const_offset / 16);
1672 }
1673 } else {
1674 offset = src_reg(this, glsl_type::uint_type);
1675 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1676 }
1677
1678 if (brw->gen >= 7) {
1679 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1680 grf_offset.type = offset.type;
1681
1682 emit(MOV(grf_offset, offset));
1683
1684 emit(new(mem_ctx) vec4_instruction(this,
1685 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1686 dst_reg(packed_consts),
1687 surf_index,
1688 src_reg(grf_offset)));
1689 } else {
1690 vec4_instruction *pull =
1691 emit(new(mem_ctx) vec4_instruction(this,
1692 VS_OPCODE_PULL_CONSTANT_LOAD,
1693 dst_reg(packed_consts),
1694 surf_index,
1695 offset));
1696 pull->base_mrf = 14;
1697 pull->mlen = 1;
1698 }
1699
1700 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1701 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1702 const_offset % 16 / 4,
1703 const_offset % 16 / 4,
1704 const_offset % 16 / 4);
1705
1706 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1707 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1708 emit(CMP(result_dst, packed_consts, src_reg(0u),
1709 BRW_CONDITIONAL_NZ));
1710 emit(AND(result_dst, result, src_reg(0x1)));
1711 } else {
1712 emit(MOV(result_dst, packed_consts));
1713 }
1714 break;
1715 }
1716
1717 case ir_binop_vector_extract:
1718 unreachable("should have been lowered by vec_index_to_cond_assign");
1719
1720 case ir_triop_fma:
1721 op[0] = fix_3src_operand(op[0]);
1722 op[1] = fix_3src_operand(op[1]);
1723 op[2] = fix_3src_operand(op[2]);
1724 /* Note that the instruction's argument order is reversed from GLSL
1725 * and the IR.
1726 */
1727 emit(MAD(result_dst, op[2], op[1], op[0]));
1728 break;
1729
1730 case ir_triop_lrp:
1731 emit_lrp(result_dst, op[0], op[1], op[2]);
1732 break;
1733
1734 case ir_triop_csel:
1735 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1736 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1737 inst->predicate = BRW_PREDICATE_NORMAL;
1738 break;
1739
1740 case ir_triop_bfi:
1741 op[0] = fix_3src_operand(op[0]);
1742 op[1] = fix_3src_operand(op[1]);
1743 op[2] = fix_3src_operand(op[2]);
1744 emit(BFI2(result_dst, op[0], op[1], op[2]));
1745 break;
1746
1747 case ir_triop_bitfield_extract:
1748 op[0] = fix_3src_operand(op[0]);
1749 op[1] = fix_3src_operand(op[1]);
1750 op[2] = fix_3src_operand(op[2]);
1751 /* Note that the instruction's argument order is reversed from GLSL
1752 * and the IR.
1753 */
1754 emit(BFE(result_dst, op[2], op[1], op[0]));
1755 break;
1756
1757 case ir_triop_vector_insert:
1758 unreachable("should have been lowered by lower_vector_insert");
1759
1760 case ir_quadop_bitfield_insert:
1761 unreachable("not reached: should be handled by "
1762 "bitfield_insert_to_bfm_bfi\n");
1763
1764 case ir_quadop_vector:
1765 unreachable("not reached: should be handled by lower_quadop_vector");
1766
1767 case ir_unop_pack_half_2x16:
1768 emit_pack_half_2x16(result_dst, op[0]);
1769 break;
1770 case ir_unop_unpack_half_2x16:
1771 emit_unpack_half_2x16(result_dst, op[0]);
1772 break;
1773 case ir_unop_pack_snorm_2x16:
1774 case ir_unop_pack_snorm_4x8:
1775 case ir_unop_pack_unorm_2x16:
1776 case ir_unop_pack_unorm_4x8:
1777 case ir_unop_unpack_snorm_2x16:
1778 case ir_unop_unpack_snorm_4x8:
1779 case ir_unop_unpack_unorm_2x16:
1780 case ir_unop_unpack_unorm_4x8:
1781 unreachable("not reached: should be handled by lower_packing_builtins");
1782 case ir_unop_unpack_half_2x16_split_x:
1783 case ir_unop_unpack_half_2x16_split_y:
1784 case ir_binop_pack_half_2x16_split:
1785 case ir_unop_interpolate_at_centroid:
1786 case ir_binop_interpolate_at_sample:
1787 case ir_binop_interpolate_at_offset:
1788 unreachable("not reached: should not occur in vertex shader");
1789 case ir_binop_ldexp:
1790 unreachable("not reached: should be handled by ldexp_to_arith()");
1791 }
1792 }
1793
1794
1795 void
1796 vec4_visitor::visit(ir_swizzle *ir)
1797 {
1798 src_reg src;
1799 int i = 0;
1800 int swizzle[4];
1801
1802 /* Note that this is only swizzles in expressions, not those on the left
1803 * hand side of an assignment, which do write masking. See ir_assignment
1804 * for that.
1805 */
1806
1807 ir->val->accept(this);
1808 src = this->result;
1809 assert(src.file != BAD_FILE);
1810
1811 for (i = 0; i < ir->type->vector_elements; i++) {
1812 switch (i) {
1813 case 0:
1814 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1815 break;
1816 case 1:
1817 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1818 break;
1819 case 2:
1820 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1821 break;
1822 case 3:
1823 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1824 break;
1825 }
1826 }
1827 for (; i < 4; i++) {
1828 /* Replicate the last channel out. */
1829 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1830 }
1831
1832 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1833
1834 this->result = src;
1835 }
1836
1837 void
1838 vec4_visitor::visit(ir_dereference_variable *ir)
1839 {
1840 const struct glsl_type *type = ir->type;
1841 dst_reg *reg = variable_storage(ir->var);
1842
1843 if (!reg) {
1844 fail("Failed to find variable storage for %s\n", ir->var->name);
1845 this->result = src_reg(brw_null_reg());
1846 return;
1847 }
1848
1849 this->result = src_reg(*reg);
1850
1851 /* System values get their swizzle from the dst_reg writemask */
1852 if (ir->var->data.mode == ir_var_system_value)
1853 return;
1854
1855 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1856 this->result.swizzle = swizzle_for_size(type->vector_elements);
1857 }
1858
1859
1860 int
1861 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1862 {
1863 /* Under normal circumstances array elements are stored consecutively, so
1864 * the stride is equal to the size of the array element.
1865 */
1866 return type_size(ir->type);
1867 }
1868
1869
1870 void
1871 vec4_visitor::visit(ir_dereference_array *ir)
1872 {
1873 ir_constant *constant_index;
1874 src_reg src;
1875 int array_stride = compute_array_stride(ir);
1876
1877 constant_index = ir->array_index->constant_expression_value();
1878
1879 ir->array->accept(this);
1880 src = this->result;
1881
1882 if (constant_index) {
1883 src.reg_offset += constant_index->value.i[0] * array_stride;
1884 } else {
1885 /* Variable index array dereference. It eats the "vec4" of the
1886 * base of the array and an index that offsets the Mesa register
1887 * index.
1888 */
1889 ir->array_index->accept(this);
1890
1891 src_reg index_reg;
1892
1893 if (array_stride == 1) {
1894 index_reg = this->result;
1895 } else {
1896 index_reg = src_reg(this, glsl_type::int_type);
1897
1898 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1899 }
1900
1901 if (src.reladdr) {
1902 src_reg temp = src_reg(this, glsl_type::int_type);
1903
1904 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1905
1906 index_reg = temp;
1907 }
1908
1909 src.reladdr = ralloc(mem_ctx, src_reg);
1910 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1911 }
1912
1913 /* If the type is smaller than a vec4, replicate the last channel out. */
1914 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1915 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1916 else
1917 src.swizzle = BRW_SWIZZLE_NOOP;
1918 src.type = brw_type_for_base_type(ir->type);
1919
1920 this->result = src;
1921 }
1922
1923 void
1924 vec4_visitor::visit(ir_dereference_record *ir)
1925 {
1926 unsigned int i;
1927 const glsl_type *struct_type = ir->record->type;
1928 int offset = 0;
1929
1930 ir->record->accept(this);
1931
1932 for (i = 0; i < struct_type->length; i++) {
1933 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1934 break;
1935 offset += type_size(struct_type->fields.structure[i].type);
1936 }
1937
1938 /* If the type is smaller than a vec4, replicate the last channel out. */
1939 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1940 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1941 else
1942 this->result.swizzle = BRW_SWIZZLE_NOOP;
1943 this->result.type = brw_type_for_base_type(ir->type);
1944
1945 this->result.reg_offset += offset;
1946 }
1947
1948 /**
1949 * We want to be careful in assignment setup to hit the actual storage
1950 * instead of potentially using a temporary like we might with the
1951 * ir_dereference handler.
1952 */
1953 static dst_reg
1954 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1955 {
1956 /* The LHS must be a dereference. If the LHS is a variable indexed array
1957 * access of a vector, it must be separated into a series conditional moves
1958 * before reaching this point (see ir_vec_index_to_cond_assign).
1959 */
1960 assert(ir->as_dereference());
1961 ir_dereference_array *deref_array = ir->as_dereference_array();
1962 if (deref_array) {
1963 assert(!deref_array->array->type->is_vector());
1964 }
1965
1966 /* Use the rvalue deref handler for the most part. We'll ignore
1967 * swizzles in it and write swizzles using writemask, though.
1968 */
1969 ir->accept(v);
1970 return dst_reg(v->result);
1971 }
1972
1973 void
1974 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1975 const struct glsl_type *type,
1976 enum brw_predicate predicate)
1977 {
1978 if (type->base_type == GLSL_TYPE_STRUCT) {
1979 for (unsigned int i = 0; i < type->length; i++) {
1980 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1981 }
1982 return;
1983 }
1984
1985 if (type->is_array()) {
1986 for (unsigned int i = 0; i < type->length; i++) {
1987 emit_block_move(dst, src, type->fields.array, predicate);
1988 }
1989 return;
1990 }
1991
1992 if (type->is_matrix()) {
1993 const struct glsl_type *vec_type;
1994
1995 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1996 type->vector_elements, 1);
1997
1998 for (int i = 0; i < type->matrix_columns; i++) {
1999 emit_block_move(dst, src, vec_type, predicate);
2000 }
2001 return;
2002 }
2003
2004 assert(type->is_scalar() || type->is_vector());
2005
2006 dst->type = brw_type_for_base_type(type);
2007 src->type = dst->type;
2008
2009 dst->writemask = (1 << type->vector_elements) - 1;
2010
2011 src->swizzle = swizzle_for_size(type->vector_elements);
2012
2013 vec4_instruction *inst = emit(MOV(*dst, *src));
2014 inst->predicate = predicate;
2015
2016 dst->reg_offset++;
2017 src->reg_offset++;
2018 }
2019
2020
2021 /* If the RHS processing resulted in an instruction generating a
2022 * temporary value, and it would be easy to rewrite the instruction to
2023 * generate its result right into the LHS instead, do so. This ends
2024 * up reliably removing instructions where it can be tricky to do so
2025 * later without real UD chain information.
2026 */
2027 bool
2028 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2029 dst_reg dst,
2030 src_reg src,
2031 vec4_instruction *pre_rhs_inst,
2032 vec4_instruction *last_rhs_inst)
2033 {
2034 /* This could be supported, but it would take more smarts. */
2035 if (ir->condition)
2036 return false;
2037
2038 if (pre_rhs_inst == last_rhs_inst)
2039 return false; /* No instructions generated to work with. */
2040
2041 /* Make sure the last instruction generated our source reg. */
2042 if (src.file != GRF ||
2043 src.file != last_rhs_inst->dst.file ||
2044 src.reg != last_rhs_inst->dst.reg ||
2045 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2046 src.reladdr ||
2047 src.abs ||
2048 src.negate ||
2049 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2050 return false;
2051
2052 /* Check that that last instruction fully initialized the channels
2053 * we want to use, in the order we want to use them. We could
2054 * potentially reswizzle the operands of many instructions so that
2055 * we could handle out of order channels, but don't yet.
2056 */
2057
2058 for (unsigned i = 0; i < 4; i++) {
2059 if (dst.writemask & (1 << i)) {
2060 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2061 return false;
2062
2063 if (BRW_GET_SWZ(src.swizzle, i) != i)
2064 return false;
2065 }
2066 }
2067
2068 /* Success! Rewrite the instruction. */
2069 last_rhs_inst->dst.file = dst.file;
2070 last_rhs_inst->dst.reg = dst.reg;
2071 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2072 last_rhs_inst->dst.reladdr = dst.reladdr;
2073 last_rhs_inst->dst.writemask &= dst.writemask;
2074
2075 return true;
2076 }
2077
2078 void
2079 vec4_visitor::visit(ir_assignment *ir)
2080 {
2081 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2082 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2083
2084 if (!ir->lhs->type->is_scalar() &&
2085 !ir->lhs->type->is_vector()) {
2086 ir->rhs->accept(this);
2087 src_reg src = this->result;
2088
2089 if (ir->condition) {
2090 emit_bool_to_cond_code(ir->condition, &predicate);
2091 }
2092
2093 /* emit_block_move doesn't account for swizzles in the source register.
2094 * This should be ok, since the source register is a structure or an
2095 * array, and those can't be swizzled. But double-check to be sure.
2096 */
2097 assert(src.swizzle ==
2098 (ir->rhs->type->is_matrix()
2099 ? swizzle_for_size(ir->rhs->type->vector_elements)
2100 : BRW_SWIZZLE_NOOP));
2101
2102 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2103 return;
2104 }
2105
2106 /* Now we're down to just a scalar/vector with writemasks. */
2107 int i;
2108
2109 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2110 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2111
2112 ir->rhs->accept(this);
2113
2114 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2115
2116 src_reg src = this->result;
2117
2118 int swizzles[4];
2119 int first_enabled_chan = 0;
2120 int src_chan = 0;
2121
2122 assert(ir->lhs->type->is_vector() ||
2123 ir->lhs->type->is_scalar());
2124 dst.writemask = ir->write_mask;
2125
2126 for (int i = 0; i < 4; i++) {
2127 if (dst.writemask & (1 << i)) {
2128 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2129 break;
2130 }
2131 }
2132
2133 /* Swizzle a small RHS vector into the channels being written.
2134 *
2135 * glsl ir treats write_mask as dictating how many channels are
2136 * present on the RHS while in our instructions we need to make
2137 * those channels appear in the slots of the vec4 they're written to.
2138 */
2139 for (int i = 0; i < 4; i++) {
2140 if (dst.writemask & (1 << i))
2141 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2142 else
2143 swizzles[i] = first_enabled_chan;
2144 }
2145 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2146 swizzles[2], swizzles[3]);
2147
2148 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2149 return;
2150 }
2151
2152 if (ir->condition) {
2153 emit_bool_to_cond_code(ir->condition, &predicate);
2154 }
2155
2156 for (i = 0; i < type_size(ir->lhs->type); i++) {
2157 vec4_instruction *inst = emit(MOV(dst, src));
2158 inst->predicate = predicate;
2159
2160 dst.reg_offset++;
2161 src.reg_offset++;
2162 }
2163 }
2164
2165 void
2166 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2167 {
2168 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2169 foreach_in_list(ir_constant, field_value, &ir->components) {
2170 emit_constant_values(dst, field_value);
2171 }
2172 return;
2173 }
2174
2175 if (ir->type->is_array()) {
2176 for (unsigned int i = 0; i < ir->type->length; i++) {
2177 emit_constant_values(dst, ir->array_elements[i]);
2178 }
2179 return;
2180 }
2181
2182 if (ir->type->is_matrix()) {
2183 for (int i = 0; i < ir->type->matrix_columns; i++) {
2184 float *vec = &ir->value.f[i * ir->type->vector_elements];
2185
2186 for (int j = 0; j < ir->type->vector_elements; j++) {
2187 dst->writemask = 1 << j;
2188 dst->type = BRW_REGISTER_TYPE_F;
2189
2190 emit(MOV(*dst, src_reg(vec[j])));
2191 }
2192 dst->reg_offset++;
2193 }
2194 return;
2195 }
2196
2197 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2198
2199 for (int i = 0; i < ir->type->vector_elements; i++) {
2200 if (!(remaining_writemask & (1 << i)))
2201 continue;
2202
2203 dst->writemask = 1 << i;
2204 dst->type = brw_type_for_base_type(ir->type);
2205
2206 /* Find other components that match the one we're about to
2207 * write. Emits fewer instructions for things like vec4(0.5,
2208 * 1.5, 1.5, 1.5).
2209 */
2210 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2211 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2212 if (ir->value.b[i] == ir->value.b[j])
2213 dst->writemask |= (1 << j);
2214 } else {
2215 /* u, i, and f storage all line up, so no need for a
2216 * switch case for comparing each type.
2217 */
2218 if (ir->value.u[i] == ir->value.u[j])
2219 dst->writemask |= (1 << j);
2220 }
2221 }
2222
2223 switch (ir->type->base_type) {
2224 case GLSL_TYPE_FLOAT:
2225 emit(MOV(*dst, src_reg(ir->value.f[i])));
2226 break;
2227 case GLSL_TYPE_INT:
2228 emit(MOV(*dst, src_reg(ir->value.i[i])));
2229 break;
2230 case GLSL_TYPE_UINT:
2231 emit(MOV(*dst, src_reg(ir->value.u[i])));
2232 break;
2233 case GLSL_TYPE_BOOL:
2234 emit(MOV(*dst, src_reg(ir->value.b[i])));
2235 break;
2236 default:
2237 unreachable("Non-float/uint/int/bool constant");
2238 }
2239
2240 remaining_writemask &= ~dst->writemask;
2241 }
2242 dst->reg_offset++;
2243 }
2244
2245 void
2246 vec4_visitor::visit(ir_constant *ir)
2247 {
2248 dst_reg dst = dst_reg(this, ir->type);
2249 this->result = src_reg(dst);
2250
2251 emit_constant_values(&dst, ir);
2252 }
2253
2254 void
2255 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2256 {
2257 ir_dereference *deref = static_cast<ir_dereference *>(
2258 ir->actual_parameters.get_head());
2259 ir_variable *location = deref->variable_referenced();
2260 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2261 location->data.atomic.buffer_index);
2262
2263 /* Calculate the surface offset */
2264 src_reg offset(this, glsl_type::uint_type);
2265 ir_dereference_array *deref_array = deref->as_dereference_array();
2266 if (deref_array) {
2267 deref_array->array_index->accept(this);
2268
2269 src_reg tmp(this, glsl_type::uint_type);
2270 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2271 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2272 } else {
2273 offset = location->data.atomic.offset;
2274 }
2275
2276 /* Emit the appropriate machine instruction */
2277 const char *callee = ir->callee->function_name();
2278 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2279
2280 if (!strcmp("__intrinsic_atomic_read", callee)) {
2281 emit_untyped_surface_read(surf_index, dst, offset);
2282
2283 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2284 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2285 src_reg(), src_reg());
2286
2287 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2288 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2289 src_reg(), src_reg());
2290 }
2291 }
2292
2293 void
2294 vec4_visitor::visit(ir_call *ir)
2295 {
2296 const char *callee = ir->callee->function_name();
2297
2298 if (!strcmp("__intrinsic_atomic_read", callee) ||
2299 !strcmp("__intrinsic_atomic_increment", callee) ||
2300 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2301 visit_atomic_counter_intrinsic(ir);
2302 } else {
2303 unreachable("Unsupported intrinsic.");
2304 }
2305 }
2306
2307 src_reg
2308 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2309 {
2310 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2311 inst->base_mrf = 2;
2312 inst->mlen = 1;
2313 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2314 inst->dst.writemask = WRITEMASK_XYZW;
2315
2316 inst->src[1] = sampler;
2317
2318 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2319 int param_base = inst->base_mrf;
2320 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2321 int zero_mask = 0xf & ~coord_mask;
2322
2323 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2324 coordinate));
2325
2326 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2327 src_reg(0)));
2328
2329 emit(inst);
2330 return src_reg(inst->dst);
2331 }
2332
2333 static bool
2334 is_high_sampler(struct brw_context *brw, src_reg sampler)
2335 {
2336 if (brw->gen < 8 && !brw->is_haswell)
2337 return false;
2338
2339 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2340 }
2341
2342 void
2343 vec4_visitor::visit(ir_texture *ir)
2344 {
2345 uint32_t sampler =
2346 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2347
2348 ir_rvalue *nonconst_sampler_index =
2349 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2350
2351 /* Handle non-constant sampler array indexing */
2352 src_reg sampler_reg;
2353 if (nonconst_sampler_index) {
2354 /* The highest sampler which may be used by this operation is
2355 * the last element of the array. Mark it here, because the generator
2356 * doesn't have enough information to determine the bound.
2357 */
2358 uint32_t array_size = ir->sampler->as_dereference_array()
2359 ->array->type->array_size();
2360
2361 uint32_t max_used = sampler + array_size - 1;
2362 if (ir->op == ir_tg4 && brw->gen < 8) {
2363 max_used += prog_data->base.binding_table.gather_texture_start;
2364 } else {
2365 max_used += prog_data->base.binding_table.texture_start;
2366 }
2367
2368 brw_mark_surface_used(&prog_data->base, max_used);
2369
2370 /* Emit code to evaluate the actual indexing expression */
2371 nonconst_sampler_index->accept(this);
2372 dst_reg temp(this, glsl_type::uint_type);
2373 emit(ADD(temp, this->result, src_reg(sampler)))
2374 ->force_writemask_all = true;
2375 sampler_reg = src_reg(temp);
2376 } else {
2377 /* Single sampler, or constant array index; the indexing expression
2378 * is just an immediate.
2379 */
2380 sampler_reg = src_reg(sampler);
2381 }
2382
2383 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2384 * emitting anything other than setting up the constant result.
2385 */
2386 if (ir->op == ir_tg4) {
2387 ir_constant *chan = ir->lod_info.component->as_constant();
2388 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2389 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2390 dst_reg result(this, ir->type);
2391 this->result = src_reg(result);
2392 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2393 return;
2394 }
2395 }
2396
2397 /* Should be lowered by do_lower_texture_projection */
2398 assert(!ir->projector);
2399
2400 /* Should be lowered */
2401 assert(!ir->offset || !ir->offset->type->is_array());
2402
2403 /* Generate code to compute all the subexpression trees. This has to be
2404 * done before loading any values into MRFs for the sampler message since
2405 * generating these values may involve SEND messages that need the MRFs.
2406 */
2407 src_reg coordinate;
2408 if (ir->coordinate) {
2409 ir->coordinate->accept(this);
2410 coordinate = this->result;
2411 }
2412
2413 src_reg shadow_comparitor;
2414 if (ir->shadow_comparitor) {
2415 ir->shadow_comparitor->accept(this);
2416 shadow_comparitor = this->result;
2417 }
2418
2419 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2420 src_reg offset_value;
2421 if (has_nonconstant_offset) {
2422 ir->offset->accept(this);
2423 offset_value = src_reg(this->result);
2424 }
2425
2426 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2427 src_reg lod, dPdx, dPdy, sample_index, mcs;
2428 switch (ir->op) {
2429 case ir_tex:
2430 lod = src_reg(0.0f);
2431 lod_type = glsl_type::float_type;
2432 break;
2433 case ir_txf:
2434 case ir_txl:
2435 case ir_txs:
2436 ir->lod_info.lod->accept(this);
2437 lod = this->result;
2438 lod_type = ir->lod_info.lod->type;
2439 break;
2440 case ir_query_levels:
2441 lod = src_reg(0);
2442 lod_type = glsl_type::int_type;
2443 break;
2444 case ir_txf_ms:
2445 ir->lod_info.sample_index->accept(this);
2446 sample_index = this->result;
2447 sample_index_type = ir->lod_info.sample_index->type;
2448
2449 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2450 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2451 else
2452 mcs = src_reg(0u);
2453 break;
2454 case ir_txd:
2455 ir->lod_info.grad.dPdx->accept(this);
2456 dPdx = this->result;
2457
2458 ir->lod_info.grad.dPdy->accept(this);
2459 dPdy = this->result;
2460
2461 lod_type = ir->lod_info.grad.dPdx->type;
2462 break;
2463 case ir_txb:
2464 case ir_lod:
2465 case ir_tg4:
2466 break;
2467 }
2468
2469 enum opcode opcode;
2470 switch (ir->op) {
2471 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2472 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2473 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2474 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2475 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2476 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2477 case ir_tg4: opcode = has_nonconstant_offset
2478 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2479 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2480 case ir_txb:
2481 unreachable("TXB is not valid for vertex shaders.");
2482 case ir_lod:
2483 unreachable("LOD is not valid for vertex shaders.");
2484 default:
2485 unreachable("Unrecognized tex op");
2486 }
2487
2488 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2489
2490 if (ir->offset != NULL && ir->op != ir_txf)
2491 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2492
2493 /* Stuff the channel select bits in the top of the texture offset */
2494 if (ir->op == ir_tg4)
2495 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2496
2497 /* The message header is necessary for:
2498 * - Gen4 (always)
2499 * - Texel offsets
2500 * - Gather channel selection
2501 * - Sampler indices too large to fit in a 4-bit value.
2502 */
2503 inst->header_present =
2504 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2505 is_high_sampler(brw, sampler_reg);
2506 inst->base_mrf = 2;
2507 inst->mlen = inst->header_present + 1; /* always at least one */
2508 inst->dst = dst_reg(this, ir->type);
2509 inst->dst.writemask = WRITEMASK_XYZW;
2510 inst->shadow_compare = ir->shadow_comparitor != NULL;
2511
2512 inst->src[1] = sampler_reg;
2513
2514 /* MRF for the first parameter */
2515 int param_base = inst->base_mrf + inst->header_present;
2516
2517 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2518 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2519 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2520 } else {
2521 /* Load the coordinate */
2522 /* FINISHME: gl_clamp_mask and saturate */
2523 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2524 int zero_mask = 0xf & ~coord_mask;
2525
2526 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2527 coordinate));
2528
2529 if (zero_mask != 0) {
2530 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2531 src_reg(0)));
2532 }
2533 /* Load the shadow comparitor */
2534 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2535 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2536 WRITEMASK_X),
2537 shadow_comparitor));
2538 inst->mlen++;
2539 }
2540
2541 /* Load the LOD info */
2542 if (ir->op == ir_tex || ir->op == ir_txl) {
2543 int mrf, writemask;
2544 if (brw->gen >= 5) {
2545 mrf = param_base + 1;
2546 if (ir->shadow_comparitor) {
2547 writemask = WRITEMASK_Y;
2548 /* mlen already incremented */
2549 } else {
2550 writemask = WRITEMASK_X;
2551 inst->mlen++;
2552 }
2553 } else /* brw->gen == 4 */ {
2554 mrf = param_base;
2555 writemask = WRITEMASK_W;
2556 }
2557 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2558 } else if (ir->op == ir_txf) {
2559 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2560 } else if (ir->op == ir_txf_ms) {
2561 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2562 sample_index));
2563 if (brw->gen >= 7)
2564 /* MCS data is in the first channel of `mcs`, but we need to get it into
2565 * the .y channel of the second vec4 of params, so replicate .x across
2566 * the whole vec4 and then mask off everything except .y
2567 */
2568 mcs.swizzle = BRW_SWIZZLE_XXXX;
2569 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2570 mcs));
2571 inst->mlen++;
2572 } else if (ir->op == ir_txd) {
2573 const glsl_type *type = lod_type;
2574
2575 if (brw->gen >= 5) {
2576 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2577 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2578 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2579 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2580 inst->mlen++;
2581
2582 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2583 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2584 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2585 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2586 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2587 inst->mlen++;
2588
2589 if (ir->shadow_comparitor) {
2590 emit(MOV(dst_reg(MRF, param_base + 2,
2591 ir->shadow_comparitor->type, WRITEMASK_Z),
2592 shadow_comparitor));
2593 }
2594 }
2595 } else /* brw->gen == 4 */ {
2596 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2597 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2598 inst->mlen += 2;
2599 }
2600 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2601 if (ir->shadow_comparitor) {
2602 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2603 shadow_comparitor));
2604 }
2605
2606 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2607 offset_value));
2608 inst->mlen++;
2609 }
2610 }
2611
2612 emit(inst);
2613
2614 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2615 * spec requires layers.
2616 */
2617 if (ir->op == ir_txs) {
2618 glsl_type const *type = ir->sampler->type;
2619 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2620 type->sampler_array) {
2621 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2622 writemask(inst->dst, WRITEMASK_Z),
2623 src_reg(inst->dst), src_reg(6));
2624 }
2625 }
2626
2627 if (brw->gen == 6 && ir->op == ir_tg4) {
2628 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2629 }
2630
2631 swizzle_result(ir, src_reg(inst->dst), sampler);
2632 }
2633
2634 /**
2635 * Apply workarounds for Gen6 gather with UINT/SINT
2636 */
2637 void
2638 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2639 {
2640 if (!wa)
2641 return;
2642
2643 int width = (wa & WA_8BIT) ? 8 : 16;
2644 dst_reg dst_f = dst;
2645 dst_f.type = BRW_REGISTER_TYPE_F;
2646
2647 /* Convert from UNORM to UINT */
2648 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2649 emit(MOV(dst, src_reg(dst_f)));
2650
2651 if (wa & WA_SIGN) {
2652 /* Reinterpret the UINT value as a signed INT value by
2653 * shifting the sign bit into place, then shifting back
2654 * preserving sign.
2655 */
2656 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2657 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2658 }
2659 }
2660
2661 /**
2662 * Set up the gather channel based on the swizzle, for gather4.
2663 */
2664 uint32_t
2665 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2666 {
2667 ir_constant *chan = ir->lod_info.component->as_constant();
2668 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2669 switch (swiz) {
2670 case SWIZZLE_X: return 0;
2671 case SWIZZLE_Y:
2672 /* gather4 sampler is broken for green channel on RG32F --
2673 * we must ask for blue instead.
2674 */
2675 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2676 return 2;
2677 return 1;
2678 case SWIZZLE_Z: return 2;
2679 case SWIZZLE_W: return 3;
2680 default:
2681 unreachable("Not reached"); /* zero, one swizzles handled already */
2682 }
2683 }
2684
2685 void
2686 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2687 {
2688 int s = key->tex.swizzles[sampler];
2689
2690 this->result = src_reg(this, ir->type);
2691 dst_reg swizzled_result(this->result);
2692
2693 if (ir->op == ir_query_levels) {
2694 /* # levels is in .w */
2695 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2696 emit(MOV(swizzled_result, orig_val));
2697 return;
2698 }
2699
2700 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2701 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2702 emit(MOV(swizzled_result, orig_val));
2703 return;
2704 }
2705
2706
2707 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2708 int swizzle[4] = {0};
2709
2710 for (int i = 0; i < 4; i++) {
2711 switch (GET_SWZ(s, i)) {
2712 case SWIZZLE_ZERO:
2713 zero_mask |= (1 << i);
2714 break;
2715 case SWIZZLE_ONE:
2716 one_mask |= (1 << i);
2717 break;
2718 default:
2719 copy_mask |= (1 << i);
2720 swizzle[i] = GET_SWZ(s, i);
2721 break;
2722 }
2723 }
2724
2725 if (copy_mask) {
2726 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2727 swizzled_result.writemask = copy_mask;
2728 emit(MOV(swizzled_result, orig_val));
2729 }
2730
2731 if (zero_mask) {
2732 swizzled_result.writemask = zero_mask;
2733 emit(MOV(swizzled_result, src_reg(0.0f)));
2734 }
2735
2736 if (one_mask) {
2737 swizzled_result.writemask = one_mask;
2738 emit(MOV(swizzled_result, src_reg(1.0f)));
2739 }
2740 }
2741
2742 void
2743 vec4_visitor::visit(ir_return *)
2744 {
2745 unreachable("not reached");
2746 }
2747
2748 void
2749 vec4_visitor::visit(ir_discard *)
2750 {
2751 unreachable("not reached");
2752 }
2753
2754 void
2755 vec4_visitor::visit(ir_if *ir)
2756 {
2757 /* Don't point the annotation at the if statement, because then it plus
2758 * the then and else blocks get printed.
2759 */
2760 this->base_ir = ir->condition;
2761
2762 if (brw->gen == 6) {
2763 emit_if_gen6(ir);
2764 } else {
2765 enum brw_predicate predicate;
2766 emit_bool_to_cond_code(ir->condition, &predicate);
2767 emit(IF(predicate));
2768 }
2769
2770 visit_instructions(&ir->then_instructions);
2771
2772 if (!ir->else_instructions.is_empty()) {
2773 this->base_ir = ir->condition;
2774 emit(BRW_OPCODE_ELSE);
2775
2776 visit_instructions(&ir->else_instructions);
2777 }
2778
2779 this->base_ir = ir->condition;
2780 emit(BRW_OPCODE_ENDIF);
2781 }
2782
2783 void
2784 vec4_visitor::visit(ir_emit_vertex *)
2785 {
2786 unreachable("not reached");
2787 }
2788
2789 void
2790 vec4_visitor::visit(ir_end_primitive *)
2791 {
2792 unreachable("not reached");
2793 }
2794
2795 void
2796 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2797 dst_reg dst, src_reg offset,
2798 src_reg src0, src_reg src1)
2799 {
2800 unsigned mlen = 0;
2801
2802 /* Set the atomic operation offset. */
2803 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2804 mlen++;
2805
2806 /* Set the atomic operation arguments. */
2807 if (src0.file != BAD_FILE) {
2808 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2809 mlen++;
2810 }
2811
2812 if (src1.file != BAD_FILE) {
2813 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2814 mlen++;
2815 }
2816
2817 /* Emit the instruction. Note that this maps to the normal SIMD8
2818 * untyped atomic message on Ivy Bridge, but that's OK because
2819 * unused channels will be masked out.
2820 */
2821 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2822 src_reg(atomic_op), src_reg(surf_index));
2823 inst->base_mrf = 0;
2824 inst->mlen = mlen;
2825 }
2826
2827 void
2828 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2829 src_reg offset)
2830 {
2831 /* Set the surface read offset. */
2832 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2833
2834 /* Emit the instruction. Note that this maps to the normal SIMD8
2835 * untyped surface read message, but that's OK because unused
2836 * channels will be masked out.
2837 */
2838 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2839 dst, src_reg(surf_index));
2840 inst->base_mrf = 0;
2841 inst->mlen = 1;
2842 }
2843
2844 void
2845 vec4_visitor::emit_ndc_computation()
2846 {
2847 /* Get the position */
2848 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2849
2850 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2851 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2852 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2853
2854 current_annotation = "NDC";
2855 dst_reg ndc_w = ndc;
2856 ndc_w.writemask = WRITEMASK_W;
2857 src_reg pos_w = pos;
2858 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2859 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2860
2861 dst_reg ndc_xyz = ndc;
2862 ndc_xyz.writemask = WRITEMASK_XYZ;
2863
2864 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2865 }
2866
2867 void
2868 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2869 {
2870 if (brw->gen < 6 &&
2871 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2872 key->userclip_active || brw->has_negative_rhw_bug)) {
2873 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2874 dst_reg header1_w = header1;
2875 header1_w.writemask = WRITEMASK_W;
2876
2877 emit(MOV(header1, 0u));
2878
2879 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2880 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2881
2882 current_annotation = "Point size";
2883 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2884 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2885 }
2886
2887 if (key->userclip_active) {
2888 current_annotation = "Clipping flags";
2889 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2890 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2891
2892 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2893 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2894 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2895
2896 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2897 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2898 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2899 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2900 }
2901
2902 /* i965 clipping workaround:
2903 * 1) Test for -ve rhw
2904 * 2) If set,
2905 * set ndc = (0,0,0,0)
2906 * set ucp[6] = 1
2907 *
2908 * Later, clipping will detect ucp[6] and ensure the primitive is
2909 * clipped against all fixed planes.
2910 */
2911 if (brw->has_negative_rhw_bug) {
2912 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2913 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2914 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2915 vec4_instruction *inst;
2916 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2917 inst->predicate = BRW_PREDICATE_NORMAL;
2918 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2919 inst->predicate = BRW_PREDICATE_NORMAL;
2920 }
2921
2922 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2923 } else if (brw->gen < 6) {
2924 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2925 } else {
2926 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2927 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2928 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2929 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2930 }
2931 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2932 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2933 src_reg(output_reg[VARYING_SLOT_LAYER])));
2934 }
2935 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2936 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2937 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2938 }
2939 }
2940 }
2941
2942 void
2943 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2944 {
2945 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2946 *
2947 * "If a linked set of shaders forming the vertex stage contains no
2948 * static write to gl_ClipVertex or gl_ClipDistance, but the
2949 * application has requested clipping against user clip planes through
2950 * the API, then the coordinate written to gl_Position is used for
2951 * comparison against the user clip planes."
2952 *
2953 * This function is only called if the shader didn't write to
2954 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2955 * if the user wrote to it; otherwise we use gl_Position.
2956 */
2957 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2958 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2959 clip_vertex = VARYING_SLOT_POS;
2960 }
2961
2962 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2963 ++i) {
2964 reg.writemask = 1 << i;
2965 emit(DP4(reg,
2966 src_reg(output_reg[clip_vertex]),
2967 src_reg(this->userplane[i + offset])));
2968 }
2969 }
2970
2971 void
2972 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2973 {
2974 assert (varying < VARYING_SLOT_MAX);
2975 reg.type = output_reg[varying].type;
2976 current_annotation = output_reg_annotation[varying];
2977 /* Copy the register, saturating if necessary */
2978 vec4_instruction *inst = emit(MOV(reg,
2979 src_reg(output_reg[varying])));
2980 if ((varying == VARYING_SLOT_COL0 ||
2981 varying == VARYING_SLOT_COL1 ||
2982 varying == VARYING_SLOT_BFC0 ||
2983 varying == VARYING_SLOT_BFC1) &&
2984 key->clamp_vertex_color) {
2985 inst->saturate = true;
2986 }
2987 }
2988
2989 void
2990 vec4_visitor::emit_urb_slot(int mrf, int varying)
2991 {
2992 struct brw_reg hw_reg = brw_message_reg(mrf);
2993 dst_reg reg = dst_reg(MRF, mrf);
2994 reg.type = BRW_REGISTER_TYPE_F;
2995
2996 switch (varying) {
2997 case VARYING_SLOT_PSIZ:
2998 /* PSIZ is always in slot 0, and is coupled with other flags. */
2999 current_annotation = "indices, point width, clip flags";
3000 emit_psiz_and_flags(hw_reg);
3001 break;
3002 case BRW_VARYING_SLOT_NDC:
3003 current_annotation = "NDC";
3004 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3005 break;
3006 case VARYING_SLOT_POS:
3007 current_annotation = "gl_Position";
3008 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3009 break;
3010 case VARYING_SLOT_EDGE:
3011 /* This is present when doing unfilled polygons. We're supposed to copy
3012 * the edge flag from the user-provided vertex array
3013 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3014 * of that attribute (starts as 1.0f). This is then used in clipping to
3015 * determine which edges should be drawn as wireframe.
3016 */
3017 current_annotation = "edge flag";
3018 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3019 glsl_type::float_type, WRITEMASK_XYZW))));
3020 break;
3021 case BRW_VARYING_SLOT_PAD:
3022 /* No need to write to this slot */
3023 break;
3024 default:
3025 emit_generic_urb_slot(reg, varying);
3026 break;
3027 }
3028 }
3029
3030 static int
3031 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3032 {
3033 if (brw->gen >= 6) {
3034 /* URB data written (does not include the message header reg) must
3035 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3036 * section 5.4.3.2.2: URB_INTERLEAVED.
3037 *
3038 * URB entries are allocated on a multiple of 1024 bits, so an
3039 * extra 128 bits written here to make the end align to 256 is
3040 * no problem.
3041 */
3042 if ((mlen % 2) != 1)
3043 mlen++;
3044 }
3045
3046 return mlen;
3047 }
3048
3049
3050 /**
3051 * Generates the VUE payload plus the necessary URB write instructions to
3052 * output it.
3053 *
3054 * The VUE layout is documented in Volume 2a.
3055 */
3056 void
3057 vec4_visitor::emit_vertex()
3058 {
3059 /* MRF 0 is reserved for the debugger, so start with message header
3060 * in MRF 1.
3061 */
3062 int base_mrf = 1;
3063 int mrf = base_mrf;
3064 /* In the process of generating our URB write message contents, we
3065 * may need to unspill a register or load from an array. Those
3066 * reads would use MRFs 14-15.
3067 */
3068 int max_usable_mrf = 13;
3069
3070 /* The following assertion verifies that max_usable_mrf causes an
3071 * even-numbered amount of URB write data, which will meet gen6's
3072 * requirements for length alignment.
3073 */
3074 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3075
3076 /* First mrf is the g0-based message header containing URB handles and
3077 * such.
3078 */
3079 emit_urb_write_header(mrf++);
3080
3081 if (brw->gen < 6) {
3082 emit_ndc_computation();
3083 }
3084
3085 /* Lower legacy ff and ClipVertex clipping to clip distances */
3086 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3087 current_annotation = "user clip distances";
3088
3089 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3090 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3091
3092 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3093 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3094 }
3095
3096 /* We may need to split this up into several URB writes, so do them in a
3097 * loop.
3098 */
3099 int slot = 0;
3100 bool complete = false;
3101 do {
3102 /* URB offset is in URB row increments, and each of our MRFs is half of
3103 * one of those, since we're doing interleaved writes.
3104 */
3105 int offset = slot / 2;
3106
3107 mrf = base_mrf + 1;
3108 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3109 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3110
3111 /* If this was max_usable_mrf, we can't fit anything more into this
3112 * URB WRITE.
3113 */
3114 if (mrf > max_usable_mrf) {
3115 slot++;
3116 break;
3117 }
3118 }
3119
3120 complete = slot >= prog_data->vue_map.num_slots;
3121 current_annotation = "URB write";
3122 vec4_instruction *inst = emit_urb_write_opcode(complete);
3123 inst->base_mrf = base_mrf;
3124 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3125 inst->offset += offset;
3126 } while(!complete);
3127 }
3128
3129
3130 src_reg
3131 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3132 src_reg *reladdr, int reg_offset)
3133 {
3134 /* Because we store the values to scratch interleaved like our
3135 * vertex data, we need to scale the vec4 index by 2.
3136 */
3137 int message_header_scale = 2;
3138
3139 /* Pre-gen6, the message header uses byte offsets instead of vec4
3140 * (16-byte) offset units.
3141 */
3142 if (brw->gen < 6)
3143 message_header_scale *= 16;
3144
3145 if (reladdr) {
3146 src_reg index = src_reg(this, glsl_type::int_type);
3147
3148 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3149 emit_before(inst, MUL(dst_reg(index),
3150 index, src_reg(message_header_scale)));
3151
3152 return index;
3153 } else {
3154 return src_reg(reg_offset * message_header_scale);
3155 }
3156 }
3157
3158 src_reg
3159 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3160 src_reg *reladdr, int reg_offset)
3161 {
3162 if (reladdr) {
3163 src_reg index = src_reg(this, glsl_type::int_type);
3164
3165 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3166
3167 /* Pre-gen6, the message header uses byte offsets instead of vec4
3168 * (16-byte) offset units.
3169 */
3170 if (brw->gen < 6) {
3171 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3172 }
3173
3174 return index;
3175 } else if (brw->gen >= 8) {
3176 /* Store the offset in a GRF so we can send-from-GRF. */
3177 src_reg offset = src_reg(this, glsl_type::int_type);
3178 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3179 return offset;
3180 } else {
3181 int message_header_scale = brw->gen < 6 ? 16 : 1;
3182 return src_reg(reg_offset * message_header_scale);
3183 }
3184 }
3185
3186 /**
3187 * Emits an instruction before @inst to load the value named by @orig_src
3188 * from scratch space at @base_offset to @temp.
3189 *
3190 * @base_offset is measured in 32-byte units (the size of a register).
3191 */
3192 void
3193 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3194 dst_reg temp, src_reg orig_src,
3195 int base_offset)
3196 {
3197 int reg_offset = base_offset + orig_src.reg_offset;
3198 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3199
3200 emit_before(inst, SCRATCH_READ(temp, index));
3201 }
3202
3203 /**
3204 * Emits an instruction after @inst to store the value to be written
3205 * to @orig_dst to scratch space at @base_offset, from @temp.
3206 *
3207 * @base_offset is measured in 32-byte units (the size of a register).
3208 */
3209 void
3210 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3211 {
3212 int reg_offset = base_offset + inst->dst.reg_offset;
3213 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3214
3215 /* Create a temporary register to store *inst's result in.
3216 *
3217 * We have to be careful in MOVing from our temporary result register in
3218 * the scratch write. If we swizzle from channels of the temporary that
3219 * weren't initialized, it will confuse live interval analysis, which will
3220 * make spilling fail to make progress.
3221 */
3222 src_reg temp = src_reg(this, glsl_type::vec4_type);
3223 temp.type = inst->dst.type;
3224 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3225 int swizzles[4];
3226 for (int i = 0; i < 4; i++)
3227 if (inst->dst.writemask & (1 << i))
3228 swizzles[i] = i;
3229 else
3230 swizzles[i] = first_writemask_chan;
3231 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3232 swizzles[2], swizzles[3]);
3233
3234 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3235 inst->dst.writemask));
3236 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3237 write->predicate = inst->predicate;
3238 write->ir = inst->ir;
3239 write->annotation = inst->annotation;
3240 inst->insert_after(write);
3241
3242 inst->dst.file = temp.file;
3243 inst->dst.reg = temp.reg;
3244 inst->dst.reg_offset = temp.reg_offset;
3245 inst->dst.reladdr = NULL;
3246 }
3247
3248 /**
3249 * We can't generally support array access in GRF space, because a
3250 * single instruction's destination can only span 2 contiguous
3251 * registers. So, we send all GRF arrays that get variable index
3252 * access to scratch space.
3253 */
3254 void
3255 vec4_visitor::move_grf_array_access_to_scratch()
3256 {
3257 int scratch_loc[this->virtual_grf_count];
3258
3259 for (int i = 0; i < this->virtual_grf_count; i++) {
3260 scratch_loc[i] = -1;
3261 }
3262
3263 /* First, calculate the set of virtual GRFs that need to be punted
3264 * to scratch due to having any array access on them, and where in
3265 * scratch.
3266 */
3267 foreach_in_list(vec4_instruction, inst, &instructions) {
3268 if (inst->dst.file == GRF && inst->dst.reladdr &&
3269 scratch_loc[inst->dst.reg] == -1) {
3270 scratch_loc[inst->dst.reg] = c->last_scratch;
3271 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3272 }
3273
3274 for (int i = 0 ; i < 3; i++) {
3275 src_reg *src = &inst->src[i];
3276
3277 if (src->file == GRF && src->reladdr &&
3278 scratch_loc[src->reg] == -1) {
3279 scratch_loc[src->reg] = c->last_scratch;
3280 c->last_scratch += this->virtual_grf_sizes[src->reg];
3281 }
3282 }
3283 }
3284
3285 /* Now, for anything that will be accessed through scratch, rewrite
3286 * it to load/store. Note that this is a _safe list walk, because
3287 * we may generate a new scratch_write instruction after the one
3288 * we're processing.
3289 */
3290 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3291 /* Set up the annotation tracking for new generated instructions. */
3292 base_ir = inst->ir;
3293 current_annotation = inst->annotation;
3294
3295 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3296 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3297 }
3298
3299 for (int i = 0 ; i < 3; i++) {
3300 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3301 continue;
3302
3303 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3304
3305 emit_scratch_read(inst, temp, inst->src[i],
3306 scratch_loc[inst->src[i].reg]);
3307
3308 inst->src[i].file = temp.file;
3309 inst->src[i].reg = temp.reg;
3310 inst->src[i].reg_offset = temp.reg_offset;
3311 inst->src[i].reladdr = NULL;
3312 }
3313 }
3314 }
3315
3316 /**
3317 * Emits an instruction before @inst to load the value named by @orig_src
3318 * from the pull constant buffer (surface) at @base_offset to @temp.
3319 */
3320 void
3321 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3322 dst_reg temp, src_reg orig_src,
3323 int base_offset)
3324 {
3325 int reg_offset = base_offset + orig_src.reg_offset;
3326 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3327 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3328 vec4_instruction *load;
3329
3330 if (brw->gen >= 7) {
3331 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3332 grf_offset.type = offset.type;
3333 emit_before(inst, MOV(grf_offset, offset));
3334
3335 load = new(mem_ctx) vec4_instruction(this,
3336 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3337 temp, index, src_reg(grf_offset));
3338 } else {
3339 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3340 temp, index, offset);
3341 load->base_mrf = 14;
3342 load->mlen = 1;
3343 }
3344 emit_before(inst, load);
3345 }
3346
3347 /**
3348 * Implements array access of uniforms by inserting a
3349 * PULL_CONSTANT_LOAD instruction.
3350 *
3351 * Unlike temporary GRF array access (where we don't support it due to
3352 * the difficulty of doing relative addressing on instruction
3353 * destinations), we could potentially do array access of uniforms
3354 * that were loaded in GRF space as push constants. In real-world
3355 * usage we've seen, though, the arrays being used are always larger
3356 * than we could load as push constants, so just always move all
3357 * uniform array access out to a pull constant buffer.
3358 */
3359 void
3360 vec4_visitor::move_uniform_array_access_to_pull_constants()
3361 {
3362 int pull_constant_loc[this->uniforms];
3363
3364 for (int i = 0; i < this->uniforms; i++) {
3365 pull_constant_loc[i] = -1;
3366 }
3367
3368 /* Walk through and find array access of uniforms. Put a copy of that
3369 * uniform in the pull constant buffer.
3370 *
3371 * Note that we don't move constant-indexed accesses to arrays. No
3372 * testing has been done of the performance impact of this choice.
3373 */
3374 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3375 for (int i = 0 ; i < 3; i++) {
3376 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3377 continue;
3378
3379 int uniform = inst->src[i].reg;
3380
3381 /* If this array isn't already present in the pull constant buffer,
3382 * add it.
3383 */
3384 if (pull_constant_loc[uniform] == -1) {
3385 const gl_constant_value **values =
3386 &stage_prog_data->param[uniform * 4];
3387
3388 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3389
3390 assert(uniform < uniform_array_size);
3391 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3392 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3393 = values[j];
3394 }
3395 }
3396
3397 /* Set up the annotation tracking for new generated instructions. */
3398 base_ir = inst->ir;
3399 current_annotation = inst->annotation;
3400
3401 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3402
3403 emit_pull_constant_load(inst, temp, inst->src[i],
3404 pull_constant_loc[uniform]);
3405
3406 inst->src[i].file = temp.file;
3407 inst->src[i].reg = temp.reg;
3408 inst->src[i].reg_offset = temp.reg_offset;
3409 inst->src[i].reladdr = NULL;
3410 }
3411 }
3412
3413 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3414 * no need to track them as larger-than-vec4 objects. This will be
3415 * relied on in cutting out unused uniform vectors from push
3416 * constants.
3417 */
3418 split_uniform_registers();
3419 }
3420
3421 void
3422 vec4_visitor::resolve_ud_negate(src_reg *reg)
3423 {
3424 if (reg->type != BRW_REGISTER_TYPE_UD ||
3425 !reg->negate)
3426 return;
3427
3428 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3429 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3430 *reg = temp;
3431 }
3432
3433 vec4_visitor::vec4_visitor(struct brw_context *brw,
3434 struct brw_vec4_compile *c,
3435 struct gl_program *prog,
3436 const struct brw_vec4_prog_key *key,
3437 struct brw_vec4_prog_data *prog_data,
3438 struct gl_shader_program *shader_prog,
3439 gl_shader_stage stage,
3440 void *mem_ctx,
3441 bool debug_flag,
3442 bool no_spills,
3443 shader_time_shader_type st_base,
3444 shader_time_shader_type st_written,
3445 shader_time_shader_type st_reset)
3446 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3447 c(c),
3448 key(key),
3449 prog_data(prog_data),
3450 sanity_param_count(0),
3451 fail_msg(NULL),
3452 first_non_payload_grf(0),
3453 need_all_constants_in_pull_buffer(false),
3454 debug_flag(debug_flag),
3455 no_spills(no_spills),
3456 st_base(st_base),
3457 st_written(st_written),
3458 st_reset(st_reset)
3459 {
3460 this->mem_ctx = mem_ctx;
3461 this->failed = false;
3462
3463 this->base_ir = NULL;
3464 this->current_annotation = NULL;
3465 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3466
3467 this->variable_ht = hash_table_ctor(0,
3468 hash_table_pointer_hash,
3469 hash_table_pointer_compare);
3470
3471 this->virtual_grf_start = NULL;
3472 this->virtual_grf_end = NULL;
3473 this->virtual_grf_sizes = NULL;
3474 this->virtual_grf_count = 0;
3475 this->virtual_grf_reg_map = NULL;
3476 this->virtual_grf_reg_count = 0;
3477 this->virtual_grf_array_size = 0;
3478 this->live_intervals_valid = false;
3479
3480 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3481
3482 this->uniforms = 0;
3483
3484 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3485 * at least one. See setup_uniforms() in brw_vec4.cpp.
3486 */
3487 this->uniform_array_size = 1;
3488 if (prog_data) {
3489 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3490 }
3491
3492 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3493 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3494 }
3495
3496 vec4_visitor::~vec4_visitor()
3497 {
3498 hash_table_dtor(this->variable_ht);
3499 }
3500
3501
3502 void
3503 vec4_visitor::fail(const char *format, ...)
3504 {
3505 va_list va;
3506 char *msg;
3507
3508 if (failed)
3509 return;
3510
3511 failed = true;
3512
3513 va_start(va, format);
3514 msg = ralloc_vasprintf(mem_ctx, format, va);
3515 va_end(va);
3516 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3517
3518 this->fail_msg = msg;
3519
3520 if (debug_flag) {
3521 fprintf(stderr, "%s", msg);
3522 }
3523 }
3524
3525 } /* namespace brw */