ce94db858f9478d4023074d5889f3a7a8d64bb02
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr) {
780 src_reg op[2];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 2);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 default:
856 unreachable("not reached");
857 }
858 return;
859 }
860
861 ir->accept(this);
862
863 resolve_ud_negate(&this->result);
864
865 if (brw->gen >= 6) {
866 vec4_instruction *inst = emit(AND(dst_null_d(),
867 this->result, src_reg(1)));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 } else {
870 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
871 inst->conditional_mod = BRW_CONDITIONAL_NZ;
872 }
873 }
874
875 /**
876 * Emit a gen6 IF statement with the comparison folded into the IF
877 * instruction.
878 */
879 void
880 vec4_visitor::emit_if_gen6(ir_if *ir)
881 {
882 ir_expression *expr = ir->condition->as_expression();
883
884 if (expr) {
885 src_reg op[2];
886 dst_reg temp;
887
888 assert(expr->get_num_operands() <= 2);
889 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
890 expr->operands[i]->accept(this);
891 op[i] = this->result;
892 }
893
894 switch (expr->operation) {
895 case ir_unop_logic_not:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
897 return;
898
899 case ir_binop_logic_xor:
900 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
901 return;
902
903 case ir_binop_logic_or:
904 temp = dst_reg(this, glsl_type::bool_type);
905 emit(OR(temp, op[0], op[1]));
906 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
907 return;
908
909 case ir_binop_logic_and:
910 temp = dst_reg(this, glsl_type::bool_type);
911 emit(AND(temp, op[0], op[1]));
912 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_f2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_unop_i2b:
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922
923 case ir_binop_greater:
924 case ir_binop_gequal:
925 case ir_binop_less:
926 case ir_binop_lequal:
927 case ir_binop_equal:
928 case ir_binop_nequal:
929 emit(IF(op[0], op[1],
930 brw_conditional_for_comparison(expr->operation)));
931 return;
932
933 case ir_binop_all_equal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
935 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
936 return;
937
938 case ir_binop_any_nequal:
939 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 case ir_unop_any:
944 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
945 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
946 return;
947
948 default:
949 unreachable("not reached");
950 }
951 return;
952 }
953
954 ir->condition->accept(this);
955
956 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
957 }
958
959 void
960 vec4_visitor::visit(ir_variable *ir)
961 {
962 dst_reg *reg = NULL;
963
964 if (variable_storage(ir))
965 return;
966
967 switch (ir->data.mode) {
968 case ir_var_shader_in:
969 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
970 break;
971
972 case ir_var_shader_out:
973 reg = new(mem_ctx) dst_reg(this, ir->type);
974
975 for (int i = 0; i < type_size(ir->type); i++) {
976 output_reg[ir->data.location + i] = *reg;
977 output_reg[ir->data.location + i].reg_offset = i;
978 output_reg[ir->data.location + i].type =
979 brw_type_for_base_type(ir->type->get_scalar_type());
980 output_reg_annotation[ir->data.location + i] = ir->name;
981 }
982 break;
983
984 case ir_var_auto:
985 case ir_var_temporary:
986 reg = new(mem_ctx) dst_reg(this, ir->type);
987 break;
988
989 case ir_var_uniform:
990 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
991
992 /* Thanks to the lower_ubo_reference pass, we will see only
993 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
994 * variables, so no need for them to be in variable_ht.
995 *
996 * Atomic counters take no uniform storage, no need to do
997 * anything here.
998 */
999 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000 return;
1001
1002 /* Track how big the whole uniform variable is, in case we need to put a
1003 * copy of its data into pull constants for array access.
1004 */
1005 assert(this->uniforms < uniform_array_size);
1006 this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008 if (!strncmp(ir->name, "gl_", 3)) {
1009 setup_builtin_uniform_values(ir);
1010 } else {
1011 setup_uniform_values(ir);
1012 }
1013 break;
1014
1015 case ir_var_system_value:
1016 reg = make_reg_for_system_value(ir);
1017 break;
1018
1019 default:
1020 unreachable("not reached");
1021 }
1022
1023 reg->type = brw_type_for_base_type(ir->type);
1024 hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030 /* We don't want debugging output to print the whole body of the
1031 * loop as the annotation.
1032 */
1033 this->base_ir = NULL;
1034
1035 emit(BRW_OPCODE_DO);
1036
1037 visit_instructions(&ir->body_instructions);
1038
1039 emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045 switch (ir->mode) {
1046 case ir_loop_jump::jump_break:
1047 emit(BRW_OPCODE_BREAK);
1048 break;
1049 case ir_loop_jump::jump_continue:
1050 emit(BRW_OPCODE_CONTINUE);
1051 break;
1052 }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059 unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065 /* Ignore function bodies other than main() -- we shouldn't see calls to
1066 * them since they should all be inlined.
1067 */
1068 if (strcmp(ir->name, "main") == 0) {
1069 const ir_function_signature *sig;
1070 exec_list empty;
1071
1072 sig = ir->matching_signature(NULL, &empty, false);
1073
1074 assert(sig);
1075
1076 visit_instructions(&sig->body);
1077 }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_mad(ir_expression *ir)
1082 {
1083 /* 3-src instructions were introduced in gen6. */
1084 if (brw->gen < 6)
1085 return false;
1086
1087 /* MAD can only handle floating-point data. */
1088 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1089 return false;
1090
1091 ir_rvalue *nonmul = ir->operands[1];
1092 ir_expression *mul = ir->operands[0]->as_expression();
1093
1094 if (!mul || mul->operation != ir_binop_mul) {
1095 nonmul = ir->operands[0];
1096 mul = ir->operands[1]->as_expression();
1097
1098 if (!mul || mul->operation != ir_binop_mul)
1099 return false;
1100 }
1101
1102 nonmul->accept(this);
1103 src_reg src0 = fix_3src_operand(this->result);
1104
1105 mul->operands[0]->accept(this);
1106 src_reg src1 = fix_3src_operand(this->result);
1107
1108 mul->operands[1]->accept(this);
1109 src_reg src2 = fix_3src_operand(this->result);
1110
1111 this->result = src_reg(this, ir->type);
1112 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1113
1114 return true;
1115 }
1116
1117 bool
1118 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1119 {
1120 /* This optimization relies on CMP setting the destination to 0 when
1121 * false. Early hardware only sets the least significant bit, and
1122 * leaves the other bits undefined. So we can't use it.
1123 */
1124 if (brw->gen < 6)
1125 return false;
1126
1127 ir_expression *const cmp = ir->operands[0]->as_expression();
1128
1129 if (cmp == NULL)
1130 return false;
1131
1132 switch (cmp->operation) {
1133 case ir_binop_less:
1134 case ir_binop_greater:
1135 case ir_binop_lequal:
1136 case ir_binop_gequal:
1137 case ir_binop_equal:
1138 case ir_binop_nequal:
1139 break;
1140
1141 default:
1142 return false;
1143 }
1144
1145 cmp->operands[0]->accept(this);
1146 const src_reg cmp_src0 = this->result;
1147
1148 cmp->operands[1]->accept(this);
1149 const src_reg cmp_src1 = this->result;
1150
1151 this->result = src_reg(this, ir->type);
1152
1153 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1154 brw_conditional_for_comparison(cmp->operation)));
1155
1156 /* If the comparison is false, this->result will just happen to be zero.
1157 */
1158 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1159 this->result, src_reg(1.0f));
1160 inst->predicate = BRW_PREDICATE_NORMAL;
1161 inst->predicate_inverse = true;
1162
1163 return true;
1164 }
1165
1166 void
1167 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1168 src_reg src0, src_reg src1)
1169 {
1170 vec4_instruction *inst;
1171
1172 if (brw->gen >= 6) {
1173 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1174 inst->conditional_mod = conditionalmod;
1175 } else {
1176 emit(CMP(dst, src0, src1, conditionalmod));
1177
1178 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1179 inst->predicate = BRW_PREDICATE_NORMAL;
1180 }
1181 }
1182
1183 void
1184 vec4_visitor::emit_lrp(const dst_reg &dst,
1185 const src_reg &x, const src_reg &y, const src_reg &a)
1186 {
1187 if (brw->gen >= 6) {
1188 /* Note that the instruction's argument order is reversed from GLSL
1189 * and the IR.
1190 */
1191 emit(LRP(dst,
1192 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1193 } else {
1194 /* Earlier generations don't support three source operations, so we
1195 * need to emit x*(1-a) + y*a.
1196 */
1197 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1198 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1199 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1200 y_times_a.writemask = dst.writemask;
1201 one_minus_a.writemask = dst.writemask;
1202 x_times_one_minus_a.writemask = dst.writemask;
1203
1204 emit(MUL(y_times_a, y, a));
1205 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1206 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1207 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1208 }
1209 }
1210
1211 void
1212 vec4_visitor::visit(ir_expression *ir)
1213 {
1214 unsigned int operand;
1215 src_reg op[Elements(ir->operands)];
1216 src_reg result_src;
1217 dst_reg result_dst;
1218 vec4_instruction *inst;
1219
1220 if (ir->operation == ir_binop_add) {
1221 if (try_emit_mad(ir))
1222 return;
1223 }
1224
1225 if (ir->operation == ir_unop_b2f) {
1226 if (try_emit_b2f_of_compare(ir))
1227 return;
1228 }
1229
1230 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1231 this->result.file = BAD_FILE;
1232 ir->operands[operand]->accept(this);
1233 if (this->result.file == BAD_FILE) {
1234 fprintf(stderr, "Failed to get tree for expression operand:\n");
1235 ir->operands[operand]->fprint(stderr);
1236 exit(1);
1237 }
1238 op[operand] = this->result;
1239
1240 /* Matrix expression operands should have been broken down to vector
1241 * operations already.
1242 */
1243 assert(!ir->operands[operand]->type->is_matrix());
1244 }
1245
1246 int vector_elements = ir->operands[0]->type->vector_elements;
1247 if (ir->operands[1]) {
1248 vector_elements = MAX2(vector_elements,
1249 ir->operands[1]->type->vector_elements);
1250 }
1251
1252 this->result.file = BAD_FILE;
1253
1254 /* Storage for our result. Ideally for an assignment we'd be using
1255 * the actual storage for the result here, instead.
1256 */
1257 result_src = src_reg(this, ir->type);
1258 /* convenience for the emit functions below. */
1259 result_dst = dst_reg(result_src);
1260 /* If nothing special happens, this is the result. */
1261 this->result = result_src;
1262 /* Limit writes to the channels that will be used by result_src later.
1263 * This does limit this temp's use as a temporary for multi-instruction
1264 * sequences.
1265 */
1266 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1267
1268 switch (ir->operation) {
1269 case ir_unop_logic_not:
1270 if (ctx->Const.UniformBooleanTrue != 1) {
1271 emit(NOT(result_dst, op[0]));
1272 } else {
1273 emit(XOR(result_dst, op[0], src_reg(1)));
1274 }
1275 break;
1276 case ir_unop_neg:
1277 op[0].negate = !op[0].negate;
1278 emit(MOV(result_dst, op[0]));
1279 break;
1280 case ir_unop_abs:
1281 op[0].abs = true;
1282 op[0].negate = false;
1283 emit(MOV(result_dst, op[0]));
1284 break;
1285
1286 case ir_unop_sign:
1287 if (ir->type->is_float()) {
1288 /* AND(val, 0x80000000) gives the sign bit.
1289 *
1290 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1291 * zero.
1292 */
1293 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1294
1295 op[0].type = BRW_REGISTER_TYPE_UD;
1296 result_dst.type = BRW_REGISTER_TYPE_UD;
1297 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1298
1299 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1300 inst->predicate = BRW_PREDICATE_NORMAL;
1301
1302 this->result.type = BRW_REGISTER_TYPE_F;
1303 } else {
1304 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1305 * -> non-negative val generates 0x00000000.
1306 * Predicated OR sets 1 if val is positive.
1307 */
1308 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1309
1310 emit(ASR(result_dst, op[0], src_reg(31)));
1311
1312 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1313 inst->predicate = BRW_PREDICATE_NORMAL;
1314 }
1315 break;
1316
1317 case ir_unop_rcp:
1318 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1319 break;
1320
1321 case ir_unop_exp2:
1322 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1323 break;
1324 case ir_unop_log2:
1325 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1326 break;
1327 case ir_unop_exp:
1328 case ir_unop_log:
1329 unreachable("not reached: should be handled by ir_explog_to_explog2");
1330 case ir_unop_sin:
1331 case ir_unop_sin_reduced:
1332 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1333 break;
1334 case ir_unop_cos:
1335 case ir_unop_cos_reduced:
1336 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1337 break;
1338
1339 case ir_unop_dFdx:
1340 case ir_unop_dFdx_coarse:
1341 case ir_unop_dFdx_fine:
1342 case ir_unop_dFdy:
1343 case ir_unop_dFdy_coarse:
1344 case ir_unop_dFdy_fine:
1345 unreachable("derivatives not valid in vertex shader");
1346
1347 case ir_unop_bitfield_reverse:
1348 emit(BFREV(result_dst, op[0]));
1349 break;
1350 case ir_unop_bit_count:
1351 emit(CBIT(result_dst, op[0]));
1352 break;
1353 case ir_unop_find_msb: {
1354 src_reg temp = src_reg(this, glsl_type::uint_type);
1355
1356 inst = emit(FBH(dst_reg(temp), op[0]));
1357 inst->dst.writemask = WRITEMASK_XYZW;
1358
1359 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1360 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1361 * subtract the result from 31 to convert the MSB count into an LSB count.
1362 */
1363
1364 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1365 temp.swizzle = BRW_SWIZZLE_NOOP;
1366 emit(MOV(result_dst, temp));
1367
1368 src_reg src_tmp = src_reg(result_dst);
1369 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1370
1371 src_tmp.negate = true;
1372 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1373 inst->predicate = BRW_PREDICATE_NORMAL;
1374 break;
1375 }
1376 case ir_unop_find_lsb:
1377 emit(FBL(result_dst, op[0]));
1378 break;
1379 case ir_unop_saturate:
1380 inst = emit(MOV(result_dst, op[0]));
1381 inst->saturate = true;
1382 break;
1383
1384 case ir_unop_noise:
1385 unreachable("not reached: should be handled by lower_noise");
1386
1387 case ir_binop_add:
1388 emit(ADD(result_dst, op[0], op[1]));
1389 break;
1390 case ir_binop_sub:
1391 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1392
1393 case ir_binop_mul:
1394 if (brw->gen < 8 && ir->type->is_integer()) {
1395 /* For integer multiplication, the MUL uses the low 16 bits of one of
1396 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1397 * accumulates in the contribution of the upper 16 bits of that
1398 * operand. If we can determine that one of the args is in the low
1399 * 16 bits, though, we can just emit a single MUL.
1400 */
1401 if (ir->operands[0]->is_uint16_constant()) {
1402 if (brw->gen < 7)
1403 emit(MUL(result_dst, op[0], op[1]));
1404 else
1405 emit(MUL(result_dst, op[1], op[0]));
1406 } else if (ir->operands[1]->is_uint16_constant()) {
1407 if (brw->gen < 7)
1408 emit(MUL(result_dst, op[1], op[0]));
1409 else
1410 emit(MUL(result_dst, op[0], op[1]));
1411 } else {
1412 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1413
1414 emit(MUL(acc, op[0], op[1]));
1415 emit(MACH(dst_null_d(), op[0], op[1]));
1416 emit(MOV(result_dst, src_reg(acc)));
1417 }
1418 } else {
1419 emit(MUL(result_dst, op[0], op[1]));
1420 }
1421 break;
1422 case ir_binop_imul_high: {
1423 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1424
1425 emit(MUL(acc, op[0], op[1]));
1426 emit(MACH(result_dst, op[0], op[1]));
1427 break;
1428 }
1429 case ir_binop_div:
1430 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1431 assert(ir->type->is_integer());
1432 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1433 break;
1434 case ir_binop_carry: {
1435 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1436
1437 emit(ADDC(dst_null_ud(), op[0], op[1]));
1438 emit(MOV(result_dst, src_reg(acc)));
1439 break;
1440 }
1441 case ir_binop_borrow: {
1442 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1443
1444 emit(SUBB(dst_null_ud(), op[0], op[1]));
1445 emit(MOV(result_dst, src_reg(acc)));
1446 break;
1447 }
1448 case ir_binop_mod:
1449 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1450 assert(ir->type->is_integer());
1451 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1452 break;
1453
1454 case ir_binop_less:
1455 case ir_binop_greater:
1456 case ir_binop_lequal:
1457 case ir_binop_gequal:
1458 case ir_binop_equal:
1459 case ir_binop_nequal: {
1460 emit(CMP(result_dst, op[0], op[1],
1461 brw_conditional_for_comparison(ir->operation)));
1462 if (ctx->Const.UniformBooleanTrue == 1) {
1463 emit(AND(result_dst, result_src, src_reg(1)));
1464 }
1465 break;
1466 }
1467
1468 case ir_binop_all_equal:
1469 /* "==" operator producing a scalar boolean. */
1470 if (ir->operands[0]->type->is_vector() ||
1471 ir->operands[1]->type->is_vector()) {
1472 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1473 emit(MOV(result_dst, src_reg(0)));
1474 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1475 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1476 } else {
1477 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1478 if (ctx->Const.UniformBooleanTrue == 1) {
1479 emit(AND(result_dst, result_src, src_reg(1)));
1480 }
1481 }
1482 break;
1483 case ir_binop_any_nequal:
1484 /* "!=" operator producing a scalar boolean. */
1485 if (ir->operands[0]->type->is_vector() ||
1486 ir->operands[1]->type->is_vector()) {
1487 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1488
1489 emit(MOV(result_dst, src_reg(0)));
1490 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1491 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1492 } else {
1493 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1494 if (ctx->Const.UniformBooleanTrue == 1) {
1495 emit(AND(result_dst, result_src, src_reg(1)));
1496 }
1497 }
1498 break;
1499
1500 case ir_unop_any:
1501 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1502 emit(MOV(result_dst, src_reg(0)));
1503
1504 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1505 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1506 break;
1507
1508 case ir_binop_logic_xor:
1509 emit(XOR(result_dst, op[0], op[1]));
1510 break;
1511
1512 case ir_binop_logic_or:
1513 emit(OR(result_dst, op[0], op[1]));
1514 break;
1515
1516 case ir_binop_logic_and:
1517 emit(AND(result_dst, op[0], op[1]));
1518 break;
1519
1520 case ir_binop_dot:
1521 assert(ir->operands[0]->type->is_vector());
1522 assert(ir->operands[0]->type == ir->operands[1]->type);
1523 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1524 break;
1525
1526 case ir_unop_sqrt:
1527 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1528 break;
1529 case ir_unop_rsq:
1530 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1531 break;
1532
1533 case ir_unop_bitcast_i2f:
1534 case ir_unop_bitcast_u2f:
1535 this->result = op[0];
1536 this->result.type = BRW_REGISTER_TYPE_F;
1537 break;
1538
1539 case ir_unop_bitcast_f2i:
1540 this->result = op[0];
1541 this->result.type = BRW_REGISTER_TYPE_D;
1542 break;
1543
1544 case ir_unop_bitcast_f2u:
1545 this->result = op[0];
1546 this->result.type = BRW_REGISTER_TYPE_UD;
1547 break;
1548
1549 case ir_unop_i2f:
1550 case ir_unop_i2u:
1551 case ir_unop_u2i:
1552 case ir_unop_u2f:
1553 case ir_unop_f2i:
1554 case ir_unop_f2u:
1555 emit(MOV(result_dst, op[0]));
1556 break;
1557 case ir_unop_b2i:
1558 if (ctx->Const.UniformBooleanTrue != 1) {
1559 emit(AND(result_dst, op[0], src_reg(1)));
1560 } else {
1561 emit(MOV(result_dst, op[0]));
1562 }
1563 break;
1564 case ir_unop_b2f:
1565 if (ctx->Const.UniformBooleanTrue != 1) {
1566 op[0].type = BRW_REGISTER_TYPE_UD;
1567 result_dst.type = BRW_REGISTER_TYPE_UD;
1568 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1569 result_dst.type = BRW_REGISTER_TYPE_F;
1570 } else {
1571 emit(MOV(result_dst, op[0]));
1572 }
1573 break;
1574 case ir_unop_f2b:
1575 case ir_unop_i2b:
1576 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1577 if (ctx->Const.UniformBooleanTrue == 1) {
1578 emit(AND(result_dst, result_src, src_reg(1)));
1579 }
1580 break;
1581
1582 case ir_unop_trunc:
1583 emit(RNDZ(result_dst, op[0]));
1584 break;
1585 case ir_unop_ceil:
1586 op[0].negate = !op[0].negate;
1587 inst = emit(RNDD(result_dst, op[0]));
1588 this->result.negate = true;
1589 break;
1590 case ir_unop_floor:
1591 inst = emit(RNDD(result_dst, op[0]));
1592 break;
1593 case ir_unop_fract:
1594 inst = emit(FRC(result_dst, op[0]));
1595 break;
1596 case ir_unop_round_even:
1597 emit(RNDE(result_dst, op[0]));
1598 break;
1599
1600 case ir_binop_min:
1601 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1602 break;
1603 case ir_binop_max:
1604 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1605 break;
1606
1607 case ir_binop_pow:
1608 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1609 break;
1610
1611 case ir_unop_bit_not:
1612 inst = emit(NOT(result_dst, op[0]));
1613 break;
1614 case ir_binop_bit_and:
1615 inst = emit(AND(result_dst, op[0], op[1]));
1616 break;
1617 case ir_binop_bit_xor:
1618 inst = emit(XOR(result_dst, op[0], op[1]));
1619 break;
1620 case ir_binop_bit_or:
1621 inst = emit(OR(result_dst, op[0], op[1]));
1622 break;
1623
1624 case ir_binop_lshift:
1625 inst = emit(SHL(result_dst, op[0], op[1]));
1626 break;
1627
1628 case ir_binop_rshift:
1629 if (ir->type->base_type == GLSL_TYPE_INT)
1630 inst = emit(ASR(result_dst, op[0], op[1]));
1631 else
1632 inst = emit(SHR(result_dst, op[0], op[1]));
1633 break;
1634
1635 case ir_binop_bfm:
1636 emit(BFI1(result_dst, op[0], op[1]));
1637 break;
1638
1639 case ir_binop_ubo_load: {
1640 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1641 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1642 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1643 src_reg offset;
1644
1645 /* Now, load the vector from that offset. */
1646 assert(ir->type->is_vector() || ir->type->is_scalar());
1647
1648 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1649 packed_consts.type = result.type;
1650 src_reg surf_index;
1651
1652 if (const_uniform_block) {
1653 /* The block index is a constant, so just emit the binding table entry
1654 * as an immediate.
1655 */
1656 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1657 const_uniform_block->value.u[0]);
1658 } else {
1659 /* The block index is not a constant. Evaluate the index expression
1660 * per-channel and add the base UBO index; the generator will select
1661 * a value from any live channel.
1662 */
1663 surf_index = src_reg(this, glsl_type::uint_type);
1664 emit(ADD(dst_reg(surf_index), op[0],
1665 src_reg(prog_data->base.binding_table.ubo_start)));
1666
1667 /* Assume this may touch any UBO. It would be nice to provide
1668 * a tighter bound, but the array information is already lowered away.
1669 */
1670 brw_mark_surface_used(&prog_data->base,
1671 prog_data->base.binding_table.ubo_start +
1672 shader_prog->NumUniformBlocks - 1);
1673 }
1674
1675 if (const_offset_ir) {
1676 if (brw->gen >= 8) {
1677 /* Store the offset in a GRF so we can send-from-GRF. */
1678 offset = src_reg(this, glsl_type::int_type);
1679 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1680 } else {
1681 /* Immediates are fine on older generations since they'll be moved
1682 * to a (potentially fake) MRF at the generator level.
1683 */
1684 offset = src_reg(const_offset / 16);
1685 }
1686 } else {
1687 offset = src_reg(this, glsl_type::uint_type);
1688 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1689 }
1690
1691 if (brw->gen >= 7) {
1692 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1693 grf_offset.type = offset.type;
1694
1695 emit(MOV(grf_offset, offset));
1696
1697 emit(new(mem_ctx) vec4_instruction(this,
1698 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1699 dst_reg(packed_consts),
1700 surf_index,
1701 src_reg(grf_offset)));
1702 } else {
1703 vec4_instruction *pull =
1704 emit(new(mem_ctx) vec4_instruction(this,
1705 VS_OPCODE_PULL_CONSTANT_LOAD,
1706 dst_reg(packed_consts),
1707 surf_index,
1708 offset));
1709 pull->base_mrf = 14;
1710 pull->mlen = 1;
1711 }
1712
1713 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1714 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1715 const_offset % 16 / 4,
1716 const_offset % 16 / 4,
1717 const_offset % 16 / 4);
1718
1719 /* UBO bools are any nonzero int. We need to convert them to use the
1720 * value of true stored in ctx->Const.UniformBooleanTrue.
1721 */
1722 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1723 emit(CMP(result_dst, packed_consts, src_reg(0u),
1724 BRW_CONDITIONAL_NZ));
1725 if (ctx->Const.UniformBooleanTrue == 1) {
1726 emit(AND(result_dst, result, src_reg(1)));
1727 }
1728 } else {
1729 emit(MOV(result_dst, packed_consts));
1730 }
1731 break;
1732 }
1733
1734 case ir_binop_vector_extract:
1735 unreachable("should have been lowered by vec_index_to_cond_assign");
1736
1737 case ir_triop_fma:
1738 op[0] = fix_3src_operand(op[0]);
1739 op[1] = fix_3src_operand(op[1]);
1740 op[2] = fix_3src_operand(op[2]);
1741 /* Note that the instruction's argument order is reversed from GLSL
1742 * and the IR.
1743 */
1744 emit(MAD(result_dst, op[2], op[1], op[0]));
1745 break;
1746
1747 case ir_triop_lrp:
1748 emit_lrp(result_dst, op[0], op[1], op[2]);
1749 break;
1750
1751 case ir_triop_csel:
1752 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1753 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1754 inst->predicate = BRW_PREDICATE_NORMAL;
1755 break;
1756
1757 case ir_triop_bfi:
1758 op[0] = fix_3src_operand(op[0]);
1759 op[1] = fix_3src_operand(op[1]);
1760 op[2] = fix_3src_operand(op[2]);
1761 emit(BFI2(result_dst, op[0], op[1], op[2]));
1762 break;
1763
1764 case ir_triop_bitfield_extract:
1765 op[0] = fix_3src_operand(op[0]);
1766 op[1] = fix_3src_operand(op[1]);
1767 op[2] = fix_3src_operand(op[2]);
1768 /* Note that the instruction's argument order is reversed from GLSL
1769 * and the IR.
1770 */
1771 emit(BFE(result_dst, op[2], op[1], op[0]));
1772 break;
1773
1774 case ir_triop_vector_insert:
1775 unreachable("should have been lowered by lower_vector_insert");
1776
1777 case ir_quadop_bitfield_insert:
1778 unreachable("not reached: should be handled by "
1779 "bitfield_insert_to_bfm_bfi\n");
1780
1781 case ir_quadop_vector:
1782 unreachable("not reached: should be handled by lower_quadop_vector");
1783
1784 case ir_unop_pack_half_2x16:
1785 emit_pack_half_2x16(result_dst, op[0]);
1786 break;
1787 case ir_unop_unpack_half_2x16:
1788 emit_unpack_half_2x16(result_dst, op[0]);
1789 break;
1790 case ir_unop_pack_snorm_2x16:
1791 case ir_unop_pack_snorm_4x8:
1792 case ir_unop_pack_unorm_2x16:
1793 case ir_unop_pack_unorm_4x8:
1794 case ir_unop_unpack_snorm_2x16:
1795 case ir_unop_unpack_snorm_4x8:
1796 case ir_unop_unpack_unorm_2x16:
1797 case ir_unop_unpack_unorm_4x8:
1798 unreachable("not reached: should be handled by lower_packing_builtins");
1799 case ir_unop_unpack_half_2x16_split_x:
1800 case ir_unop_unpack_half_2x16_split_y:
1801 case ir_binop_pack_half_2x16_split:
1802 case ir_unop_interpolate_at_centroid:
1803 case ir_binop_interpolate_at_sample:
1804 case ir_binop_interpolate_at_offset:
1805 unreachable("not reached: should not occur in vertex shader");
1806 case ir_binop_ldexp:
1807 unreachable("not reached: should be handled by ldexp_to_arith()");
1808 }
1809 }
1810
1811
1812 void
1813 vec4_visitor::visit(ir_swizzle *ir)
1814 {
1815 src_reg src;
1816 int i = 0;
1817 int swizzle[4];
1818
1819 /* Note that this is only swizzles in expressions, not those on the left
1820 * hand side of an assignment, which do write masking. See ir_assignment
1821 * for that.
1822 */
1823
1824 ir->val->accept(this);
1825 src = this->result;
1826 assert(src.file != BAD_FILE);
1827
1828 for (i = 0; i < ir->type->vector_elements; i++) {
1829 switch (i) {
1830 case 0:
1831 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1832 break;
1833 case 1:
1834 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1835 break;
1836 case 2:
1837 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1838 break;
1839 case 3:
1840 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1841 break;
1842 }
1843 }
1844 for (; i < 4; i++) {
1845 /* Replicate the last channel out. */
1846 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1847 }
1848
1849 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1850
1851 this->result = src;
1852 }
1853
1854 void
1855 vec4_visitor::visit(ir_dereference_variable *ir)
1856 {
1857 const struct glsl_type *type = ir->type;
1858 dst_reg *reg = variable_storage(ir->var);
1859
1860 if (!reg) {
1861 fail("Failed to find variable storage for %s\n", ir->var->name);
1862 this->result = src_reg(brw_null_reg());
1863 return;
1864 }
1865
1866 this->result = src_reg(*reg);
1867
1868 /* System values get their swizzle from the dst_reg writemask */
1869 if (ir->var->data.mode == ir_var_system_value)
1870 return;
1871
1872 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1873 this->result.swizzle = swizzle_for_size(type->vector_elements);
1874 }
1875
1876
1877 int
1878 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1879 {
1880 /* Under normal circumstances array elements are stored consecutively, so
1881 * the stride is equal to the size of the array element.
1882 */
1883 return type_size(ir->type);
1884 }
1885
1886
1887 void
1888 vec4_visitor::visit(ir_dereference_array *ir)
1889 {
1890 ir_constant *constant_index;
1891 src_reg src;
1892 int array_stride = compute_array_stride(ir);
1893
1894 constant_index = ir->array_index->constant_expression_value();
1895
1896 ir->array->accept(this);
1897 src = this->result;
1898
1899 if (constant_index) {
1900 src.reg_offset += constant_index->value.i[0] * array_stride;
1901 } else {
1902 /* Variable index array dereference. It eats the "vec4" of the
1903 * base of the array and an index that offsets the Mesa register
1904 * index.
1905 */
1906 ir->array_index->accept(this);
1907
1908 src_reg index_reg;
1909
1910 if (array_stride == 1) {
1911 index_reg = this->result;
1912 } else {
1913 index_reg = src_reg(this, glsl_type::int_type);
1914
1915 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1916 }
1917
1918 if (src.reladdr) {
1919 src_reg temp = src_reg(this, glsl_type::int_type);
1920
1921 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1922
1923 index_reg = temp;
1924 }
1925
1926 src.reladdr = ralloc(mem_ctx, src_reg);
1927 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1928 }
1929
1930 /* If the type is smaller than a vec4, replicate the last channel out. */
1931 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1932 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1933 else
1934 src.swizzle = BRW_SWIZZLE_NOOP;
1935 src.type = brw_type_for_base_type(ir->type);
1936
1937 this->result = src;
1938 }
1939
1940 void
1941 vec4_visitor::visit(ir_dereference_record *ir)
1942 {
1943 unsigned int i;
1944 const glsl_type *struct_type = ir->record->type;
1945 int offset = 0;
1946
1947 ir->record->accept(this);
1948
1949 for (i = 0; i < struct_type->length; i++) {
1950 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1951 break;
1952 offset += type_size(struct_type->fields.structure[i].type);
1953 }
1954
1955 /* If the type is smaller than a vec4, replicate the last channel out. */
1956 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1957 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1958 else
1959 this->result.swizzle = BRW_SWIZZLE_NOOP;
1960 this->result.type = brw_type_for_base_type(ir->type);
1961
1962 this->result.reg_offset += offset;
1963 }
1964
1965 /**
1966 * We want to be careful in assignment setup to hit the actual storage
1967 * instead of potentially using a temporary like we might with the
1968 * ir_dereference handler.
1969 */
1970 static dst_reg
1971 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1972 {
1973 /* The LHS must be a dereference. If the LHS is a variable indexed array
1974 * access of a vector, it must be separated into a series conditional moves
1975 * before reaching this point (see ir_vec_index_to_cond_assign).
1976 */
1977 assert(ir->as_dereference());
1978 ir_dereference_array *deref_array = ir->as_dereference_array();
1979 if (deref_array) {
1980 assert(!deref_array->array->type->is_vector());
1981 }
1982
1983 /* Use the rvalue deref handler for the most part. We'll ignore
1984 * swizzles in it and write swizzles using writemask, though.
1985 */
1986 ir->accept(v);
1987 return dst_reg(v->result);
1988 }
1989
1990 void
1991 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1992 const struct glsl_type *type,
1993 enum brw_predicate predicate)
1994 {
1995 if (type->base_type == GLSL_TYPE_STRUCT) {
1996 for (unsigned int i = 0; i < type->length; i++) {
1997 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1998 }
1999 return;
2000 }
2001
2002 if (type->is_array()) {
2003 for (unsigned int i = 0; i < type->length; i++) {
2004 emit_block_move(dst, src, type->fields.array, predicate);
2005 }
2006 return;
2007 }
2008
2009 if (type->is_matrix()) {
2010 const struct glsl_type *vec_type;
2011
2012 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2013 type->vector_elements, 1);
2014
2015 for (int i = 0; i < type->matrix_columns; i++) {
2016 emit_block_move(dst, src, vec_type, predicate);
2017 }
2018 return;
2019 }
2020
2021 assert(type->is_scalar() || type->is_vector());
2022
2023 dst->type = brw_type_for_base_type(type);
2024 src->type = dst->type;
2025
2026 dst->writemask = (1 << type->vector_elements) - 1;
2027
2028 src->swizzle = swizzle_for_size(type->vector_elements);
2029
2030 vec4_instruction *inst = emit(MOV(*dst, *src));
2031 inst->predicate = predicate;
2032
2033 dst->reg_offset++;
2034 src->reg_offset++;
2035 }
2036
2037
2038 /* If the RHS processing resulted in an instruction generating a
2039 * temporary value, and it would be easy to rewrite the instruction to
2040 * generate its result right into the LHS instead, do so. This ends
2041 * up reliably removing instructions where it can be tricky to do so
2042 * later without real UD chain information.
2043 */
2044 bool
2045 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2046 dst_reg dst,
2047 src_reg src,
2048 vec4_instruction *pre_rhs_inst,
2049 vec4_instruction *last_rhs_inst)
2050 {
2051 /* This could be supported, but it would take more smarts. */
2052 if (ir->condition)
2053 return false;
2054
2055 if (pre_rhs_inst == last_rhs_inst)
2056 return false; /* No instructions generated to work with. */
2057
2058 /* Make sure the last instruction generated our source reg. */
2059 if (src.file != GRF ||
2060 src.file != last_rhs_inst->dst.file ||
2061 src.reg != last_rhs_inst->dst.reg ||
2062 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2063 src.reladdr ||
2064 src.abs ||
2065 src.negate ||
2066 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2067 return false;
2068
2069 /* Check that that last instruction fully initialized the channels
2070 * we want to use, in the order we want to use them. We could
2071 * potentially reswizzle the operands of many instructions so that
2072 * we could handle out of order channels, but don't yet.
2073 */
2074
2075 for (unsigned i = 0; i < 4; i++) {
2076 if (dst.writemask & (1 << i)) {
2077 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2078 return false;
2079
2080 if (BRW_GET_SWZ(src.swizzle, i) != i)
2081 return false;
2082 }
2083 }
2084
2085 /* Success! Rewrite the instruction. */
2086 last_rhs_inst->dst.file = dst.file;
2087 last_rhs_inst->dst.reg = dst.reg;
2088 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2089 last_rhs_inst->dst.reladdr = dst.reladdr;
2090 last_rhs_inst->dst.writemask &= dst.writemask;
2091
2092 return true;
2093 }
2094
2095 void
2096 vec4_visitor::visit(ir_assignment *ir)
2097 {
2098 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2099 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2100
2101 if (!ir->lhs->type->is_scalar() &&
2102 !ir->lhs->type->is_vector()) {
2103 ir->rhs->accept(this);
2104 src_reg src = this->result;
2105
2106 if (ir->condition) {
2107 emit_bool_to_cond_code(ir->condition, &predicate);
2108 }
2109
2110 /* emit_block_move doesn't account for swizzles in the source register.
2111 * This should be ok, since the source register is a structure or an
2112 * array, and those can't be swizzled. But double-check to be sure.
2113 */
2114 assert(src.swizzle ==
2115 (ir->rhs->type->is_matrix()
2116 ? swizzle_for_size(ir->rhs->type->vector_elements)
2117 : BRW_SWIZZLE_NOOP));
2118
2119 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2120 return;
2121 }
2122
2123 /* Now we're down to just a scalar/vector with writemasks. */
2124 int i;
2125
2126 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2127 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2128
2129 ir->rhs->accept(this);
2130
2131 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2132
2133 src_reg src = this->result;
2134
2135 int swizzles[4];
2136 int first_enabled_chan = 0;
2137 int src_chan = 0;
2138
2139 assert(ir->lhs->type->is_vector() ||
2140 ir->lhs->type->is_scalar());
2141 dst.writemask = ir->write_mask;
2142
2143 for (int i = 0; i < 4; i++) {
2144 if (dst.writemask & (1 << i)) {
2145 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2146 break;
2147 }
2148 }
2149
2150 /* Swizzle a small RHS vector into the channels being written.
2151 *
2152 * glsl ir treats write_mask as dictating how many channels are
2153 * present on the RHS while in our instructions we need to make
2154 * those channels appear in the slots of the vec4 they're written to.
2155 */
2156 for (int i = 0; i < 4; i++) {
2157 if (dst.writemask & (1 << i))
2158 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2159 else
2160 swizzles[i] = first_enabled_chan;
2161 }
2162 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2163 swizzles[2], swizzles[3]);
2164
2165 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2166 return;
2167 }
2168
2169 if (ir->condition) {
2170 emit_bool_to_cond_code(ir->condition, &predicate);
2171 }
2172
2173 for (i = 0; i < type_size(ir->lhs->type); i++) {
2174 vec4_instruction *inst = emit(MOV(dst, src));
2175 inst->predicate = predicate;
2176
2177 dst.reg_offset++;
2178 src.reg_offset++;
2179 }
2180 }
2181
2182 void
2183 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2184 {
2185 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2186 foreach_in_list(ir_constant, field_value, &ir->components) {
2187 emit_constant_values(dst, field_value);
2188 }
2189 return;
2190 }
2191
2192 if (ir->type->is_array()) {
2193 for (unsigned int i = 0; i < ir->type->length; i++) {
2194 emit_constant_values(dst, ir->array_elements[i]);
2195 }
2196 return;
2197 }
2198
2199 if (ir->type->is_matrix()) {
2200 for (int i = 0; i < ir->type->matrix_columns; i++) {
2201 float *vec = &ir->value.f[i * ir->type->vector_elements];
2202
2203 for (int j = 0; j < ir->type->vector_elements; j++) {
2204 dst->writemask = 1 << j;
2205 dst->type = BRW_REGISTER_TYPE_F;
2206
2207 emit(MOV(*dst, src_reg(vec[j])));
2208 }
2209 dst->reg_offset++;
2210 }
2211 return;
2212 }
2213
2214 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2215
2216 for (int i = 0; i < ir->type->vector_elements; i++) {
2217 if (!(remaining_writemask & (1 << i)))
2218 continue;
2219
2220 dst->writemask = 1 << i;
2221 dst->type = brw_type_for_base_type(ir->type);
2222
2223 /* Find other components that match the one we're about to
2224 * write. Emits fewer instructions for things like vec4(0.5,
2225 * 1.5, 1.5, 1.5).
2226 */
2227 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2228 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2229 if (ir->value.b[i] == ir->value.b[j])
2230 dst->writemask |= (1 << j);
2231 } else {
2232 /* u, i, and f storage all line up, so no need for a
2233 * switch case for comparing each type.
2234 */
2235 if (ir->value.u[i] == ir->value.u[j])
2236 dst->writemask |= (1 << j);
2237 }
2238 }
2239
2240 switch (ir->type->base_type) {
2241 case GLSL_TYPE_FLOAT:
2242 emit(MOV(*dst, src_reg(ir->value.f[i])));
2243 break;
2244 case GLSL_TYPE_INT:
2245 emit(MOV(*dst, src_reg(ir->value.i[i])));
2246 break;
2247 case GLSL_TYPE_UINT:
2248 emit(MOV(*dst, src_reg(ir->value.u[i])));
2249 break;
2250 case GLSL_TYPE_BOOL:
2251 emit(MOV(*dst,
2252 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2253 : 0)));
2254 break;
2255 default:
2256 unreachable("Non-float/uint/int/bool constant");
2257 }
2258
2259 remaining_writemask &= ~dst->writemask;
2260 }
2261 dst->reg_offset++;
2262 }
2263
2264 void
2265 vec4_visitor::visit(ir_constant *ir)
2266 {
2267 dst_reg dst = dst_reg(this, ir->type);
2268 this->result = src_reg(dst);
2269
2270 emit_constant_values(&dst, ir);
2271 }
2272
2273 void
2274 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2275 {
2276 ir_dereference *deref = static_cast<ir_dereference *>(
2277 ir->actual_parameters.get_head());
2278 ir_variable *location = deref->variable_referenced();
2279 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2280 location->data.binding);
2281
2282 /* Calculate the surface offset */
2283 src_reg offset(this, glsl_type::uint_type);
2284 ir_dereference_array *deref_array = deref->as_dereference_array();
2285 if (deref_array) {
2286 deref_array->array_index->accept(this);
2287
2288 src_reg tmp(this, glsl_type::uint_type);
2289 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2290 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2291 } else {
2292 offset = location->data.atomic.offset;
2293 }
2294
2295 /* Emit the appropriate machine instruction */
2296 const char *callee = ir->callee->function_name();
2297 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2298
2299 if (!strcmp("__intrinsic_atomic_read", callee)) {
2300 emit_untyped_surface_read(surf_index, dst, offset);
2301
2302 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2303 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2304 src_reg(), src_reg());
2305
2306 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2307 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2308 src_reg(), src_reg());
2309 }
2310 }
2311
2312 void
2313 vec4_visitor::visit(ir_call *ir)
2314 {
2315 const char *callee = ir->callee->function_name();
2316
2317 if (!strcmp("__intrinsic_atomic_read", callee) ||
2318 !strcmp("__intrinsic_atomic_increment", callee) ||
2319 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2320 visit_atomic_counter_intrinsic(ir);
2321 } else {
2322 unreachable("Unsupported intrinsic.");
2323 }
2324 }
2325
2326 src_reg
2327 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2328 {
2329 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2330 inst->base_mrf = 2;
2331 inst->mlen = 1;
2332 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2333 inst->dst.writemask = WRITEMASK_XYZW;
2334
2335 inst->src[1] = sampler;
2336
2337 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2338 int param_base = inst->base_mrf;
2339 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2340 int zero_mask = 0xf & ~coord_mask;
2341
2342 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2343 coordinate));
2344
2345 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2346 src_reg(0)));
2347
2348 emit(inst);
2349 return src_reg(inst->dst);
2350 }
2351
2352 static bool
2353 is_high_sampler(struct brw_context *brw, src_reg sampler)
2354 {
2355 if (brw->gen < 8 && !brw->is_haswell)
2356 return false;
2357
2358 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2359 }
2360
2361 void
2362 vec4_visitor::visit(ir_texture *ir)
2363 {
2364 uint32_t sampler =
2365 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2366
2367 ir_rvalue *nonconst_sampler_index =
2368 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2369
2370 /* Handle non-constant sampler array indexing */
2371 src_reg sampler_reg;
2372 if (nonconst_sampler_index) {
2373 /* The highest sampler which may be used by this operation is
2374 * the last element of the array. Mark it here, because the generator
2375 * doesn't have enough information to determine the bound.
2376 */
2377 uint32_t array_size = ir->sampler->as_dereference_array()
2378 ->array->type->array_size();
2379
2380 uint32_t max_used = sampler + array_size - 1;
2381 if (ir->op == ir_tg4 && brw->gen < 8) {
2382 max_used += prog_data->base.binding_table.gather_texture_start;
2383 } else {
2384 max_used += prog_data->base.binding_table.texture_start;
2385 }
2386
2387 brw_mark_surface_used(&prog_data->base, max_used);
2388
2389 /* Emit code to evaluate the actual indexing expression */
2390 nonconst_sampler_index->accept(this);
2391 dst_reg temp(this, glsl_type::uint_type);
2392 emit(ADD(temp, this->result, src_reg(sampler)))
2393 ->force_writemask_all = true;
2394 sampler_reg = src_reg(temp);
2395 } else {
2396 /* Single sampler, or constant array index; the indexing expression
2397 * is just an immediate.
2398 */
2399 sampler_reg = src_reg(sampler);
2400 }
2401
2402 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2403 * emitting anything other than setting up the constant result.
2404 */
2405 if (ir->op == ir_tg4) {
2406 ir_constant *chan = ir->lod_info.component->as_constant();
2407 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2408 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2409 dst_reg result(this, ir->type);
2410 this->result = src_reg(result);
2411 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2412 return;
2413 }
2414 }
2415
2416 /* Should be lowered by do_lower_texture_projection */
2417 assert(!ir->projector);
2418
2419 /* Should be lowered */
2420 assert(!ir->offset || !ir->offset->type->is_array());
2421
2422 /* Generate code to compute all the subexpression trees. This has to be
2423 * done before loading any values into MRFs for the sampler message since
2424 * generating these values may involve SEND messages that need the MRFs.
2425 */
2426 src_reg coordinate;
2427 if (ir->coordinate) {
2428 ir->coordinate->accept(this);
2429 coordinate = this->result;
2430 }
2431
2432 src_reg shadow_comparitor;
2433 if (ir->shadow_comparitor) {
2434 ir->shadow_comparitor->accept(this);
2435 shadow_comparitor = this->result;
2436 }
2437
2438 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2439 src_reg offset_value;
2440 if (has_nonconstant_offset) {
2441 ir->offset->accept(this);
2442 offset_value = src_reg(this->result);
2443 }
2444
2445 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2446 src_reg lod, dPdx, dPdy, sample_index, mcs;
2447 switch (ir->op) {
2448 case ir_tex:
2449 lod = src_reg(0.0f);
2450 lod_type = glsl_type::float_type;
2451 break;
2452 case ir_txf:
2453 case ir_txl:
2454 case ir_txs:
2455 ir->lod_info.lod->accept(this);
2456 lod = this->result;
2457 lod_type = ir->lod_info.lod->type;
2458 break;
2459 case ir_query_levels:
2460 lod = src_reg(0);
2461 lod_type = glsl_type::int_type;
2462 break;
2463 case ir_txf_ms:
2464 ir->lod_info.sample_index->accept(this);
2465 sample_index = this->result;
2466 sample_index_type = ir->lod_info.sample_index->type;
2467
2468 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2469 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2470 else
2471 mcs = src_reg(0u);
2472 break;
2473 case ir_txd:
2474 ir->lod_info.grad.dPdx->accept(this);
2475 dPdx = this->result;
2476
2477 ir->lod_info.grad.dPdy->accept(this);
2478 dPdy = this->result;
2479
2480 lod_type = ir->lod_info.grad.dPdx->type;
2481 break;
2482 case ir_txb:
2483 case ir_lod:
2484 case ir_tg4:
2485 break;
2486 }
2487
2488 enum opcode opcode;
2489 switch (ir->op) {
2490 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2491 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2492 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2493 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2494 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2495 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2496 case ir_tg4: opcode = has_nonconstant_offset
2497 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2498 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2499 case ir_txb:
2500 unreachable("TXB is not valid for vertex shaders.");
2501 case ir_lod:
2502 unreachable("LOD is not valid for vertex shaders.");
2503 default:
2504 unreachable("Unrecognized tex op");
2505 }
2506
2507 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2508
2509 if (ir->offset != NULL && ir->op != ir_txf)
2510 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2511
2512 /* Stuff the channel select bits in the top of the texture offset */
2513 if (ir->op == ir_tg4)
2514 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2515
2516 /* The message header is necessary for:
2517 * - Gen4 (always)
2518 * - Texel offsets
2519 * - Gather channel selection
2520 * - Sampler indices too large to fit in a 4-bit value.
2521 */
2522 inst->header_present =
2523 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2524 is_high_sampler(brw, sampler_reg);
2525 inst->base_mrf = 2;
2526 inst->mlen = inst->header_present + 1; /* always at least one */
2527 inst->dst = dst_reg(this, ir->type);
2528 inst->dst.writemask = WRITEMASK_XYZW;
2529 inst->shadow_compare = ir->shadow_comparitor != NULL;
2530
2531 inst->src[1] = sampler_reg;
2532
2533 /* MRF for the first parameter */
2534 int param_base = inst->base_mrf + inst->header_present;
2535
2536 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2537 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2538 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2539 } else {
2540 /* Load the coordinate */
2541 /* FINISHME: gl_clamp_mask and saturate */
2542 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2543 int zero_mask = 0xf & ~coord_mask;
2544
2545 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2546 coordinate));
2547
2548 if (zero_mask != 0) {
2549 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2550 src_reg(0)));
2551 }
2552 /* Load the shadow comparitor */
2553 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2554 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2555 WRITEMASK_X),
2556 shadow_comparitor));
2557 inst->mlen++;
2558 }
2559
2560 /* Load the LOD info */
2561 if (ir->op == ir_tex || ir->op == ir_txl) {
2562 int mrf, writemask;
2563 if (brw->gen >= 5) {
2564 mrf = param_base + 1;
2565 if (ir->shadow_comparitor) {
2566 writemask = WRITEMASK_Y;
2567 /* mlen already incremented */
2568 } else {
2569 writemask = WRITEMASK_X;
2570 inst->mlen++;
2571 }
2572 } else /* brw->gen == 4 */ {
2573 mrf = param_base;
2574 writemask = WRITEMASK_W;
2575 }
2576 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2577 } else if (ir->op == ir_txf) {
2578 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2579 } else if (ir->op == ir_txf_ms) {
2580 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2581 sample_index));
2582 if (brw->gen >= 7)
2583 /* MCS data is in the first channel of `mcs`, but we need to get it into
2584 * the .y channel of the second vec4 of params, so replicate .x across
2585 * the whole vec4 and then mask off everything except .y
2586 */
2587 mcs.swizzle = BRW_SWIZZLE_XXXX;
2588 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2589 mcs));
2590 inst->mlen++;
2591 } else if (ir->op == ir_txd) {
2592 const glsl_type *type = lod_type;
2593
2594 if (brw->gen >= 5) {
2595 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2596 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2597 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2598 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2599 inst->mlen++;
2600
2601 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2602 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2603 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2604 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2605 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2606 inst->mlen++;
2607
2608 if (ir->shadow_comparitor) {
2609 emit(MOV(dst_reg(MRF, param_base + 2,
2610 ir->shadow_comparitor->type, WRITEMASK_Z),
2611 shadow_comparitor));
2612 }
2613 }
2614 } else /* brw->gen == 4 */ {
2615 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2616 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2617 inst->mlen += 2;
2618 }
2619 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2620 if (ir->shadow_comparitor) {
2621 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2622 shadow_comparitor));
2623 }
2624
2625 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2626 offset_value));
2627 inst->mlen++;
2628 }
2629 }
2630
2631 emit(inst);
2632
2633 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2634 * spec requires layers.
2635 */
2636 if (ir->op == ir_txs) {
2637 glsl_type const *type = ir->sampler->type;
2638 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2639 type->sampler_array) {
2640 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2641 writemask(inst->dst, WRITEMASK_Z),
2642 src_reg(inst->dst), src_reg(6));
2643 }
2644 }
2645
2646 if (brw->gen == 6 && ir->op == ir_tg4) {
2647 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2648 }
2649
2650 swizzle_result(ir, src_reg(inst->dst), sampler);
2651 }
2652
2653 /**
2654 * Apply workarounds for Gen6 gather with UINT/SINT
2655 */
2656 void
2657 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2658 {
2659 if (!wa)
2660 return;
2661
2662 int width = (wa & WA_8BIT) ? 8 : 16;
2663 dst_reg dst_f = dst;
2664 dst_f.type = BRW_REGISTER_TYPE_F;
2665
2666 /* Convert from UNORM to UINT */
2667 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2668 emit(MOV(dst, src_reg(dst_f)));
2669
2670 if (wa & WA_SIGN) {
2671 /* Reinterpret the UINT value as a signed INT value by
2672 * shifting the sign bit into place, then shifting back
2673 * preserving sign.
2674 */
2675 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2676 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2677 }
2678 }
2679
2680 /**
2681 * Set up the gather channel based on the swizzle, for gather4.
2682 */
2683 uint32_t
2684 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2685 {
2686 ir_constant *chan = ir->lod_info.component->as_constant();
2687 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2688 switch (swiz) {
2689 case SWIZZLE_X: return 0;
2690 case SWIZZLE_Y:
2691 /* gather4 sampler is broken for green channel on RG32F --
2692 * we must ask for blue instead.
2693 */
2694 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2695 return 2;
2696 return 1;
2697 case SWIZZLE_Z: return 2;
2698 case SWIZZLE_W: return 3;
2699 default:
2700 unreachable("Not reached"); /* zero, one swizzles handled already */
2701 }
2702 }
2703
2704 void
2705 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2706 {
2707 int s = key->tex.swizzles[sampler];
2708
2709 this->result = src_reg(this, ir->type);
2710 dst_reg swizzled_result(this->result);
2711
2712 if (ir->op == ir_query_levels) {
2713 /* # levels is in .w */
2714 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2715 emit(MOV(swizzled_result, orig_val));
2716 return;
2717 }
2718
2719 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2720 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2721 emit(MOV(swizzled_result, orig_val));
2722 return;
2723 }
2724
2725
2726 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2727 int swizzle[4] = {0};
2728
2729 for (int i = 0; i < 4; i++) {
2730 switch (GET_SWZ(s, i)) {
2731 case SWIZZLE_ZERO:
2732 zero_mask |= (1 << i);
2733 break;
2734 case SWIZZLE_ONE:
2735 one_mask |= (1 << i);
2736 break;
2737 default:
2738 copy_mask |= (1 << i);
2739 swizzle[i] = GET_SWZ(s, i);
2740 break;
2741 }
2742 }
2743
2744 if (copy_mask) {
2745 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2746 swizzled_result.writemask = copy_mask;
2747 emit(MOV(swizzled_result, orig_val));
2748 }
2749
2750 if (zero_mask) {
2751 swizzled_result.writemask = zero_mask;
2752 emit(MOV(swizzled_result, src_reg(0.0f)));
2753 }
2754
2755 if (one_mask) {
2756 swizzled_result.writemask = one_mask;
2757 emit(MOV(swizzled_result, src_reg(1.0f)));
2758 }
2759 }
2760
2761 void
2762 vec4_visitor::visit(ir_return *)
2763 {
2764 unreachable("not reached");
2765 }
2766
2767 void
2768 vec4_visitor::visit(ir_discard *)
2769 {
2770 unreachable("not reached");
2771 }
2772
2773 void
2774 vec4_visitor::visit(ir_if *ir)
2775 {
2776 /* Don't point the annotation at the if statement, because then it plus
2777 * the then and else blocks get printed.
2778 */
2779 this->base_ir = ir->condition;
2780
2781 if (brw->gen == 6) {
2782 emit_if_gen6(ir);
2783 } else {
2784 enum brw_predicate predicate;
2785 emit_bool_to_cond_code(ir->condition, &predicate);
2786 emit(IF(predicate));
2787 }
2788
2789 visit_instructions(&ir->then_instructions);
2790
2791 if (!ir->else_instructions.is_empty()) {
2792 this->base_ir = ir->condition;
2793 emit(BRW_OPCODE_ELSE);
2794
2795 visit_instructions(&ir->else_instructions);
2796 }
2797
2798 this->base_ir = ir->condition;
2799 emit(BRW_OPCODE_ENDIF);
2800 }
2801
2802 void
2803 vec4_visitor::visit(ir_emit_vertex *)
2804 {
2805 unreachable("not reached");
2806 }
2807
2808 void
2809 vec4_visitor::visit(ir_end_primitive *)
2810 {
2811 unreachable("not reached");
2812 }
2813
2814 void
2815 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2816 dst_reg dst, src_reg offset,
2817 src_reg src0, src_reg src1)
2818 {
2819 unsigned mlen = 0;
2820
2821 /* Set the atomic operation offset. */
2822 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2823 mlen++;
2824
2825 /* Set the atomic operation arguments. */
2826 if (src0.file != BAD_FILE) {
2827 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2828 mlen++;
2829 }
2830
2831 if (src1.file != BAD_FILE) {
2832 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2833 mlen++;
2834 }
2835
2836 /* Emit the instruction. Note that this maps to the normal SIMD8
2837 * untyped atomic message on Ivy Bridge, but that's OK because
2838 * unused channels will be masked out.
2839 */
2840 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2841 src_reg(atomic_op), src_reg(surf_index));
2842 inst->base_mrf = 0;
2843 inst->mlen = mlen;
2844 }
2845
2846 void
2847 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2848 src_reg offset)
2849 {
2850 /* Set the surface read offset. */
2851 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2852
2853 /* Emit the instruction. Note that this maps to the normal SIMD8
2854 * untyped surface read message, but that's OK because unused
2855 * channels will be masked out.
2856 */
2857 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2858 dst, src_reg(surf_index));
2859 inst->base_mrf = 0;
2860 inst->mlen = 1;
2861 }
2862
2863 void
2864 vec4_visitor::emit_ndc_computation()
2865 {
2866 /* Get the position */
2867 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2868
2869 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2870 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2871 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2872
2873 current_annotation = "NDC";
2874 dst_reg ndc_w = ndc;
2875 ndc_w.writemask = WRITEMASK_W;
2876 src_reg pos_w = pos;
2877 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2878 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2879
2880 dst_reg ndc_xyz = ndc;
2881 ndc_xyz.writemask = WRITEMASK_XYZ;
2882
2883 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2884 }
2885
2886 void
2887 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2888 {
2889 if (brw->gen < 6 &&
2890 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2891 key->userclip_active || brw->has_negative_rhw_bug)) {
2892 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2893 dst_reg header1_w = header1;
2894 header1_w.writemask = WRITEMASK_W;
2895
2896 emit(MOV(header1, 0u));
2897
2898 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2899 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2900
2901 current_annotation = "Point size";
2902 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2903 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2904 }
2905
2906 if (key->userclip_active) {
2907 current_annotation = "Clipping flags";
2908 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2909 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2910
2911 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2912 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2913 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2914
2915 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2916 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2917 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2918 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2919 }
2920
2921 /* i965 clipping workaround:
2922 * 1) Test for -ve rhw
2923 * 2) If set,
2924 * set ndc = (0,0,0,0)
2925 * set ucp[6] = 1
2926 *
2927 * Later, clipping will detect ucp[6] and ensure the primitive is
2928 * clipped against all fixed planes.
2929 */
2930 if (brw->has_negative_rhw_bug) {
2931 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2932 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2933 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2934 vec4_instruction *inst;
2935 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2936 inst->predicate = BRW_PREDICATE_NORMAL;
2937 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2938 inst->predicate = BRW_PREDICATE_NORMAL;
2939 }
2940
2941 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2942 } else if (brw->gen < 6) {
2943 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2944 } else {
2945 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2946 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2947 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2948 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2949 }
2950 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2951 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2952 src_reg(output_reg[VARYING_SLOT_LAYER])));
2953 }
2954 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2955 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2956 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2957 }
2958 }
2959 }
2960
2961 void
2962 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2963 {
2964 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2965 *
2966 * "If a linked set of shaders forming the vertex stage contains no
2967 * static write to gl_ClipVertex or gl_ClipDistance, but the
2968 * application has requested clipping against user clip planes through
2969 * the API, then the coordinate written to gl_Position is used for
2970 * comparison against the user clip planes."
2971 *
2972 * This function is only called if the shader didn't write to
2973 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2974 * if the user wrote to it; otherwise we use gl_Position.
2975 */
2976 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2977 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2978 clip_vertex = VARYING_SLOT_POS;
2979 }
2980
2981 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2982 ++i) {
2983 reg.writemask = 1 << i;
2984 emit(DP4(reg,
2985 src_reg(output_reg[clip_vertex]),
2986 src_reg(this->userplane[i + offset])));
2987 }
2988 }
2989
2990 void
2991 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2992 {
2993 assert (varying < VARYING_SLOT_MAX);
2994 reg.type = output_reg[varying].type;
2995 current_annotation = output_reg_annotation[varying];
2996 /* Copy the register, saturating if necessary */
2997 vec4_instruction *inst = emit(MOV(reg,
2998 src_reg(output_reg[varying])));
2999 if ((varying == VARYING_SLOT_COL0 ||
3000 varying == VARYING_SLOT_COL1 ||
3001 varying == VARYING_SLOT_BFC0 ||
3002 varying == VARYING_SLOT_BFC1) &&
3003 key->clamp_vertex_color) {
3004 inst->saturate = true;
3005 }
3006 }
3007
3008 void
3009 vec4_visitor::emit_urb_slot(int mrf, int varying)
3010 {
3011 struct brw_reg hw_reg = brw_message_reg(mrf);
3012 dst_reg reg = dst_reg(MRF, mrf);
3013 reg.type = BRW_REGISTER_TYPE_F;
3014
3015 switch (varying) {
3016 case VARYING_SLOT_PSIZ:
3017 /* PSIZ is always in slot 0, and is coupled with other flags. */
3018 current_annotation = "indices, point width, clip flags";
3019 emit_psiz_and_flags(hw_reg);
3020 break;
3021 case BRW_VARYING_SLOT_NDC:
3022 current_annotation = "NDC";
3023 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3024 break;
3025 case VARYING_SLOT_POS:
3026 current_annotation = "gl_Position";
3027 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3028 break;
3029 case VARYING_SLOT_EDGE:
3030 /* This is present when doing unfilled polygons. We're supposed to copy
3031 * the edge flag from the user-provided vertex array
3032 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3033 * of that attribute (starts as 1.0f). This is then used in clipping to
3034 * determine which edges should be drawn as wireframe.
3035 */
3036 current_annotation = "edge flag";
3037 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3038 glsl_type::float_type, WRITEMASK_XYZW))));
3039 break;
3040 case BRW_VARYING_SLOT_PAD:
3041 /* No need to write to this slot */
3042 break;
3043 default:
3044 emit_generic_urb_slot(reg, varying);
3045 break;
3046 }
3047 }
3048
3049 static int
3050 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3051 {
3052 if (brw->gen >= 6) {
3053 /* URB data written (does not include the message header reg) must
3054 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3055 * section 5.4.3.2.2: URB_INTERLEAVED.
3056 *
3057 * URB entries are allocated on a multiple of 1024 bits, so an
3058 * extra 128 bits written here to make the end align to 256 is
3059 * no problem.
3060 */
3061 if ((mlen % 2) != 1)
3062 mlen++;
3063 }
3064
3065 return mlen;
3066 }
3067
3068
3069 /**
3070 * Generates the VUE payload plus the necessary URB write instructions to
3071 * output it.
3072 *
3073 * The VUE layout is documented in Volume 2a.
3074 */
3075 void
3076 vec4_visitor::emit_vertex()
3077 {
3078 /* MRF 0 is reserved for the debugger, so start with message header
3079 * in MRF 1.
3080 */
3081 int base_mrf = 1;
3082 int mrf = base_mrf;
3083 /* In the process of generating our URB write message contents, we
3084 * may need to unspill a register or load from an array. Those
3085 * reads would use MRFs 14-15.
3086 */
3087 int max_usable_mrf = 13;
3088
3089 /* The following assertion verifies that max_usable_mrf causes an
3090 * even-numbered amount of URB write data, which will meet gen6's
3091 * requirements for length alignment.
3092 */
3093 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3094
3095 /* First mrf is the g0-based message header containing URB handles and
3096 * such.
3097 */
3098 emit_urb_write_header(mrf++);
3099
3100 if (brw->gen < 6) {
3101 emit_ndc_computation();
3102 }
3103
3104 /* Lower legacy ff and ClipVertex clipping to clip distances */
3105 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3106 current_annotation = "user clip distances";
3107
3108 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3109 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3110
3111 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3112 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3113 }
3114
3115 /* We may need to split this up into several URB writes, so do them in a
3116 * loop.
3117 */
3118 int slot = 0;
3119 bool complete = false;
3120 do {
3121 /* URB offset is in URB row increments, and each of our MRFs is half of
3122 * one of those, since we're doing interleaved writes.
3123 */
3124 int offset = slot / 2;
3125
3126 mrf = base_mrf + 1;
3127 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3128 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3129
3130 /* If this was max_usable_mrf, we can't fit anything more into this
3131 * URB WRITE.
3132 */
3133 if (mrf > max_usable_mrf) {
3134 slot++;
3135 break;
3136 }
3137 }
3138
3139 complete = slot >= prog_data->vue_map.num_slots;
3140 current_annotation = "URB write";
3141 vec4_instruction *inst = emit_urb_write_opcode(complete);
3142 inst->base_mrf = base_mrf;
3143 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3144 inst->offset += offset;
3145 } while(!complete);
3146 }
3147
3148
3149 src_reg
3150 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3151 src_reg *reladdr, int reg_offset)
3152 {
3153 /* Because we store the values to scratch interleaved like our
3154 * vertex data, we need to scale the vec4 index by 2.
3155 */
3156 int message_header_scale = 2;
3157
3158 /* Pre-gen6, the message header uses byte offsets instead of vec4
3159 * (16-byte) offset units.
3160 */
3161 if (brw->gen < 6)
3162 message_header_scale *= 16;
3163
3164 if (reladdr) {
3165 src_reg index = src_reg(this, glsl_type::int_type);
3166
3167 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3168 emit_before(inst, MUL(dst_reg(index),
3169 index, src_reg(message_header_scale)));
3170
3171 return index;
3172 } else {
3173 return src_reg(reg_offset * message_header_scale);
3174 }
3175 }
3176
3177 src_reg
3178 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3179 src_reg *reladdr, int reg_offset)
3180 {
3181 if (reladdr) {
3182 src_reg index = src_reg(this, glsl_type::int_type);
3183
3184 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3185
3186 /* Pre-gen6, the message header uses byte offsets instead of vec4
3187 * (16-byte) offset units.
3188 */
3189 if (brw->gen < 6) {
3190 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3191 }
3192
3193 return index;
3194 } else if (brw->gen >= 8) {
3195 /* Store the offset in a GRF so we can send-from-GRF. */
3196 src_reg offset = src_reg(this, glsl_type::int_type);
3197 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3198 return offset;
3199 } else {
3200 int message_header_scale = brw->gen < 6 ? 16 : 1;
3201 return src_reg(reg_offset * message_header_scale);
3202 }
3203 }
3204
3205 /**
3206 * Emits an instruction before @inst to load the value named by @orig_src
3207 * from scratch space at @base_offset to @temp.
3208 *
3209 * @base_offset is measured in 32-byte units (the size of a register).
3210 */
3211 void
3212 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3213 dst_reg temp, src_reg orig_src,
3214 int base_offset)
3215 {
3216 int reg_offset = base_offset + orig_src.reg_offset;
3217 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3218
3219 emit_before(inst, SCRATCH_READ(temp, index));
3220 }
3221
3222 /**
3223 * Emits an instruction after @inst to store the value to be written
3224 * to @orig_dst to scratch space at @base_offset, from @temp.
3225 *
3226 * @base_offset is measured in 32-byte units (the size of a register).
3227 */
3228 void
3229 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3230 {
3231 int reg_offset = base_offset + inst->dst.reg_offset;
3232 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3233
3234 /* Create a temporary register to store *inst's result in.
3235 *
3236 * We have to be careful in MOVing from our temporary result register in
3237 * the scratch write. If we swizzle from channels of the temporary that
3238 * weren't initialized, it will confuse live interval analysis, which will
3239 * make spilling fail to make progress.
3240 */
3241 src_reg temp = src_reg(this, glsl_type::vec4_type);
3242 temp.type = inst->dst.type;
3243 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3244 int swizzles[4];
3245 for (int i = 0; i < 4; i++)
3246 if (inst->dst.writemask & (1 << i))
3247 swizzles[i] = i;
3248 else
3249 swizzles[i] = first_writemask_chan;
3250 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3251 swizzles[2], swizzles[3]);
3252
3253 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3254 inst->dst.writemask));
3255 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3256 write->predicate = inst->predicate;
3257 write->ir = inst->ir;
3258 write->annotation = inst->annotation;
3259 inst->insert_after(write);
3260
3261 inst->dst.file = temp.file;
3262 inst->dst.reg = temp.reg;
3263 inst->dst.reg_offset = temp.reg_offset;
3264 inst->dst.reladdr = NULL;
3265 }
3266
3267 /**
3268 * We can't generally support array access in GRF space, because a
3269 * single instruction's destination can only span 2 contiguous
3270 * registers. So, we send all GRF arrays that get variable index
3271 * access to scratch space.
3272 */
3273 void
3274 vec4_visitor::move_grf_array_access_to_scratch()
3275 {
3276 int scratch_loc[this->virtual_grf_count];
3277
3278 for (int i = 0; i < this->virtual_grf_count; i++) {
3279 scratch_loc[i] = -1;
3280 }
3281
3282 /* First, calculate the set of virtual GRFs that need to be punted
3283 * to scratch due to having any array access on them, and where in
3284 * scratch.
3285 */
3286 foreach_in_list(vec4_instruction, inst, &instructions) {
3287 if (inst->dst.file == GRF && inst->dst.reladdr &&
3288 scratch_loc[inst->dst.reg] == -1) {
3289 scratch_loc[inst->dst.reg] = c->last_scratch;
3290 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3291 }
3292
3293 for (int i = 0 ; i < 3; i++) {
3294 src_reg *src = &inst->src[i];
3295
3296 if (src->file == GRF && src->reladdr &&
3297 scratch_loc[src->reg] == -1) {
3298 scratch_loc[src->reg] = c->last_scratch;
3299 c->last_scratch += this->virtual_grf_sizes[src->reg];
3300 }
3301 }
3302 }
3303
3304 /* Now, for anything that will be accessed through scratch, rewrite
3305 * it to load/store. Note that this is a _safe list walk, because
3306 * we may generate a new scratch_write instruction after the one
3307 * we're processing.
3308 */
3309 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3310 /* Set up the annotation tracking for new generated instructions. */
3311 base_ir = inst->ir;
3312 current_annotation = inst->annotation;
3313
3314 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3315 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3316 }
3317
3318 for (int i = 0 ; i < 3; i++) {
3319 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3320 continue;
3321
3322 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3323
3324 emit_scratch_read(inst, temp, inst->src[i],
3325 scratch_loc[inst->src[i].reg]);
3326
3327 inst->src[i].file = temp.file;
3328 inst->src[i].reg = temp.reg;
3329 inst->src[i].reg_offset = temp.reg_offset;
3330 inst->src[i].reladdr = NULL;
3331 }
3332 }
3333 }
3334
3335 /**
3336 * Emits an instruction before @inst to load the value named by @orig_src
3337 * from the pull constant buffer (surface) at @base_offset to @temp.
3338 */
3339 void
3340 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3341 dst_reg temp, src_reg orig_src,
3342 int base_offset)
3343 {
3344 int reg_offset = base_offset + orig_src.reg_offset;
3345 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3346 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3347 vec4_instruction *load;
3348
3349 if (brw->gen >= 7) {
3350 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3351 grf_offset.type = offset.type;
3352 emit_before(inst, MOV(grf_offset, offset));
3353
3354 load = new(mem_ctx) vec4_instruction(this,
3355 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3356 temp, index, src_reg(grf_offset));
3357 } else {
3358 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3359 temp, index, offset);
3360 load->base_mrf = 14;
3361 load->mlen = 1;
3362 }
3363 emit_before(inst, load);
3364 }
3365
3366 /**
3367 * Implements array access of uniforms by inserting a
3368 * PULL_CONSTANT_LOAD instruction.
3369 *
3370 * Unlike temporary GRF array access (where we don't support it due to
3371 * the difficulty of doing relative addressing on instruction
3372 * destinations), we could potentially do array access of uniforms
3373 * that were loaded in GRF space as push constants. In real-world
3374 * usage we've seen, though, the arrays being used are always larger
3375 * than we could load as push constants, so just always move all
3376 * uniform array access out to a pull constant buffer.
3377 */
3378 void
3379 vec4_visitor::move_uniform_array_access_to_pull_constants()
3380 {
3381 int pull_constant_loc[this->uniforms];
3382
3383 for (int i = 0; i < this->uniforms; i++) {
3384 pull_constant_loc[i] = -1;
3385 }
3386
3387 /* Walk through and find array access of uniforms. Put a copy of that
3388 * uniform in the pull constant buffer.
3389 *
3390 * Note that we don't move constant-indexed accesses to arrays. No
3391 * testing has been done of the performance impact of this choice.
3392 */
3393 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3394 for (int i = 0 ; i < 3; i++) {
3395 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3396 continue;
3397
3398 int uniform = inst->src[i].reg;
3399
3400 /* If this array isn't already present in the pull constant buffer,
3401 * add it.
3402 */
3403 if (pull_constant_loc[uniform] == -1) {
3404 const gl_constant_value **values =
3405 &stage_prog_data->param[uniform * 4];
3406
3407 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3408
3409 assert(uniform < uniform_array_size);
3410 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3411 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3412 = values[j];
3413 }
3414 }
3415
3416 /* Set up the annotation tracking for new generated instructions. */
3417 base_ir = inst->ir;
3418 current_annotation = inst->annotation;
3419
3420 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3421
3422 emit_pull_constant_load(inst, temp, inst->src[i],
3423 pull_constant_loc[uniform]);
3424
3425 inst->src[i].file = temp.file;
3426 inst->src[i].reg = temp.reg;
3427 inst->src[i].reg_offset = temp.reg_offset;
3428 inst->src[i].reladdr = NULL;
3429 }
3430 }
3431
3432 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3433 * no need to track them as larger-than-vec4 objects. This will be
3434 * relied on in cutting out unused uniform vectors from push
3435 * constants.
3436 */
3437 split_uniform_registers();
3438 }
3439
3440 void
3441 vec4_visitor::resolve_ud_negate(src_reg *reg)
3442 {
3443 if (reg->type != BRW_REGISTER_TYPE_UD ||
3444 !reg->negate)
3445 return;
3446
3447 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3448 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3449 *reg = temp;
3450 }
3451
3452 vec4_visitor::vec4_visitor(struct brw_context *brw,
3453 struct brw_vec4_compile *c,
3454 struct gl_program *prog,
3455 const struct brw_vec4_prog_key *key,
3456 struct brw_vec4_prog_data *prog_data,
3457 struct gl_shader_program *shader_prog,
3458 gl_shader_stage stage,
3459 void *mem_ctx,
3460 bool debug_flag,
3461 bool no_spills,
3462 shader_time_shader_type st_base,
3463 shader_time_shader_type st_written,
3464 shader_time_shader_type st_reset)
3465 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3466 c(c),
3467 key(key),
3468 prog_data(prog_data),
3469 sanity_param_count(0),
3470 fail_msg(NULL),
3471 first_non_payload_grf(0),
3472 need_all_constants_in_pull_buffer(false),
3473 debug_flag(debug_flag),
3474 no_spills(no_spills),
3475 st_base(st_base),
3476 st_written(st_written),
3477 st_reset(st_reset)
3478 {
3479 this->mem_ctx = mem_ctx;
3480 this->failed = false;
3481
3482 this->base_ir = NULL;
3483 this->current_annotation = NULL;
3484 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3485
3486 this->variable_ht = hash_table_ctor(0,
3487 hash_table_pointer_hash,
3488 hash_table_pointer_compare);
3489
3490 this->virtual_grf_start = NULL;
3491 this->virtual_grf_end = NULL;
3492 this->virtual_grf_sizes = NULL;
3493 this->virtual_grf_count = 0;
3494 this->virtual_grf_reg_map = NULL;
3495 this->virtual_grf_reg_count = 0;
3496 this->virtual_grf_array_size = 0;
3497 this->live_intervals_valid = false;
3498
3499 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3500
3501 this->uniforms = 0;
3502
3503 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3504 * at least one. See setup_uniforms() in brw_vec4.cpp.
3505 */
3506 this->uniform_array_size = 1;
3507 if (prog_data) {
3508 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3509 }
3510
3511 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3512 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3513 }
3514
3515 vec4_visitor::~vec4_visitor()
3516 {
3517 hash_table_dtor(this->variable_ht);
3518 }
3519
3520
3521 void
3522 vec4_visitor::fail(const char *format, ...)
3523 {
3524 va_list va;
3525 char *msg;
3526
3527 if (failed)
3528 return;
3529
3530 failed = true;
3531
3532 va_start(va, format);
3533 msg = ralloc_vasprintf(mem_ctx, format, va);
3534 va_end(va);
3535 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3536
3537 this->fail_msg = msg;
3538
3539 if (debug_flag) {
3540 fprintf(stderr, "%s", msg);
3541 }
3542 }
3543
3544 } /* namespace brw */