6e48be7485251776705bf4d4ac6f3c09715fb853
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr) {
780 src_reg op[2];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 2);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 default:
856 unreachable("not reached");
857 }
858 return;
859 }
860
861 ir->accept(this);
862
863 resolve_ud_negate(&this->result);
864
865 if (brw->gen >= 6) {
866 vec4_instruction *inst = emit(AND(dst_null_d(),
867 this->result, src_reg(1)));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 } else {
870 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
871 inst->conditional_mod = BRW_CONDITIONAL_NZ;
872 }
873 }
874
875 /**
876 * Emit a gen6 IF statement with the comparison folded into the IF
877 * instruction.
878 */
879 void
880 vec4_visitor::emit_if_gen6(ir_if *ir)
881 {
882 ir_expression *expr = ir->condition->as_expression();
883
884 if (expr) {
885 src_reg op[2];
886 dst_reg temp;
887
888 assert(expr->get_num_operands() <= 2);
889 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
890 expr->operands[i]->accept(this);
891 op[i] = this->result;
892 }
893
894 switch (expr->operation) {
895 case ir_unop_logic_not:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
897 return;
898
899 case ir_binop_logic_xor:
900 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
901 return;
902
903 case ir_binop_logic_or:
904 temp = dst_reg(this, glsl_type::bool_type);
905 emit(OR(temp, op[0], op[1]));
906 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
907 return;
908
909 case ir_binop_logic_and:
910 temp = dst_reg(this, glsl_type::bool_type);
911 emit(AND(temp, op[0], op[1]));
912 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_f2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_unop_i2b:
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922
923 case ir_binop_greater:
924 case ir_binop_gequal:
925 case ir_binop_less:
926 case ir_binop_lequal:
927 case ir_binop_equal:
928 case ir_binop_nequal:
929 emit(IF(op[0], op[1],
930 brw_conditional_for_comparison(expr->operation)));
931 return;
932
933 case ir_binop_all_equal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
935 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
936 return;
937
938 case ir_binop_any_nequal:
939 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 case ir_unop_any:
944 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
945 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
946 return;
947
948 default:
949 unreachable("not reached");
950 }
951 return;
952 }
953
954 ir->condition->accept(this);
955
956 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
957 }
958
959 void
960 vec4_visitor::visit(ir_variable *ir)
961 {
962 dst_reg *reg = NULL;
963
964 if (variable_storage(ir))
965 return;
966
967 switch (ir->data.mode) {
968 case ir_var_shader_in:
969 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
970 break;
971
972 case ir_var_shader_out:
973 reg = new(mem_ctx) dst_reg(this, ir->type);
974
975 for (int i = 0; i < type_size(ir->type); i++) {
976 output_reg[ir->data.location + i] = *reg;
977 output_reg[ir->data.location + i].reg_offset = i;
978 output_reg[ir->data.location + i].type =
979 brw_type_for_base_type(ir->type->get_scalar_type());
980 output_reg_annotation[ir->data.location + i] = ir->name;
981 }
982 break;
983
984 case ir_var_auto:
985 case ir_var_temporary:
986 reg = new(mem_ctx) dst_reg(this, ir->type);
987 break;
988
989 case ir_var_uniform:
990 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
991
992 /* Thanks to the lower_ubo_reference pass, we will see only
993 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
994 * variables, so no need for them to be in variable_ht.
995 *
996 * Atomic counters take no uniform storage, no need to do
997 * anything here.
998 */
999 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000 return;
1001
1002 /* Track how big the whole uniform variable is, in case we need to put a
1003 * copy of its data into pull constants for array access.
1004 */
1005 assert(this->uniforms < uniform_array_size);
1006 this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008 if (!strncmp(ir->name, "gl_", 3)) {
1009 setup_builtin_uniform_values(ir);
1010 } else {
1011 setup_uniform_values(ir);
1012 }
1013 break;
1014
1015 case ir_var_system_value:
1016 reg = make_reg_for_system_value(ir);
1017 break;
1018
1019 default:
1020 unreachable("not reached");
1021 }
1022
1023 reg->type = brw_type_for_base_type(ir->type);
1024 hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030 /* We don't want debugging output to print the whole body of the
1031 * loop as the annotation.
1032 */
1033 this->base_ir = NULL;
1034
1035 emit(BRW_OPCODE_DO);
1036
1037 visit_instructions(&ir->body_instructions);
1038
1039 emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045 switch (ir->mode) {
1046 case ir_loop_jump::jump_break:
1047 emit(BRW_OPCODE_BREAK);
1048 break;
1049 case ir_loop_jump::jump_continue:
1050 emit(BRW_OPCODE_CONTINUE);
1051 break;
1052 }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059 unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065 /* Ignore function bodies other than main() -- we shouldn't see calls to
1066 * them since they should all be inlined.
1067 */
1068 if (strcmp(ir->name, "main") == 0) {
1069 const ir_function_signature *sig;
1070 exec_list empty;
1071
1072 sig = ir->matching_signature(NULL, &empty, false);
1073
1074 assert(sig);
1075
1076 visit_instructions(&sig->body);
1077 }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084 if (!sat_src)
1085 return false;
1086
1087 sat_src->accept(this);
1088 src_reg src = this->result;
1089
1090 this->result = src_reg(this, ir->type);
1091 vec4_instruction *inst;
1092 inst = emit(MOV(dst_reg(this->result), src));
1093 inst->saturate = true;
1094
1095 return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101 /* 3-src instructions were introduced in gen6. */
1102 if (brw->gen < 6)
1103 return false;
1104
1105 /* MAD can only handle floating-point data. */
1106 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107 return false;
1108
1109 ir_rvalue *nonmul = ir->operands[1];
1110 ir_expression *mul = ir->operands[0]->as_expression();
1111
1112 if (!mul || mul->operation != ir_binop_mul) {
1113 nonmul = ir->operands[0];
1114 mul = ir->operands[1]->as_expression();
1115
1116 if (!mul || mul->operation != ir_binop_mul)
1117 return false;
1118 }
1119
1120 nonmul->accept(this);
1121 src_reg src0 = fix_3src_operand(this->result);
1122
1123 mul->operands[0]->accept(this);
1124 src_reg src1 = fix_3src_operand(this->result);
1125
1126 mul->operands[1]->accept(this);
1127 src_reg src2 = fix_3src_operand(this->result);
1128
1129 this->result = src_reg(this, ir->type);
1130 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132 return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138 /* This optimization relies on CMP setting the destination to 0 when
1139 * false. Early hardware only sets the least significant bit, and
1140 * leaves the other bits undefined. So we can't use it.
1141 */
1142 if (brw->gen < 6)
1143 return false;
1144
1145 ir_expression *const cmp = ir->operands[0]->as_expression();
1146
1147 if (cmp == NULL)
1148 return false;
1149
1150 switch (cmp->operation) {
1151 case ir_binop_less:
1152 case ir_binop_greater:
1153 case ir_binop_lequal:
1154 case ir_binop_gequal:
1155 case ir_binop_equal:
1156 case ir_binop_nequal:
1157 break;
1158
1159 default:
1160 return false;
1161 }
1162
1163 cmp->operands[0]->accept(this);
1164 const src_reg cmp_src0 = this->result;
1165
1166 cmp->operands[1]->accept(this);
1167 const src_reg cmp_src1 = this->result;
1168
1169 this->result = src_reg(this, ir->type);
1170
1171 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1172 brw_conditional_for_comparison(cmp->operation)));
1173
1174 /* If the comparison is false, this->result will just happen to be zero.
1175 */
1176 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1177 this->result, src_reg(1.0f));
1178 inst->predicate = BRW_PREDICATE_NORMAL;
1179 inst->predicate_inverse = true;
1180
1181 return true;
1182 }
1183
1184 void
1185 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1186 src_reg src0, src_reg src1)
1187 {
1188 vec4_instruction *inst;
1189
1190 if (brw->gen >= 6) {
1191 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1192 inst->conditional_mod = conditionalmod;
1193 } else {
1194 emit(CMP(dst, src0, src1, conditionalmod));
1195
1196 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1197 inst->predicate = BRW_PREDICATE_NORMAL;
1198 }
1199 }
1200
1201 void
1202 vec4_visitor::emit_lrp(const dst_reg &dst,
1203 const src_reg &x, const src_reg &y, const src_reg &a)
1204 {
1205 if (brw->gen >= 6) {
1206 /* Note that the instruction's argument order is reversed from GLSL
1207 * and the IR.
1208 */
1209 emit(LRP(dst,
1210 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1211 } else {
1212 /* Earlier generations don't support three source operations, so we
1213 * need to emit x*(1-a) + y*a.
1214 */
1215 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1216 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1217 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1218 y_times_a.writemask = dst.writemask;
1219 one_minus_a.writemask = dst.writemask;
1220 x_times_one_minus_a.writemask = dst.writemask;
1221
1222 emit(MUL(y_times_a, y, a));
1223 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1224 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1225 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1226 }
1227 }
1228
1229 void
1230 vec4_visitor::visit(ir_expression *ir)
1231 {
1232 unsigned int operand;
1233 src_reg op[Elements(ir->operands)];
1234 src_reg result_src;
1235 dst_reg result_dst;
1236 vec4_instruction *inst;
1237
1238 if (try_emit_sat(ir))
1239 return;
1240
1241 if (ir->operation == ir_binop_add) {
1242 if (try_emit_mad(ir))
1243 return;
1244 }
1245
1246 if (ir->operation == ir_unop_b2f) {
1247 if (try_emit_b2f_of_compare(ir))
1248 return;
1249 }
1250
1251 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1252 this->result.file = BAD_FILE;
1253 ir->operands[operand]->accept(this);
1254 if (this->result.file == BAD_FILE) {
1255 fprintf(stderr, "Failed to get tree for expression operand:\n");
1256 ir->operands[operand]->fprint(stderr);
1257 exit(1);
1258 }
1259 op[operand] = this->result;
1260
1261 /* Matrix expression operands should have been broken down to vector
1262 * operations already.
1263 */
1264 assert(!ir->operands[operand]->type->is_matrix());
1265 }
1266
1267 int vector_elements = ir->operands[0]->type->vector_elements;
1268 if (ir->operands[1]) {
1269 vector_elements = MAX2(vector_elements,
1270 ir->operands[1]->type->vector_elements);
1271 }
1272
1273 this->result.file = BAD_FILE;
1274
1275 /* Storage for our result. Ideally for an assignment we'd be using
1276 * the actual storage for the result here, instead.
1277 */
1278 result_src = src_reg(this, ir->type);
1279 /* convenience for the emit functions below. */
1280 result_dst = dst_reg(result_src);
1281 /* If nothing special happens, this is the result. */
1282 this->result = result_src;
1283 /* Limit writes to the channels that will be used by result_src later.
1284 * This does limit this temp's use as a temporary for multi-instruction
1285 * sequences.
1286 */
1287 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1288
1289 switch (ir->operation) {
1290 case ir_unop_logic_not:
1291 if (ctx->Const.UniformBooleanTrue != 1) {
1292 emit(NOT(result_dst, op[0]));
1293 } else {
1294 emit(XOR(result_dst, op[0], src_reg(1)));
1295 }
1296 break;
1297 case ir_unop_neg:
1298 op[0].negate = !op[0].negate;
1299 emit(MOV(result_dst, op[0]));
1300 break;
1301 case ir_unop_abs:
1302 op[0].abs = true;
1303 op[0].negate = false;
1304 emit(MOV(result_dst, op[0]));
1305 break;
1306
1307 case ir_unop_sign:
1308 if (ir->type->is_float()) {
1309 /* AND(val, 0x80000000) gives the sign bit.
1310 *
1311 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1312 * zero.
1313 */
1314 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1315
1316 op[0].type = BRW_REGISTER_TYPE_UD;
1317 result_dst.type = BRW_REGISTER_TYPE_UD;
1318 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1319
1320 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1321 inst->predicate = BRW_PREDICATE_NORMAL;
1322
1323 this->result.type = BRW_REGISTER_TYPE_F;
1324 } else {
1325 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1326 * -> non-negative val generates 0x00000000.
1327 * Predicated OR sets 1 if val is positive.
1328 */
1329 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1330
1331 emit(ASR(result_dst, op[0], src_reg(31)));
1332
1333 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1334 inst->predicate = BRW_PREDICATE_NORMAL;
1335 }
1336 break;
1337
1338 case ir_unop_rcp:
1339 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1340 break;
1341
1342 case ir_unop_exp2:
1343 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1344 break;
1345 case ir_unop_log2:
1346 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1347 break;
1348 case ir_unop_exp:
1349 case ir_unop_log:
1350 unreachable("not reached: should be handled by ir_explog_to_explog2");
1351 case ir_unop_sin:
1352 case ir_unop_sin_reduced:
1353 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1354 break;
1355 case ir_unop_cos:
1356 case ir_unop_cos_reduced:
1357 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1358 break;
1359
1360 case ir_unop_dFdx:
1361 case ir_unop_dFdx_coarse:
1362 case ir_unop_dFdx_fine:
1363 case ir_unop_dFdy:
1364 case ir_unop_dFdy_coarse:
1365 case ir_unop_dFdy_fine:
1366 unreachable("derivatives not valid in vertex shader");
1367
1368 case ir_unop_bitfield_reverse:
1369 emit(BFREV(result_dst, op[0]));
1370 break;
1371 case ir_unop_bit_count:
1372 emit(CBIT(result_dst, op[0]));
1373 break;
1374 case ir_unop_find_msb: {
1375 src_reg temp = src_reg(this, glsl_type::uint_type);
1376
1377 inst = emit(FBH(dst_reg(temp), op[0]));
1378 inst->dst.writemask = WRITEMASK_XYZW;
1379
1380 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1381 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1382 * subtract the result from 31 to convert the MSB count into an LSB count.
1383 */
1384
1385 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1386 temp.swizzle = BRW_SWIZZLE_NOOP;
1387 emit(MOV(result_dst, temp));
1388
1389 src_reg src_tmp = src_reg(result_dst);
1390 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1391
1392 src_tmp.negate = true;
1393 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1394 inst->predicate = BRW_PREDICATE_NORMAL;
1395 break;
1396 }
1397 case ir_unop_find_lsb:
1398 emit(FBL(result_dst, op[0]));
1399 break;
1400
1401 case ir_unop_noise:
1402 unreachable("not reached: should be handled by lower_noise");
1403
1404 case ir_binop_add:
1405 emit(ADD(result_dst, op[0], op[1]));
1406 break;
1407 case ir_binop_sub:
1408 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1409
1410 case ir_binop_mul:
1411 if (brw->gen < 8 && ir->type->is_integer()) {
1412 /* For integer multiplication, the MUL uses the low 16 bits of one of
1413 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1414 * accumulates in the contribution of the upper 16 bits of that
1415 * operand. If we can determine that one of the args is in the low
1416 * 16 bits, though, we can just emit a single MUL.
1417 */
1418 if (ir->operands[0]->is_uint16_constant()) {
1419 if (brw->gen < 7)
1420 emit(MUL(result_dst, op[0], op[1]));
1421 else
1422 emit(MUL(result_dst, op[1], op[0]));
1423 } else if (ir->operands[1]->is_uint16_constant()) {
1424 if (brw->gen < 7)
1425 emit(MUL(result_dst, op[1], op[0]));
1426 else
1427 emit(MUL(result_dst, op[0], op[1]));
1428 } else {
1429 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1430
1431 emit(MUL(acc, op[0], op[1]));
1432 emit(MACH(dst_null_d(), op[0], op[1]));
1433 emit(MOV(result_dst, src_reg(acc)));
1434 }
1435 } else {
1436 emit(MUL(result_dst, op[0], op[1]));
1437 }
1438 break;
1439 case ir_binop_imul_high: {
1440 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1441
1442 emit(MUL(acc, op[0], op[1]));
1443 emit(MACH(result_dst, op[0], op[1]));
1444 break;
1445 }
1446 case ir_binop_div:
1447 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1448 assert(ir->type->is_integer());
1449 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1450 break;
1451 case ir_binop_carry: {
1452 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1453
1454 emit(ADDC(dst_null_ud(), op[0], op[1]));
1455 emit(MOV(result_dst, src_reg(acc)));
1456 break;
1457 }
1458 case ir_binop_borrow: {
1459 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1460
1461 emit(SUBB(dst_null_ud(), op[0], op[1]));
1462 emit(MOV(result_dst, src_reg(acc)));
1463 break;
1464 }
1465 case ir_binop_mod:
1466 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1467 assert(ir->type->is_integer());
1468 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1469 break;
1470
1471 case ir_binop_less:
1472 case ir_binop_greater:
1473 case ir_binop_lequal:
1474 case ir_binop_gequal:
1475 case ir_binop_equal:
1476 case ir_binop_nequal: {
1477 emit(CMP(result_dst, op[0], op[1],
1478 brw_conditional_for_comparison(ir->operation)));
1479 if (ctx->Const.UniformBooleanTrue == 1) {
1480 emit(AND(result_dst, result_src, src_reg(1)));
1481 }
1482 break;
1483 }
1484
1485 case ir_binop_all_equal:
1486 /* "==" operator producing a scalar boolean. */
1487 if (ir->operands[0]->type->is_vector() ||
1488 ir->operands[1]->type->is_vector()) {
1489 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1490 emit(MOV(result_dst, src_reg(0)));
1491 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1492 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1493 } else {
1494 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1495 if (ctx->Const.UniformBooleanTrue == 1) {
1496 emit(AND(result_dst, result_src, src_reg(1)));
1497 }
1498 }
1499 break;
1500 case ir_binop_any_nequal:
1501 /* "!=" operator producing a scalar boolean. */
1502 if (ir->operands[0]->type->is_vector() ||
1503 ir->operands[1]->type->is_vector()) {
1504 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1505
1506 emit(MOV(result_dst, src_reg(0)));
1507 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1508 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1509 } else {
1510 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1511 if (ctx->Const.UniformBooleanTrue == 1) {
1512 emit(AND(result_dst, result_src, src_reg(1)));
1513 }
1514 }
1515 break;
1516
1517 case ir_unop_any:
1518 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1519 emit(MOV(result_dst, src_reg(0)));
1520
1521 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1522 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1523 break;
1524
1525 case ir_binop_logic_xor:
1526 emit(XOR(result_dst, op[0], op[1]));
1527 break;
1528
1529 case ir_binop_logic_or:
1530 emit(OR(result_dst, op[0], op[1]));
1531 break;
1532
1533 case ir_binop_logic_and:
1534 emit(AND(result_dst, op[0], op[1]));
1535 break;
1536
1537 case ir_binop_dot:
1538 assert(ir->operands[0]->type->is_vector());
1539 assert(ir->operands[0]->type == ir->operands[1]->type);
1540 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1541 break;
1542
1543 case ir_unop_sqrt:
1544 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1545 break;
1546 case ir_unop_rsq:
1547 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1548 break;
1549
1550 case ir_unop_bitcast_i2f:
1551 case ir_unop_bitcast_u2f:
1552 this->result = op[0];
1553 this->result.type = BRW_REGISTER_TYPE_F;
1554 break;
1555
1556 case ir_unop_bitcast_f2i:
1557 this->result = op[0];
1558 this->result.type = BRW_REGISTER_TYPE_D;
1559 break;
1560
1561 case ir_unop_bitcast_f2u:
1562 this->result = op[0];
1563 this->result.type = BRW_REGISTER_TYPE_UD;
1564 break;
1565
1566 case ir_unop_i2f:
1567 case ir_unop_i2u:
1568 case ir_unop_u2i:
1569 case ir_unop_u2f:
1570 case ir_unop_f2i:
1571 case ir_unop_f2u:
1572 emit(MOV(result_dst, op[0]));
1573 break;
1574 case ir_unop_b2i:
1575 if (ctx->Const.UniformBooleanTrue != 1) {
1576 emit(AND(result_dst, op[0], src_reg(1)));
1577 } else {
1578 emit(MOV(result_dst, op[0]));
1579 }
1580 break;
1581 case ir_unop_b2f:
1582 if (ctx->Const.UniformBooleanTrue != 1) {
1583 op[0].type = BRW_REGISTER_TYPE_UD;
1584 result_dst.type = BRW_REGISTER_TYPE_UD;
1585 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1586 result_dst.type = BRW_REGISTER_TYPE_F;
1587 } else {
1588 emit(MOV(result_dst, op[0]));
1589 }
1590 break;
1591 case ir_unop_f2b:
1592 case ir_unop_i2b:
1593 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1594 if (ctx->Const.UniformBooleanTrue == 1) {
1595 emit(AND(result_dst, result_src, src_reg(1)));
1596 }
1597 break;
1598
1599 case ir_unop_trunc:
1600 emit(RNDZ(result_dst, op[0]));
1601 break;
1602 case ir_unop_ceil:
1603 op[0].negate = !op[0].negate;
1604 inst = emit(RNDD(result_dst, op[0]));
1605 this->result.negate = true;
1606 break;
1607 case ir_unop_floor:
1608 inst = emit(RNDD(result_dst, op[0]));
1609 break;
1610 case ir_unop_fract:
1611 inst = emit(FRC(result_dst, op[0]));
1612 break;
1613 case ir_unop_round_even:
1614 emit(RNDE(result_dst, op[0]));
1615 break;
1616
1617 case ir_binop_min:
1618 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1619 break;
1620 case ir_binop_max:
1621 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1622 break;
1623
1624 case ir_binop_pow:
1625 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1626 break;
1627
1628 case ir_unop_bit_not:
1629 inst = emit(NOT(result_dst, op[0]));
1630 break;
1631 case ir_binop_bit_and:
1632 inst = emit(AND(result_dst, op[0], op[1]));
1633 break;
1634 case ir_binop_bit_xor:
1635 inst = emit(XOR(result_dst, op[0], op[1]));
1636 break;
1637 case ir_binop_bit_or:
1638 inst = emit(OR(result_dst, op[0], op[1]));
1639 break;
1640
1641 case ir_binop_lshift:
1642 inst = emit(SHL(result_dst, op[0], op[1]));
1643 break;
1644
1645 case ir_binop_rshift:
1646 if (ir->type->base_type == GLSL_TYPE_INT)
1647 inst = emit(ASR(result_dst, op[0], op[1]));
1648 else
1649 inst = emit(SHR(result_dst, op[0], op[1]));
1650 break;
1651
1652 case ir_binop_bfm:
1653 emit(BFI1(result_dst, op[0], op[1]));
1654 break;
1655
1656 case ir_binop_ubo_load: {
1657 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1658 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1659 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1660 src_reg offset;
1661
1662 /* Now, load the vector from that offset. */
1663 assert(ir->type->is_vector() || ir->type->is_scalar());
1664
1665 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1666 packed_consts.type = result.type;
1667 src_reg surf_index;
1668
1669 if (const_uniform_block) {
1670 /* The block index is a constant, so just emit the binding table entry
1671 * as an immediate.
1672 */
1673 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1674 const_uniform_block->value.u[0]);
1675 } else {
1676 /* The block index is not a constant. Evaluate the index expression
1677 * per-channel and add the base UBO index; the generator will select
1678 * a value from any live channel.
1679 */
1680 surf_index = src_reg(this, glsl_type::uint_type);
1681 emit(ADD(dst_reg(surf_index), op[0],
1682 src_reg(prog_data->base.binding_table.ubo_start)));
1683
1684 /* Assume this may touch any UBO. It would be nice to provide
1685 * a tighter bound, but the array information is already lowered away.
1686 */
1687 brw_mark_surface_used(&prog_data->base,
1688 prog_data->base.binding_table.ubo_start +
1689 shader_prog->NumUniformBlocks - 1);
1690 }
1691
1692 if (const_offset_ir) {
1693 if (brw->gen >= 8) {
1694 /* Store the offset in a GRF so we can send-from-GRF. */
1695 offset = src_reg(this, glsl_type::int_type);
1696 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1697 } else {
1698 /* Immediates are fine on older generations since they'll be moved
1699 * to a (potentially fake) MRF at the generator level.
1700 */
1701 offset = src_reg(const_offset / 16);
1702 }
1703 } else {
1704 offset = src_reg(this, glsl_type::uint_type);
1705 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1706 }
1707
1708 if (brw->gen >= 7) {
1709 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1710 grf_offset.type = offset.type;
1711
1712 emit(MOV(grf_offset, offset));
1713
1714 emit(new(mem_ctx) vec4_instruction(this,
1715 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1716 dst_reg(packed_consts),
1717 surf_index,
1718 src_reg(grf_offset)));
1719 } else {
1720 vec4_instruction *pull =
1721 emit(new(mem_ctx) vec4_instruction(this,
1722 VS_OPCODE_PULL_CONSTANT_LOAD,
1723 dst_reg(packed_consts),
1724 surf_index,
1725 offset));
1726 pull->base_mrf = 14;
1727 pull->mlen = 1;
1728 }
1729
1730 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1731 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1732 const_offset % 16 / 4,
1733 const_offset % 16 / 4,
1734 const_offset % 16 / 4);
1735
1736 /* UBO bools are any nonzero int. We need to convert them to use the
1737 * value of true stored in ctx->Const.UniformBooleanTrue.
1738 */
1739 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1740 emit(CMP(result_dst, packed_consts, src_reg(0u),
1741 BRW_CONDITIONAL_NZ));
1742 if (ctx->Const.UniformBooleanTrue == 1) {
1743 emit(AND(result_dst, result, src_reg(1)));
1744 }
1745 } else {
1746 emit(MOV(result_dst, packed_consts));
1747 }
1748 break;
1749 }
1750
1751 case ir_binop_vector_extract:
1752 unreachable("should have been lowered by vec_index_to_cond_assign");
1753
1754 case ir_triop_fma:
1755 op[0] = fix_3src_operand(op[0]);
1756 op[1] = fix_3src_operand(op[1]);
1757 op[2] = fix_3src_operand(op[2]);
1758 /* Note that the instruction's argument order is reversed from GLSL
1759 * and the IR.
1760 */
1761 emit(MAD(result_dst, op[2], op[1], op[0]));
1762 break;
1763
1764 case ir_triop_lrp:
1765 emit_lrp(result_dst, op[0], op[1], op[2]);
1766 break;
1767
1768 case ir_triop_csel:
1769 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1770 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1771 inst->predicate = BRW_PREDICATE_NORMAL;
1772 break;
1773
1774 case ir_triop_bfi:
1775 op[0] = fix_3src_operand(op[0]);
1776 op[1] = fix_3src_operand(op[1]);
1777 op[2] = fix_3src_operand(op[2]);
1778 emit(BFI2(result_dst, op[0], op[1], op[2]));
1779 break;
1780
1781 case ir_triop_bitfield_extract:
1782 op[0] = fix_3src_operand(op[0]);
1783 op[1] = fix_3src_operand(op[1]);
1784 op[2] = fix_3src_operand(op[2]);
1785 /* Note that the instruction's argument order is reversed from GLSL
1786 * and the IR.
1787 */
1788 emit(BFE(result_dst, op[2], op[1], op[0]));
1789 break;
1790
1791 case ir_triop_vector_insert:
1792 unreachable("should have been lowered by lower_vector_insert");
1793
1794 case ir_quadop_bitfield_insert:
1795 unreachable("not reached: should be handled by "
1796 "bitfield_insert_to_bfm_bfi\n");
1797
1798 case ir_quadop_vector:
1799 unreachable("not reached: should be handled by lower_quadop_vector");
1800
1801 case ir_unop_pack_half_2x16:
1802 emit_pack_half_2x16(result_dst, op[0]);
1803 break;
1804 case ir_unop_unpack_half_2x16:
1805 emit_unpack_half_2x16(result_dst, op[0]);
1806 break;
1807 case ir_unop_pack_snorm_2x16:
1808 case ir_unop_pack_snorm_4x8:
1809 case ir_unop_pack_unorm_2x16:
1810 case ir_unop_pack_unorm_4x8:
1811 case ir_unop_unpack_snorm_2x16:
1812 case ir_unop_unpack_snorm_4x8:
1813 case ir_unop_unpack_unorm_2x16:
1814 case ir_unop_unpack_unorm_4x8:
1815 unreachable("not reached: should be handled by lower_packing_builtins");
1816 case ir_unop_unpack_half_2x16_split_x:
1817 case ir_unop_unpack_half_2x16_split_y:
1818 case ir_binop_pack_half_2x16_split:
1819 case ir_unop_interpolate_at_centroid:
1820 case ir_binop_interpolate_at_sample:
1821 case ir_binop_interpolate_at_offset:
1822 unreachable("not reached: should not occur in vertex shader");
1823 case ir_binop_ldexp:
1824 unreachable("not reached: should be handled by ldexp_to_arith()");
1825 }
1826 }
1827
1828
1829 void
1830 vec4_visitor::visit(ir_swizzle *ir)
1831 {
1832 src_reg src;
1833 int i = 0;
1834 int swizzle[4];
1835
1836 /* Note that this is only swizzles in expressions, not those on the left
1837 * hand side of an assignment, which do write masking. See ir_assignment
1838 * for that.
1839 */
1840
1841 ir->val->accept(this);
1842 src = this->result;
1843 assert(src.file != BAD_FILE);
1844
1845 for (i = 0; i < ir->type->vector_elements; i++) {
1846 switch (i) {
1847 case 0:
1848 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1849 break;
1850 case 1:
1851 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1852 break;
1853 case 2:
1854 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1855 break;
1856 case 3:
1857 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1858 break;
1859 }
1860 }
1861 for (; i < 4; i++) {
1862 /* Replicate the last channel out. */
1863 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1864 }
1865
1866 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1867
1868 this->result = src;
1869 }
1870
1871 void
1872 vec4_visitor::visit(ir_dereference_variable *ir)
1873 {
1874 const struct glsl_type *type = ir->type;
1875 dst_reg *reg = variable_storage(ir->var);
1876
1877 if (!reg) {
1878 fail("Failed to find variable storage for %s\n", ir->var->name);
1879 this->result = src_reg(brw_null_reg());
1880 return;
1881 }
1882
1883 this->result = src_reg(*reg);
1884
1885 /* System values get their swizzle from the dst_reg writemask */
1886 if (ir->var->data.mode == ir_var_system_value)
1887 return;
1888
1889 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1890 this->result.swizzle = swizzle_for_size(type->vector_elements);
1891 }
1892
1893
1894 int
1895 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1896 {
1897 /* Under normal circumstances array elements are stored consecutively, so
1898 * the stride is equal to the size of the array element.
1899 */
1900 return type_size(ir->type);
1901 }
1902
1903
1904 void
1905 vec4_visitor::visit(ir_dereference_array *ir)
1906 {
1907 ir_constant *constant_index;
1908 src_reg src;
1909 int array_stride = compute_array_stride(ir);
1910
1911 constant_index = ir->array_index->constant_expression_value();
1912
1913 ir->array->accept(this);
1914 src = this->result;
1915
1916 if (constant_index) {
1917 src.reg_offset += constant_index->value.i[0] * array_stride;
1918 } else {
1919 /* Variable index array dereference. It eats the "vec4" of the
1920 * base of the array and an index that offsets the Mesa register
1921 * index.
1922 */
1923 ir->array_index->accept(this);
1924
1925 src_reg index_reg;
1926
1927 if (array_stride == 1) {
1928 index_reg = this->result;
1929 } else {
1930 index_reg = src_reg(this, glsl_type::int_type);
1931
1932 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1933 }
1934
1935 if (src.reladdr) {
1936 src_reg temp = src_reg(this, glsl_type::int_type);
1937
1938 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1939
1940 index_reg = temp;
1941 }
1942
1943 src.reladdr = ralloc(mem_ctx, src_reg);
1944 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1945 }
1946
1947 /* If the type is smaller than a vec4, replicate the last channel out. */
1948 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1949 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1950 else
1951 src.swizzle = BRW_SWIZZLE_NOOP;
1952 src.type = brw_type_for_base_type(ir->type);
1953
1954 this->result = src;
1955 }
1956
1957 void
1958 vec4_visitor::visit(ir_dereference_record *ir)
1959 {
1960 unsigned int i;
1961 const glsl_type *struct_type = ir->record->type;
1962 int offset = 0;
1963
1964 ir->record->accept(this);
1965
1966 for (i = 0; i < struct_type->length; i++) {
1967 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1968 break;
1969 offset += type_size(struct_type->fields.structure[i].type);
1970 }
1971
1972 /* If the type is smaller than a vec4, replicate the last channel out. */
1973 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1974 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1975 else
1976 this->result.swizzle = BRW_SWIZZLE_NOOP;
1977 this->result.type = brw_type_for_base_type(ir->type);
1978
1979 this->result.reg_offset += offset;
1980 }
1981
1982 /**
1983 * We want to be careful in assignment setup to hit the actual storage
1984 * instead of potentially using a temporary like we might with the
1985 * ir_dereference handler.
1986 */
1987 static dst_reg
1988 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1989 {
1990 /* The LHS must be a dereference. If the LHS is a variable indexed array
1991 * access of a vector, it must be separated into a series conditional moves
1992 * before reaching this point (see ir_vec_index_to_cond_assign).
1993 */
1994 assert(ir->as_dereference());
1995 ir_dereference_array *deref_array = ir->as_dereference_array();
1996 if (deref_array) {
1997 assert(!deref_array->array->type->is_vector());
1998 }
1999
2000 /* Use the rvalue deref handler for the most part. We'll ignore
2001 * swizzles in it and write swizzles using writemask, though.
2002 */
2003 ir->accept(v);
2004 return dst_reg(v->result);
2005 }
2006
2007 void
2008 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2009 const struct glsl_type *type,
2010 enum brw_predicate predicate)
2011 {
2012 if (type->base_type == GLSL_TYPE_STRUCT) {
2013 for (unsigned int i = 0; i < type->length; i++) {
2014 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2015 }
2016 return;
2017 }
2018
2019 if (type->is_array()) {
2020 for (unsigned int i = 0; i < type->length; i++) {
2021 emit_block_move(dst, src, type->fields.array, predicate);
2022 }
2023 return;
2024 }
2025
2026 if (type->is_matrix()) {
2027 const struct glsl_type *vec_type;
2028
2029 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2030 type->vector_elements, 1);
2031
2032 for (int i = 0; i < type->matrix_columns; i++) {
2033 emit_block_move(dst, src, vec_type, predicate);
2034 }
2035 return;
2036 }
2037
2038 assert(type->is_scalar() || type->is_vector());
2039
2040 dst->type = brw_type_for_base_type(type);
2041 src->type = dst->type;
2042
2043 dst->writemask = (1 << type->vector_elements) - 1;
2044
2045 src->swizzle = swizzle_for_size(type->vector_elements);
2046
2047 vec4_instruction *inst = emit(MOV(*dst, *src));
2048 inst->predicate = predicate;
2049
2050 dst->reg_offset++;
2051 src->reg_offset++;
2052 }
2053
2054
2055 /* If the RHS processing resulted in an instruction generating a
2056 * temporary value, and it would be easy to rewrite the instruction to
2057 * generate its result right into the LHS instead, do so. This ends
2058 * up reliably removing instructions where it can be tricky to do so
2059 * later without real UD chain information.
2060 */
2061 bool
2062 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2063 dst_reg dst,
2064 src_reg src,
2065 vec4_instruction *pre_rhs_inst,
2066 vec4_instruction *last_rhs_inst)
2067 {
2068 /* This could be supported, but it would take more smarts. */
2069 if (ir->condition)
2070 return false;
2071
2072 if (pre_rhs_inst == last_rhs_inst)
2073 return false; /* No instructions generated to work with. */
2074
2075 /* Make sure the last instruction generated our source reg. */
2076 if (src.file != GRF ||
2077 src.file != last_rhs_inst->dst.file ||
2078 src.reg != last_rhs_inst->dst.reg ||
2079 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2080 src.reladdr ||
2081 src.abs ||
2082 src.negate ||
2083 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2084 return false;
2085
2086 /* Check that that last instruction fully initialized the channels
2087 * we want to use, in the order we want to use them. We could
2088 * potentially reswizzle the operands of many instructions so that
2089 * we could handle out of order channels, but don't yet.
2090 */
2091
2092 for (unsigned i = 0; i < 4; i++) {
2093 if (dst.writemask & (1 << i)) {
2094 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2095 return false;
2096
2097 if (BRW_GET_SWZ(src.swizzle, i) != i)
2098 return false;
2099 }
2100 }
2101
2102 /* Success! Rewrite the instruction. */
2103 last_rhs_inst->dst.file = dst.file;
2104 last_rhs_inst->dst.reg = dst.reg;
2105 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2106 last_rhs_inst->dst.reladdr = dst.reladdr;
2107 last_rhs_inst->dst.writemask &= dst.writemask;
2108
2109 return true;
2110 }
2111
2112 void
2113 vec4_visitor::visit(ir_assignment *ir)
2114 {
2115 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2116 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2117
2118 if (!ir->lhs->type->is_scalar() &&
2119 !ir->lhs->type->is_vector()) {
2120 ir->rhs->accept(this);
2121 src_reg src = this->result;
2122
2123 if (ir->condition) {
2124 emit_bool_to_cond_code(ir->condition, &predicate);
2125 }
2126
2127 /* emit_block_move doesn't account for swizzles in the source register.
2128 * This should be ok, since the source register is a structure or an
2129 * array, and those can't be swizzled. But double-check to be sure.
2130 */
2131 assert(src.swizzle ==
2132 (ir->rhs->type->is_matrix()
2133 ? swizzle_for_size(ir->rhs->type->vector_elements)
2134 : BRW_SWIZZLE_NOOP));
2135
2136 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2137 return;
2138 }
2139
2140 /* Now we're down to just a scalar/vector with writemasks. */
2141 int i;
2142
2143 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2144 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2145
2146 ir->rhs->accept(this);
2147
2148 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2149
2150 src_reg src = this->result;
2151
2152 int swizzles[4];
2153 int first_enabled_chan = 0;
2154 int src_chan = 0;
2155
2156 assert(ir->lhs->type->is_vector() ||
2157 ir->lhs->type->is_scalar());
2158 dst.writemask = ir->write_mask;
2159
2160 for (int i = 0; i < 4; i++) {
2161 if (dst.writemask & (1 << i)) {
2162 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2163 break;
2164 }
2165 }
2166
2167 /* Swizzle a small RHS vector into the channels being written.
2168 *
2169 * glsl ir treats write_mask as dictating how many channels are
2170 * present on the RHS while in our instructions we need to make
2171 * those channels appear in the slots of the vec4 they're written to.
2172 */
2173 for (int i = 0; i < 4; i++) {
2174 if (dst.writemask & (1 << i))
2175 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2176 else
2177 swizzles[i] = first_enabled_chan;
2178 }
2179 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2180 swizzles[2], swizzles[3]);
2181
2182 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2183 return;
2184 }
2185
2186 if (ir->condition) {
2187 emit_bool_to_cond_code(ir->condition, &predicate);
2188 }
2189
2190 for (i = 0; i < type_size(ir->lhs->type); i++) {
2191 vec4_instruction *inst = emit(MOV(dst, src));
2192 inst->predicate = predicate;
2193
2194 dst.reg_offset++;
2195 src.reg_offset++;
2196 }
2197 }
2198
2199 void
2200 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2201 {
2202 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2203 foreach_in_list(ir_constant, field_value, &ir->components) {
2204 emit_constant_values(dst, field_value);
2205 }
2206 return;
2207 }
2208
2209 if (ir->type->is_array()) {
2210 for (unsigned int i = 0; i < ir->type->length; i++) {
2211 emit_constant_values(dst, ir->array_elements[i]);
2212 }
2213 return;
2214 }
2215
2216 if (ir->type->is_matrix()) {
2217 for (int i = 0; i < ir->type->matrix_columns; i++) {
2218 float *vec = &ir->value.f[i * ir->type->vector_elements];
2219
2220 for (int j = 0; j < ir->type->vector_elements; j++) {
2221 dst->writemask = 1 << j;
2222 dst->type = BRW_REGISTER_TYPE_F;
2223
2224 emit(MOV(*dst, src_reg(vec[j])));
2225 }
2226 dst->reg_offset++;
2227 }
2228 return;
2229 }
2230
2231 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2232
2233 for (int i = 0; i < ir->type->vector_elements; i++) {
2234 if (!(remaining_writemask & (1 << i)))
2235 continue;
2236
2237 dst->writemask = 1 << i;
2238 dst->type = brw_type_for_base_type(ir->type);
2239
2240 /* Find other components that match the one we're about to
2241 * write. Emits fewer instructions for things like vec4(0.5,
2242 * 1.5, 1.5, 1.5).
2243 */
2244 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2245 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2246 if (ir->value.b[i] == ir->value.b[j])
2247 dst->writemask |= (1 << j);
2248 } else {
2249 /* u, i, and f storage all line up, so no need for a
2250 * switch case for comparing each type.
2251 */
2252 if (ir->value.u[i] == ir->value.u[j])
2253 dst->writemask |= (1 << j);
2254 }
2255 }
2256
2257 switch (ir->type->base_type) {
2258 case GLSL_TYPE_FLOAT:
2259 emit(MOV(*dst, src_reg(ir->value.f[i])));
2260 break;
2261 case GLSL_TYPE_INT:
2262 emit(MOV(*dst, src_reg(ir->value.i[i])));
2263 break;
2264 case GLSL_TYPE_UINT:
2265 emit(MOV(*dst, src_reg(ir->value.u[i])));
2266 break;
2267 case GLSL_TYPE_BOOL:
2268 emit(MOV(*dst,
2269 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2270 : 0)));
2271 break;
2272 default:
2273 unreachable("Non-float/uint/int/bool constant");
2274 }
2275
2276 remaining_writemask &= ~dst->writemask;
2277 }
2278 dst->reg_offset++;
2279 }
2280
2281 void
2282 vec4_visitor::visit(ir_constant *ir)
2283 {
2284 dst_reg dst = dst_reg(this, ir->type);
2285 this->result = src_reg(dst);
2286
2287 emit_constant_values(&dst, ir);
2288 }
2289
2290 void
2291 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2292 {
2293 ir_dereference *deref = static_cast<ir_dereference *>(
2294 ir->actual_parameters.get_head());
2295 ir_variable *location = deref->variable_referenced();
2296 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2297 location->data.atomic.buffer_index);
2298
2299 /* Calculate the surface offset */
2300 src_reg offset(this, glsl_type::uint_type);
2301 ir_dereference_array *deref_array = deref->as_dereference_array();
2302 if (deref_array) {
2303 deref_array->array_index->accept(this);
2304
2305 src_reg tmp(this, glsl_type::uint_type);
2306 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2307 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2308 } else {
2309 offset = location->data.atomic.offset;
2310 }
2311
2312 /* Emit the appropriate machine instruction */
2313 const char *callee = ir->callee->function_name();
2314 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2315
2316 if (!strcmp("__intrinsic_atomic_read", callee)) {
2317 emit_untyped_surface_read(surf_index, dst, offset);
2318
2319 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2320 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2321 src_reg(), src_reg());
2322
2323 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2324 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2325 src_reg(), src_reg());
2326 }
2327 }
2328
2329 void
2330 vec4_visitor::visit(ir_call *ir)
2331 {
2332 const char *callee = ir->callee->function_name();
2333
2334 if (!strcmp("__intrinsic_atomic_read", callee) ||
2335 !strcmp("__intrinsic_atomic_increment", callee) ||
2336 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2337 visit_atomic_counter_intrinsic(ir);
2338 } else {
2339 unreachable("Unsupported intrinsic.");
2340 }
2341 }
2342
2343 src_reg
2344 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2345 {
2346 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2347 inst->base_mrf = 2;
2348 inst->mlen = 1;
2349 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2350 inst->dst.writemask = WRITEMASK_XYZW;
2351
2352 inst->src[1] = sampler;
2353
2354 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2355 int param_base = inst->base_mrf;
2356 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2357 int zero_mask = 0xf & ~coord_mask;
2358
2359 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2360 coordinate));
2361
2362 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2363 src_reg(0)));
2364
2365 emit(inst);
2366 return src_reg(inst->dst);
2367 }
2368
2369 static bool
2370 is_high_sampler(struct brw_context *brw, src_reg sampler)
2371 {
2372 if (brw->gen < 8 && !brw->is_haswell)
2373 return false;
2374
2375 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2376 }
2377
2378 void
2379 vec4_visitor::visit(ir_texture *ir)
2380 {
2381 uint32_t sampler =
2382 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2383
2384 ir_rvalue *nonconst_sampler_index =
2385 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2386
2387 /* Handle non-constant sampler array indexing */
2388 src_reg sampler_reg;
2389 if (nonconst_sampler_index) {
2390 /* The highest sampler which may be used by this operation is
2391 * the last element of the array. Mark it here, because the generator
2392 * doesn't have enough information to determine the bound.
2393 */
2394 uint32_t array_size = ir->sampler->as_dereference_array()
2395 ->array->type->array_size();
2396
2397 uint32_t max_used = sampler + array_size - 1;
2398 if (ir->op == ir_tg4 && brw->gen < 8) {
2399 max_used += prog_data->base.binding_table.gather_texture_start;
2400 } else {
2401 max_used += prog_data->base.binding_table.texture_start;
2402 }
2403
2404 brw_mark_surface_used(&prog_data->base, max_used);
2405
2406 /* Emit code to evaluate the actual indexing expression */
2407 nonconst_sampler_index->accept(this);
2408 dst_reg temp(this, glsl_type::uint_type);
2409 emit(ADD(temp, this->result, src_reg(sampler)))
2410 ->force_writemask_all = true;
2411 sampler_reg = src_reg(temp);
2412 } else {
2413 /* Single sampler, or constant array index; the indexing expression
2414 * is just an immediate.
2415 */
2416 sampler_reg = src_reg(sampler);
2417 }
2418
2419 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2420 * emitting anything other than setting up the constant result.
2421 */
2422 if (ir->op == ir_tg4) {
2423 ir_constant *chan = ir->lod_info.component->as_constant();
2424 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2425 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2426 dst_reg result(this, ir->type);
2427 this->result = src_reg(result);
2428 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2429 return;
2430 }
2431 }
2432
2433 /* Should be lowered by do_lower_texture_projection */
2434 assert(!ir->projector);
2435
2436 /* Should be lowered */
2437 assert(!ir->offset || !ir->offset->type->is_array());
2438
2439 /* Generate code to compute all the subexpression trees. This has to be
2440 * done before loading any values into MRFs for the sampler message since
2441 * generating these values may involve SEND messages that need the MRFs.
2442 */
2443 src_reg coordinate;
2444 if (ir->coordinate) {
2445 ir->coordinate->accept(this);
2446 coordinate = this->result;
2447 }
2448
2449 src_reg shadow_comparitor;
2450 if (ir->shadow_comparitor) {
2451 ir->shadow_comparitor->accept(this);
2452 shadow_comparitor = this->result;
2453 }
2454
2455 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2456 src_reg offset_value;
2457 if (has_nonconstant_offset) {
2458 ir->offset->accept(this);
2459 offset_value = src_reg(this->result);
2460 }
2461
2462 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2463 src_reg lod, dPdx, dPdy, sample_index, mcs;
2464 switch (ir->op) {
2465 case ir_tex:
2466 lod = src_reg(0.0f);
2467 lod_type = glsl_type::float_type;
2468 break;
2469 case ir_txf:
2470 case ir_txl:
2471 case ir_txs:
2472 ir->lod_info.lod->accept(this);
2473 lod = this->result;
2474 lod_type = ir->lod_info.lod->type;
2475 break;
2476 case ir_query_levels:
2477 lod = src_reg(0);
2478 lod_type = glsl_type::int_type;
2479 break;
2480 case ir_txf_ms:
2481 ir->lod_info.sample_index->accept(this);
2482 sample_index = this->result;
2483 sample_index_type = ir->lod_info.sample_index->type;
2484
2485 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2486 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2487 else
2488 mcs = src_reg(0u);
2489 break;
2490 case ir_txd:
2491 ir->lod_info.grad.dPdx->accept(this);
2492 dPdx = this->result;
2493
2494 ir->lod_info.grad.dPdy->accept(this);
2495 dPdy = this->result;
2496
2497 lod_type = ir->lod_info.grad.dPdx->type;
2498 break;
2499 case ir_txb:
2500 case ir_lod:
2501 case ir_tg4:
2502 break;
2503 }
2504
2505 enum opcode opcode;
2506 switch (ir->op) {
2507 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2508 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2509 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2510 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2511 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2512 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2513 case ir_tg4: opcode = has_nonconstant_offset
2514 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2515 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2516 case ir_txb:
2517 unreachable("TXB is not valid for vertex shaders.");
2518 case ir_lod:
2519 unreachable("LOD is not valid for vertex shaders.");
2520 default:
2521 unreachable("Unrecognized tex op");
2522 }
2523
2524 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2525
2526 if (ir->offset != NULL && ir->op != ir_txf)
2527 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2528
2529 /* Stuff the channel select bits in the top of the texture offset */
2530 if (ir->op == ir_tg4)
2531 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2532
2533 /* The message header is necessary for:
2534 * - Gen4 (always)
2535 * - Texel offsets
2536 * - Gather channel selection
2537 * - Sampler indices too large to fit in a 4-bit value.
2538 */
2539 inst->header_present =
2540 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2541 is_high_sampler(brw, sampler_reg);
2542 inst->base_mrf = 2;
2543 inst->mlen = inst->header_present + 1; /* always at least one */
2544 inst->dst = dst_reg(this, ir->type);
2545 inst->dst.writemask = WRITEMASK_XYZW;
2546 inst->shadow_compare = ir->shadow_comparitor != NULL;
2547
2548 inst->src[1] = sampler_reg;
2549
2550 /* MRF for the first parameter */
2551 int param_base = inst->base_mrf + inst->header_present;
2552
2553 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2554 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2555 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2556 } else {
2557 /* Load the coordinate */
2558 /* FINISHME: gl_clamp_mask and saturate */
2559 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2560 int zero_mask = 0xf & ~coord_mask;
2561
2562 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2563 coordinate));
2564
2565 if (zero_mask != 0) {
2566 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2567 src_reg(0)));
2568 }
2569 /* Load the shadow comparitor */
2570 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2571 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2572 WRITEMASK_X),
2573 shadow_comparitor));
2574 inst->mlen++;
2575 }
2576
2577 /* Load the LOD info */
2578 if (ir->op == ir_tex || ir->op == ir_txl) {
2579 int mrf, writemask;
2580 if (brw->gen >= 5) {
2581 mrf = param_base + 1;
2582 if (ir->shadow_comparitor) {
2583 writemask = WRITEMASK_Y;
2584 /* mlen already incremented */
2585 } else {
2586 writemask = WRITEMASK_X;
2587 inst->mlen++;
2588 }
2589 } else /* brw->gen == 4 */ {
2590 mrf = param_base;
2591 writemask = WRITEMASK_W;
2592 }
2593 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2594 } else if (ir->op == ir_txf) {
2595 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2596 } else if (ir->op == ir_txf_ms) {
2597 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2598 sample_index));
2599 if (brw->gen >= 7)
2600 /* MCS data is in the first channel of `mcs`, but we need to get it into
2601 * the .y channel of the second vec4 of params, so replicate .x across
2602 * the whole vec4 and then mask off everything except .y
2603 */
2604 mcs.swizzle = BRW_SWIZZLE_XXXX;
2605 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2606 mcs));
2607 inst->mlen++;
2608 } else if (ir->op == ir_txd) {
2609 const glsl_type *type = lod_type;
2610
2611 if (brw->gen >= 5) {
2612 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2613 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2614 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2615 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2616 inst->mlen++;
2617
2618 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2619 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2620 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2621 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2622 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2623 inst->mlen++;
2624
2625 if (ir->shadow_comparitor) {
2626 emit(MOV(dst_reg(MRF, param_base + 2,
2627 ir->shadow_comparitor->type, WRITEMASK_Z),
2628 shadow_comparitor));
2629 }
2630 }
2631 } else /* brw->gen == 4 */ {
2632 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2633 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2634 inst->mlen += 2;
2635 }
2636 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2637 if (ir->shadow_comparitor) {
2638 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2639 shadow_comparitor));
2640 }
2641
2642 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2643 offset_value));
2644 inst->mlen++;
2645 }
2646 }
2647
2648 emit(inst);
2649
2650 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2651 * spec requires layers.
2652 */
2653 if (ir->op == ir_txs) {
2654 glsl_type const *type = ir->sampler->type;
2655 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2656 type->sampler_array) {
2657 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2658 writemask(inst->dst, WRITEMASK_Z),
2659 src_reg(inst->dst), src_reg(6));
2660 }
2661 }
2662
2663 if (brw->gen == 6 && ir->op == ir_tg4) {
2664 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2665 }
2666
2667 swizzle_result(ir, src_reg(inst->dst), sampler);
2668 }
2669
2670 /**
2671 * Apply workarounds for Gen6 gather with UINT/SINT
2672 */
2673 void
2674 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2675 {
2676 if (!wa)
2677 return;
2678
2679 int width = (wa & WA_8BIT) ? 8 : 16;
2680 dst_reg dst_f = dst;
2681 dst_f.type = BRW_REGISTER_TYPE_F;
2682
2683 /* Convert from UNORM to UINT */
2684 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2685 emit(MOV(dst, src_reg(dst_f)));
2686
2687 if (wa & WA_SIGN) {
2688 /* Reinterpret the UINT value as a signed INT value by
2689 * shifting the sign bit into place, then shifting back
2690 * preserving sign.
2691 */
2692 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2693 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2694 }
2695 }
2696
2697 /**
2698 * Set up the gather channel based on the swizzle, for gather4.
2699 */
2700 uint32_t
2701 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2702 {
2703 ir_constant *chan = ir->lod_info.component->as_constant();
2704 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2705 switch (swiz) {
2706 case SWIZZLE_X: return 0;
2707 case SWIZZLE_Y:
2708 /* gather4 sampler is broken for green channel on RG32F --
2709 * we must ask for blue instead.
2710 */
2711 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2712 return 2;
2713 return 1;
2714 case SWIZZLE_Z: return 2;
2715 case SWIZZLE_W: return 3;
2716 default:
2717 unreachable("Not reached"); /* zero, one swizzles handled already */
2718 }
2719 }
2720
2721 void
2722 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2723 {
2724 int s = key->tex.swizzles[sampler];
2725
2726 this->result = src_reg(this, ir->type);
2727 dst_reg swizzled_result(this->result);
2728
2729 if (ir->op == ir_query_levels) {
2730 /* # levels is in .w */
2731 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2732 emit(MOV(swizzled_result, orig_val));
2733 return;
2734 }
2735
2736 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2737 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2738 emit(MOV(swizzled_result, orig_val));
2739 return;
2740 }
2741
2742
2743 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2744 int swizzle[4] = {0};
2745
2746 for (int i = 0; i < 4; i++) {
2747 switch (GET_SWZ(s, i)) {
2748 case SWIZZLE_ZERO:
2749 zero_mask |= (1 << i);
2750 break;
2751 case SWIZZLE_ONE:
2752 one_mask |= (1 << i);
2753 break;
2754 default:
2755 copy_mask |= (1 << i);
2756 swizzle[i] = GET_SWZ(s, i);
2757 break;
2758 }
2759 }
2760
2761 if (copy_mask) {
2762 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2763 swizzled_result.writemask = copy_mask;
2764 emit(MOV(swizzled_result, orig_val));
2765 }
2766
2767 if (zero_mask) {
2768 swizzled_result.writemask = zero_mask;
2769 emit(MOV(swizzled_result, src_reg(0.0f)));
2770 }
2771
2772 if (one_mask) {
2773 swizzled_result.writemask = one_mask;
2774 emit(MOV(swizzled_result, src_reg(1.0f)));
2775 }
2776 }
2777
2778 void
2779 vec4_visitor::visit(ir_return *)
2780 {
2781 unreachable("not reached");
2782 }
2783
2784 void
2785 vec4_visitor::visit(ir_discard *)
2786 {
2787 unreachable("not reached");
2788 }
2789
2790 void
2791 vec4_visitor::visit(ir_if *ir)
2792 {
2793 /* Don't point the annotation at the if statement, because then it plus
2794 * the then and else blocks get printed.
2795 */
2796 this->base_ir = ir->condition;
2797
2798 if (brw->gen == 6) {
2799 emit_if_gen6(ir);
2800 } else {
2801 enum brw_predicate predicate;
2802 emit_bool_to_cond_code(ir->condition, &predicate);
2803 emit(IF(predicate));
2804 }
2805
2806 visit_instructions(&ir->then_instructions);
2807
2808 if (!ir->else_instructions.is_empty()) {
2809 this->base_ir = ir->condition;
2810 emit(BRW_OPCODE_ELSE);
2811
2812 visit_instructions(&ir->else_instructions);
2813 }
2814
2815 this->base_ir = ir->condition;
2816 emit(BRW_OPCODE_ENDIF);
2817 }
2818
2819 void
2820 vec4_visitor::visit(ir_emit_vertex *)
2821 {
2822 unreachable("not reached");
2823 }
2824
2825 void
2826 vec4_visitor::visit(ir_end_primitive *)
2827 {
2828 unreachable("not reached");
2829 }
2830
2831 void
2832 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2833 dst_reg dst, src_reg offset,
2834 src_reg src0, src_reg src1)
2835 {
2836 unsigned mlen = 0;
2837
2838 /* Set the atomic operation offset. */
2839 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2840 mlen++;
2841
2842 /* Set the atomic operation arguments. */
2843 if (src0.file != BAD_FILE) {
2844 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2845 mlen++;
2846 }
2847
2848 if (src1.file != BAD_FILE) {
2849 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2850 mlen++;
2851 }
2852
2853 /* Emit the instruction. Note that this maps to the normal SIMD8
2854 * untyped atomic message on Ivy Bridge, but that's OK because
2855 * unused channels will be masked out.
2856 */
2857 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2858 src_reg(atomic_op), src_reg(surf_index));
2859 inst->base_mrf = 0;
2860 inst->mlen = mlen;
2861 }
2862
2863 void
2864 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2865 src_reg offset)
2866 {
2867 /* Set the surface read offset. */
2868 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2869
2870 /* Emit the instruction. Note that this maps to the normal SIMD8
2871 * untyped surface read message, but that's OK because unused
2872 * channels will be masked out.
2873 */
2874 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2875 dst, src_reg(surf_index));
2876 inst->base_mrf = 0;
2877 inst->mlen = 1;
2878 }
2879
2880 void
2881 vec4_visitor::emit_ndc_computation()
2882 {
2883 /* Get the position */
2884 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2885
2886 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2887 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2888 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2889
2890 current_annotation = "NDC";
2891 dst_reg ndc_w = ndc;
2892 ndc_w.writemask = WRITEMASK_W;
2893 src_reg pos_w = pos;
2894 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2895 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2896
2897 dst_reg ndc_xyz = ndc;
2898 ndc_xyz.writemask = WRITEMASK_XYZ;
2899
2900 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2901 }
2902
2903 void
2904 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2905 {
2906 if (brw->gen < 6 &&
2907 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2908 key->userclip_active || brw->has_negative_rhw_bug)) {
2909 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2910 dst_reg header1_w = header1;
2911 header1_w.writemask = WRITEMASK_W;
2912
2913 emit(MOV(header1, 0u));
2914
2915 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2916 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2917
2918 current_annotation = "Point size";
2919 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2920 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2921 }
2922
2923 if (key->userclip_active) {
2924 current_annotation = "Clipping flags";
2925 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2926 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2927
2928 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2929 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2930 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2931
2932 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2933 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2934 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2935 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2936 }
2937
2938 /* i965 clipping workaround:
2939 * 1) Test for -ve rhw
2940 * 2) If set,
2941 * set ndc = (0,0,0,0)
2942 * set ucp[6] = 1
2943 *
2944 * Later, clipping will detect ucp[6] and ensure the primitive is
2945 * clipped against all fixed planes.
2946 */
2947 if (brw->has_negative_rhw_bug) {
2948 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2949 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2950 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2951 vec4_instruction *inst;
2952 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2953 inst->predicate = BRW_PREDICATE_NORMAL;
2954 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2955 inst->predicate = BRW_PREDICATE_NORMAL;
2956 }
2957
2958 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2959 } else if (brw->gen < 6) {
2960 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2961 } else {
2962 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2963 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2964 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2965 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2966 }
2967 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2968 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2969 src_reg(output_reg[VARYING_SLOT_LAYER])));
2970 }
2971 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2972 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2973 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2974 }
2975 }
2976 }
2977
2978 void
2979 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2980 {
2981 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2982 *
2983 * "If a linked set of shaders forming the vertex stage contains no
2984 * static write to gl_ClipVertex or gl_ClipDistance, but the
2985 * application has requested clipping against user clip planes through
2986 * the API, then the coordinate written to gl_Position is used for
2987 * comparison against the user clip planes."
2988 *
2989 * This function is only called if the shader didn't write to
2990 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2991 * if the user wrote to it; otherwise we use gl_Position.
2992 */
2993 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2994 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2995 clip_vertex = VARYING_SLOT_POS;
2996 }
2997
2998 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2999 ++i) {
3000 reg.writemask = 1 << i;
3001 emit(DP4(reg,
3002 src_reg(output_reg[clip_vertex]),
3003 src_reg(this->userplane[i + offset])));
3004 }
3005 }
3006
3007 void
3008 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3009 {
3010 assert (varying < VARYING_SLOT_MAX);
3011 reg.type = output_reg[varying].type;
3012 current_annotation = output_reg_annotation[varying];
3013 /* Copy the register, saturating if necessary */
3014 vec4_instruction *inst = emit(MOV(reg,
3015 src_reg(output_reg[varying])));
3016 if ((varying == VARYING_SLOT_COL0 ||
3017 varying == VARYING_SLOT_COL1 ||
3018 varying == VARYING_SLOT_BFC0 ||
3019 varying == VARYING_SLOT_BFC1) &&
3020 key->clamp_vertex_color) {
3021 inst->saturate = true;
3022 }
3023 }
3024
3025 void
3026 vec4_visitor::emit_urb_slot(int mrf, int varying)
3027 {
3028 struct brw_reg hw_reg = brw_message_reg(mrf);
3029 dst_reg reg = dst_reg(MRF, mrf);
3030 reg.type = BRW_REGISTER_TYPE_F;
3031
3032 switch (varying) {
3033 case VARYING_SLOT_PSIZ:
3034 /* PSIZ is always in slot 0, and is coupled with other flags. */
3035 current_annotation = "indices, point width, clip flags";
3036 emit_psiz_and_flags(hw_reg);
3037 break;
3038 case BRW_VARYING_SLOT_NDC:
3039 current_annotation = "NDC";
3040 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3041 break;
3042 case VARYING_SLOT_POS:
3043 current_annotation = "gl_Position";
3044 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3045 break;
3046 case VARYING_SLOT_EDGE:
3047 /* This is present when doing unfilled polygons. We're supposed to copy
3048 * the edge flag from the user-provided vertex array
3049 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3050 * of that attribute (starts as 1.0f). This is then used in clipping to
3051 * determine which edges should be drawn as wireframe.
3052 */
3053 current_annotation = "edge flag";
3054 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3055 glsl_type::float_type, WRITEMASK_XYZW))));
3056 break;
3057 case BRW_VARYING_SLOT_PAD:
3058 /* No need to write to this slot */
3059 break;
3060 default:
3061 emit_generic_urb_slot(reg, varying);
3062 break;
3063 }
3064 }
3065
3066 static int
3067 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3068 {
3069 if (brw->gen >= 6) {
3070 /* URB data written (does not include the message header reg) must
3071 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3072 * section 5.4.3.2.2: URB_INTERLEAVED.
3073 *
3074 * URB entries are allocated on a multiple of 1024 bits, so an
3075 * extra 128 bits written here to make the end align to 256 is
3076 * no problem.
3077 */
3078 if ((mlen % 2) != 1)
3079 mlen++;
3080 }
3081
3082 return mlen;
3083 }
3084
3085
3086 /**
3087 * Generates the VUE payload plus the necessary URB write instructions to
3088 * output it.
3089 *
3090 * The VUE layout is documented in Volume 2a.
3091 */
3092 void
3093 vec4_visitor::emit_vertex()
3094 {
3095 /* MRF 0 is reserved for the debugger, so start with message header
3096 * in MRF 1.
3097 */
3098 int base_mrf = 1;
3099 int mrf = base_mrf;
3100 /* In the process of generating our URB write message contents, we
3101 * may need to unspill a register or load from an array. Those
3102 * reads would use MRFs 14-15.
3103 */
3104 int max_usable_mrf = 13;
3105
3106 /* The following assertion verifies that max_usable_mrf causes an
3107 * even-numbered amount of URB write data, which will meet gen6's
3108 * requirements for length alignment.
3109 */
3110 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3111
3112 /* First mrf is the g0-based message header containing URB handles and
3113 * such.
3114 */
3115 emit_urb_write_header(mrf++);
3116
3117 if (brw->gen < 6) {
3118 emit_ndc_computation();
3119 }
3120
3121 /* Lower legacy ff and ClipVertex clipping to clip distances */
3122 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3123 current_annotation = "user clip distances";
3124
3125 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3126 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3127
3128 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3129 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3130 }
3131
3132 /* We may need to split this up into several URB writes, so do them in a
3133 * loop.
3134 */
3135 int slot = 0;
3136 bool complete = false;
3137 do {
3138 /* URB offset is in URB row increments, and each of our MRFs is half of
3139 * one of those, since we're doing interleaved writes.
3140 */
3141 int offset = slot / 2;
3142
3143 mrf = base_mrf + 1;
3144 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3145 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3146
3147 /* If this was max_usable_mrf, we can't fit anything more into this
3148 * URB WRITE.
3149 */
3150 if (mrf > max_usable_mrf) {
3151 slot++;
3152 break;
3153 }
3154 }
3155
3156 complete = slot >= prog_data->vue_map.num_slots;
3157 current_annotation = "URB write";
3158 vec4_instruction *inst = emit_urb_write_opcode(complete);
3159 inst->base_mrf = base_mrf;
3160 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3161 inst->offset += offset;
3162 } while(!complete);
3163 }
3164
3165
3166 src_reg
3167 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3168 src_reg *reladdr, int reg_offset)
3169 {
3170 /* Because we store the values to scratch interleaved like our
3171 * vertex data, we need to scale the vec4 index by 2.
3172 */
3173 int message_header_scale = 2;
3174
3175 /* Pre-gen6, the message header uses byte offsets instead of vec4
3176 * (16-byte) offset units.
3177 */
3178 if (brw->gen < 6)
3179 message_header_scale *= 16;
3180
3181 if (reladdr) {
3182 src_reg index = src_reg(this, glsl_type::int_type);
3183
3184 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3185 emit_before(inst, MUL(dst_reg(index),
3186 index, src_reg(message_header_scale)));
3187
3188 return index;
3189 } else {
3190 return src_reg(reg_offset * message_header_scale);
3191 }
3192 }
3193
3194 src_reg
3195 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3196 src_reg *reladdr, int reg_offset)
3197 {
3198 if (reladdr) {
3199 src_reg index = src_reg(this, glsl_type::int_type);
3200
3201 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3202
3203 /* Pre-gen6, the message header uses byte offsets instead of vec4
3204 * (16-byte) offset units.
3205 */
3206 if (brw->gen < 6) {
3207 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3208 }
3209
3210 return index;
3211 } else if (brw->gen >= 8) {
3212 /* Store the offset in a GRF so we can send-from-GRF. */
3213 src_reg offset = src_reg(this, glsl_type::int_type);
3214 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3215 return offset;
3216 } else {
3217 int message_header_scale = brw->gen < 6 ? 16 : 1;
3218 return src_reg(reg_offset * message_header_scale);
3219 }
3220 }
3221
3222 /**
3223 * Emits an instruction before @inst to load the value named by @orig_src
3224 * from scratch space at @base_offset to @temp.
3225 *
3226 * @base_offset is measured in 32-byte units (the size of a register).
3227 */
3228 void
3229 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3230 dst_reg temp, src_reg orig_src,
3231 int base_offset)
3232 {
3233 int reg_offset = base_offset + orig_src.reg_offset;
3234 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3235
3236 emit_before(inst, SCRATCH_READ(temp, index));
3237 }
3238
3239 /**
3240 * Emits an instruction after @inst to store the value to be written
3241 * to @orig_dst to scratch space at @base_offset, from @temp.
3242 *
3243 * @base_offset is measured in 32-byte units (the size of a register).
3244 */
3245 void
3246 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3247 {
3248 int reg_offset = base_offset + inst->dst.reg_offset;
3249 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3250
3251 /* Create a temporary register to store *inst's result in.
3252 *
3253 * We have to be careful in MOVing from our temporary result register in
3254 * the scratch write. If we swizzle from channels of the temporary that
3255 * weren't initialized, it will confuse live interval analysis, which will
3256 * make spilling fail to make progress.
3257 */
3258 src_reg temp = src_reg(this, glsl_type::vec4_type);
3259 temp.type = inst->dst.type;
3260 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3261 int swizzles[4];
3262 for (int i = 0; i < 4; i++)
3263 if (inst->dst.writemask & (1 << i))
3264 swizzles[i] = i;
3265 else
3266 swizzles[i] = first_writemask_chan;
3267 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3268 swizzles[2], swizzles[3]);
3269
3270 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3271 inst->dst.writemask));
3272 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3273 write->predicate = inst->predicate;
3274 write->ir = inst->ir;
3275 write->annotation = inst->annotation;
3276 inst->insert_after(write);
3277
3278 inst->dst.file = temp.file;
3279 inst->dst.reg = temp.reg;
3280 inst->dst.reg_offset = temp.reg_offset;
3281 inst->dst.reladdr = NULL;
3282 }
3283
3284 /**
3285 * We can't generally support array access in GRF space, because a
3286 * single instruction's destination can only span 2 contiguous
3287 * registers. So, we send all GRF arrays that get variable index
3288 * access to scratch space.
3289 */
3290 void
3291 vec4_visitor::move_grf_array_access_to_scratch()
3292 {
3293 int scratch_loc[this->virtual_grf_count];
3294
3295 for (int i = 0; i < this->virtual_grf_count; i++) {
3296 scratch_loc[i] = -1;
3297 }
3298
3299 /* First, calculate the set of virtual GRFs that need to be punted
3300 * to scratch due to having any array access on them, and where in
3301 * scratch.
3302 */
3303 foreach_in_list(vec4_instruction, inst, &instructions) {
3304 if (inst->dst.file == GRF && inst->dst.reladdr &&
3305 scratch_loc[inst->dst.reg] == -1) {
3306 scratch_loc[inst->dst.reg] = c->last_scratch;
3307 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3308 }
3309
3310 for (int i = 0 ; i < 3; i++) {
3311 src_reg *src = &inst->src[i];
3312
3313 if (src->file == GRF && src->reladdr &&
3314 scratch_loc[src->reg] == -1) {
3315 scratch_loc[src->reg] = c->last_scratch;
3316 c->last_scratch += this->virtual_grf_sizes[src->reg];
3317 }
3318 }
3319 }
3320
3321 /* Now, for anything that will be accessed through scratch, rewrite
3322 * it to load/store. Note that this is a _safe list walk, because
3323 * we may generate a new scratch_write instruction after the one
3324 * we're processing.
3325 */
3326 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3327 /* Set up the annotation tracking for new generated instructions. */
3328 base_ir = inst->ir;
3329 current_annotation = inst->annotation;
3330
3331 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3332 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3333 }
3334
3335 for (int i = 0 ; i < 3; i++) {
3336 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3337 continue;
3338
3339 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3340
3341 emit_scratch_read(inst, temp, inst->src[i],
3342 scratch_loc[inst->src[i].reg]);
3343
3344 inst->src[i].file = temp.file;
3345 inst->src[i].reg = temp.reg;
3346 inst->src[i].reg_offset = temp.reg_offset;
3347 inst->src[i].reladdr = NULL;
3348 }
3349 }
3350 }
3351
3352 /**
3353 * Emits an instruction before @inst to load the value named by @orig_src
3354 * from the pull constant buffer (surface) at @base_offset to @temp.
3355 */
3356 void
3357 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3358 dst_reg temp, src_reg orig_src,
3359 int base_offset)
3360 {
3361 int reg_offset = base_offset + orig_src.reg_offset;
3362 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3363 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3364 vec4_instruction *load;
3365
3366 if (brw->gen >= 7) {
3367 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3368 grf_offset.type = offset.type;
3369 emit_before(inst, MOV(grf_offset, offset));
3370
3371 load = new(mem_ctx) vec4_instruction(this,
3372 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3373 temp, index, src_reg(grf_offset));
3374 } else {
3375 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3376 temp, index, offset);
3377 load->base_mrf = 14;
3378 load->mlen = 1;
3379 }
3380 emit_before(inst, load);
3381 }
3382
3383 /**
3384 * Implements array access of uniforms by inserting a
3385 * PULL_CONSTANT_LOAD instruction.
3386 *
3387 * Unlike temporary GRF array access (where we don't support it due to
3388 * the difficulty of doing relative addressing on instruction
3389 * destinations), we could potentially do array access of uniforms
3390 * that were loaded in GRF space as push constants. In real-world
3391 * usage we've seen, though, the arrays being used are always larger
3392 * than we could load as push constants, so just always move all
3393 * uniform array access out to a pull constant buffer.
3394 */
3395 void
3396 vec4_visitor::move_uniform_array_access_to_pull_constants()
3397 {
3398 int pull_constant_loc[this->uniforms];
3399
3400 for (int i = 0; i < this->uniforms; i++) {
3401 pull_constant_loc[i] = -1;
3402 }
3403
3404 /* Walk through and find array access of uniforms. Put a copy of that
3405 * uniform in the pull constant buffer.
3406 *
3407 * Note that we don't move constant-indexed accesses to arrays. No
3408 * testing has been done of the performance impact of this choice.
3409 */
3410 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3411 for (int i = 0 ; i < 3; i++) {
3412 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3413 continue;
3414
3415 int uniform = inst->src[i].reg;
3416
3417 /* If this array isn't already present in the pull constant buffer,
3418 * add it.
3419 */
3420 if (pull_constant_loc[uniform] == -1) {
3421 const gl_constant_value **values =
3422 &stage_prog_data->param[uniform * 4];
3423
3424 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3425
3426 assert(uniform < uniform_array_size);
3427 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3428 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3429 = values[j];
3430 }
3431 }
3432
3433 /* Set up the annotation tracking for new generated instructions. */
3434 base_ir = inst->ir;
3435 current_annotation = inst->annotation;
3436
3437 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3438
3439 emit_pull_constant_load(inst, temp, inst->src[i],
3440 pull_constant_loc[uniform]);
3441
3442 inst->src[i].file = temp.file;
3443 inst->src[i].reg = temp.reg;
3444 inst->src[i].reg_offset = temp.reg_offset;
3445 inst->src[i].reladdr = NULL;
3446 }
3447 }
3448
3449 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3450 * no need to track them as larger-than-vec4 objects. This will be
3451 * relied on in cutting out unused uniform vectors from push
3452 * constants.
3453 */
3454 split_uniform_registers();
3455 }
3456
3457 void
3458 vec4_visitor::resolve_ud_negate(src_reg *reg)
3459 {
3460 if (reg->type != BRW_REGISTER_TYPE_UD ||
3461 !reg->negate)
3462 return;
3463
3464 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3465 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3466 *reg = temp;
3467 }
3468
3469 vec4_visitor::vec4_visitor(struct brw_context *brw,
3470 struct brw_vec4_compile *c,
3471 struct gl_program *prog,
3472 const struct brw_vec4_prog_key *key,
3473 struct brw_vec4_prog_data *prog_data,
3474 struct gl_shader_program *shader_prog,
3475 gl_shader_stage stage,
3476 void *mem_ctx,
3477 bool debug_flag,
3478 bool no_spills,
3479 shader_time_shader_type st_base,
3480 shader_time_shader_type st_written,
3481 shader_time_shader_type st_reset)
3482 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3483 c(c),
3484 key(key),
3485 prog_data(prog_data),
3486 sanity_param_count(0),
3487 fail_msg(NULL),
3488 first_non_payload_grf(0),
3489 need_all_constants_in_pull_buffer(false),
3490 debug_flag(debug_flag),
3491 no_spills(no_spills),
3492 st_base(st_base),
3493 st_written(st_written),
3494 st_reset(st_reset)
3495 {
3496 this->mem_ctx = mem_ctx;
3497 this->failed = false;
3498
3499 this->base_ir = NULL;
3500 this->current_annotation = NULL;
3501 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3502
3503 this->variable_ht = hash_table_ctor(0,
3504 hash_table_pointer_hash,
3505 hash_table_pointer_compare);
3506
3507 this->virtual_grf_start = NULL;
3508 this->virtual_grf_end = NULL;
3509 this->virtual_grf_sizes = NULL;
3510 this->virtual_grf_count = 0;
3511 this->virtual_grf_reg_map = NULL;
3512 this->virtual_grf_reg_count = 0;
3513 this->virtual_grf_array_size = 0;
3514 this->live_intervals_valid = false;
3515
3516 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3517
3518 this->uniforms = 0;
3519
3520 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3521 * at least one. See setup_uniforms() in brw_vec4.cpp.
3522 */
3523 this->uniform_array_size = 1;
3524 if (prog_data) {
3525 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3526 }
3527
3528 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3529 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3530 }
3531
3532 vec4_visitor::~vec4_visitor()
3533 {
3534 hash_table_dtor(this->variable_ht);
3535 }
3536
3537
3538 void
3539 vec4_visitor::fail(const char *format, ...)
3540 {
3541 va_list va;
3542 char *msg;
3543
3544 if (failed)
3545 return;
3546
3547 failed = true;
3548
3549 va_start(va, format);
3550 msg = ralloc_vasprintf(mem_ctx, format, va);
3551 va_end(va);
3552 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3553
3554 this->fail_msg = msg;
3555
3556 if (debug_flag) {
3557 fprintf(stderr, "%s", msg);
3558 }
3559 }
3560
3561 } /* namespace brw */