i965: Provide means to create registers of a given size.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up no register space, since they're baked in at
592 * link time.
593 */
594 return 0;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
643 {
644 assert(size > 0);
645
646 init();
647
648 this->file = GRF;
649 this->reg = v->virtual_grf_alloc(type_size(type) * size);
650
651 this->swizzle = BRW_SWIZZLE_NOOP;
652
653 this->type = brw_type_for_base_type(type);
654 }
655
656 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
657 {
658 init();
659
660 this->file = GRF;
661 this->reg = v->virtual_grf_alloc(type_size(type));
662
663 if (type->is_array() || type->is_record()) {
664 this->writemask = WRITEMASK_XYZW;
665 } else {
666 this->writemask = (1 << type->vector_elements) - 1;
667 }
668
669 this->type = brw_type_for_base_type(type);
670 }
671
672 /* Our support for uniforms is piggy-backed on the struct
673 * gl_fragment_program, because that's where the values actually
674 * get stored, rather than in some global gl_shader_program uniform
675 * store.
676 */
677 void
678 vec4_visitor::setup_uniform_values(ir_variable *ir)
679 {
680 int namelen = strlen(ir->name);
681
682 /* The data for our (non-builtin) uniforms is stored in a series of
683 * gl_uniform_driver_storage structs for each subcomponent that
684 * glGetUniformLocation() could name. We know it's been set up in the same
685 * order we'd walk the type, so walk the list of storage and find anything
686 * with our name, or the prefix of a component that starts with our name.
687 */
688 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
689 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
690
691 if (strncmp(ir->name, storage->name, namelen) != 0 ||
692 (storage->name[namelen] != 0 &&
693 storage->name[namelen] != '.' &&
694 storage->name[namelen] != '[')) {
695 continue;
696 }
697
698 gl_constant_value *components = storage->storage;
699 unsigned vector_count = (MAX2(storage->array_elements, 1) *
700 storage->type->matrix_columns);
701
702 for (unsigned s = 0; s < vector_count; s++) {
703 assert(uniforms < uniform_array_size);
704 uniform_vector_size[uniforms] = storage->type->vector_elements;
705
706 int i;
707 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
708 stage_prog_data->param[uniforms * 4 + i] = components;
709 components++;
710 }
711 for (; i < 4; i++) {
712 static gl_constant_value zero = { 0.0 };
713 stage_prog_data->param[uniforms * 4 + i] = &zero;
714 }
715
716 uniforms++;
717 }
718 }
719 }
720
721 void
722 vec4_visitor::setup_uniform_clipplane_values()
723 {
724 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
725
726 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
727 assert(this->uniforms < uniform_array_size);
728 this->uniform_vector_size[this->uniforms] = 4;
729 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
730 this->userplane[i].type = BRW_REGISTER_TYPE_F;
731 for (int j = 0; j < 4; ++j) {
732 stage_prog_data->param[this->uniforms * 4 + j] =
733 (gl_constant_value *) &clip_planes[i][j];
734 }
735 ++this->uniforms;
736 }
737 }
738
739 /* Our support for builtin uniforms is even scarier than non-builtin.
740 * It sits on top of the PROG_STATE_VAR parameters that are
741 * automatically updated from GL context state.
742 */
743 void
744 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
745 {
746 const ir_state_slot *const slots = ir->state_slots;
747 assert(ir->state_slots != NULL);
748
749 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
750 /* This state reference has already been setup by ir_to_mesa,
751 * but we'll get the same index back here. We can reference
752 * ParameterValues directly, since unlike brw_fs.cpp, we never
753 * add new state references during compile.
754 */
755 int index = _mesa_add_state_reference(this->prog->Parameters,
756 (gl_state_index *)slots[i].tokens);
757 gl_constant_value *values =
758 &this->prog->Parameters->ParameterValues[index][0];
759
760 assert(this->uniforms < uniform_array_size);
761 this->uniform_vector_size[this->uniforms] = 0;
762 /* Add each of the unique swizzled channels of the element.
763 * This will end up matching the size of the glsl_type of this field.
764 */
765 int last_swiz = -1;
766 for (unsigned int j = 0; j < 4; j++) {
767 int swiz = GET_SWZ(slots[i].swizzle, j);
768 last_swiz = swiz;
769
770 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
771 assert(this->uniforms < uniform_array_size);
772 if (swiz <= last_swiz)
773 this->uniform_vector_size[this->uniforms]++;
774 }
775 this->uniforms++;
776 }
777 }
778
779 dst_reg *
780 vec4_visitor::variable_storage(ir_variable *var)
781 {
782 return (dst_reg *)hash_table_find(this->variable_ht, var);
783 }
784
785 void
786 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
787 enum brw_predicate *predicate)
788 {
789 ir_expression *expr = ir->as_expression();
790
791 *predicate = BRW_PREDICATE_NORMAL;
792
793 if (expr && expr->operation != ir_binop_ubo_load) {
794 src_reg op[3];
795 vec4_instruction *inst;
796
797 assert(expr->get_num_operands() <= 3);
798 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
799 expr->operands[i]->accept(this);
800 op[i] = this->result;
801
802 resolve_ud_negate(&op[i]);
803 }
804
805 switch (expr->operation) {
806 case ir_unop_logic_not:
807 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
808 inst->conditional_mod = BRW_CONDITIONAL_Z;
809 break;
810
811 case ir_binop_logic_xor:
812 inst = emit(XOR(dst_null_d(), op[0], op[1]));
813 inst->conditional_mod = BRW_CONDITIONAL_NZ;
814 break;
815
816 case ir_binop_logic_or:
817 inst = emit(OR(dst_null_d(), op[0], op[1]));
818 inst->conditional_mod = BRW_CONDITIONAL_NZ;
819 break;
820
821 case ir_binop_logic_and:
822 inst = emit(AND(dst_null_d(), op[0], op[1]));
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 break;
825
826 case ir_unop_f2b:
827 if (brw->gen >= 6) {
828 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
829 } else {
830 inst = emit(MOV(dst_null_f(), op[0]));
831 inst->conditional_mod = BRW_CONDITIONAL_NZ;
832 }
833 break;
834
835 case ir_unop_i2b:
836 if (brw->gen >= 6) {
837 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
838 } else {
839 inst = emit(MOV(dst_null_d(), op[0]));
840 inst->conditional_mod = BRW_CONDITIONAL_NZ;
841 }
842 break;
843
844 case ir_binop_all_equal:
845 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
846 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
847 break;
848
849 case ir_binop_any_nequal:
850 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
851 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
852 break;
853
854 case ir_unop_any:
855 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
856 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
857 break;
858
859 case ir_binop_greater:
860 case ir_binop_gequal:
861 case ir_binop_less:
862 case ir_binop_lequal:
863 case ir_binop_equal:
864 case ir_binop_nequal:
865 emit(CMP(dst_null_d(), op[0], op[1],
866 brw_conditional_for_comparison(expr->operation)));
867 break;
868
869 case ir_triop_csel: {
870 /* Expand the boolean condition into the flag register. */
871 inst = emit(MOV(dst_null_d(), op[0]));
872 inst->conditional_mod = BRW_CONDITIONAL_NZ;
873
874 /* Select which boolean to return. */
875 dst_reg temp(this, expr->operands[1]->type);
876 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
877 inst->predicate = BRW_PREDICATE_NORMAL;
878
879 /* Expand the result to a condition code. */
880 inst = emit(MOV(dst_null_d(), src_reg(temp)));
881 inst->conditional_mod = BRW_CONDITIONAL_NZ;
882 break;
883 }
884
885 default:
886 unreachable("not reached");
887 }
888 return;
889 }
890
891 ir->accept(this);
892
893 resolve_ud_negate(&this->result);
894
895 if (brw->gen >= 6) {
896 vec4_instruction *inst = emit(AND(dst_null_d(),
897 this->result, src_reg(1)));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899 } else {
900 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
901 inst->conditional_mod = BRW_CONDITIONAL_NZ;
902 }
903 }
904
905 /**
906 * Emit a gen6 IF statement with the comparison folded into the IF
907 * instruction.
908 */
909 void
910 vec4_visitor::emit_if_gen6(ir_if *ir)
911 {
912 ir_expression *expr = ir->condition->as_expression();
913
914 if (expr && expr->operation != ir_binop_ubo_load) {
915 src_reg op[3];
916 dst_reg temp;
917
918 assert(expr->get_num_operands() <= 3);
919 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
920 expr->operands[i]->accept(this);
921 op[i] = this->result;
922 }
923
924 switch (expr->operation) {
925 case ir_unop_logic_not:
926 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
927 return;
928
929 case ir_binop_logic_xor:
930 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
931 return;
932
933 case ir_binop_logic_or:
934 temp = dst_reg(this, glsl_type::bool_type);
935 emit(OR(temp, op[0], op[1]));
936 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
937 return;
938
939 case ir_binop_logic_and:
940 temp = dst_reg(this, glsl_type::bool_type);
941 emit(AND(temp, op[0], op[1]));
942 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
943 return;
944
945 case ir_unop_f2b:
946 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
947 return;
948
949 case ir_unop_i2b:
950 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
951 return;
952
953 case ir_binop_greater:
954 case ir_binop_gequal:
955 case ir_binop_less:
956 case ir_binop_lequal:
957 case ir_binop_equal:
958 case ir_binop_nequal:
959 emit(IF(op[0], op[1],
960 brw_conditional_for_comparison(expr->operation)));
961 return;
962
963 case ir_binop_all_equal:
964 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
965 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
966 return;
967
968 case ir_binop_any_nequal:
969 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
970 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
971 return;
972
973 case ir_unop_any:
974 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
975 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
976 return;
977
978 case ir_triop_csel: {
979 /* Expand the boolean condition into the flag register. */
980 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
981 inst->conditional_mod = BRW_CONDITIONAL_NZ;
982
983 /* Select which boolean to return. */
984 dst_reg temp(this, expr->operands[1]->type);
985 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
986 inst->predicate = BRW_PREDICATE_NORMAL;
987
988 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
989 return;
990 }
991
992 default:
993 unreachable("not reached");
994 }
995 return;
996 }
997
998 ir->condition->accept(this);
999
1000 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1001 }
1002
1003 void
1004 vec4_visitor::visit(ir_variable *ir)
1005 {
1006 dst_reg *reg = NULL;
1007
1008 if (variable_storage(ir))
1009 return;
1010
1011 switch (ir->data.mode) {
1012 case ir_var_shader_in:
1013 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1014 break;
1015
1016 case ir_var_shader_out:
1017 reg = new(mem_ctx) dst_reg(this, ir->type);
1018
1019 for (int i = 0; i < type_size(ir->type); i++) {
1020 output_reg[ir->data.location + i] = *reg;
1021 output_reg[ir->data.location + i].reg_offset = i;
1022 output_reg[ir->data.location + i].type =
1023 brw_type_for_base_type(ir->type->get_scalar_type());
1024 output_reg_annotation[ir->data.location + i] = ir->name;
1025 }
1026 break;
1027
1028 case ir_var_auto:
1029 case ir_var_temporary:
1030 reg = new(mem_ctx) dst_reg(this, ir->type);
1031 break;
1032
1033 case ir_var_uniform:
1034 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1035
1036 /* Thanks to the lower_ubo_reference pass, we will see only
1037 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1038 * variables, so no need for them to be in variable_ht.
1039 *
1040 * Some uniforms, such as samplers and atomic counters, have no actual
1041 * storage, so we should ignore them.
1042 */
1043 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1044 return;
1045
1046 /* Track how big the whole uniform variable is, in case we need to put a
1047 * copy of its data into pull constants for array access.
1048 */
1049 assert(this->uniforms < uniform_array_size);
1050 this->uniform_size[this->uniforms] = type_size(ir->type);
1051
1052 if (!strncmp(ir->name, "gl_", 3)) {
1053 setup_builtin_uniform_values(ir);
1054 } else {
1055 setup_uniform_values(ir);
1056 }
1057 break;
1058
1059 case ir_var_system_value:
1060 reg = make_reg_for_system_value(ir);
1061 break;
1062
1063 default:
1064 unreachable("not reached");
1065 }
1066
1067 reg->type = brw_type_for_base_type(ir->type);
1068 hash_table_insert(this->variable_ht, reg, ir);
1069 }
1070
1071 void
1072 vec4_visitor::visit(ir_loop *ir)
1073 {
1074 /* We don't want debugging output to print the whole body of the
1075 * loop as the annotation.
1076 */
1077 this->base_ir = NULL;
1078
1079 emit(BRW_OPCODE_DO);
1080
1081 visit_instructions(&ir->body_instructions);
1082
1083 emit(BRW_OPCODE_WHILE);
1084 }
1085
1086 void
1087 vec4_visitor::visit(ir_loop_jump *ir)
1088 {
1089 switch (ir->mode) {
1090 case ir_loop_jump::jump_break:
1091 emit(BRW_OPCODE_BREAK);
1092 break;
1093 case ir_loop_jump::jump_continue:
1094 emit(BRW_OPCODE_CONTINUE);
1095 break;
1096 }
1097 }
1098
1099
1100 void
1101 vec4_visitor::visit(ir_function_signature *)
1102 {
1103 unreachable("not reached");
1104 }
1105
1106 void
1107 vec4_visitor::visit(ir_function *ir)
1108 {
1109 /* Ignore function bodies other than main() -- we shouldn't see calls to
1110 * them since they should all be inlined.
1111 */
1112 if (strcmp(ir->name, "main") == 0) {
1113 const ir_function_signature *sig;
1114 exec_list empty;
1115
1116 sig = ir->matching_signature(NULL, &empty, false);
1117
1118 assert(sig);
1119
1120 visit_instructions(&sig->body);
1121 }
1122 }
1123
1124 bool
1125 vec4_visitor::try_emit_mad(ir_expression *ir)
1126 {
1127 /* 3-src instructions were introduced in gen6. */
1128 if (brw->gen < 6)
1129 return false;
1130
1131 /* MAD can only handle floating-point data. */
1132 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1133 return false;
1134
1135 ir_rvalue *nonmul = ir->operands[1];
1136 ir_expression *mul = ir->operands[0]->as_expression();
1137
1138 if (!mul || mul->operation != ir_binop_mul) {
1139 nonmul = ir->operands[0];
1140 mul = ir->operands[1]->as_expression();
1141
1142 if (!mul || mul->operation != ir_binop_mul)
1143 return false;
1144 }
1145
1146 nonmul->accept(this);
1147 src_reg src0 = fix_3src_operand(this->result);
1148
1149 mul->operands[0]->accept(this);
1150 src_reg src1 = fix_3src_operand(this->result);
1151
1152 mul->operands[1]->accept(this);
1153 src_reg src2 = fix_3src_operand(this->result);
1154
1155 this->result = src_reg(this, ir->type);
1156 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1157
1158 return true;
1159 }
1160
1161 bool
1162 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1163 {
1164 /* This optimization relies on CMP setting the destination to 0 when
1165 * false. Early hardware only sets the least significant bit, and
1166 * leaves the other bits undefined. So we can't use it.
1167 */
1168 if (brw->gen < 6)
1169 return false;
1170
1171 ir_expression *const cmp = ir->operands[0]->as_expression();
1172
1173 if (cmp == NULL)
1174 return false;
1175
1176 switch (cmp->operation) {
1177 case ir_binop_less:
1178 case ir_binop_greater:
1179 case ir_binop_lequal:
1180 case ir_binop_gequal:
1181 case ir_binop_equal:
1182 case ir_binop_nequal:
1183 break;
1184
1185 default:
1186 return false;
1187 }
1188
1189 cmp->operands[0]->accept(this);
1190 const src_reg cmp_src0 = this->result;
1191
1192 cmp->operands[1]->accept(this);
1193 const src_reg cmp_src1 = this->result;
1194
1195 this->result = src_reg(this, ir->type);
1196
1197 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1198 brw_conditional_for_comparison(cmp->operation)));
1199
1200 /* If the comparison is false, this->result will just happen to be zero.
1201 */
1202 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1203 this->result, src_reg(1.0f));
1204 inst->predicate = BRW_PREDICATE_NORMAL;
1205 inst->predicate_inverse = true;
1206
1207 return true;
1208 }
1209
1210 void
1211 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1212 src_reg src0, src_reg src1)
1213 {
1214 vec4_instruction *inst;
1215
1216 if (brw->gen >= 6) {
1217 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1218 inst->conditional_mod = conditionalmod;
1219 } else {
1220 emit(CMP(dst, src0, src1, conditionalmod));
1221
1222 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1223 inst->predicate = BRW_PREDICATE_NORMAL;
1224 }
1225 }
1226
1227 void
1228 vec4_visitor::emit_lrp(const dst_reg &dst,
1229 const src_reg &x, const src_reg &y, const src_reg &a)
1230 {
1231 if (brw->gen >= 6) {
1232 /* Note that the instruction's argument order is reversed from GLSL
1233 * and the IR.
1234 */
1235 emit(LRP(dst,
1236 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1237 } else {
1238 /* Earlier generations don't support three source operations, so we
1239 * need to emit x*(1-a) + y*a.
1240 */
1241 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1242 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1243 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1244 y_times_a.writemask = dst.writemask;
1245 one_minus_a.writemask = dst.writemask;
1246 x_times_one_minus_a.writemask = dst.writemask;
1247
1248 emit(MUL(y_times_a, y, a));
1249 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1250 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1251 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1252 }
1253 }
1254
1255 void
1256 vec4_visitor::visit(ir_expression *ir)
1257 {
1258 unsigned int operand;
1259 src_reg op[Elements(ir->operands)];
1260 src_reg result_src;
1261 dst_reg result_dst;
1262 vec4_instruction *inst;
1263
1264 if (ir->operation == ir_binop_add) {
1265 if (try_emit_mad(ir))
1266 return;
1267 }
1268
1269 if (ir->operation == ir_unop_b2f) {
1270 if (try_emit_b2f_of_compare(ir))
1271 return;
1272 }
1273
1274 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1275 this->result.file = BAD_FILE;
1276 ir->operands[operand]->accept(this);
1277 if (this->result.file == BAD_FILE) {
1278 fprintf(stderr, "Failed to get tree for expression operand:\n");
1279 ir->operands[operand]->fprint(stderr);
1280 exit(1);
1281 }
1282 op[operand] = this->result;
1283
1284 /* Matrix expression operands should have been broken down to vector
1285 * operations already.
1286 */
1287 assert(!ir->operands[operand]->type->is_matrix());
1288 }
1289
1290 int vector_elements = ir->operands[0]->type->vector_elements;
1291 if (ir->operands[1]) {
1292 vector_elements = MAX2(vector_elements,
1293 ir->operands[1]->type->vector_elements);
1294 }
1295
1296 this->result.file = BAD_FILE;
1297
1298 /* Storage for our result. Ideally for an assignment we'd be using
1299 * the actual storage for the result here, instead.
1300 */
1301 result_src = src_reg(this, ir->type);
1302 /* convenience for the emit functions below. */
1303 result_dst = dst_reg(result_src);
1304 /* If nothing special happens, this is the result. */
1305 this->result = result_src;
1306 /* Limit writes to the channels that will be used by result_src later.
1307 * This does limit this temp's use as a temporary for multi-instruction
1308 * sequences.
1309 */
1310 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1311
1312 switch (ir->operation) {
1313 case ir_unop_logic_not:
1314 if (ctx->Const.UniformBooleanTrue != 1) {
1315 emit(NOT(result_dst, op[0]));
1316 } else {
1317 emit(XOR(result_dst, op[0], src_reg(1)));
1318 }
1319 break;
1320 case ir_unop_neg:
1321 op[0].negate = !op[0].negate;
1322 emit(MOV(result_dst, op[0]));
1323 break;
1324 case ir_unop_abs:
1325 op[0].abs = true;
1326 op[0].negate = false;
1327 emit(MOV(result_dst, op[0]));
1328 break;
1329
1330 case ir_unop_sign:
1331 if (ir->type->is_float()) {
1332 /* AND(val, 0x80000000) gives the sign bit.
1333 *
1334 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1335 * zero.
1336 */
1337 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1338
1339 op[0].type = BRW_REGISTER_TYPE_UD;
1340 result_dst.type = BRW_REGISTER_TYPE_UD;
1341 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1342
1343 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1344 inst->predicate = BRW_PREDICATE_NORMAL;
1345
1346 this->result.type = BRW_REGISTER_TYPE_F;
1347 } else {
1348 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1349 * -> non-negative val generates 0x00000000.
1350 * Predicated OR sets 1 if val is positive.
1351 */
1352 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1353
1354 emit(ASR(result_dst, op[0], src_reg(31)));
1355
1356 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1357 inst->predicate = BRW_PREDICATE_NORMAL;
1358 }
1359 break;
1360
1361 case ir_unop_rcp:
1362 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1363 break;
1364
1365 case ir_unop_exp2:
1366 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1367 break;
1368 case ir_unop_log2:
1369 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1370 break;
1371 case ir_unop_exp:
1372 case ir_unop_log:
1373 unreachable("not reached: should be handled by ir_explog_to_explog2");
1374 case ir_unop_sin:
1375 case ir_unop_sin_reduced:
1376 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1377 break;
1378 case ir_unop_cos:
1379 case ir_unop_cos_reduced:
1380 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1381 break;
1382
1383 case ir_unop_dFdx:
1384 case ir_unop_dFdx_coarse:
1385 case ir_unop_dFdx_fine:
1386 case ir_unop_dFdy:
1387 case ir_unop_dFdy_coarse:
1388 case ir_unop_dFdy_fine:
1389 unreachable("derivatives not valid in vertex shader");
1390
1391 case ir_unop_bitfield_reverse:
1392 emit(BFREV(result_dst, op[0]));
1393 break;
1394 case ir_unop_bit_count:
1395 emit(CBIT(result_dst, op[0]));
1396 break;
1397 case ir_unop_find_msb: {
1398 src_reg temp = src_reg(this, glsl_type::uint_type);
1399
1400 inst = emit(FBH(dst_reg(temp), op[0]));
1401 inst->dst.writemask = WRITEMASK_XYZW;
1402
1403 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1404 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1405 * subtract the result from 31 to convert the MSB count into an LSB count.
1406 */
1407
1408 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1409 temp.swizzle = BRW_SWIZZLE_NOOP;
1410 emit(MOV(result_dst, temp));
1411
1412 src_reg src_tmp = src_reg(result_dst);
1413 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1414
1415 src_tmp.negate = true;
1416 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1417 inst->predicate = BRW_PREDICATE_NORMAL;
1418 break;
1419 }
1420 case ir_unop_find_lsb:
1421 emit(FBL(result_dst, op[0]));
1422 break;
1423 case ir_unop_saturate:
1424 inst = emit(MOV(result_dst, op[0]));
1425 inst->saturate = true;
1426 break;
1427
1428 case ir_unop_noise:
1429 unreachable("not reached: should be handled by lower_noise");
1430
1431 case ir_binop_add:
1432 emit(ADD(result_dst, op[0], op[1]));
1433 break;
1434 case ir_binop_sub:
1435 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1436
1437 case ir_binop_mul:
1438 if (brw->gen < 8 && ir->type->is_integer()) {
1439 /* For integer multiplication, the MUL uses the low 16 bits of one of
1440 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1441 * accumulates in the contribution of the upper 16 bits of that
1442 * operand. If we can determine that one of the args is in the low
1443 * 16 bits, though, we can just emit a single MUL.
1444 */
1445 if (ir->operands[0]->is_uint16_constant()) {
1446 if (brw->gen < 7)
1447 emit(MUL(result_dst, op[0], op[1]));
1448 else
1449 emit(MUL(result_dst, op[1], op[0]));
1450 } else if (ir->operands[1]->is_uint16_constant()) {
1451 if (brw->gen < 7)
1452 emit(MUL(result_dst, op[1], op[0]));
1453 else
1454 emit(MUL(result_dst, op[0], op[1]));
1455 } else {
1456 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1457
1458 emit(MUL(acc, op[0], op[1]));
1459 emit(MACH(dst_null_d(), op[0], op[1]));
1460 emit(MOV(result_dst, src_reg(acc)));
1461 }
1462 } else {
1463 emit(MUL(result_dst, op[0], op[1]));
1464 }
1465 break;
1466 case ir_binop_imul_high: {
1467 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1468
1469 emit(MUL(acc, op[0], op[1]));
1470 emit(MACH(result_dst, op[0], op[1]));
1471 break;
1472 }
1473 case ir_binop_div:
1474 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1475 assert(ir->type->is_integer());
1476 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1477 break;
1478 case ir_binop_carry: {
1479 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1480
1481 emit(ADDC(dst_null_ud(), op[0], op[1]));
1482 emit(MOV(result_dst, src_reg(acc)));
1483 break;
1484 }
1485 case ir_binop_borrow: {
1486 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1487
1488 emit(SUBB(dst_null_ud(), op[0], op[1]));
1489 emit(MOV(result_dst, src_reg(acc)));
1490 break;
1491 }
1492 case ir_binop_mod:
1493 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1494 assert(ir->type->is_integer());
1495 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1496 break;
1497
1498 case ir_binop_less:
1499 case ir_binop_greater:
1500 case ir_binop_lequal:
1501 case ir_binop_gequal:
1502 case ir_binop_equal:
1503 case ir_binop_nequal: {
1504 emit(CMP(result_dst, op[0], op[1],
1505 brw_conditional_for_comparison(ir->operation)));
1506 if (ctx->Const.UniformBooleanTrue == 1) {
1507 emit(AND(result_dst, result_src, src_reg(1)));
1508 }
1509 break;
1510 }
1511
1512 case ir_binop_all_equal:
1513 /* "==" operator producing a scalar boolean. */
1514 if (ir->operands[0]->type->is_vector() ||
1515 ir->operands[1]->type->is_vector()) {
1516 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1517 emit(MOV(result_dst, src_reg(0)));
1518 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1519 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1520 } else {
1521 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1522 if (ctx->Const.UniformBooleanTrue == 1) {
1523 emit(AND(result_dst, result_src, src_reg(1)));
1524 }
1525 }
1526 break;
1527 case ir_binop_any_nequal:
1528 /* "!=" operator producing a scalar boolean. */
1529 if (ir->operands[0]->type->is_vector() ||
1530 ir->operands[1]->type->is_vector()) {
1531 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1532
1533 emit(MOV(result_dst, src_reg(0)));
1534 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1535 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1536 } else {
1537 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1538 if (ctx->Const.UniformBooleanTrue == 1) {
1539 emit(AND(result_dst, result_src, src_reg(1)));
1540 }
1541 }
1542 break;
1543
1544 case ir_unop_any:
1545 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1546 emit(MOV(result_dst, src_reg(0)));
1547
1548 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1549 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1550 break;
1551
1552 case ir_binop_logic_xor:
1553 emit(XOR(result_dst, op[0], op[1]));
1554 break;
1555
1556 case ir_binop_logic_or:
1557 emit(OR(result_dst, op[0], op[1]));
1558 break;
1559
1560 case ir_binop_logic_and:
1561 emit(AND(result_dst, op[0], op[1]));
1562 break;
1563
1564 case ir_binop_dot:
1565 assert(ir->operands[0]->type->is_vector());
1566 assert(ir->operands[0]->type == ir->operands[1]->type);
1567 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1568 break;
1569
1570 case ir_unop_sqrt:
1571 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1572 break;
1573 case ir_unop_rsq:
1574 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1575 break;
1576
1577 case ir_unop_bitcast_i2f:
1578 case ir_unop_bitcast_u2f:
1579 this->result = op[0];
1580 this->result.type = BRW_REGISTER_TYPE_F;
1581 break;
1582
1583 case ir_unop_bitcast_f2i:
1584 this->result = op[0];
1585 this->result.type = BRW_REGISTER_TYPE_D;
1586 break;
1587
1588 case ir_unop_bitcast_f2u:
1589 this->result = op[0];
1590 this->result.type = BRW_REGISTER_TYPE_UD;
1591 break;
1592
1593 case ir_unop_i2f:
1594 case ir_unop_i2u:
1595 case ir_unop_u2i:
1596 case ir_unop_u2f:
1597 case ir_unop_f2i:
1598 case ir_unop_f2u:
1599 emit(MOV(result_dst, op[0]));
1600 break;
1601 case ir_unop_b2i:
1602 if (ctx->Const.UniformBooleanTrue != 1) {
1603 emit(AND(result_dst, op[0], src_reg(1)));
1604 } else {
1605 emit(MOV(result_dst, op[0]));
1606 }
1607 break;
1608 case ir_unop_b2f:
1609 if (ctx->Const.UniformBooleanTrue != 1) {
1610 op[0].type = BRW_REGISTER_TYPE_UD;
1611 result_dst.type = BRW_REGISTER_TYPE_UD;
1612 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1613 result_dst.type = BRW_REGISTER_TYPE_F;
1614 } else {
1615 emit(MOV(result_dst, op[0]));
1616 }
1617 break;
1618 case ir_unop_f2b:
1619 case ir_unop_i2b:
1620 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1621 if (ctx->Const.UniformBooleanTrue == 1) {
1622 emit(AND(result_dst, result_src, src_reg(1)));
1623 }
1624 break;
1625
1626 case ir_unop_trunc:
1627 emit(RNDZ(result_dst, op[0]));
1628 break;
1629 case ir_unop_ceil:
1630 op[0].negate = !op[0].negate;
1631 inst = emit(RNDD(result_dst, op[0]));
1632 this->result.negate = true;
1633 break;
1634 case ir_unop_floor:
1635 inst = emit(RNDD(result_dst, op[0]));
1636 break;
1637 case ir_unop_fract:
1638 inst = emit(FRC(result_dst, op[0]));
1639 break;
1640 case ir_unop_round_even:
1641 emit(RNDE(result_dst, op[0]));
1642 break;
1643
1644 case ir_binop_min:
1645 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1646 break;
1647 case ir_binop_max:
1648 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1649 break;
1650
1651 case ir_binop_pow:
1652 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1653 break;
1654
1655 case ir_unop_bit_not:
1656 inst = emit(NOT(result_dst, op[0]));
1657 break;
1658 case ir_binop_bit_and:
1659 inst = emit(AND(result_dst, op[0], op[1]));
1660 break;
1661 case ir_binop_bit_xor:
1662 inst = emit(XOR(result_dst, op[0], op[1]));
1663 break;
1664 case ir_binop_bit_or:
1665 inst = emit(OR(result_dst, op[0], op[1]));
1666 break;
1667
1668 case ir_binop_lshift:
1669 inst = emit(SHL(result_dst, op[0], op[1]));
1670 break;
1671
1672 case ir_binop_rshift:
1673 if (ir->type->base_type == GLSL_TYPE_INT)
1674 inst = emit(ASR(result_dst, op[0], op[1]));
1675 else
1676 inst = emit(SHR(result_dst, op[0], op[1]));
1677 break;
1678
1679 case ir_binop_bfm:
1680 emit(BFI1(result_dst, op[0], op[1]));
1681 break;
1682
1683 case ir_binop_ubo_load: {
1684 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1685 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1686 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1687 src_reg offset;
1688
1689 /* Now, load the vector from that offset. */
1690 assert(ir->type->is_vector() || ir->type->is_scalar());
1691
1692 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1693 packed_consts.type = result.type;
1694 src_reg surf_index;
1695
1696 if (const_uniform_block) {
1697 /* The block index is a constant, so just emit the binding table entry
1698 * as an immediate.
1699 */
1700 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1701 const_uniform_block->value.u[0]);
1702 } else {
1703 /* The block index is not a constant. Evaluate the index expression
1704 * per-channel and add the base UBO index; the generator will select
1705 * a value from any live channel.
1706 */
1707 surf_index = src_reg(this, glsl_type::uint_type);
1708 emit(ADD(dst_reg(surf_index), op[0],
1709 src_reg(prog_data->base.binding_table.ubo_start)));
1710
1711 /* Assume this may touch any UBO. It would be nice to provide
1712 * a tighter bound, but the array information is already lowered away.
1713 */
1714 brw_mark_surface_used(&prog_data->base,
1715 prog_data->base.binding_table.ubo_start +
1716 shader_prog->NumUniformBlocks - 1);
1717 }
1718
1719 if (const_offset_ir) {
1720 if (brw->gen >= 8) {
1721 /* Store the offset in a GRF so we can send-from-GRF. */
1722 offset = src_reg(this, glsl_type::int_type);
1723 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1724 } else {
1725 /* Immediates are fine on older generations since they'll be moved
1726 * to a (potentially fake) MRF at the generator level.
1727 */
1728 offset = src_reg(const_offset / 16);
1729 }
1730 } else {
1731 offset = src_reg(this, glsl_type::uint_type);
1732 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1733 }
1734
1735 if (brw->gen >= 7) {
1736 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1737 grf_offset.type = offset.type;
1738
1739 emit(MOV(grf_offset, offset));
1740
1741 emit(new(mem_ctx) vec4_instruction(this,
1742 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1743 dst_reg(packed_consts),
1744 surf_index,
1745 src_reg(grf_offset)));
1746 } else {
1747 vec4_instruction *pull =
1748 emit(new(mem_ctx) vec4_instruction(this,
1749 VS_OPCODE_PULL_CONSTANT_LOAD,
1750 dst_reg(packed_consts),
1751 surf_index,
1752 offset));
1753 pull->base_mrf = 14;
1754 pull->mlen = 1;
1755 }
1756
1757 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1758 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1759 const_offset % 16 / 4,
1760 const_offset % 16 / 4,
1761 const_offset % 16 / 4);
1762
1763 /* UBO bools are any nonzero int. We need to convert them to use the
1764 * value of true stored in ctx->Const.UniformBooleanTrue.
1765 */
1766 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1767 emit(CMP(result_dst, packed_consts, src_reg(0u),
1768 BRW_CONDITIONAL_NZ));
1769 if (ctx->Const.UniformBooleanTrue == 1) {
1770 emit(AND(result_dst, result, src_reg(1)));
1771 }
1772 } else {
1773 emit(MOV(result_dst, packed_consts));
1774 }
1775 break;
1776 }
1777
1778 case ir_binop_vector_extract:
1779 unreachable("should have been lowered by vec_index_to_cond_assign");
1780
1781 case ir_triop_fma:
1782 op[0] = fix_3src_operand(op[0]);
1783 op[1] = fix_3src_operand(op[1]);
1784 op[2] = fix_3src_operand(op[2]);
1785 /* Note that the instruction's argument order is reversed from GLSL
1786 * and the IR.
1787 */
1788 emit(MAD(result_dst, op[2], op[1], op[0]));
1789 break;
1790
1791 case ir_triop_lrp:
1792 emit_lrp(result_dst, op[0], op[1], op[2]);
1793 break;
1794
1795 case ir_triop_csel:
1796 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1797 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1798 inst->predicate = BRW_PREDICATE_NORMAL;
1799 break;
1800
1801 case ir_triop_bfi:
1802 op[0] = fix_3src_operand(op[0]);
1803 op[1] = fix_3src_operand(op[1]);
1804 op[2] = fix_3src_operand(op[2]);
1805 emit(BFI2(result_dst, op[0], op[1], op[2]));
1806 break;
1807
1808 case ir_triop_bitfield_extract:
1809 op[0] = fix_3src_operand(op[0]);
1810 op[1] = fix_3src_operand(op[1]);
1811 op[2] = fix_3src_operand(op[2]);
1812 /* Note that the instruction's argument order is reversed from GLSL
1813 * and the IR.
1814 */
1815 emit(BFE(result_dst, op[2], op[1], op[0]));
1816 break;
1817
1818 case ir_triop_vector_insert:
1819 unreachable("should have been lowered by lower_vector_insert");
1820
1821 case ir_quadop_bitfield_insert:
1822 unreachable("not reached: should be handled by "
1823 "bitfield_insert_to_bfm_bfi\n");
1824
1825 case ir_quadop_vector:
1826 unreachable("not reached: should be handled by lower_quadop_vector");
1827
1828 case ir_unop_pack_half_2x16:
1829 emit_pack_half_2x16(result_dst, op[0]);
1830 break;
1831 case ir_unop_unpack_half_2x16:
1832 emit_unpack_half_2x16(result_dst, op[0]);
1833 break;
1834 case ir_unop_pack_snorm_2x16:
1835 case ir_unop_pack_snorm_4x8:
1836 case ir_unop_pack_unorm_2x16:
1837 case ir_unop_pack_unorm_4x8:
1838 case ir_unop_unpack_snorm_2x16:
1839 case ir_unop_unpack_snorm_4x8:
1840 case ir_unop_unpack_unorm_2x16:
1841 case ir_unop_unpack_unorm_4x8:
1842 unreachable("not reached: should be handled by lower_packing_builtins");
1843 case ir_unop_unpack_half_2x16_split_x:
1844 case ir_unop_unpack_half_2x16_split_y:
1845 case ir_binop_pack_half_2x16_split:
1846 case ir_unop_interpolate_at_centroid:
1847 case ir_binop_interpolate_at_sample:
1848 case ir_binop_interpolate_at_offset:
1849 unreachable("not reached: should not occur in vertex shader");
1850 case ir_binop_ldexp:
1851 unreachable("not reached: should be handled by ldexp_to_arith()");
1852 }
1853 }
1854
1855
1856 void
1857 vec4_visitor::visit(ir_swizzle *ir)
1858 {
1859 src_reg src;
1860 int i = 0;
1861 int swizzle[4];
1862
1863 /* Note that this is only swizzles in expressions, not those on the left
1864 * hand side of an assignment, which do write masking. See ir_assignment
1865 * for that.
1866 */
1867
1868 ir->val->accept(this);
1869 src = this->result;
1870 assert(src.file != BAD_FILE);
1871
1872 for (i = 0; i < ir->type->vector_elements; i++) {
1873 switch (i) {
1874 case 0:
1875 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1876 break;
1877 case 1:
1878 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1879 break;
1880 case 2:
1881 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1882 break;
1883 case 3:
1884 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1885 break;
1886 }
1887 }
1888 for (; i < 4; i++) {
1889 /* Replicate the last channel out. */
1890 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1891 }
1892
1893 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1894
1895 this->result = src;
1896 }
1897
1898 void
1899 vec4_visitor::visit(ir_dereference_variable *ir)
1900 {
1901 const struct glsl_type *type = ir->type;
1902 dst_reg *reg = variable_storage(ir->var);
1903
1904 if (!reg) {
1905 fail("Failed to find variable storage for %s\n", ir->var->name);
1906 this->result = src_reg(brw_null_reg());
1907 return;
1908 }
1909
1910 this->result = src_reg(*reg);
1911
1912 /* System values get their swizzle from the dst_reg writemask */
1913 if (ir->var->data.mode == ir_var_system_value)
1914 return;
1915
1916 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1917 this->result.swizzle = swizzle_for_size(type->vector_elements);
1918 }
1919
1920
1921 int
1922 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1923 {
1924 /* Under normal circumstances array elements are stored consecutively, so
1925 * the stride is equal to the size of the array element.
1926 */
1927 return type_size(ir->type);
1928 }
1929
1930
1931 void
1932 vec4_visitor::visit(ir_dereference_array *ir)
1933 {
1934 ir_constant *constant_index;
1935 src_reg src;
1936 int array_stride = compute_array_stride(ir);
1937
1938 constant_index = ir->array_index->constant_expression_value();
1939
1940 ir->array->accept(this);
1941 src = this->result;
1942
1943 if (constant_index) {
1944 src.reg_offset += constant_index->value.i[0] * array_stride;
1945 } else {
1946 /* Variable index array dereference. It eats the "vec4" of the
1947 * base of the array and an index that offsets the Mesa register
1948 * index.
1949 */
1950 ir->array_index->accept(this);
1951
1952 src_reg index_reg;
1953
1954 if (array_stride == 1) {
1955 index_reg = this->result;
1956 } else {
1957 index_reg = src_reg(this, glsl_type::int_type);
1958
1959 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1960 }
1961
1962 if (src.reladdr) {
1963 src_reg temp = src_reg(this, glsl_type::int_type);
1964
1965 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1966
1967 index_reg = temp;
1968 }
1969
1970 src.reladdr = ralloc(mem_ctx, src_reg);
1971 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1972 }
1973
1974 /* If the type is smaller than a vec4, replicate the last channel out. */
1975 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1976 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1977 else
1978 src.swizzle = BRW_SWIZZLE_NOOP;
1979 src.type = brw_type_for_base_type(ir->type);
1980
1981 this->result = src;
1982 }
1983
1984 void
1985 vec4_visitor::visit(ir_dereference_record *ir)
1986 {
1987 unsigned int i;
1988 const glsl_type *struct_type = ir->record->type;
1989 int offset = 0;
1990
1991 ir->record->accept(this);
1992
1993 for (i = 0; i < struct_type->length; i++) {
1994 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1995 break;
1996 offset += type_size(struct_type->fields.structure[i].type);
1997 }
1998
1999 /* If the type is smaller than a vec4, replicate the last channel out. */
2000 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2001 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2002 else
2003 this->result.swizzle = BRW_SWIZZLE_NOOP;
2004 this->result.type = brw_type_for_base_type(ir->type);
2005
2006 this->result.reg_offset += offset;
2007 }
2008
2009 /**
2010 * We want to be careful in assignment setup to hit the actual storage
2011 * instead of potentially using a temporary like we might with the
2012 * ir_dereference handler.
2013 */
2014 static dst_reg
2015 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2016 {
2017 /* The LHS must be a dereference. If the LHS is a variable indexed array
2018 * access of a vector, it must be separated into a series conditional moves
2019 * before reaching this point (see ir_vec_index_to_cond_assign).
2020 */
2021 assert(ir->as_dereference());
2022 ir_dereference_array *deref_array = ir->as_dereference_array();
2023 if (deref_array) {
2024 assert(!deref_array->array->type->is_vector());
2025 }
2026
2027 /* Use the rvalue deref handler for the most part. We'll ignore
2028 * swizzles in it and write swizzles using writemask, though.
2029 */
2030 ir->accept(v);
2031 return dst_reg(v->result);
2032 }
2033
2034 void
2035 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2036 const struct glsl_type *type,
2037 enum brw_predicate predicate)
2038 {
2039 if (type->base_type == GLSL_TYPE_STRUCT) {
2040 for (unsigned int i = 0; i < type->length; i++) {
2041 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2042 }
2043 return;
2044 }
2045
2046 if (type->is_array()) {
2047 for (unsigned int i = 0; i < type->length; i++) {
2048 emit_block_move(dst, src, type->fields.array, predicate);
2049 }
2050 return;
2051 }
2052
2053 if (type->is_matrix()) {
2054 const struct glsl_type *vec_type;
2055
2056 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2057 type->vector_elements, 1);
2058
2059 for (int i = 0; i < type->matrix_columns; i++) {
2060 emit_block_move(dst, src, vec_type, predicate);
2061 }
2062 return;
2063 }
2064
2065 assert(type->is_scalar() || type->is_vector());
2066
2067 dst->type = brw_type_for_base_type(type);
2068 src->type = dst->type;
2069
2070 dst->writemask = (1 << type->vector_elements) - 1;
2071
2072 src->swizzle = swizzle_for_size(type->vector_elements);
2073
2074 vec4_instruction *inst = emit(MOV(*dst, *src));
2075 inst->predicate = predicate;
2076
2077 dst->reg_offset++;
2078 src->reg_offset++;
2079 }
2080
2081
2082 /* If the RHS processing resulted in an instruction generating a
2083 * temporary value, and it would be easy to rewrite the instruction to
2084 * generate its result right into the LHS instead, do so. This ends
2085 * up reliably removing instructions where it can be tricky to do so
2086 * later without real UD chain information.
2087 */
2088 bool
2089 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2090 dst_reg dst,
2091 src_reg src,
2092 vec4_instruction *pre_rhs_inst,
2093 vec4_instruction *last_rhs_inst)
2094 {
2095 /* This could be supported, but it would take more smarts. */
2096 if (ir->condition)
2097 return false;
2098
2099 if (pre_rhs_inst == last_rhs_inst)
2100 return false; /* No instructions generated to work with. */
2101
2102 /* Make sure the last instruction generated our source reg. */
2103 if (src.file != GRF ||
2104 src.file != last_rhs_inst->dst.file ||
2105 src.reg != last_rhs_inst->dst.reg ||
2106 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2107 src.reladdr ||
2108 src.abs ||
2109 src.negate ||
2110 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2111 return false;
2112
2113 /* Check that that last instruction fully initialized the channels
2114 * we want to use, in the order we want to use them. We could
2115 * potentially reswizzle the operands of many instructions so that
2116 * we could handle out of order channels, but don't yet.
2117 */
2118
2119 for (unsigned i = 0; i < 4; i++) {
2120 if (dst.writemask & (1 << i)) {
2121 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2122 return false;
2123
2124 if (BRW_GET_SWZ(src.swizzle, i) != i)
2125 return false;
2126 }
2127 }
2128
2129 /* Success! Rewrite the instruction. */
2130 last_rhs_inst->dst.file = dst.file;
2131 last_rhs_inst->dst.reg = dst.reg;
2132 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2133 last_rhs_inst->dst.reladdr = dst.reladdr;
2134 last_rhs_inst->dst.writemask &= dst.writemask;
2135
2136 return true;
2137 }
2138
2139 void
2140 vec4_visitor::visit(ir_assignment *ir)
2141 {
2142 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2143 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2144
2145 if (!ir->lhs->type->is_scalar() &&
2146 !ir->lhs->type->is_vector()) {
2147 ir->rhs->accept(this);
2148 src_reg src = this->result;
2149
2150 if (ir->condition) {
2151 emit_bool_to_cond_code(ir->condition, &predicate);
2152 }
2153
2154 /* emit_block_move doesn't account for swizzles in the source register.
2155 * This should be ok, since the source register is a structure or an
2156 * array, and those can't be swizzled. But double-check to be sure.
2157 */
2158 assert(src.swizzle ==
2159 (ir->rhs->type->is_matrix()
2160 ? swizzle_for_size(ir->rhs->type->vector_elements)
2161 : BRW_SWIZZLE_NOOP));
2162
2163 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2164 return;
2165 }
2166
2167 /* Now we're down to just a scalar/vector with writemasks. */
2168 int i;
2169
2170 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2171 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2172
2173 ir->rhs->accept(this);
2174
2175 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2176
2177 src_reg src = this->result;
2178
2179 int swizzles[4];
2180 int first_enabled_chan = 0;
2181 int src_chan = 0;
2182
2183 assert(ir->lhs->type->is_vector() ||
2184 ir->lhs->type->is_scalar());
2185 dst.writemask = ir->write_mask;
2186
2187 for (int i = 0; i < 4; i++) {
2188 if (dst.writemask & (1 << i)) {
2189 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2190 break;
2191 }
2192 }
2193
2194 /* Swizzle a small RHS vector into the channels being written.
2195 *
2196 * glsl ir treats write_mask as dictating how many channels are
2197 * present on the RHS while in our instructions we need to make
2198 * those channels appear in the slots of the vec4 they're written to.
2199 */
2200 for (int i = 0; i < 4; i++) {
2201 if (dst.writemask & (1 << i))
2202 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2203 else
2204 swizzles[i] = first_enabled_chan;
2205 }
2206 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2207 swizzles[2], swizzles[3]);
2208
2209 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2210 return;
2211 }
2212
2213 if (ir->condition) {
2214 emit_bool_to_cond_code(ir->condition, &predicate);
2215 }
2216
2217 for (i = 0; i < type_size(ir->lhs->type); i++) {
2218 vec4_instruction *inst = emit(MOV(dst, src));
2219 inst->predicate = predicate;
2220
2221 dst.reg_offset++;
2222 src.reg_offset++;
2223 }
2224 }
2225
2226 void
2227 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2228 {
2229 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2230 foreach_in_list(ir_constant, field_value, &ir->components) {
2231 emit_constant_values(dst, field_value);
2232 }
2233 return;
2234 }
2235
2236 if (ir->type->is_array()) {
2237 for (unsigned int i = 0; i < ir->type->length; i++) {
2238 emit_constant_values(dst, ir->array_elements[i]);
2239 }
2240 return;
2241 }
2242
2243 if (ir->type->is_matrix()) {
2244 for (int i = 0; i < ir->type->matrix_columns; i++) {
2245 float *vec = &ir->value.f[i * ir->type->vector_elements];
2246
2247 for (int j = 0; j < ir->type->vector_elements; j++) {
2248 dst->writemask = 1 << j;
2249 dst->type = BRW_REGISTER_TYPE_F;
2250
2251 emit(MOV(*dst, src_reg(vec[j])));
2252 }
2253 dst->reg_offset++;
2254 }
2255 return;
2256 }
2257
2258 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2259
2260 for (int i = 0; i < ir->type->vector_elements; i++) {
2261 if (!(remaining_writemask & (1 << i)))
2262 continue;
2263
2264 dst->writemask = 1 << i;
2265 dst->type = brw_type_for_base_type(ir->type);
2266
2267 /* Find other components that match the one we're about to
2268 * write. Emits fewer instructions for things like vec4(0.5,
2269 * 1.5, 1.5, 1.5).
2270 */
2271 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2272 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2273 if (ir->value.b[i] == ir->value.b[j])
2274 dst->writemask |= (1 << j);
2275 } else {
2276 /* u, i, and f storage all line up, so no need for a
2277 * switch case for comparing each type.
2278 */
2279 if (ir->value.u[i] == ir->value.u[j])
2280 dst->writemask |= (1 << j);
2281 }
2282 }
2283
2284 switch (ir->type->base_type) {
2285 case GLSL_TYPE_FLOAT:
2286 emit(MOV(*dst, src_reg(ir->value.f[i])));
2287 break;
2288 case GLSL_TYPE_INT:
2289 emit(MOV(*dst, src_reg(ir->value.i[i])));
2290 break;
2291 case GLSL_TYPE_UINT:
2292 emit(MOV(*dst, src_reg(ir->value.u[i])));
2293 break;
2294 case GLSL_TYPE_BOOL:
2295 emit(MOV(*dst,
2296 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2297 : 0)));
2298 break;
2299 default:
2300 unreachable("Non-float/uint/int/bool constant");
2301 }
2302
2303 remaining_writemask &= ~dst->writemask;
2304 }
2305 dst->reg_offset++;
2306 }
2307
2308 void
2309 vec4_visitor::visit(ir_constant *ir)
2310 {
2311 dst_reg dst = dst_reg(this, ir->type);
2312 this->result = src_reg(dst);
2313
2314 emit_constant_values(&dst, ir);
2315 }
2316
2317 void
2318 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2319 {
2320 ir_dereference *deref = static_cast<ir_dereference *>(
2321 ir->actual_parameters.get_head());
2322 ir_variable *location = deref->variable_referenced();
2323 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2324 location->data.binding);
2325
2326 /* Calculate the surface offset */
2327 src_reg offset(this, glsl_type::uint_type);
2328 ir_dereference_array *deref_array = deref->as_dereference_array();
2329 if (deref_array) {
2330 deref_array->array_index->accept(this);
2331
2332 src_reg tmp(this, glsl_type::uint_type);
2333 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2334 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2335 } else {
2336 offset = location->data.atomic.offset;
2337 }
2338
2339 /* Emit the appropriate machine instruction */
2340 const char *callee = ir->callee->function_name();
2341 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2342
2343 if (!strcmp("__intrinsic_atomic_read", callee)) {
2344 emit_untyped_surface_read(surf_index, dst, offset);
2345
2346 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2347 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2348 src_reg(), src_reg());
2349
2350 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2351 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2352 src_reg(), src_reg());
2353 }
2354 }
2355
2356 void
2357 vec4_visitor::visit(ir_call *ir)
2358 {
2359 const char *callee = ir->callee->function_name();
2360
2361 if (!strcmp("__intrinsic_atomic_read", callee) ||
2362 !strcmp("__intrinsic_atomic_increment", callee) ||
2363 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2364 visit_atomic_counter_intrinsic(ir);
2365 } else {
2366 unreachable("Unsupported intrinsic.");
2367 }
2368 }
2369
2370 src_reg
2371 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2372 {
2373 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2374 inst->base_mrf = 2;
2375 inst->mlen = 1;
2376 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2377 inst->dst.writemask = WRITEMASK_XYZW;
2378
2379 inst->src[1] = sampler;
2380
2381 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2382 int param_base = inst->base_mrf;
2383 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2384 int zero_mask = 0xf & ~coord_mask;
2385
2386 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2387 coordinate));
2388
2389 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2390 src_reg(0)));
2391
2392 emit(inst);
2393 return src_reg(inst->dst);
2394 }
2395
2396 static bool
2397 is_high_sampler(struct brw_context *brw, src_reg sampler)
2398 {
2399 if (brw->gen < 8 && !brw->is_haswell)
2400 return false;
2401
2402 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2403 }
2404
2405 void
2406 vec4_visitor::visit(ir_texture *ir)
2407 {
2408 uint32_t sampler =
2409 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2410
2411 ir_rvalue *nonconst_sampler_index =
2412 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2413
2414 /* Handle non-constant sampler array indexing */
2415 src_reg sampler_reg;
2416 if (nonconst_sampler_index) {
2417 /* The highest sampler which may be used by this operation is
2418 * the last element of the array. Mark it here, because the generator
2419 * doesn't have enough information to determine the bound.
2420 */
2421 uint32_t array_size = ir->sampler->as_dereference_array()
2422 ->array->type->array_size();
2423
2424 uint32_t max_used = sampler + array_size - 1;
2425 if (ir->op == ir_tg4 && brw->gen < 8) {
2426 max_used += prog_data->base.binding_table.gather_texture_start;
2427 } else {
2428 max_used += prog_data->base.binding_table.texture_start;
2429 }
2430
2431 brw_mark_surface_used(&prog_data->base, max_used);
2432
2433 /* Emit code to evaluate the actual indexing expression */
2434 nonconst_sampler_index->accept(this);
2435 dst_reg temp(this, glsl_type::uint_type);
2436 emit(ADD(temp, this->result, src_reg(sampler)))
2437 ->force_writemask_all = true;
2438 sampler_reg = src_reg(temp);
2439 } else {
2440 /* Single sampler, or constant array index; the indexing expression
2441 * is just an immediate.
2442 */
2443 sampler_reg = src_reg(sampler);
2444 }
2445
2446 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2447 * emitting anything other than setting up the constant result.
2448 */
2449 if (ir->op == ir_tg4) {
2450 ir_constant *chan = ir->lod_info.component->as_constant();
2451 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2452 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2453 dst_reg result(this, ir->type);
2454 this->result = src_reg(result);
2455 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2456 return;
2457 }
2458 }
2459
2460 /* Should be lowered by do_lower_texture_projection */
2461 assert(!ir->projector);
2462
2463 /* Should be lowered */
2464 assert(!ir->offset || !ir->offset->type->is_array());
2465
2466 /* Generate code to compute all the subexpression trees. This has to be
2467 * done before loading any values into MRFs for the sampler message since
2468 * generating these values may involve SEND messages that need the MRFs.
2469 */
2470 src_reg coordinate;
2471 if (ir->coordinate) {
2472 ir->coordinate->accept(this);
2473 coordinate = this->result;
2474 }
2475
2476 src_reg shadow_comparitor;
2477 if (ir->shadow_comparitor) {
2478 ir->shadow_comparitor->accept(this);
2479 shadow_comparitor = this->result;
2480 }
2481
2482 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2483 src_reg offset_value;
2484 if (has_nonconstant_offset) {
2485 ir->offset->accept(this);
2486 offset_value = src_reg(this->result);
2487 }
2488
2489 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2490 src_reg lod, dPdx, dPdy, sample_index, mcs;
2491 switch (ir->op) {
2492 case ir_tex:
2493 lod = src_reg(0.0f);
2494 lod_type = glsl_type::float_type;
2495 break;
2496 case ir_txf:
2497 case ir_txl:
2498 case ir_txs:
2499 ir->lod_info.lod->accept(this);
2500 lod = this->result;
2501 lod_type = ir->lod_info.lod->type;
2502 break;
2503 case ir_query_levels:
2504 lod = src_reg(0);
2505 lod_type = glsl_type::int_type;
2506 break;
2507 case ir_txf_ms:
2508 ir->lod_info.sample_index->accept(this);
2509 sample_index = this->result;
2510 sample_index_type = ir->lod_info.sample_index->type;
2511
2512 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2513 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2514 else
2515 mcs = src_reg(0u);
2516 break;
2517 case ir_txd:
2518 ir->lod_info.grad.dPdx->accept(this);
2519 dPdx = this->result;
2520
2521 ir->lod_info.grad.dPdy->accept(this);
2522 dPdy = this->result;
2523
2524 lod_type = ir->lod_info.grad.dPdx->type;
2525 break;
2526 case ir_txb:
2527 case ir_lod:
2528 case ir_tg4:
2529 break;
2530 }
2531
2532 enum opcode opcode;
2533 switch (ir->op) {
2534 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2535 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2536 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2537 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2538 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2539 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2540 case ir_tg4: opcode = has_nonconstant_offset
2541 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2542 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2543 case ir_txb:
2544 unreachable("TXB is not valid for vertex shaders.");
2545 case ir_lod:
2546 unreachable("LOD is not valid for vertex shaders.");
2547 default:
2548 unreachable("Unrecognized tex op");
2549 }
2550
2551 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2552
2553 if (ir->offset != NULL && ir->op != ir_txf)
2554 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2555
2556 /* Stuff the channel select bits in the top of the texture offset */
2557 if (ir->op == ir_tg4)
2558 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2559
2560 /* The message header is necessary for:
2561 * - Gen4 (always)
2562 * - Texel offsets
2563 * - Gather channel selection
2564 * - Sampler indices too large to fit in a 4-bit value.
2565 */
2566 inst->header_present =
2567 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2568 is_high_sampler(brw, sampler_reg);
2569 inst->base_mrf = 2;
2570 inst->mlen = inst->header_present + 1; /* always at least one */
2571 inst->dst = dst_reg(this, ir->type);
2572 inst->dst.writemask = WRITEMASK_XYZW;
2573 inst->shadow_compare = ir->shadow_comparitor != NULL;
2574
2575 inst->src[1] = sampler_reg;
2576
2577 /* MRF for the first parameter */
2578 int param_base = inst->base_mrf + inst->header_present;
2579
2580 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2581 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2582 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2583 } else {
2584 /* Load the coordinate */
2585 /* FINISHME: gl_clamp_mask and saturate */
2586 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2587 int zero_mask = 0xf & ~coord_mask;
2588
2589 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2590 coordinate));
2591
2592 if (zero_mask != 0) {
2593 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2594 src_reg(0)));
2595 }
2596 /* Load the shadow comparitor */
2597 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2598 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2599 WRITEMASK_X),
2600 shadow_comparitor));
2601 inst->mlen++;
2602 }
2603
2604 /* Load the LOD info */
2605 if (ir->op == ir_tex || ir->op == ir_txl) {
2606 int mrf, writemask;
2607 if (brw->gen >= 5) {
2608 mrf = param_base + 1;
2609 if (ir->shadow_comparitor) {
2610 writemask = WRITEMASK_Y;
2611 /* mlen already incremented */
2612 } else {
2613 writemask = WRITEMASK_X;
2614 inst->mlen++;
2615 }
2616 } else /* brw->gen == 4 */ {
2617 mrf = param_base;
2618 writemask = WRITEMASK_W;
2619 }
2620 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2621 } else if (ir->op == ir_txf) {
2622 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2623 } else if (ir->op == ir_txf_ms) {
2624 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2625 sample_index));
2626 if (brw->gen >= 7) {
2627 /* MCS data is in the first channel of `mcs`, but we need to get it into
2628 * the .y channel of the second vec4 of params, so replicate .x across
2629 * the whole vec4 and then mask off everything except .y
2630 */
2631 mcs.swizzle = BRW_SWIZZLE_XXXX;
2632 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2633 mcs));
2634 }
2635 inst->mlen++;
2636 } else if (ir->op == ir_txd) {
2637 const glsl_type *type = lod_type;
2638
2639 if (brw->gen >= 5) {
2640 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2641 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2642 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2643 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2644 inst->mlen++;
2645
2646 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2647 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2648 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2649 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2650 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2651 inst->mlen++;
2652
2653 if (ir->shadow_comparitor) {
2654 emit(MOV(dst_reg(MRF, param_base + 2,
2655 ir->shadow_comparitor->type, WRITEMASK_Z),
2656 shadow_comparitor));
2657 }
2658 }
2659 } else /* brw->gen == 4 */ {
2660 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2661 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2662 inst->mlen += 2;
2663 }
2664 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2665 if (ir->shadow_comparitor) {
2666 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2667 shadow_comparitor));
2668 }
2669
2670 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2671 offset_value));
2672 inst->mlen++;
2673 }
2674 }
2675
2676 emit(inst);
2677
2678 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2679 * spec requires layers.
2680 */
2681 if (ir->op == ir_txs) {
2682 glsl_type const *type = ir->sampler->type;
2683 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2684 type->sampler_array) {
2685 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2686 writemask(inst->dst, WRITEMASK_Z),
2687 src_reg(inst->dst), src_reg(6));
2688 }
2689 }
2690
2691 if (brw->gen == 6 && ir->op == ir_tg4) {
2692 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2693 }
2694
2695 swizzle_result(ir, src_reg(inst->dst), sampler);
2696 }
2697
2698 /**
2699 * Apply workarounds for Gen6 gather with UINT/SINT
2700 */
2701 void
2702 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2703 {
2704 if (!wa)
2705 return;
2706
2707 int width = (wa & WA_8BIT) ? 8 : 16;
2708 dst_reg dst_f = dst;
2709 dst_f.type = BRW_REGISTER_TYPE_F;
2710
2711 /* Convert from UNORM to UINT */
2712 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2713 emit(MOV(dst, src_reg(dst_f)));
2714
2715 if (wa & WA_SIGN) {
2716 /* Reinterpret the UINT value as a signed INT value by
2717 * shifting the sign bit into place, then shifting back
2718 * preserving sign.
2719 */
2720 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2721 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2722 }
2723 }
2724
2725 /**
2726 * Set up the gather channel based on the swizzle, for gather4.
2727 */
2728 uint32_t
2729 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2730 {
2731 ir_constant *chan = ir->lod_info.component->as_constant();
2732 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2733 switch (swiz) {
2734 case SWIZZLE_X: return 0;
2735 case SWIZZLE_Y:
2736 /* gather4 sampler is broken for green channel on RG32F --
2737 * we must ask for blue instead.
2738 */
2739 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2740 return 2;
2741 return 1;
2742 case SWIZZLE_Z: return 2;
2743 case SWIZZLE_W: return 3;
2744 default:
2745 unreachable("Not reached"); /* zero, one swizzles handled already */
2746 }
2747 }
2748
2749 void
2750 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2751 {
2752 int s = key->tex.swizzles[sampler];
2753
2754 this->result = src_reg(this, ir->type);
2755 dst_reg swizzled_result(this->result);
2756
2757 if (ir->op == ir_query_levels) {
2758 /* # levels is in .w */
2759 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2760 emit(MOV(swizzled_result, orig_val));
2761 return;
2762 }
2763
2764 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2765 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2766 emit(MOV(swizzled_result, orig_val));
2767 return;
2768 }
2769
2770
2771 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2772 int swizzle[4] = {0};
2773
2774 for (int i = 0; i < 4; i++) {
2775 switch (GET_SWZ(s, i)) {
2776 case SWIZZLE_ZERO:
2777 zero_mask |= (1 << i);
2778 break;
2779 case SWIZZLE_ONE:
2780 one_mask |= (1 << i);
2781 break;
2782 default:
2783 copy_mask |= (1 << i);
2784 swizzle[i] = GET_SWZ(s, i);
2785 break;
2786 }
2787 }
2788
2789 if (copy_mask) {
2790 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2791 swizzled_result.writemask = copy_mask;
2792 emit(MOV(swizzled_result, orig_val));
2793 }
2794
2795 if (zero_mask) {
2796 swizzled_result.writemask = zero_mask;
2797 emit(MOV(swizzled_result, src_reg(0.0f)));
2798 }
2799
2800 if (one_mask) {
2801 swizzled_result.writemask = one_mask;
2802 emit(MOV(swizzled_result, src_reg(1.0f)));
2803 }
2804 }
2805
2806 void
2807 vec4_visitor::visit(ir_return *)
2808 {
2809 unreachable("not reached");
2810 }
2811
2812 void
2813 vec4_visitor::visit(ir_discard *)
2814 {
2815 unreachable("not reached");
2816 }
2817
2818 void
2819 vec4_visitor::visit(ir_if *ir)
2820 {
2821 /* Don't point the annotation at the if statement, because then it plus
2822 * the then and else blocks get printed.
2823 */
2824 this->base_ir = ir->condition;
2825
2826 if (brw->gen == 6) {
2827 emit_if_gen6(ir);
2828 } else {
2829 enum brw_predicate predicate;
2830 emit_bool_to_cond_code(ir->condition, &predicate);
2831 emit(IF(predicate));
2832 }
2833
2834 visit_instructions(&ir->then_instructions);
2835
2836 if (!ir->else_instructions.is_empty()) {
2837 this->base_ir = ir->condition;
2838 emit(BRW_OPCODE_ELSE);
2839
2840 visit_instructions(&ir->else_instructions);
2841 }
2842
2843 this->base_ir = ir->condition;
2844 emit(BRW_OPCODE_ENDIF);
2845 }
2846
2847 void
2848 vec4_visitor::visit(ir_emit_vertex *)
2849 {
2850 unreachable("not reached");
2851 }
2852
2853 void
2854 vec4_visitor::visit(ir_end_primitive *)
2855 {
2856 unreachable("not reached");
2857 }
2858
2859 void
2860 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2861 dst_reg dst, src_reg offset,
2862 src_reg src0, src_reg src1)
2863 {
2864 unsigned mlen = 0;
2865
2866 /* Set the atomic operation offset. */
2867 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2868 mlen++;
2869
2870 /* Set the atomic operation arguments. */
2871 if (src0.file != BAD_FILE) {
2872 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2873 mlen++;
2874 }
2875
2876 if (src1.file != BAD_FILE) {
2877 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2878 mlen++;
2879 }
2880
2881 /* Emit the instruction. Note that this maps to the normal SIMD8
2882 * untyped atomic message on Ivy Bridge, but that's OK because
2883 * unused channels will be masked out.
2884 */
2885 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2886 src_reg(atomic_op), src_reg(surf_index));
2887 inst->base_mrf = 0;
2888 inst->mlen = mlen;
2889 }
2890
2891 void
2892 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2893 src_reg offset)
2894 {
2895 /* Set the surface read offset. */
2896 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2897
2898 /* Emit the instruction. Note that this maps to the normal SIMD8
2899 * untyped surface read message, but that's OK because unused
2900 * channels will be masked out.
2901 */
2902 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2903 dst, src_reg(surf_index));
2904 inst->base_mrf = 0;
2905 inst->mlen = 1;
2906 }
2907
2908 void
2909 vec4_visitor::emit_ndc_computation()
2910 {
2911 /* Get the position */
2912 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2913
2914 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2915 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2916 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2917
2918 current_annotation = "NDC";
2919 dst_reg ndc_w = ndc;
2920 ndc_w.writemask = WRITEMASK_W;
2921 src_reg pos_w = pos;
2922 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2923 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2924
2925 dst_reg ndc_xyz = ndc;
2926 ndc_xyz.writemask = WRITEMASK_XYZ;
2927
2928 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2929 }
2930
2931 void
2932 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2933 {
2934 if (brw->gen < 6 &&
2935 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2936 key->userclip_active || brw->has_negative_rhw_bug)) {
2937 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2938 dst_reg header1_w = header1;
2939 header1_w.writemask = WRITEMASK_W;
2940
2941 emit(MOV(header1, 0u));
2942
2943 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2944 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2945
2946 current_annotation = "Point size";
2947 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2948 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2949 }
2950
2951 if (key->userclip_active) {
2952 current_annotation = "Clipping flags";
2953 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2954 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2955
2956 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2957 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2958 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2959
2960 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2961 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2962 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2963 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2964 }
2965
2966 /* i965 clipping workaround:
2967 * 1) Test for -ve rhw
2968 * 2) If set,
2969 * set ndc = (0,0,0,0)
2970 * set ucp[6] = 1
2971 *
2972 * Later, clipping will detect ucp[6] and ensure the primitive is
2973 * clipped against all fixed planes.
2974 */
2975 if (brw->has_negative_rhw_bug) {
2976 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2977 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2978 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2979 vec4_instruction *inst;
2980 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2981 inst->predicate = BRW_PREDICATE_NORMAL;
2982 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2983 inst->predicate = BRW_PREDICATE_NORMAL;
2984 }
2985
2986 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2987 } else if (brw->gen < 6) {
2988 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2989 } else {
2990 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2991 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2992 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2993 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2994 }
2995 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2996 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2997 src_reg(output_reg[VARYING_SLOT_LAYER])));
2998 }
2999 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3000 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
3001 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3002 }
3003 }
3004 }
3005
3006 void
3007 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3008 {
3009 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3010 *
3011 * "If a linked set of shaders forming the vertex stage contains no
3012 * static write to gl_ClipVertex or gl_ClipDistance, but the
3013 * application has requested clipping against user clip planes through
3014 * the API, then the coordinate written to gl_Position is used for
3015 * comparison against the user clip planes."
3016 *
3017 * This function is only called if the shader didn't write to
3018 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3019 * if the user wrote to it; otherwise we use gl_Position.
3020 */
3021 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3022 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3023 clip_vertex = VARYING_SLOT_POS;
3024 }
3025
3026 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3027 ++i) {
3028 reg.writemask = 1 << i;
3029 emit(DP4(reg,
3030 src_reg(output_reg[clip_vertex]),
3031 src_reg(this->userplane[i + offset])));
3032 }
3033 }
3034
3035 void
3036 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3037 {
3038 assert (varying < VARYING_SLOT_MAX);
3039 reg.type = output_reg[varying].type;
3040 current_annotation = output_reg_annotation[varying];
3041 /* Copy the register, saturating if necessary */
3042 vec4_instruction *inst = emit(MOV(reg,
3043 src_reg(output_reg[varying])));
3044 if ((varying == VARYING_SLOT_COL0 ||
3045 varying == VARYING_SLOT_COL1 ||
3046 varying == VARYING_SLOT_BFC0 ||
3047 varying == VARYING_SLOT_BFC1) &&
3048 key->clamp_vertex_color) {
3049 inst->saturate = true;
3050 }
3051 }
3052
3053 void
3054 vec4_visitor::emit_urb_slot(int mrf, int varying)
3055 {
3056 struct brw_reg hw_reg = brw_message_reg(mrf);
3057 dst_reg reg = dst_reg(MRF, mrf);
3058 reg.type = BRW_REGISTER_TYPE_F;
3059
3060 switch (varying) {
3061 case VARYING_SLOT_PSIZ:
3062 /* PSIZ is always in slot 0, and is coupled with other flags. */
3063 current_annotation = "indices, point width, clip flags";
3064 emit_psiz_and_flags(hw_reg);
3065 break;
3066 case BRW_VARYING_SLOT_NDC:
3067 current_annotation = "NDC";
3068 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3069 break;
3070 case VARYING_SLOT_POS:
3071 current_annotation = "gl_Position";
3072 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3073 break;
3074 case VARYING_SLOT_EDGE:
3075 /* This is present when doing unfilled polygons. We're supposed to copy
3076 * the edge flag from the user-provided vertex array
3077 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3078 * of that attribute (starts as 1.0f). This is then used in clipping to
3079 * determine which edges should be drawn as wireframe.
3080 */
3081 current_annotation = "edge flag";
3082 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3083 glsl_type::float_type, WRITEMASK_XYZW))));
3084 break;
3085 case BRW_VARYING_SLOT_PAD:
3086 /* No need to write to this slot */
3087 break;
3088 default:
3089 emit_generic_urb_slot(reg, varying);
3090 break;
3091 }
3092 }
3093
3094 static int
3095 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3096 {
3097 if (brw->gen >= 6) {
3098 /* URB data written (does not include the message header reg) must
3099 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3100 * section 5.4.3.2.2: URB_INTERLEAVED.
3101 *
3102 * URB entries are allocated on a multiple of 1024 bits, so an
3103 * extra 128 bits written here to make the end align to 256 is
3104 * no problem.
3105 */
3106 if ((mlen % 2) != 1)
3107 mlen++;
3108 }
3109
3110 return mlen;
3111 }
3112
3113
3114 /**
3115 * Generates the VUE payload plus the necessary URB write instructions to
3116 * output it.
3117 *
3118 * The VUE layout is documented in Volume 2a.
3119 */
3120 void
3121 vec4_visitor::emit_vertex()
3122 {
3123 /* MRF 0 is reserved for the debugger, so start with message header
3124 * in MRF 1.
3125 */
3126 int base_mrf = 1;
3127 int mrf = base_mrf;
3128 /* In the process of generating our URB write message contents, we
3129 * may need to unspill a register or load from an array. Those
3130 * reads would use MRFs 14-15.
3131 */
3132 int max_usable_mrf = 13;
3133
3134 /* The following assertion verifies that max_usable_mrf causes an
3135 * even-numbered amount of URB write data, which will meet gen6's
3136 * requirements for length alignment.
3137 */
3138 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3139
3140 /* First mrf is the g0-based message header containing URB handles and
3141 * such.
3142 */
3143 emit_urb_write_header(mrf++);
3144
3145 if (brw->gen < 6) {
3146 emit_ndc_computation();
3147 }
3148
3149 /* Lower legacy ff and ClipVertex clipping to clip distances */
3150 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3151 current_annotation = "user clip distances";
3152
3153 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3154 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3155
3156 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3157 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3158 }
3159
3160 /* We may need to split this up into several URB writes, so do them in a
3161 * loop.
3162 */
3163 int slot = 0;
3164 bool complete = false;
3165 do {
3166 /* URB offset is in URB row increments, and each of our MRFs is half of
3167 * one of those, since we're doing interleaved writes.
3168 */
3169 int offset = slot / 2;
3170
3171 mrf = base_mrf + 1;
3172 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3173 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3174
3175 /* If this was max_usable_mrf, we can't fit anything more into this
3176 * URB WRITE.
3177 */
3178 if (mrf > max_usable_mrf) {
3179 slot++;
3180 break;
3181 }
3182 }
3183
3184 complete = slot >= prog_data->vue_map.num_slots;
3185 current_annotation = "URB write";
3186 vec4_instruction *inst = emit_urb_write_opcode(complete);
3187 inst->base_mrf = base_mrf;
3188 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3189 inst->offset += offset;
3190 } while(!complete);
3191 }
3192
3193
3194 src_reg
3195 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3196 src_reg *reladdr, int reg_offset)
3197 {
3198 /* Because we store the values to scratch interleaved like our
3199 * vertex data, we need to scale the vec4 index by 2.
3200 */
3201 int message_header_scale = 2;
3202
3203 /* Pre-gen6, the message header uses byte offsets instead of vec4
3204 * (16-byte) offset units.
3205 */
3206 if (brw->gen < 6)
3207 message_header_scale *= 16;
3208
3209 if (reladdr) {
3210 src_reg index = src_reg(this, glsl_type::int_type);
3211
3212 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3213 emit_before(inst, MUL(dst_reg(index),
3214 index, src_reg(message_header_scale)));
3215
3216 return index;
3217 } else {
3218 return src_reg(reg_offset * message_header_scale);
3219 }
3220 }
3221
3222 src_reg
3223 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3224 src_reg *reladdr, int reg_offset)
3225 {
3226 if (reladdr) {
3227 src_reg index = src_reg(this, glsl_type::int_type);
3228
3229 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3230
3231 /* Pre-gen6, the message header uses byte offsets instead of vec4
3232 * (16-byte) offset units.
3233 */
3234 if (brw->gen < 6) {
3235 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3236 }
3237
3238 return index;
3239 } else if (brw->gen >= 8) {
3240 /* Store the offset in a GRF so we can send-from-GRF. */
3241 src_reg offset = src_reg(this, glsl_type::int_type);
3242 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3243 return offset;
3244 } else {
3245 int message_header_scale = brw->gen < 6 ? 16 : 1;
3246 return src_reg(reg_offset * message_header_scale);
3247 }
3248 }
3249
3250 /**
3251 * Emits an instruction before @inst to load the value named by @orig_src
3252 * from scratch space at @base_offset to @temp.
3253 *
3254 * @base_offset is measured in 32-byte units (the size of a register).
3255 */
3256 void
3257 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3258 dst_reg temp, src_reg orig_src,
3259 int base_offset)
3260 {
3261 int reg_offset = base_offset + orig_src.reg_offset;
3262 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3263
3264 emit_before(inst, SCRATCH_READ(temp, index));
3265 }
3266
3267 /**
3268 * Emits an instruction after @inst to store the value to be written
3269 * to @orig_dst to scratch space at @base_offset, from @temp.
3270 *
3271 * @base_offset is measured in 32-byte units (the size of a register).
3272 */
3273 void
3274 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3275 {
3276 int reg_offset = base_offset + inst->dst.reg_offset;
3277 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3278
3279 /* Create a temporary register to store *inst's result in.
3280 *
3281 * We have to be careful in MOVing from our temporary result register in
3282 * the scratch write. If we swizzle from channels of the temporary that
3283 * weren't initialized, it will confuse live interval analysis, which will
3284 * make spilling fail to make progress.
3285 */
3286 src_reg temp = src_reg(this, glsl_type::vec4_type);
3287 temp.type = inst->dst.type;
3288 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3289 int swizzles[4];
3290 for (int i = 0; i < 4; i++)
3291 if (inst->dst.writemask & (1 << i))
3292 swizzles[i] = i;
3293 else
3294 swizzles[i] = first_writemask_chan;
3295 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3296 swizzles[2], swizzles[3]);
3297
3298 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3299 inst->dst.writemask));
3300 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3301 write->predicate = inst->predicate;
3302 write->ir = inst->ir;
3303 write->annotation = inst->annotation;
3304 inst->insert_after(write);
3305
3306 inst->dst.file = temp.file;
3307 inst->dst.reg = temp.reg;
3308 inst->dst.reg_offset = temp.reg_offset;
3309 inst->dst.reladdr = NULL;
3310 }
3311
3312 /**
3313 * We can't generally support array access in GRF space, because a
3314 * single instruction's destination can only span 2 contiguous
3315 * registers. So, we send all GRF arrays that get variable index
3316 * access to scratch space.
3317 */
3318 void
3319 vec4_visitor::move_grf_array_access_to_scratch()
3320 {
3321 int scratch_loc[this->virtual_grf_count];
3322
3323 for (int i = 0; i < this->virtual_grf_count; i++) {
3324 scratch_loc[i] = -1;
3325 }
3326
3327 /* First, calculate the set of virtual GRFs that need to be punted
3328 * to scratch due to having any array access on them, and where in
3329 * scratch.
3330 */
3331 foreach_in_list(vec4_instruction, inst, &instructions) {
3332 if (inst->dst.file == GRF && inst->dst.reladdr &&
3333 scratch_loc[inst->dst.reg] == -1) {
3334 scratch_loc[inst->dst.reg] = c->last_scratch;
3335 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3336 }
3337
3338 for (int i = 0 ; i < 3; i++) {
3339 src_reg *src = &inst->src[i];
3340
3341 if (src->file == GRF && src->reladdr &&
3342 scratch_loc[src->reg] == -1) {
3343 scratch_loc[src->reg] = c->last_scratch;
3344 c->last_scratch += this->virtual_grf_sizes[src->reg];
3345 }
3346 }
3347 }
3348
3349 /* Now, for anything that will be accessed through scratch, rewrite
3350 * it to load/store. Note that this is a _safe list walk, because
3351 * we may generate a new scratch_write instruction after the one
3352 * we're processing.
3353 */
3354 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3355 /* Set up the annotation tracking for new generated instructions. */
3356 base_ir = inst->ir;
3357 current_annotation = inst->annotation;
3358
3359 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3360 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3361 }
3362
3363 for (int i = 0 ; i < 3; i++) {
3364 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3365 continue;
3366
3367 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3368
3369 emit_scratch_read(inst, temp, inst->src[i],
3370 scratch_loc[inst->src[i].reg]);
3371
3372 inst->src[i].file = temp.file;
3373 inst->src[i].reg = temp.reg;
3374 inst->src[i].reg_offset = temp.reg_offset;
3375 inst->src[i].reladdr = NULL;
3376 }
3377 }
3378 }
3379
3380 /**
3381 * Emits an instruction before @inst to load the value named by @orig_src
3382 * from the pull constant buffer (surface) at @base_offset to @temp.
3383 */
3384 void
3385 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3386 dst_reg temp, src_reg orig_src,
3387 int base_offset)
3388 {
3389 int reg_offset = base_offset + orig_src.reg_offset;
3390 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3391 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3392 vec4_instruction *load;
3393
3394 if (brw->gen >= 7) {
3395 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3396 grf_offset.type = offset.type;
3397 emit_before(inst, MOV(grf_offset, offset));
3398
3399 load = new(mem_ctx) vec4_instruction(this,
3400 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3401 temp, index, src_reg(grf_offset));
3402 } else {
3403 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3404 temp, index, offset);
3405 load->base_mrf = 14;
3406 load->mlen = 1;
3407 }
3408 emit_before(inst, load);
3409 }
3410
3411 /**
3412 * Implements array access of uniforms by inserting a
3413 * PULL_CONSTANT_LOAD instruction.
3414 *
3415 * Unlike temporary GRF array access (where we don't support it due to
3416 * the difficulty of doing relative addressing on instruction
3417 * destinations), we could potentially do array access of uniforms
3418 * that were loaded in GRF space as push constants. In real-world
3419 * usage we've seen, though, the arrays being used are always larger
3420 * than we could load as push constants, so just always move all
3421 * uniform array access out to a pull constant buffer.
3422 */
3423 void
3424 vec4_visitor::move_uniform_array_access_to_pull_constants()
3425 {
3426 int pull_constant_loc[this->uniforms];
3427
3428 for (int i = 0; i < this->uniforms; i++) {
3429 pull_constant_loc[i] = -1;
3430 }
3431
3432 /* Walk through and find array access of uniforms. Put a copy of that
3433 * uniform in the pull constant buffer.
3434 *
3435 * Note that we don't move constant-indexed accesses to arrays. No
3436 * testing has been done of the performance impact of this choice.
3437 */
3438 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3439 for (int i = 0 ; i < 3; i++) {
3440 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3441 continue;
3442
3443 int uniform = inst->src[i].reg;
3444
3445 /* If this array isn't already present in the pull constant buffer,
3446 * add it.
3447 */
3448 if (pull_constant_loc[uniform] == -1) {
3449 const gl_constant_value **values =
3450 &stage_prog_data->param[uniform * 4];
3451
3452 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3453
3454 assert(uniform < uniform_array_size);
3455 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3456 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3457 = values[j];
3458 }
3459 }
3460
3461 /* Set up the annotation tracking for new generated instructions. */
3462 base_ir = inst->ir;
3463 current_annotation = inst->annotation;
3464
3465 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3466
3467 emit_pull_constant_load(inst, temp, inst->src[i],
3468 pull_constant_loc[uniform]);
3469
3470 inst->src[i].file = temp.file;
3471 inst->src[i].reg = temp.reg;
3472 inst->src[i].reg_offset = temp.reg_offset;
3473 inst->src[i].reladdr = NULL;
3474 }
3475 }
3476
3477 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3478 * no need to track them as larger-than-vec4 objects. This will be
3479 * relied on in cutting out unused uniform vectors from push
3480 * constants.
3481 */
3482 split_uniform_registers();
3483 }
3484
3485 void
3486 vec4_visitor::resolve_ud_negate(src_reg *reg)
3487 {
3488 if (reg->type != BRW_REGISTER_TYPE_UD ||
3489 !reg->negate)
3490 return;
3491
3492 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3493 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3494 *reg = temp;
3495 }
3496
3497 vec4_visitor::vec4_visitor(struct brw_context *brw,
3498 struct brw_vec4_compile *c,
3499 struct gl_program *prog,
3500 const struct brw_vec4_prog_key *key,
3501 struct brw_vec4_prog_data *prog_data,
3502 struct gl_shader_program *shader_prog,
3503 gl_shader_stage stage,
3504 void *mem_ctx,
3505 bool debug_flag,
3506 bool no_spills,
3507 shader_time_shader_type st_base,
3508 shader_time_shader_type st_written,
3509 shader_time_shader_type st_reset)
3510 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3511 c(c),
3512 key(key),
3513 prog_data(prog_data),
3514 sanity_param_count(0),
3515 fail_msg(NULL),
3516 first_non_payload_grf(0),
3517 need_all_constants_in_pull_buffer(false),
3518 debug_flag(debug_flag),
3519 no_spills(no_spills),
3520 st_base(st_base),
3521 st_written(st_written),
3522 st_reset(st_reset)
3523 {
3524 this->mem_ctx = mem_ctx;
3525 this->failed = false;
3526
3527 this->base_ir = NULL;
3528 this->current_annotation = NULL;
3529 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3530
3531 this->variable_ht = hash_table_ctor(0,
3532 hash_table_pointer_hash,
3533 hash_table_pointer_compare);
3534
3535 this->virtual_grf_start = NULL;
3536 this->virtual_grf_end = NULL;
3537 this->virtual_grf_sizes = NULL;
3538 this->virtual_grf_count = 0;
3539 this->virtual_grf_reg_map = NULL;
3540 this->virtual_grf_reg_count = 0;
3541 this->virtual_grf_array_size = 0;
3542 this->live_intervals_valid = false;
3543
3544 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3545
3546 this->uniforms = 0;
3547
3548 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3549 * at least one. See setup_uniforms() in brw_vec4.cpp.
3550 */
3551 this->uniform_array_size = 1;
3552 if (prog_data) {
3553 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3554 }
3555
3556 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3557 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3558 }
3559
3560 vec4_visitor::~vec4_visitor()
3561 {
3562 hash_table_dtor(this->variable_ht);
3563 }
3564
3565
3566 void
3567 vec4_visitor::fail(const char *format, ...)
3568 {
3569 va_list va;
3570 char *msg;
3571
3572 if (failed)
3573 return;
3574
3575 failed = true;
3576
3577 va_start(va, format);
3578 msg = ralloc_vasprintf(mem_ctx, format, va);
3579 va_end(va);
3580 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3581
3582 this->fail_msg = msg;
3583
3584 if (debug_flag) {
3585 fprintf(stderr, "%s", msg);
3586 }
3587 }
3588
3589 } /* namespace brw */