i965/vec4: Preserve CFG in spill_reg().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
71 vec4_instruction *new_inst)
72 {
73 new_inst->ir = inst->ir;
74 new_inst->annotation = inst->annotation;
75
76 inst->insert_before(block, new_inst);
77
78 return inst;
79 }
80
81 vec4_instruction *
82 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
83 src_reg src0, src_reg src1, src_reg src2)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
86 src0, src1, src2));
87 }
88
89
90 vec4_instruction *
91 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 /* The gen6 math instruction ignores the source modifiers --
313 * swizzle, abs, negate, and at least some parts of the register
314 * region description.
315 *
316 * Rather than trying to enumerate all these cases, *always* expand the
317 * operand to a temp GRF for gen6.
318 *
319 * For gen7, keep the operand as-is, except if immediate, which gen7 still
320 * can't use.
321 */
322
323 if (brw->gen == 7 && src.file != IMM)
324 return src;
325
326 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
327 expanded.type = src.type;
328 emit(MOV(expanded, src));
329 return src_reg(expanded);
330 }
331
332 void
333 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
334 {
335 src = fix_math_operand(src);
336
337 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
338 /* The gen6 math instruction must be align1, so we can't do
339 * writemasks.
340 */
341 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
342
343 emit(opcode, temp_dst, src);
344
345 emit(MOV(dst, src_reg(temp_dst)));
346 } else {
347 emit(opcode, dst, src);
348 }
349 }
350
351 void
352 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
353 {
354 vec4_instruction *inst = emit(opcode, dst, src);
355 inst->base_mrf = 1;
356 inst->mlen = 1;
357 }
358
359 void
360 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
361 {
362 switch (opcode) {
363 case SHADER_OPCODE_RCP:
364 case SHADER_OPCODE_RSQ:
365 case SHADER_OPCODE_SQRT:
366 case SHADER_OPCODE_EXP2:
367 case SHADER_OPCODE_LOG2:
368 case SHADER_OPCODE_SIN:
369 case SHADER_OPCODE_COS:
370 break;
371 default:
372 unreachable("not reached: bad math opcode");
373 }
374
375 if (brw->gen >= 8) {
376 emit(opcode, dst, src);
377 } else if (brw->gen >= 6) {
378 emit_math1_gen6(opcode, dst, src);
379 } else {
380 emit_math1_gen4(opcode, dst, src);
381 }
382 }
383
384 void
385 vec4_visitor::emit_math2_gen6(enum opcode opcode,
386 dst_reg dst, src_reg src0, src_reg src1)
387 {
388 src0 = fix_math_operand(src0);
389 src1 = fix_math_operand(src1);
390
391 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
392 /* The gen6 math instruction must be align1, so we can't do
393 * writemasks.
394 */
395 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
396 temp_dst.type = dst.type;
397
398 emit(opcode, temp_dst, src0, src1);
399
400 emit(MOV(dst, src_reg(temp_dst)));
401 } else {
402 emit(opcode, dst, src0, src1);
403 }
404 }
405
406 void
407 vec4_visitor::emit_math2_gen4(enum opcode opcode,
408 dst_reg dst, src_reg src0, src_reg src1)
409 {
410 vec4_instruction *inst = emit(opcode, dst, src0, src1);
411 inst->base_mrf = 1;
412 inst->mlen = 2;
413 }
414
415 void
416 vec4_visitor::emit_math(enum opcode opcode,
417 dst_reg dst, src_reg src0, src_reg src1)
418 {
419 switch (opcode) {
420 case SHADER_OPCODE_POW:
421 case SHADER_OPCODE_INT_QUOTIENT:
422 case SHADER_OPCODE_INT_REMAINDER:
423 break;
424 default:
425 unreachable("not reached: unsupported binary math opcode");
426 }
427
428 if (brw->gen >= 8) {
429 emit(opcode, dst, src0, src1);
430 } else if (brw->gen >= 6) {
431 emit_math2_gen6(opcode, dst, src0, src1);
432 } else {
433 emit_math2_gen4(opcode, dst, src0, src1);
434 }
435 }
436
437 void
438 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_pack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_UD);
445 assert(src0.type == BRW_REGISTER_TYPE_F);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the destination data type must be Word (W).
451 *
452 * The destination must be DWord-aligned and specify a horizontal stride
453 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
454 * each destination channel and the upper word is not modified.
455 *
456 * The above restriction implies that the f32to16 instruction must use
457 * align1 mode, because only in align1 mode is it possible to specify
458 * horizontal stride. We choose here to defy the hardware docs and emit
459 * align16 instructions.
460 *
461 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
462 * instructions. I was partially successful in that the code passed all
463 * tests. However, the code was dubiously correct and fragile, and the
464 * tests were not harsh enough to probe that frailty. Not trusting the
465 * code, I chose instead to remain in align16 mode in defiance of the hw
466 * docs).
467 *
468 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
469 * simulator, emitting a f32to16 in align16 mode with UD as destination
470 * data type is safe. The behavior differs from that specified in the PRM
471 * in that the upper word of each destination channel is cleared to 0.
472 */
473
474 dst_reg tmp_dst(this, glsl_type::uvec2_type);
475 src_reg tmp_src(tmp_dst);
476
477 #if 0
478 /* Verify the undocumented behavior on which the following instructions
479 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
480 * then the result of the bit-or instruction below will be incorrect.
481 *
482 * You should inspect the disasm output in order to verify that the MOV is
483 * not optimized away.
484 */
485 emit(MOV(tmp_dst, src_reg(0x12345678u)));
486 #endif
487
488 /* Give tmp the form below, where "." means untouched.
489 *
490 * w z y x w z y x
491 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
492 *
493 * That the upper word of each write-channel be 0 is required for the
494 * following bit-shift and bit-or instructions to work. Note that this
495 * relies on the undocumented hardware behavior mentioned above.
496 */
497 tmp_dst.writemask = WRITEMASK_XY;
498 emit(F32TO16(tmp_dst, src0));
499
500 /* Give the write-channels of dst the form:
501 * 0xhhhh0000
502 */
503 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
504 emit(SHL(dst, tmp_src, src_reg(16u)));
505
506 /* Finally, give the write-channels of dst the form of packHalf2x16's
507 * output:
508 * 0xhhhhllll
509 */
510 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
511 emit(OR(dst, src_reg(dst), tmp_src));
512 }
513
514 void
515 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
516 {
517 if (brw->gen < 7) {
518 unreachable("ir_unop_unpack_half_2x16 should be lowered");
519 }
520
521 assert(dst.type == BRW_REGISTER_TYPE_F);
522 assert(src0.type == BRW_REGISTER_TYPE_UD);
523
524 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
525 *
526 * Because this instruction does not have a 16-bit floating-point type,
527 * the source data type must be Word (W). The destination type must be
528 * F (Float).
529 *
530 * To use W as the source data type, we must adjust horizontal strides,
531 * which is only possible in align1 mode. All my [chadv] attempts at
532 * emitting align1 instructions for unpackHalf2x16 failed to pass the
533 * Piglit tests, so I gave up.
534 *
535 * I've verified that, on gen7 hardware and the simulator, it is safe to
536 * emit f16to32 in align16 mode with UD as source data type.
537 */
538
539 dst_reg tmp_dst(this, glsl_type::uvec2_type);
540 src_reg tmp_src(tmp_dst);
541
542 tmp_dst.writemask = WRITEMASK_X;
543 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
544
545 tmp_dst.writemask = WRITEMASK_Y;
546 emit(SHR(tmp_dst, src0, src_reg(16u)));
547
548 dst.writemask = WRITEMASK_XY;
549 emit(F16TO32(dst, tmp_src));
550 }
551
552 void
553 vec4_visitor::visit_instructions(const exec_list *list)
554 {
555 foreach_in_list(ir_instruction, ir, list) {
556 base_ir = ir;
557 ir->accept(this);
558 }
559 }
560
561
562 static int
563 type_size(const struct glsl_type *type)
564 {
565 unsigned int i;
566 int size;
567
568 switch (type->base_type) {
569 case GLSL_TYPE_UINT:
570 case GLSL_TYPE_INT:
571 case GLSL_TYPE_FLOAT:
572 case GLSL_TYPE_BOOL:
573 if (type->is_matrix()) {
574 return type->matrix_columns;
575 } else {
576 /* Regardless of size of vector, it gets a vec4. This is bad
577 * packing for things like floats, but otherwise arrays become a
578 * mess. Hopefully a later pass over the code can pack scalars
579 * down if appropriate.
580 */
581 return 1;
582 }
583 case GLSL_TYPE_ARRAY:
584 assert(type->length > 0);
585 return type_size(type->fields.array) * type->length;
586 case GLSL_TYPE_STRUCT:
587 size = 0;
588 for (i = 0; i < type->length; i++) {
589 size += type_size(type->fields.structure[i].type);
590 }
591 return size;
592 case GLSL_TYPE_SAMPLER:
593 /* Samplers take up no register space, since they're baked in at
594 * link time.
595 */
596 return 0;
597 case GLSL_TYPE_ATOMIC_UINT:
598 return 0;
599 case GLSL_TYPE_IMAGE:
600 case GLSL_TYPE_VOID:
601 case GLSL_TYPE_ERROR:
602 case GLSL_TYPE_INTERFACE:
603 unreachable("not reached");
604 }
605
606 return 0;
607 }
608
609 int
610 vec4_visitor::virtual_grf_alloc(int size)
611 {
612 if (virtual_grf_array_size <= virtual_grf_count) {
613 if (virtual_grf_array_size == 0)
614 virtual_grf_array_size = 16;
615 else
616 virtual_grf_array_size *= 2;
617 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
618 virtual_grf_array_size);
619 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
620 virtual_grf_array_size);
621 }
622 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
623 virtual_grf_reg_count += size;
624 virtual_grf_sizes[virtual_grf_count] = size;
625 return virtual_grf_count++;
626 }
627
628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
629 {
630 init();
631
632 this->file = GRF;
633 this->reg = v->virtual_grf_alloc(type_size(type));
634
635 if (type->is_array() || type->is_record()) {
636 this->swizzle = BRW_SWIZZLE_NOOP;
637 } else {
638 this->swizzle = swizzle_for_size(type->vector_elements);
639 }
640
641 this->type = brw_type_for_base_type(type);
642 }
643
644 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
645 {
646 assert(size > 0);
647
648 init();
649
650 this->file = GRF;
651 this->reg = v->virtual_grf_alloc(type_size(type) * size);
652
653 this->swizzle = BRW_SWIZZLE_NOOP;
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
659 {
660 init();
661
662 this->file = GRF;
663 this->reg = v->virtual_grf_alloc(type_size(type));
664
665 if (type->is_array() || type->is_record()) {
666 this->writemask = WRITEMASK_XYZW;
667 } else {
668 this->writemask = (1 << type->vector_elements) - 1;
669 }
670
671 this->type = brw_type_for_base_type(type);
672 }
673
674 /* Our support for uniforms is piggy-backed on the struct
675 * gl_fragment_program, because that's where the values actually
676 * get stored, rather than in some global gl_shader_program uniform
677 * store.
678 */
679 void
680 vec4_visitor::setup_uniform_values(ir_variable *ir)
681 {
682 int namelen = strlen(ir->name);
683
684 /* The data for our (non-builtin) uniforms is stored in a series of
685 * gl_uniform_driver_storage structs for each subcomponent that
686 * glGetUniformLocation() could name. We know it's been set up in the same
687 * order we'd walk the type, so walk the list of storage and find anything
688 * with our name, or the prefix of a component that starts with our name.
689 */
690 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
691 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
692
693 if (strncmp(ir->name, storage->name, namelen) != 0 ||
694 (storage->name[namelen] != 0 &&
695 storage->name[namelen] != '.' &&
696 storage->name[namelen] != '[')) {
697 continue;
698 }
699
700 gl_constant_value *components = storage->storage;
701 unsigned vector_count = (MAX2(storage->array_elements, 1) *
702 storage->type->matrix_columns);
703
704 for (unsigned s = 0; s < vector_count; s++) {
705 assert(uniforms < uniform_array_size);
706 uniform_vector_size[uniforms] = storage->type->vector_elements;
707
708 int i;
709 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
710 stage_prog_data->param[uniforms * 4 + i] = components;
711 components++;
712 }
713 for (; i < 4; i++) {
714 static gl_constant_value zero = { 0.0 };
715 stage_prog_data->param[uniforms * 4 + i] = &zero;
716 }
717
718 uniforms++;
719 }
720 }
721 }
722
723 void
724 vec4_visitor::setup_uniform_clipplane_values()
725 {
726 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
727
728 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
729 assert(this->uniforms < uniform_array_size);
730 this->uniform_vector_size[this->uniforms] = 4;
731 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
732 this->userplane[i].type = BRW_REGISTER_TYPE_F;
733 for (int j = 0; j < 4; ++j) {
734 stage_prog_data->param[this->uniforms * 4 + j] =
735 (gl_constant_value *) &clip_planes[i][j];
736 }
737 ++this->uniforms;
738 }
739 }
740
741 /* Our support for builtin uniforms is even scarier than non-builtin.
742 * It sits on top of the PROG_STATE_VAR parameters that are
743 * automatically updated from GL context state.
744 */
745 void
746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
747 {
748 const ir_state_slot *const slots = ir->state_slots;
749 assert(ir->state_slots != NULL);
750
751 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
752 /* This state reference has already been setup by ir_to_mesa,
753 * but we'll get the same index back here. We can reference
754 * ParameterValues directly, since unlike brw_fs.cpp, we never
755 * add new state references during compile.
756 */
757 int index = _mesa_add_state_reference(this->prog->Parameters,
758 (gl_state_index *)slots[i].tokens);
759 gl_constant_value *values =
760 &this->prog->Parameters->ParameterValues[index][0];
761
762 assert(this->uniforms < uniform_array_size);
763 this->uniform_vector_size[this->uniforms] = 0;
764 /* Add each of the unique swizzled channels of the element.
765 * This will end up matching the size of the glsl_type of this field.
766 */
767 int last_swiz = -1;
768 for (unsigned int j = 0; j < 4; j++) {
769 int swiz = GET_SWZ(slots[i].swizzle, j);
770 last_swiz = swiz;
771
772 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
773 assert(this->uniforms < uniform_array_size);
774 if (swiz <= last_swiz)
775 this->uniform_vector_size[this->uniforms]++;
776 }
777 this->uniforms++;
778 }
779 }
780
781 dst_reg *
782 vec4_visitor::variable_storage(ir_variable *var)
783 {
784 return (dst_reg *)hash_table_find(this->variable_ht, var);
785 }
786
787 void
788 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
789 enum brw_predicate *predicate)
790 {
791 ir_expression *expr = ir->as_expression();
792
793 *predicate = BRW_PREDICATE_NORMAL;
794
795 if (expr && expr->operation != ir_binop_ubo_load) {
796 src_reg op[3];
797 vec4_instruction *inst;
798
799 assert(expr->get_num_operands() <= 3);
800 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
801 expr->operands[i]->accept(this);
802 op[i] = this->result;
803
804 resolve_ud_negate(&op[i]);
805 }
806
807 switch (expr->operation) {
808 case ir_unop_logic_not:
809 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
810 inst->conditional_mod = BRW_CONDITIONAL_Z;
811 break;
812
813 case ir_binop_logic_xor:
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 inst->conditional_mod = BRW_CONDITIONAL_NZ;
816 break;
817
818 case ir_binop_logic_or:
819 inst = emit(OR(dst_null_d(), op[0], op[1]));
820 inst->conditional_mod = BRW_CONDITIONAL_NZ;
821 break;
822
823 case ir_binop_logic_and:
824 inst = emit(AND(dst_null_d(), op[0], op[1]));
825 inst->conditional_mod = BRW_CONDITIONAL_NZ;
826 break;
827
828 case ir_unop_f2b:
829 if (brw->gen >= 6) {
830 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
831 } else {
832 inst = emit(MOV(dst_null_f(), op[0]));
833 inst->conditional_mod = BRW_CONDITIONAL_NZ;
834 }
835 break;
836
837 case ir_unop_i2b:
838 if (brw->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_d(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_binop_all_equal:
847 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
848 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
849 break;
850
851 case ir_binop_any_nequal:
852 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
853 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
854 break;
855
856 case ir_unop_any:
857 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
858 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
859 break;
860
861 case ir_binop_greater:
862 case ir_binop_gequal:
863 case ir_binop_less:
864 case ir_binop_lequal:
865 case ir_binop_equal:
866 case ir_binop_nequal:
867 emit(CMP(dst_null_d(), op[0], op[1],
868 brw_conditional_for_comparison(expr->operation)));
869 break;
870
871 case ir_triop_csel: {
872 /* Expand the boolean condition into the flag register. */
873 inst = emit(MOV(dst_null_d(), op[0]));
874 inst->conditional_mod = BRW_CONDITIONAL_NZ;
875
876 /* Select which boolean to return. */
877 dst_reg temp(this, expr->operands[1]->type);
878 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
879 inst->predicate = BRW_PREDICATE_NORMAL;
880
881 /* Expand the result to a condition code. */
882 inst = emit(MOV(dst_null_d(), src_reg(temp)));
883 inst->conditional_mod = BRW_CONDITIONAL_NZ;
884 break;
885 }
886
887 default:
888 unreachable("not reached");
889 }
890 return;
891 }
892
893 ir->accept(this);
894
895 resolve_ud_negate(&this->result);
896
897 if (brw->gen >= 6) {
898 vec4_instruction *inst = emit(AND(dst_null_d(),
899 this->result, src_reg(1)));
900 inst->conditional_mod = BRW_CONDITIONAL_NZ;
901 } else {
902 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
903 inst->conditional_mod = BRW_CONDITIONAL_NZ;
904 }
905 }
906
907 /**
908 * Emit a gen6 IF statement with the comparison folded into the IF
909 * instruction.
910 */
911 void
912 vec4_visitor::emit_if_gen6(ir_if *ir)
913 {
914 ir_expression *expr = ir->condition->as_expression();
915
916 if (expr && expr->operation != ir_binop_ubo_load) {
917 src_reg op[3];
918 dst_reg temp;
919
920 assert(expr->get_num_operands() <= 3);
921 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
922 expr->operands[i]->accept(this);
923 op[i] = this->result;
924 }
925
926 switch (expr->operation) {
927 case ir_unop_logic_not:
928 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
929 return;
930
931 case ir_binop_logic_xor:
932 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
933 return;
934
935 case ir_binop_logic_or:
936 temp = dst_reg(this, glsl_type::bool_type);
937 emit(OR(temp, op[0], op[1]));
938 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
939 return;
940
941 case ir_binop_logic_and:
942 temp = dst_reg(this, glsl_type::bool_type);
943 emit(AND(temp, op[0], op[1]));
944 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
945 return;
946
947 case ir_unop_f2b:
948 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
949 return;
950
951 case ir_unop_i2b:
952 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
953 return;
954
955 case ir_binop_greater:
956 case ir_binop_gequal:
957 case ir_binop_less:
958 case ir_binop_lequal:
959 case ir_binop_equal:
960 case ir_binop_nequal:
961 emit(IF(op[0], op[1],
962 brw_conditional_for_comparison(expr->operation)));
963 return;
964
965 case ir_binop_all_equal:
966 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
967 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
968 return;
969
970 case ir_binop_any_nequal:
971 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
972 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
973 return;
974
975 case ir_unop_any:
976 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
977 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
978 return;
979
980 case ir_triop_csel: {
981 /* Expand the boolean condition into the flag register. */
982 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
983 inst->conditional_mod = BRW_CONDITIONAL_NZ;
984
985 /* Select which boolean to return. */
986 dst_reg temp(this, expr->operands[1]->type);
987 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
988 inst->predicate = BRW_PREDICATE_NORMAL;
989
990 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
991 return;
992 }
993
994 default:
995 unreachable("not reached");
996 }
997 return;
998 }
999
1000 ir->condition->accept(this);
1001
1002 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_variable *ir)
1007 {
1008 dst_reg *reg = NULL;
1009
1010 if (variable_storage(ir))
1011 return;
1012
1013 switch (ir->data.mode) {
1014 case ir_var_shader_in:
1015 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1016 break;
1017
1018 case ir_var_shader_out:
1019 reg = new(mem_ctx) dst_reg(this, ir->type);
1020
1021 for (int i = 0; i < type_size(ir->type); i++) {
1022 output_reg[ir->data.location + i] = *reg;
1023 output_reg[ir->data.location + i].reg_offset = i;
1024 output_reg[ir->data.location + i].type =
1025 brw_type_for_base_type(ir->type->get_scalar_type());
1026 output_reg_annotation[ir->data.location + i] = ir->name;
1027 }
1028 break;
1029
1030 case ir_var_auto:
1031 case ir_var_temporary:
1032 reg = new(mem_ctx) dst_reg(this, ir->type);
1033 break;
1034
1035 case ir_var_uniform:
1036 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1037
1038 /* Thanks to the lower_ubo_reference pass, we will see only
1039 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1040 * variables, so no need for them to be in variable_ht.
1041 *
1042 * Some uniforms, such as samplers and atomic counters, have no actual
1043 * storage, so we should ignore them.
1044 */
1045 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1046 return;
1047
1048 /* Track how big the whole uniform variable is, in case we need to put a
1049 * copy of its data into pull constants for array access.
1050 */
1051 assert(this->uniforms < uniform_array_size);
1052 this->uniform_size[this->uniforms] = type_size(ir->type);
1053
1054 if (!strncmp(ir->name, "gl_", 3)) {
1055 setup_builtin_uniform_values(ir);
1056 } else {
1057 setup_uniform_values(ir);
1058 }
1059 break;
1060
1061 case ir_var_system_value:
1062 reg = make_reg_for_system_value(ir);
1063 break;
1064
1065 default:
1066 unreachable("not reached");
1067 }
1068
1069 reg->type = brw_type_for_base_type(ir->type);
1070 hash_table_insert(this->variable_ht, reg, ir);
1071 }
1072
1073 void
1074 vec4_visitor::visit(ir_loop *ir)
1075 {
1076 /* We don't want debugging output to print the whole body of the
1077 * loop as the annotation.
1078 */
1079 this->base_ir = NULL;
1080
1081 emit(BRW_OPCODE_DO);
1082
1083 visit_instructions(&ir->body_instructions);
1084
1085 emit(BRW_OPCODE_WHILE);
1086 }
1087
1088 void
1089 vec4_visitor::visit(ir_loop_jump *ir)
1090 {
1091 switch (ir->mode) {
1092 case ir_loop_jump::jump_break:
1093 emit(BRW_OPCODE_BREAK);
1094 break;
1095 case ir_loop_jump::jump_continue:
1096 emit(BRW_OPCODE_CONTINUE);
1097 break;
1098 }
1099 }
1100
1101
1102 void
1103 vec4_visitor::visit(ir_function_signature *)
1104 {
1105 unreachable("not reached");
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_function *ir)
1110 {
1111 /* Ignore function bodies other than main() -- we shouldn't see calls to
1112 * them since they should all be inlined.
1113 */
1114 if (strcmp(ir->name, "main") == 0) {
1115 const ir_function_signature *sig;
1116 exec_list empty;
1117
1118 sig = ir->matching_signature(NULL, &empty, false);
1119
1120 assert(sig);
1121
1122 visit_instructions(&sig->body);
1123 }
1124 }
1125
1126 bool
1127 vec4_visitor::try_emit_mad(ir_expression *ir)
1128 {
1129 /* 3-src instructions were introduced in gen6. */
1130 if (brw->gen < 6)
1131 return false;
1132
1133 /* MAD can only handle floating-point data. */
1134 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1135 return false;
1136
1137 ir_rvalue *nonmul = ir->operands[1];
1138 ir_expression *mul = ir->operands[0]->as_expression();
1139
1140 if (!mul || mul->operation != ir_binop_mul) {
1141 nonmul = ir->operands[0];
1142 mul = ir->operands[1]->as_expression();
1143
1144 if (!mul || mul->operation != ir_binop_mul)
1145 return false;
1146 }
1147
1148 nonmul->accept(this);
1149 src_reg src0 = fix_3src_operand(this->result);
1150
1151 mul->operands[0]->accept(this);
1152 src_reg src1 = fix_3src_operand(this->result);
1153
1154 mul->operands[1]->accept(this);
1155 src_reg src2 = fix_3src_operand(this->result);
1156
1157 this->result = src_reg(this, ir->type);
1158 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1159
1160 return true;
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1165 {
1166 /* This optimization relies on CMP setting the destination to 0 when
1167 * false. Early hardware only sets the least significant bit, and
1168 * leaves the other bits undefined. So we can't use it.
1169 */
1170 if (brw->gen < 6)
1171 return false;
1172
1173 ir_expression *const cmp = ir->operands[0]->as_expression();
1174
1175 if (cmp == NULL)
1176 return false;
1177
1178 switch (cmp->operation) {
1179 case ir_binop_less:
1180 case ir_binop_greater:
1181 case ir_binop_lequal:
1182 case ir_binop_gequal:
1183 case ir_binop_equal:
1184 case ir_binop_nequal:
1185 break;
1186
1187 default:
1188 return false;
1189 }
1190
1191 cmp->operands[0]->accept(this);
1192 const src_reg cmp_src0 = this->result;
1193
1194 cmp->operands[1]->accept(this);
1195 const src_reg cmp_src1 = this->result;
1196
1197 this->result = src_reg(this, ir->type);
1198
1199 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1200 brw_conditional_for_comparison(cmp->operation)));
1201
1202 /* If the comparison is false, this->result will just happen to be zero.
1203 */
1204 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1205 this->result, src_reg(1.0f));
1206 inst->predicate = BRW_PREDICATE_NORMAL;
1207 inst->predicate_inverse = true;
1208
1209 return true;
1210 }
1211
1212 void
1213 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1214 src_reg src0, src_reg src1)
1215 {
1216 vec4_instruction *inst;
1217
1218 if (brw->gen >= 6) {
1219 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1220 inst->conditional_mod = conditionalmod;
1221 } else {
1222 emit(CMP(dst, src0, src1, conditionalmod));
1223
1224 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225 inst->predicate = BRW_PREDICATE_NORMAL;
1226 }
1227 }
1228
1229 void
1230 vec4_visitor::emit_lrp(const dst_reg &dst,
1231 const src_reg &x, const src_reg &y, const src_reg &a)
1232 {
1233 if (brw->gen >= 6) {
1234 /* Note that the instruction's argument order is reversed from GLSL
1235 * and the IR.
1236 */
1237 emit(LRP(dst,
1238 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1239 } else {
1240 /* Earlier generations don't support three source operations, so we
1241 * need to emit x*(1-a) + y*a.
1242 */
1243 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1244 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1245 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1246 y_times_a.writemask = dst.writemask;
1247 one_minus_a.writemask = dst.writemask;
1248 x_times_one_minus_a.writemask = dst.writemask;
1249
1250 emit(MUL(y_times_a, y, a));
1251 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1252 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1253 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1254 }
1255 }
1256
1257 void
1258 vec4_visitor::visit(ir_expression *ir)
1259 {
1260 unsigned int operand;
1261 src_reg op[Elements(ir->operands)];
1262 src_reg result_src;
1263 dst_reg result_dst;
1264 vec4_instruction *inst;
1265
1266 if (ir->operation == ir_binop_add) {
1267 if (try_emit_mad(ir))
1268 return;
1269 }
1270
1271 if (ir->operation == ir_unop_b2f) {
1272 if (try_emit_b2f_of_compare(ir))
1273 return;
1274 }
1275
1276 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1277 this->result.file = BAD_FILE;
1278 ir->operands[operand]->accept(this);
1279 if (this->result.file == BAD_FILE) {
1280 fprintf(stderr, "Failed to get tree for expression operand:\n");
1281 ir->operands[operand]->fprint(stderr);
1282 exit(1);
1283 }
1284 op[operand] = this->result;
1285
1286 /* Matrix expression operands should have been broken down to vector
1287 * operations already.
1288 */
1289 assert(!ir->operands[operand]->type->is_matrix());
1290 }
1291
1292 int vector_elements = ir->operands[0]->type->vector_elements;
1293 if (ir->operands[1]) {
1294 vector_elements = MAX2(vector_elements,
1295 ir->operands[1]->type->vector_elements);
1296 }
1297
1298 this->result.file = BAD_FILE;
1299
1300 /* Storage for our result. Ideally for an assignment we'd be using
1301 * the actual storage for the result here, instead.
1302 */
1303 result_src = src_reg(this, ir->type);
1304 /* convenience for the emit functions below. */
1305 result_dst = dst_reg(result_src);
1306 /* If nothing special happens, this is the result. */
1307 this->result = result_src;
1308 /* Limit writes to the channels that will be used by result_src later.
1309 * This does limit this temp's use as a temporary for multi-instruction
1310 * sequences.
1311 */
1312 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1313
1314 switch (ir->operation) {
1315 case ir_unop_logic_not:
1316 if (ctx->Const.UniformBooleanTrue != 1) {
1317 emit(NOT(result_dst, op[0]));
1318 } else {
1319 emit(XOR(result_dst, op[0], src_reg(1)));
1320 }
1321 break;
1322 case ir_unop_neg:
1323 op[0].negate = !op[0].negate;
1324 emit(MOV(result_dst, op[0]));
1325 break;
1326 case ir_unop_abs:
1327 op[0].abs = true;
1328 op[0].negate = false;
1329 emit(MOV(result_dst, op[0]));
1330 break;
1331
1332 case ir_unop_sign:
1333 if (ir->type->is_float()) {
1334 /* AND(val, 0x80000000) gives the sign bit.
1335 *
1336 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1337 * zero.
1338 */
1339 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1340
1341 op[0].type = BRW_REGISTER_TYPE_UD;
1342 result_dst.type = BRW_REGISTER_TYPE_UD;
1343 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1344
1345 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1346 inst->predicate = BRW_PREDICATE_NORMAL;
1347
1348 this->result.type = BRW_REGISTER_TYPE_F;
1349 } else {
1350 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1351 * -> non-negative val generates 0x00000000.
1352 * Predicated OR sets 1 if val is positive.
1353 */
1354 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1355
1356 emit(ASR(result_dst, op[0], src_reg(31)));
1357
1358 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1359 inst->predicate = BRW_PREDICATE_NORMAL;
1360 }
1361 break;
1362
1363 case ir_unop_rcp:
1364 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1365 break;
1366
1367 case ir_unop_exp2:
1368 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1369 break;
1370 case ir_unop_log2:
1371 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1372 break;
1373 case ir_unop_exp:
1374 case ir_unop_log:
1375 unreachable("not reached: should be handled by ir_explog_to_explog2");
1376 case ir_unop_sin:
1377 case ir_unop_sin_reduced:
1378 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1379 break;
1380 case ir_unop_cos:
1381 case ir_unop_cos_reduced:
1382 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1383 break;
1384
1385 case ir_unop_dFdx:
1386 case ir_unop_dFdx_coarse:
1387 case ir_unop_dFdx_fine:
1388 case ir_unop_dFdy:
1389 case ir_unop_dFdy_coarse:
1390 case ir_unop_dFdy_fine:
1391 unreachable("derivatives not valid in vertex shader");
1392
1393 case ir_unop_bitfield_reverse:
1394 emit(BFREV(result_dst, op[0]));
1395 break;
1396 case ir_unop_bit_count:
1397 emit(CBIT(result_dst, op[0]));
1398 break;
1399 case ir_unop_find_msb: {
1400 src_reg temp = src_reg(this, glsl_type::uint_type);
1401
1402 inst = emit(FBH(dst_reg(temp), op[0]));
1403 inst->dst.writemask = WRITEMASK_XYZW;
1404
1405 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1406 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1407 * subtract the result from 31 to convert the MSB count into an LSB count.
1408 */
1409
1410 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1411 temp.swizzle = BRW_SWIZZLE_NOOP;
1412 emit(MOV(result_dst, temp));
1413
1414 src_reg src_tmp = src_reg(result_dst);
1415 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1416
1417 src_tmp.negate = true;
1418 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1419 inst->predicate = BRW_PREDICATE_NORMAL;
1420 break;
1421 }
1422 case ir_unop_find_lsb:
1423 emit(FBL(result_dst, op[0]));
1424 break;
1425 case ir_unop_saturate:
1426 inst = emit(MOV(result_dst, op[0]));
1427 inst->saturate = true;
1428 break;
1429
1430 case ir_unop_noise:
1431 unreachable("not reached: should be handled by lower_noise");
1432
1433 case ir_binop_add:
1434 emit(ADD(result_dst, op[0], op[1]));
1435 break;
1436 case ir_binop_sub:
1437 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1438
1439 case ir_binop_mul:
1440 if (brw->gen < 8 && ir->type->is_integer()) {
1441 /* For integer multiplication, the MUL uses the low 16 bits of one of
1442 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1443 * accumulates in the contribution of the upper 16 bits of that
1444 * operand. If we can determine that one of the args is in the low
1445 * 16 bits, though, we can just emit a single MUL.
1446 */
1447 if (ir->operands[0]->is_uint16_constant()) {
1448 if (brw->gen < 7)
1449 emit(MUL(result_dst, op[0], op[1]));
1450 else
1451 emit(MUL(result_dst, op[1], op[0]));
1452 } else if (ir->operands[1]->is_uint16_constant()) {
1453 if (brw->gen < 7)
1454 emit(MUL(result_dst, op[1], op[0]));
1455 else
1456 emit(MUL(result_dst, op[0], op[1]));
1457 } else {
1458 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1459
1460 emit(MUL(acc, op[0], op[1]));
1461 emit(MACH(dst_null_d(), op[0], op[1]));
1462 emit(MOV(result_dst, src_reg(acc)));
1463 }
1464 } else {
1465 emit(MUL(result_dst, op[0], op[1]));
1466 }
1467 break;
1468 case ir_binop_imul_high: {
1469 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1470
1471 emit(MUL(acc, op[0], op[1]));
1472 emit(MACH(result_dst, op[0], op[1]));
1473 break;
1474 }
1475 case ir_binop_div:
1476 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1477 assert(ir->type->is_integer());
1478 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1479 break;
1480 case ir_binop_carry: {
1481 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1482
1483 emit(ADDC(dst_null_ud(), op[0], op[1]));
1484 emit(MOV(result_dst, src_reg(acc)));
1485 break;
1486 }
1487 case ir_binop_borrow: {
1488 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1489
1490 emit(SUBB(dst_null_ud(), op[0], op[1]));
1491 emit(MOV(result_dst, src_reg(acc)));
1492 break;
1493 }
1494 case ir_binop_mod:
1495 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1496 assert(ir->type->is_integer());
1497 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1498 break;
1499
1500 case ir_binop_less:
1501 case ir_binop_greater:
1502 case ir_binop_lequal:
1503 case ir_binop_gequal:
1504 case ir_binop_equal:
1505 case ir_binop_nequal: {
1506 emit(CMP(result_dst, op[0], op[1],
1507 brw_conditional_for_comparison(ir->operation)));
1508 if (ctx->Const.UniformBooleanTrue == 1) {
1509 emit(AND(result_dst, result_src, src_reg(1)));
1510 }
1511 break;
1512 }
1513
1514 case ir_binop_all_equal:
1515 /* "==" operator producing a scalar boolean. */
1516 if (ir->operands[0]->type->is_vector() ||
1517 ir->operands[1]->type->is_vector()) {
1518 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1519 emit(MOV(result_dst, src_reg(0)));
1520 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1522 } else {
1523 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1524 if (ctx->Const.UniformBooleanTrue == 1) {
1525 emit(AND(result_dst, result_src, src_reg(1)));
1526 }
1527 }
1528 break;
1529 case ir_binop_any_nequal:
1530 /* "!=" operator producing a scalar boolean. */
1531 if (ir->operands[0]->type->is_vector() ||
1532 ir->operands[1]->type->is_vector()) {
1533 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1534
1535 emit(MOV(result_dst, src_reg(0)));
1536 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1537 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1538 } else {
1539 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1540 if (ctx->Const.UniformBooleanTrue == 1) {
1541 emit(AND(result_dst, result_src, src_reg(1)));
1542 }
1543 }
1544 break;
1545
1546 case ir_unop_any:
1547 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1548 emit(MOV(result_dst, src_reg(0)));
1549
1550 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1551 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1552 break;
1553
1554 case ir_binop_logic_xor:
1555 emit(XOR(result_dst, op[0], op[1]));
1556 break;
1557
1558 case ir_binop_logic_or:
1559 emit(OR(result_dst, op[0], op[1]));
1560 break;
1561
1562 case ir_binop_logic_and:
1563 emit(AND(result_dst, op[0], op[1]));
1564 break;
1565
1566 case ir_binop_dot:
1567 assert(ir->operands[0]->type->is_vector());
1568 assert(ir->operands[0]->type == ir->operands[1]->type);
1569 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1570 break;
1571
1572 case ir_unop_sqrt:
1573 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1574 break;
1575 case ir_unop_rsq:
1576 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1577 break;
1578
1579 case ir_unop_bitcast_i2f:
1580 case ir_unop_bitcast_u2f:
1581 this->result = op[0];
1582 this->result.type = BRW_REGISTER_TYPE_F;
1583 break;
1584
1585 case ir_unop_bitcast_f2i:
1586 this->result = op[0];
1587 this->result.type = BRW_REGISTER_TYPE_D;
1588 break;
1589
1590 case ir_unop_bitcast_f2u:
1591 this->result = op[0];
1592 this->result.type = BRW_REGISTER_TYPE_UD;
1593 break;
1594
1595 case ir_unop_i2f:
1596 case ir_unop_i2u:
1597 case ir_unop_u2i:
1598 case ir_unop_u2f:
1599 case ir_unop_f2i:
1600 case ir_unop_f2u:
1601 emit(MOV(result_dst, op[0]));
1602 break;
1603 case ir_unop_b2i:
1604 if (ctx->Const.UniformBooleanTrue != 1) {
1605 emit(AND(result_dst, op[0], src_reg(1)));
1606 } else {
1607 emit(MOV(result_dst, op[0]));
1608 }
1609 break;
1610 case ir_unop_b2f:
1611 if (ctx->Const.UniformBooleanTrue != 1) {
1612 op[0].type = BRW_REGISTER_TYPE_UD;
1613 result_dst.type = BRW_REGISTER_TYPE_UD;
1614 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1615 result_dst.type = BRW_REGISTER_TYPE_F;
1616 } else {
1617 emit(MOV(result_dst, op[0]));
1618 }
1619 break;
1620 case ir_unop_f2b:
1621 case ir_unop_i2b:
1622 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1623 if (ctx->Const.UniformBooleanTrue == 1) {
1624 emit(AND(result_dst, result_src, src_reg(1)));
1625 }
1626 break;
1627
1628 case ir_unop_trunc:
1629 emit(RNDZ(result_dst, op[0]));
1630 break;
1631 case ir_unop_ceil:
1632 op[0].negate = !op[0].negate;
1633 inst = emit(RNDD(result_dst, op[0]));
1634 this->result.negate = true;
1635 break;
1636 case ir_unop_floor:
1637 inst = emit(RNDD(result_dst, op[0]));
1638 break;
1639 case ir_unop_fract:
1640 inst = emit(FRC(result_dst, op[0]));
1641 break;
1642 case ir_unop_round_even:
1643 emit(RNDE(result_dst, op[0]));
1644 break;
1645
1646 case ir_binop_min:
1647 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1648 break;
1649 case ir_binop_max:
1650 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1651 break;
1652
1653 case ir_binop_pow:
1654 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1655 break;
1656
1657 case ir_unop_bit_not:
1658 inst = emit(NOT(result_dst, op[0]));
1659 break;
1660 case ir_binop_bit_and:
1661 inst = emit(AND(result_dst, op[0], op[1]));
1662 break;
1663 case ir_binop_bit_xor:
1664 inst = emit(XOR(result_dst, op[0], op[1]));
1665 break;
1666 case ir_binop_bit_or:
1667 inst = emit(OR(result_dst, op[0], op[1]));
1668 break;
1669
1670 case ir_binop_lshift:
1671 inst = emit(SHL(result_dst, op[0], op[1]));
1672 break;
1673
1674 case ir_binop_rshift:
1675 if (ir->type->base_type == GLSL_TYPE_INT)
1676 inst = emit(ASR(result_dst, op[0], op[1]));
1677 else
1678 inst = emit(SHR(result_dst, op[0], op[1]));
1679 break;
1680
1681 case ir_binop_bfm:
1682 emit(BFI1(result_dst, op[0], op[1]));
1683 break;
1684
1685 case ir_binop_ubo_load: {
1686 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1687 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1688 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1689 src_reg offset;
1690
1691 /* Now, load the vector from that offset. */
1692 assert(ir->type->is_vector() || ir->type->is_scalar());
1693
1694 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1695 packed_consts.type = result.type;
1696 src_reg surf_index;
1697
1698 if (const_uniform_block) {
1699 /* The block index is a constant, so just emit the binding table entry
1700 * as an immediate.
1701 */
1702 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1703 const_uniform_block->value.u[0]);
1704 } else {
1705 /* The block index is not a constant. Evaluate the index expression
1706 * per-channel and add the base UBO index; the generator will select
1707 * a value from any live channel.
1708 */
1709 surf_index = src_reg(this, glsl_type::uint_type);
1710 emit(ADD(dst_reg(surf_index), op[0],
1711 src_reg(prog_data->base.binding_table.ubo_start)));
1712
1713 /* Assume this may touch any UBO. It would be nice to provide
1714 * a tighter bound, but the array information is already lowered away.
1715 */
1716 brw_mark_surface_used(&prog_data->base,
1717 prog_data->base.binding_table.ubo_start +
1718 shader_prog->NumUniformBlocks - 1);
1719 }
1720
1721 if (const_offset_ir) {
1722 if (brw->gen >= 8) {
1723 /* Store the offset in a GRF so we can send-from-GRF. */
1724 offset = src_reg(this, glsl_type::int_type);
1725 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1726 } else {
1727 /* Immediates are fine on older generations since they'll be moved
1728 * to a (potentially fake) MRF at the generator level.
1729 */
1730 offset = src_reg(const_offset / 16);
1731 }
1732 } else {
1733 offset = src_reg(this, glsl_type::uint_type);
1734 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1735 }
1736
1737 if (brw->gen >= 7) {
1738 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1739 grf_offset.type = offset.type;
1740
1741 emit(MOV(grf_offset, offset));
1742
1743 emit(new(mem_ctx) vec4_instruction(this,
1744 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1745 dst_reg(packed_consts),
1746 surf_index,
1747 src_reg(grf_offset)));
1748 } else {
1749 vec4_instruction *pull =
1750 emit(new(mem_ctx) vec4_instruction(this,
1751 VS_OPCODE_PULL_CONSTANT_LOAD,
1752 dst_reg(packed_consts),
1753 surf_index,
1754 offset));
1755 pull->base_mrf = 14;
1756 pull->mlen = 1;
1757 }
1758
1759 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1760 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1761 const_offset % 16 / 4,
1762 const_offset % 16 / 4,
1763 const_offset % 16 / 4);
1764
1765 /* UBO bools are any nonzero int. We need to convert them to use the
1766 * value of true stored in ctx->Const.UniformBooleanTrue.
1767 */
1768 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1769 emit(CMP(result_dst, packed_consts, src_reg(0u),
1770 BRW_CONDITIONAL_NZ));
1771 if (ctx->Const.UniformBooleanTrue == 1) {
1772 emit(AND(result_dst, result, src_reg(1)));
1773 }
1774 } else {
1775 emit(MOV(result_dst, packed_consts));
1776 }
1777 break;
1778 }
1779
1780 case ir_binop_vector_extract:
1781 unreachable("should have been lowered by vec_index_to_cond_assign");
1782
1783 case ir_triop_fma:
1784 op[0] = fix_3src_operand(op[0]);
1785 op[1] = fix_3src_operand(op[1]);
1786 op[2] = fix_3src_operand(op[2]);
1787 /* Note that the instruction's argument order is reversed from GLSL
1788 * and the IR.
1789 */
1790 emit(MAD(result_dst, op[2], op[1], op[0]));
1791 break;
1792
1793 case ir_triop_lrp:
1794 emit_lrp(result_dst, op[0], op[1], op[2]);
1795 break;
1796
1797 case ir_triop_csel:
1798 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1799 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1800 inst->predicate = BRW_PREDICATE_NORMAL;
1801 break;
1802
1803 case ir_triop_bfi:
1804 op[0] = fix_3src_operand(op[0]);
1805 op[1] = fix_3src_operand(op[1]);
1806 op[2] = fix_3src_operand(op[2]);
1807 emit(BFI2(result_dst, op[0], op[1], op[2]));
1808 break;
1809
1810 case ir_triop_bitfield_extract:
1811 op[0] = fix_3src_operand(op[0]);
1812 op[1] = fix_3src_operand(op[1]);
1813 op[2] = fix_3src_operand(op[2]);
1814 /* Note that the instruction's argument order is reversed from GLSL
1815 * and the IR.
1816 */
1817 emit(BFE(result_dst, op[2], op[1], op[0]));
1818 break;
1819
1820 case ir_triop_vector_insert:
1821 unreachable("should have been lowered by lower_vector_insert");
1822
1823 case ir_quadop_bitfield_insert:
1824 unreachable("not reached: should be handled by "
1825 "bitfield_insert_to_bfm_bfi\n");
1826
1827 case ir_quadop_vector:
1828 unreachable("not reached: should be handled by lower_quadop_vector");
1829
1830 case ir_unop_pack_half_2x16:
1831 emit_pack_half_2x16(result_dst, op[0]);
1832 break;
1833 case ir_unop_unpack_half_2x16:
1834 emit_unpack_half_2x16(result_dst, op[0]);
1835 break;
1836 case ir_unop_pack_snorm_2x16:
1837 case ir_unop_pack_snorm_4x8:
1838 case ir_unop_pack_unorm_2x16:
1839 case ir_unop_pack_unorm_4x8:
1840 case ir_unop_unpack_snorm_2x16:
1841 case ir_unop_unpack_snorm_4x8:
1842 case ir_unop_unpack_unorm_2x16:
1843 case ir_unop_unpack_unorm_4x8:
1844 unreachable("not reached: should be handled by lower_packing_builtins");
1845 case ir_unop_unpack_half_2x16_split_x:
1846 case ir_unop_unpack_half_2x16_split_y:
1847 case ir_binop_pack_half_2x16_split:
1848 case ir_unop_interpolate_at_centroid:
1849 case ir_binop_interpolate_at_sample:
1850 case ir_binop_interpolate_at_offset:
1851 unreachable("not reached: should not occur in vertex shader");
1852 case ir_binop_ldexp:
1853 unreachable("not reached: should be handled by ldexp_to_arith()");
1854 }
1855 }
1856
1857
1858 void
1859 vec4_visitor::visit(ir_swizzle *ir)
1860 {
1861 src_reg src;
1862 int i = 0;
1863 int swizzle[4];
1864
1865 /* Note that this is only swizzles in expressions, not those on the left
1866 * hand side of an assignment, which do write masking. See ir_assignment
1867 * for that.
1868 */
1869
1870 ir->val->accept(this);
1871 src = this->result;
1872 assert(src.file != BAD_FILE);
1873
1874 for (i = 0; i < ir->type->vector_elements; i++) {
1875 switch (i) {
1876 case 0:
1877 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1878 break;
1879 case 1:
1880 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1881 break;
1882 case 2:
1883 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1884 break;
1885 case 3:
1886 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1887 break;
1888 }
1889 }
1890 for (; i < 4; i++) {
1891 /* Replicate the last channel out. */
1892 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1893 }
1894
1895 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1896
1897 this->result = src;
1898 }
1899
1900 void
1901 vec4_visitor::visit(ir_dereference_variable *ir)
1902 {
1903 const struct glsl_type *type = ir->type;
1904 dst_reg *reg = variable_storage(ir->var);
1905
1906 if (!reg) {
1907 fail("Failed to find variable storage for %s\n", ir->var->name);
1908 this->result = src_reg(brw_null_reg());
1909 return;
1910 }
1911
1912 this->result = src_reg(*reg);
1913
1914 /* System values get their swizzle from the dst_reg writemask */
1915 if (ir->var->data.mode == ir_var_system_value)
1916 return;
1917
1918 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1919 this->result.swizzle = swizzle_for_size(type->vector_elements);
1920 }
1921
1922
1923 int
1924 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1925 {
1926 /* Under normal circumstances array elements are stored consecutively, so
1927 * the stride is equal to the size of the array element.
1928 */
1929 return type_size(ir->type);
1930 }
1931
1932
1933 void
1934 vec4_visitor::visit(ir_dereference_array *ir)
1935 {
1936 ir_constant *constant_index;
1937 src_reg src;
1938 int array_stride = compute_array_stride(ir);
1939
1940 constant_index = ir->array_index->constant_expression_value();
1941
1942 ir->array->accept(this);
1943 src = this->result;
1944
1945 if (constant_index) {
1946 src.reg_offset += constant_index->value.i[0] * array_stride;
1947 } else {
1948 /* Variable index array dereference. It eats the "vec4" of the
1949 * base of the array and an index that offsets the Mesa register
1950 * index.
1951 */
1952 ir->array_index->accept(this);
1953
1954 src_reg index_reg;
1955
1956 if (array_stride == 1) {
1957 index_reg = this->result;
1958 } else {
1959 index_reg = src_reg(this, glsl_type::int_type);
1960
1961 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1962 }
1963
1964 if (src.reladdr) {
1965 src_reg temp = src_reg(this, glsl_type::int_type);
1966
1967 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1968
1969 index_reg = temp;
1970 }
1971
1972 src.reladdr = ralloc(mem_ctx, src_reg);
1973 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1974 }
1975
1976 /* If the type is smaller than a vec4, replicate the last channel out. */
1977 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1978 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1979 else
1980 src.swizzle = BRW_SWIZZLE_NOOP;
1981 src.type = brw_type_for_base_type(ir->type);
1982
1983 this->result = src;
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_dereference_record *ir)
1988 {
1989 unsigned int i;
1990 const glsl_type *struct_type = ir->record->type;
1991 int offset = 0;
1992
1993 ir->record->accept(this);
1994
1995 for (i = 0; i < struct_type->length; i++) {
1996 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1997 break;
1998 offset += type_size(struct_type->fields.structure[i].type);
1999 }
2000
2001 /* If the type is smaller than a vec4, replicate the last channel out. */
2002 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2003 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2004 else
2005 this->result.swizzle = BRW_SWIZZLE_NOOP;
2006 this->result.type = brw_type_for_base_type(ir->type);
2007
2008 this->result.reg_offset += offset;
2009 }
2010
2011 /**
2012 * We want to be careful in assignment setup to hit the actual storage
2013 * instead of potentially using a temporary like we might with the
2014 * ir_dereference handler.
2015 */
2016 static dst_reg
2017 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2018 {
2019 /* The LHS must be a dereference. If the LHS is a variable indexed array
2020 * access of a vector, it must be separated into a series conditional moves
2021 * before reaching this point (see ir_vec_index_to_cond_assign).
2022 */
2023 assert(ir->as_dereference());
2024 ir_dereference_array *deref_array = ir->as_dereference_array();
2025 if (deref_array) {
2026 assert(!deref_array->array->type->is_vector());
2027 }
2028
2029 /* Use the rvalue deref handler for the most part. We'll ignore
2030 * swizzles in it and write swizzles using writemask, though.
2031 */
2032 ir->accept(v);
2033 return dst_reg(v->result);
2034 }
2035
2036 void
2037 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2038 const struct glsl_type *type,
2039 enum brw_predicate predicate)
2040 {
2041 if (type->base_type == GLSL_TYPE_STRUCT) {
2042 for (unsigned int i = 0; i < type->length; i++) {
2043 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2044 }
2045 return;
2046 }
2047
2048 if (type->is_array()) {
2049 for (unsigned int i = 0; i < type->length; i++) {
2050 emit_block_move(dst, src, type->fields.array, predicate);
2051 }
2052 return;
2053 }
2054
2055 if (type->is_matrix()) {
2056 const struct glsl_type *vec_type;
2057
2058 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2059 type->vector_elements, 1);
2060
2061 for (int i = 0; i < type->matrix_columns; i++) {
2062 emit_block_move(dst, src, vec_type, predicate);
2063 }
2064 return;
2065 }
2066
2067 assert(type->is_scalar() || type->is_vector());
2068
2069 dst->type = brw_type_for_base_type(type);
2070 src->type = dst->type;
2071
2072 dst->writemask = (1 << type->vector_elements) - 1;
2073
2074 src->swizzle = swizzle_for_size(type->vector_elements);
2075
2076 vec4_instruction *inst = emit(MOV(*dst, *src));
2077 inst->predicate = predicate;
2078
2079 dst->reg_offset++;
2080 src->reg_offset++;
2081 }
2082
2083
2084 /* If the RHS processing resulted in an instruction generating a
2085 * temporary value, and it would be easy to rewrite the instruction to
2086 * generate its result right into the LHS instead, do so. This ends
2087 * up reliably removing instructions where it can be tricky to do so
2088 * later without real UD chain information.
2089 */
2090 bool
2091 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2092 dst_reg dst,
2093 src_reg src,
2094 vec4_instruction *pre_rhs_inst,
2095 vec4_instruction *last_rhs_inst)
2096 {
2097 /* This could be supported, but it would take more smarts. */
2098 if (ir->condition)
2099 return false;
2100
2101 if (pre_rhs_inst == last_rhs_inst)
2102 return false; /* No instructions generated to work with. */
2103
2104 /* Make sure the last instruction generated our source reg. */
2105 if (src.file != GRF ||
2106 src.file != last_rhs_inst->dst.file ||
2107 src.reg != last_rhs_inst->dst.reg ||
2108 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2109 src.reladdr ||
2110 src.abs ||
2111 src.negate ||
2112 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2113 return false;
2114
2115 /* Check that that last instruction fully initialized the channels
2116 * we want to use, in the order we want to use them. We could
2117 * potentially reswizzle the operands of many instructions so that
2118 * we could handle out of order channels, but don't yet.
2119 */
2120
2121 for (unsigned i = 0; i < 4; i++) {
2122 if (dst.writemask & (1 << i)) {
2123 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2124 return false;
2125
2126 if (BRW_GET_SWZ(src.swizzle, i) != i)
2127 return false;
2128 }
2129 }
2130
2131 /* Success! Rewrite the instruction. */
2132 last_rhs_inst->dst.file = dst.file;
2133 last_rhs_inst->dst.reg = dst.reg;
2134 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2135 last_rhs_inst->dst.reladdr = dst.reladdr;
2136 last_rhs_inst->dst.writemask &= dst.writemask;
2137
2138 return true;
2139 }
2140
2141 void
2142 vec4_visitor::visit(ir_assignment *ir)
2143 {
2144 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2145 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2146
2147 if (!ir->lhs->type->is_scalar() &&
2148 !ir->lhs->type->is_vector()) {
2149 ir->rhs->accept(this);
2150 src_reg src = this->result;
2151
2152 if (ir->condition) {
2153 emit_bool_to_cond_code(ir->condition, &predicate);
2154 }
2155
2156 /* emit_block_move doesn't account for swizzles in the source register.
2157 * This should be ok, since the source register is a structure or an
2158 * array, and those can't be swizzled. But double-check to be sure.
2159 */
2160 assert(src.swizzle ==
2161 (ir->rhs->type->is_matrix()
2162 ? swizzle_for_size(ir->rhs->type->vector_elements)
2163 : BRW_SWIZZLE_NOOP));
2164
2165 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2166 return;
2167 }
2168
2169 /* Now we're down to just a scalar/vector with writemasks. */
2170 int i;
2171
2172 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2173 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2174
2175 ir->rhs->accept(this);
2176
2177 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2178
2179 src_reg src = this->result;
2180
2181 int swizzles[4];
2182 int first_enabled_chan = 0;
2183 int src_chan = 0;
2184
2185 assert(ir->lhs->type->is_vector() ||
2186 ir->lhs->type->is_scalar());
2187 dst.writemask = ir->write_mask;
2188
2189 for (int i = 0; i < 4; i++) {
2190 if (dst.writemask & (1 << i)) {
2191 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2192 break;
2193 }
2194 }
2195
2196 /* Swizzle a small RHS vector into the channels being written.
2197 *
2198 * glsl ir treats write_mask as dictating how many channels are
2199 * present on the RHS while in our instructions we need to make
2200 * those channels appear in the slots of the vec4 they're written to.
2201 */
2202 for (int i = 0; i < 4; i++) {
2203 if (dst.writemask & (1 << i))
2204 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2205 else
2206 swizzles[i] = first_enabled_chan;
2207 }
2208 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2209 swizzles[2], swizzles[3]);
2210
2211 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2212 return;
2213 }
2214
2215 if (ir->condition) {
2216 emit_bool_to_cond_code(ir->condition, &predicate);
2217 }
2218
2219 for (i = 0; i < type_size(ir->lhs->type); i++) {
2220 vec4_instruction *inst = emit(MOV(dst, src));
2221 inst->predicate = predicate;
2222
2223 dst.reg_offset++;
2224 src.reg_offset++;
2225 }
2226 }
2227
2228 void
2229 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2230 {
2231 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2232 foreach_in_list(ir_constant, field_value, &ir->components) {
2233 emit_constant_values(dst, field_value);
2234 }
2235 return;
2236 }
2237
2238 if (ir->type->is_array()) {
2239 for (unsigned int i = 0; i < ir->type->length; i++) {
2240 emit_constant_values(dst, ir->array_elements[i]);
2241 }
2242 return;
2243 }
2244
2245 if (ir->type->is_matrix()) {
2246 for (int i = 0; i < ir->type->matrix_columns; i++) {
2247 float *vec = &ir->value.f[i * ir->type->vector_elements];
2248
2249 for (int j = 0; j < ir->type->vector_elements; j++) {
2250 dst->writemask = 1 << j;
2251 dst->type = BRW_REGISTER_TYPE_F;
2252
2253 emit(MOV(*dst, src_reg(vec[j])));
2254 }
2255 dst->reg_offset++;
2256 }
2257 return;
2258 }
2259
2260 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2261
2262 for (int i = 0; i < ir->type->vector_elements; i++) {
2263 if (!(remaining_writemask & (1 << i)))
2264 continue;
2265
2266 dst->writemask = 1 << i;
2267 dst->type = brw_type_for_base_type(ir->type);
2268
2269 /* Find other components that match the one we're about to
2270 * write. Emits fewer instructions for things like vec4(0.5,
2271 * 1.5, 1.5, 1.5).
2272 */
2273 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2274 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2275 if (ir->value.b[i] == ir->value.b[j])
2276 dst->writemask |= (1 << j);
2277 } else {
2278 /* u, i, and f storage all line up, so no need for a
2279 * switch case for comparing each type.
2280 */
2281 if (ir->value.u[i] == ir->value.u[j])
2282 dst->writemask |= (1 << j);
2283 }
2284 }
2285
2286 switch (ir->type->base_type) {
2287 case GLSL_TYPE_FLOAT:
2288 emit(MOV(*dst, src_reg(ir->value.f[i])));
2289 break;
2290 case GLSL_TYPE_INT:
2291 emit(MOV(*dst, src_reg(ir->value.i[i])));
2292 break;
2293 case GLSL_TYPE_UINT:
2294 emit(MOV(*dst, src_reg(ir->value.u[i])));
2295 break;
2296 case GLSL_TYPE_BOOL:
2297 emit(MOV(*dst,
2298 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2299 : 0)));
2300 break;
2301 default:
2302 unreachable("Non-float/uint/int/bool constant");
2303 }
2304
2305 remaining_writemask &= ~dst->writemask;
2306 }
2307 dst->reg_offset++;
2308 }
2309
2310 void
2311 vec4_visitor::visit(ir_constant *ir)
2312 {
2313 dst_reg dst = dst_reg(this, ir->type);
2314 this->result = src_reg(dst);
2315
2316 emit_constant_values(&dst, ir);
2317 }
2318
2319 void
2320 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2321 {
2322 ir_dereference *deref = static_cast<ir_dereference *>(
2323 ir->actual_parameters.get_head());
2324 ir_variable *location = deref->variable_referenced();
2325 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2326 location->data.binding);
2327
2328 /* Calculate the surface offset */
2329 src_reg offset(this, glsl_type::uint_type);
2330 ir_dereference_array *deref_array = deref->as_dereference_array();
2331 if (deref_array) {
2332 deref_array->array_index->accept(this);
2333
2334 src_reg tmp(this, glsl_type::uint_type);
2335 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2336 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2337 } else {
2338 offset = location->data.atomic.offset;
2339 }
2340
2341 /* Emit the appropriate machine instruction */
2342 const char *callee = ir->callee->function_name();
2343 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2344
2345 if (!strcmp("__intrinsic_atomic_read", callee)) {
2346 emit_untyped_surface_read(surf_index, dst, offset);
2347
2348 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2349 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2350 src_reg(), src_reg());
2351
2352 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2353 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2354 src_reg(), src_reg());
2355 }
2356 }
2357
2358 void
2359 vec4_visitor::visit(ir_call *ir)
2360 {
2361 const char *callee = ir->callee->function_name();
2362
2363 if (!strcmp("__intrinsic_atomic_read", callee) ||
2364 !strcmp("__intrinsic_atomic_increment", callee) ||
2365 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2366 visit_atomic_counter_intrinsic(ir);
2367 } else {
2368 unreachable("Unsupported intrinsic.");
2369 }
2370 }
2371
2372 src_reg
2373 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2374 {
2375 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2376 inst->base_mrf = 2;
2377 inst->mlen = 1;
2378 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2379 inst->dst.writemask = WRITEMASK_XYZW;
2380
2381 inst->src[1] = sampler;
2382
2383 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2384 int param_base = inst->base_mrf;
2385 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2386 int zero_mask = 0xf & ~coord_mask;
2387
2388 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2389 coordinate));
2390
2391 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2392 src_reg(0)));
2393
2394 emit(inst);
2395 return src_reg(inst->dst);
2396 }
2397
2398 static bool
2399 is_high_sampler(struct brw_context *brw, src_reg sampler)
2400 {
2401 if (brw->gen < 8 && !brw->is_haswell)
2402 return false;
2403
2404 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_texture *ir)
2409 {
2410 uint32_t sampler =
2411 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2412
2413 ir_rvalue *nonconst_sampler_index =
2414 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2415
2416 /* Handle non-constant sampler array indexing */
2417 src_reg sampler_reg;
2418 if (nonconst_sampler_index) {
2419 /* The highest sampler which may be used by this operation is
2420 * the last element of the array. Mark it here, because the generator
2421 * doesn't have enough information to determine the bound.
2422 */
2423 uint32_t array_size = ir->sampler->as_dereference_array()
2424 ->array->type->array_size();
2425
2426 uint32_t max_used = sampler + array_size - 1;
2427 if (ir->op == ir_tg4 && brw->gen < 8) {
2428 max_used += prog_data->base.binding_table.gather_texture_start;
2429 } else {
2430 max_used += prog_data->base.binding_table.texture_start;
2431 }
2432
2433 brw_mark_surface_used(&prog_data->base, max_used);
2434
2435 /* Emit code to evaluate the actual indexing expression */
2436 nonconst_sampler_index->accept(this);
2437 dst_reg temp(this, glsl_type::uint_type);
2438 emit(ADD(temp, this->result, src_reg(sampler)))
2439 ->force_writemask_all = true;
2440 sampler_reg = src_reg(temp);
2441 } else {
2442 /* Single sampler, or constant array index; the indexing expression
2443 * is just an immediate.
2444 */
2445 sampler_reg = src_reg(sampler);
2446 }
2447
2448 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2449 * emitting anything other than setting up the constant result.
2450 */
2451 if (ir->op == ir_tg4) {
2452 ir_constant *chan = ir->lod_info.component->as_constant();
2453 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2454 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2455 dst_reg result(this, ir->type);
2456 this->result = src_reg(result);
2457 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2458 return;
2459 }
2460 }
2461
2462 /* Should be lowered by do_lower_texture_projection */
2463 assert(!ir->projector);
2464
2465 /* Should be lowered */
2466 assert(!ir->offset || !ir->offset->type->is_array());
2467
2468 /* Generate code to compute all the subexpression trees. This has to be
2469 * done before loading any values into MRFs for the sampler message since
2470 * generating these values may involve SEND messages that need the MRFs.
2471 */
2472 src_reg coordinate;
2473 if (ir->coordinate) {
2474 ir->coordinate->accept(this);
2475 coordinate = this->result;
2476 }
2477
2478 src_reg shadow_comparitor;
2479 if (ir->shadow_comparitor) {
2480 ir->shadow_comparitor->accept(this);
2481 shadow_comparitor = this->result;
2482 }
2483
2484 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2485 src_reg offset_value;
2486 if (has_nonconstant_offset) {
2487 ir->offset->accept(this);
2488 offset_value = src_reg(this->result);
2489 }
2490
2491 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2492 src_reg lod, dPdx, dPdy, sample_index, mcs;
2493 switch (ir->op) {
2494 case ir_tex:
2495 lod = src_reg(0.0f);
2496 lod_type = glsl_type::float_type;
2497 break;
2498 case ir_txf:
2499 case ir_txl:
2500 case ir_txs:
2501 ir->lod_info.lod->accept(this);
2502 lod = this->result;
2503 lod_type = ir->lod_info.lod->type;
2504 break;
2505 case ir_query_levels:
2506 lod = src_reg(0);
2507 lod_type = glsl_type::int_type;
2508 break;
2509 case ir_txf_ms:
2510 ir->lod_info.sample_index->accept(this);
2511 sample_index = this->result;
2512 sample_index_type = ir->lod_info.sample_index->type;
2513
2514 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2515 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2516 else
2517 mcs = src_reg(0u);
2518 break;
2519 case ir_txd:
2520 ir->lod_info.grad.dPdx->accept(this);
2521 dPdx = this->result;
2522
2523 ir->lod_info.grad.dPdy->accept(this);
2524 dPdy = this->result;
2525
2526 lod_type = ir->lod_info.grad.dPdx->type;
2527 break;
2528 case ir_txb:
2529 case ir_lod:
2530 case ir_tg4:
2531 break;
2532 }
2533
2534 enum opcode opcode;
2535 switch (ir->op) {
2536 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2537 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2538 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2539 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2540 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2541 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2542 case ir_tg4: opcode = has_nonconstant_offset
2543 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2544 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2545 case ir_txb:
2546 unreachable("TXB is not valid for vertex shaders.");
2547 case ir_lod:
2548 unreachable("LOD is not valid for vertex shaders.");
2549 default:
2550 unreachable("Unrecognized tex op");
2551 }
2552
2553 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2554
2555 if (ir->offset != NULL && ir->op != ir_txf)
2556 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2557
2558 /* Stuff the channel select bits in the top of the texture offset */
2559 if (ir->op == ir_tg4)
2560 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2561
2562 /* The message header is necessary for:
2563 * - Gen4 (always)
2564 * - Texel offsets
2565 * - Gather channel selection
2566 * - Sampler indices too large to fit in a 4-bit value.
2567 */
2568 inst->header_present =
2569 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2570 is_high_sampler(brw, sampler_reg);
2571 inst->base_mrf = 2;
2572 inst->mlen = inst->header_present + 1; /* always at least one */
2573 inst->dst = dst_reg(this, ir->type);
2574 inst->dst.writemask = WRITEMASK_XYZW;
2575 inst->shadow_compare = ir->shadow_comparitor != NULL;
2576
2577 inst->src[1] = sampler_reg;
2578
2579 /* MRF for the first parameter */
2580 int param_base = inst->base_mrf + inst->header_present;
2581
2582 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2583 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2584 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2585 } else {
2586 /* Load the coordinate */
2587 /* FINISHME: gl_clamp_mask and saturate */
2588 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2589 int zero_mask = 0xf & ~coord_mask;
2590
2591 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2592 coordinate));
2593
2594 if (zero_mask != 0) {
2595 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2596 src_reg(0)));
2597 }
2598 /* Load the shadow comparitor */
2599 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2600 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2601 WRITEMASK_X),
2602 shadow_comparitor));
2603 inst->mlen++;
2604 }
2605
2606 /* Load the LOD info */
2607 if (ir->op == ir_tex || ir->op == ir_txl) {
2608 int mrf, writemask;
2609 if (brw->gen >= 5) {
2610 mrf = param_base + 1;
2611 if (ir->shadow_comparitor) {
2612 writemask = WRITEMASK_Y;
2613 /* mlen already incremented */
2614 } else {
2615 writemask = WRITEMASK_X;
2616 inst->mlen++;
2617 }
2618 } else /* brw->gen == 4 */ {
2619 mrf = param_base;
2620 writemask = WRITEMASK_W;
2621 }
2622 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2623 } else if (ir->op == ir_txf) {
2624 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2625 } else if (ir->op == ir_txf_ms) {
2626 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2627 sample_index));
2628 if (brw->gen >= 7) {
2629 /* MCS data is in the first channel of `mcs`, but we need to get it into
2630 * the .y channel of the second vec4 of params, so replicate .x across
2631 * the whole vec4 and then mask off everything except .y
2632 */
2633 mcs.swizzle = BRW_SWIZZLE_XXXX;
2634 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2635 mcs));
2636 }
2637 inst->mlen++;
2638 } else if (ir->op == ir_txd) {
2639 const glsl_type *type = lod_type;
2640
2641 if (brw->gen >= 5) {
2642 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2643 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2644 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2645 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2646 inst->mlen++;
2647
2648 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2649 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2650 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2651 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2652 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2653 inst->mlen++;
2654
2655 if (ir->shadow_comparitor) {
2656 emit(MOV(dst_reg(MRF, param_base + 2,
2657 ir->shadow_comparitor->type, WRITEMASK_Z),
2658 shadow_comparitor));
2659 }
2660 }
2661 } else /* brw->gen == 4 */ {
2662 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2663 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2664 inst->mlen += 2;
2665 }
2666 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2667 if (ir->shadow_comparitor) {
2668 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2669 shadow_comparitor));
2670 }
2671
2672 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2673 offset_value));
2674 inst->mlen++;
2675 }
2676 }
2677
2678 emit(inst);
2679
2680 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2681 * spec requires layers.
2682 */
2683 if (ir->op == ir_txs) {
2684 glsl_type const *type = ir->sampler->type;
2685 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2686 type->sampler_array) {
2687 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2688 writemask(inst->dst, WRITEMASK_Z),
2689 src_reg(inst->dst), src_reg(6));
2690 }
2691 }
2692
2693 if (brw->gen == 6 && ir->op == ir_tg4) {
2694 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2695 }
2696
2697 swizzle_result(ir, src_reg(inst->dst), sampler);
2698 }
2699
2700 /**
2701 * Apply workarounds for Gen6 gather with UINT/SINT
2702 */
2703 void
2704 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2705 {
2706 if (!wa)
2707 return;
2708
2709 int width = (wa & WA_8BIT) ? 8 : 16;
2710 dst_reg dst_f = dst;
2711 dst_f.type = BRW_REGISTER_TYPE_F;
2712
2713 /* Convert from UNORM to UINT */
2714 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2715 emit(MOV(dst, src_reg(dst_f)));
2716
2717 if (wa & WA_SIGN) {
2718 /* Reinterpret the UINT value as a signed INT value by
2719 * shifting the sign bit into place, then shifting back
2720 * preserving sign.
2721 */
2722 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2723 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2724 }
2725 }
2726
2727 /**
2728 * Set up the gather channel based on the swizzle, for gather4.
2729 */
2730 uint32_t
2731 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2732 {
2733 ir_constant *chan = ir->lod_info.component->as_constant();
2734 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2735 switch (swiz) {
2736 case SWIZZLE_X: return 0;
2737 case SWIZZLE_Y:
2738 /* gather4 sampler is broken for green channel on RG32F --
2739 * we must ask for blue instead.
2740 */
2741 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2742 return 2;
2743 return 1;
2744 case SWIZZLE_Z: return 2;
2745 case SWIZZLE_W: return 3;
2746 default:
2747 unreachable("Not reached"); /* zero, one swizzles handled already */
2748 }
2749 }
2750
2751 void
2752 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2753 {
2754 int s = key->tex.swizzles[sampler];
2755
2756 this->result = src_reg(this, ir->type);
2757 dst_reg swizzled_result(this->result);
2758
2759 if (ir->op == ir_query_levels) {
2760 /* # levels is in .w */
2761 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2762 emit(MOV(swizzled_result, orig_val));
2763 return;
2764 }
2765
2766 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2767 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2768 emit(MOV(swizzled_result, orig_val));
2769 return;
2770 }
2771
2772
2773 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2774 int swizzle[4] = {0};
2775
2776 for (int i = 0; i < 4; i++) {
2777 switch (GET_SWZ(s, i)) {
2778 case SWIZZLE_ZERO:
2779 zero_mask |= (1 << i);
2780 break;
2781 case SWIZZLE_ONE:
2782 one_mask |= (1 << i);
2783 break;
2784 default:
2785 copy_mask |= (1 << i);
2786 swizzle[i] = GET_SWZ(s, i);
2787 break;
2788 }
2789 }
2790
2791 if (copy_mask) {
2792 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2793 swizzled_result.writemask = copy_mask;
2794 emit(MOV(swizzled_result, orig_val));
2795 }
2796
2797 if (zero_mask) {
2798 swizzled_result.writemask = zero_mask;
2799 emit(MOV(swizzled_result, src_reg(0.0f)));
2800 }
2801
2802 if (one_mask) {
2803 swizzled_result.writemask = one_mask;
2804 emit(MOV(swizzled_result, src_reg(1.0f)));
2805 }
2806 }
2807
2808 void
2809 vec4_visitor::visit(ir_return *)
2810 {
2811 unreachable("not reached");
2812 }
2813
2814 void
2815 vec4_visitor::visit(ir_discard *)
2816 {
2817 unreachable("not reached");
2818 }
2819
2820 void
2821 vec4_visitor::visit(ir_if *ir)
2822 {
2823 /* Don't point the annotation at the if statement, because then it plus
2824 * the then and else blocks get printed.
2825 */
2826 this->base_ir = ir->condition;
2827
2828 if (brw->gen == 6) {
2829 emit_if_gen6(ir);
2830 } else {
2831 enum brw_predicate predicate;
2832 emit_bool_to_cond_code(ir->condition, &predicate);
2833 emit(IF(predicate));
2834 }
2835
2836 visit_instructions(&ir->then_instructions);
2837
2838 if (!ir->else_instructions.is_empty()) {
2839 this->base_ir = ir->condition;
2840 emit(BRW_OPCODE_ELSE);
2841
2842 visit_instructions(&ir->else_instructions);
2843 }
2844
2845 this->base_ir = ir->condition;
2846 emit(BRW_OPCODE_ENDIF);
2847 }
2848
2849 void
2850 vec4_visitor::visit(ir_emit_vertex *)
2851 {
2852 unreachable("not reached");
2853 }
2854
2855 void
2856 vec4_visitor::visit(ir_end_primitive *)
2857 {
2858 unreachable("not reached");
2859 }
2860
2861 void
2862 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2863 dst_reg dst, src_reg offset,
2864 src_reg src0, src_reg src1)
2865 {
2866 unsigned mlen = 0;
2867
2868 /* Set the atomic operation offset. */
2869 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2870 mlen++;
2871
2872 /* Set the atomic operation arguments. */
2873 if (src0.file != BAD_FILE) {
2874 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2875 mlen++;
2876 }
2877
2878 if (src1.file != BAD_FILE) {
2879 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2880 mlen++;
2881 }
2882
2883 /* Emit the instruction. Note that this maps to the normal SIMD8
2884 * untyped atomic message on Ivy Bridge, but that's OK because
2885 * unused channels will be masked out.
2886 */
2887 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2888 src_reg(atomic_op), src_reg(surf_index));
2889 inst->base_mrf = 0;
2890 inst->mlen = mlen;
2891 }
2892
2893 void
2894 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2895 src_reg offset)
2896 {
2897 /* Set the surface read offset. */
2898 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2899
2900 /* Emit the instruction. Note that this maps to the normal SIMD8
2901 * untyped surface read message, but that's OK because unused
2902 * channels will be masked out.
2903 */
2904 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2905 dst, src_reg(surf_index));
2906 inst->base_mrf = 0;
2907 inst->mlen = 1;
2908 }
2909
2910 void
2911 vec4_visitor::emit_ndc_computation()
2912 {
2913 /* Get the position */
2914 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2915
2916 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2917 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2918 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2919
2920 current_annotation = "NDC";
2921 dst_reg ndc_w = ndc;
2922 ndc_w.writemask = WRITEMASK_W;
2923 src_reg pos_w = pos;
2924 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2925 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2926
2927 dst_reg ndc_xyz = ndc;
2928 ndc_xyz.writemask = WRITEMASK_XYZ;
2929
2930 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2931 }
2932
2933 void
2934 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2935 {
2936 if (brw->gen < 6 &&
2937 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2938 key->userclip_active || brw->has_negative_rhw_bug)) {
2939 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2940 dst_reg header1_w = header1;
2941 header1_w.writemask = WRITEMASK_W;
2942
2943 emit(MOV(header1, 0u));
2944
2945 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2946 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2947
2948 current_annotation = "Point size";
2949 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2950 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2951 }
2952
2953 if (key->userclip_active) {
2954 current_annotation = "Clipping flags";
2955 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2956 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2957
2958 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2959 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2960 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2961
2962 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2963 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2964 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2965 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2966 }
2967
2968 /* i965 clipping workaround:
2969 * 1) Test for -ve rhw
2970 * 2) If set,
2971 * set ndc = (0,0,0,0)
2972 * set ucp[6] = 1
2973 *
2974 * Later, clipping will detect ucp[6] and ensure the primitive is
2975 * clipped against all fixed planes.
2976 */
2977 if (brw->has_negative_rhw_bug) {
2978 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2979 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2980 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2981 vec4_instruction *inst;
2982 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2983 inst->predicate = BRW_PREDICATE_NORMAL;
2984 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2985 inst->predicate = BRW_PREDICATE_NORMAL;
2986 }
2987
2988 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2989 } else if (brw->gen < 6) {
2990 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2991 } else {
2992 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2993 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2994 dst_reg reg_w = reg;
2995 reg_w.writemask = WRITEMASK_W;
2996 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2997 }
2998 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2999 dst_reg reg_y = reg;
3000 reg_y.writemask = WRITEMASK_Y;
3001 reg_y.type = BRW_REGISTER_TYPE_D;
3002 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3003 }
3004 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3005 dst_reg reg_z = reg;
3006 reg_z.writemask = WRITEMASK_Z;
3007 reg_z.type = BRW_REGISTER_TYPE_D;
3008 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3009 }
3010 }
3011 }
3012
3013 void
3014 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3015 {
3016 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3017 *
3018 * "If a linked set of shaders forming the vertex stage contains no
3019 * static write to gl_ClipVertex or gl_ClipDistance, but the
3020 * application has requested clipping against user clip planes through
3021 * the API, then the coordinate written to gl_Position is used for
3022 * comparison against the user clip planes."
3023 *
3024 * This function is only called if the shader didn't write to
3025 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3026 * if the user wrote to it; otherwise we use gl_Position.
3027 */
3028 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3029 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3030 clip_vertex = VARYING_SLOT_POS;
3031 }
3032
3033 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3034 ++i) {
3035 reg.writemask = 1 << i;
3036 emit(DP4(reg,
3037 src_reg(output_reg[clip_vertex]),
3038 src_reg(this->userplane[i + offset])));
3039 }
3040 }
3041
3042 void
3043 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3044 {
3045 assert (varying < VARYING_SLOT_MAX);
3046 reg.type = output_reg[varying].type;
3047 current_annotation = output_reg_annotation[varying];
3048 /* Copy the register, saturating if necessary */
3049 vec4_instruction *inst = emit(MOV(reg,
3050 src_reg(output_reg[varying])));
3051 if ((varying == VARYING_SLOT_COL0 ||
3052 varying == VARYING_SLOT_COL1 ||
3053 varying == VARYING_SLOT_BFC0 ||
3054 varying == VARYING_SLOT_BFC1) &&
3055 key->clamp_vertex_color) {
3056 inst->saturate = true;
3057 }
3058 }
3059
3060 void
3061 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3062 {
3063 reg.type = BRW_REGISTER_TYPE_F;
3064
3065 switch (varying) {
3066 case VARYING_SLOT_PSIZ:
3067 {
3068 /* PSIZ is always in slot 0, and is coupled with other flags. */
3069 current_annotation = "indices, point width, clip flags";
3070 emit_psiz_and_flags(reg);
3071 break;
3072 }
3073 case BRW_VARYING_SLOT_NDC:
3074 current_annotation = "NDC";
3075 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3076 break;
3077 case VARYING_SLOT_POS:
3078 current_annotation = "gl_Position";
3079 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3080 break;
3081 case VARYING_SLOT_EDGE:
3082 /* This is present when doing unfilled polygons. We're supposed to copy
3083 * the edge flag from the user-provided vertex array
3084 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3085 * of that attribute (starts as 1.0f). This is then used in clipping to
3086 * determine which edges should be drawn as wireframe.
3087 */
3088 current_annotation = "edge flag";
3089 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3090 glsl_type::float_type, WRITEMASK_XYZW))));
3091 break;
3092 case BRW_VARYING_SLOT_PAD:
3093 /* No need to write to this slot */
3094 break;
3095 default:
3096 emit_generic_urb_slot(reg, varying);
3097 break;
3098 }
3099 }
3100
3101 static int
3102 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3103 {
3104 if (brw->gen >= 6) {
3105 /* URB data written (does not include the message header reg) must
3106 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3107 * section 5.4.3.2.2: URB_INTERLEAVED.
3108 *
3109 * URB entries are allocated on a multiple of 1024 bits, so an
3110 * extra 128 bits written here to make the end align to 256 is
3111 * no problem.
3112 */
3113 if ((mlen % 2) != 1)
3114 mlen++;
3115 }
3116
3117 return mlen;
3118 }
3119
3120
3121 /**
3122 * Generates the VUE payload plus the necessary URB write instructions to
3123 * output it.
3124 *
3125 * The VUE layout is documented in Volume 2a.
3126 */
3127 void
3128 vec4_visitor::emit_vertex()
3129 {
3130 /* MRF 0 is reserved for the debugger, so start with message header
3131 * in MRF 1.
3132 */
3133 int base_mrf = 1;
3134 int mrf = base_mrf;
3135 /* In the process of generating our URB write message contents, we
3136 * may need to unspill a register or load from an array. Those
3137 * reads would use MRFs 14-15.
3138 */
3139 int max_usable_mrf = 13;
3140
3141 /* The following assertion verifies that max_usable_mrf causes an
3142 * even-numbered amount of URB write data, which will meet gen6's
3143 * requirements for length alignment.
3144 */
3145 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3146
3147 /* First mrf is the g0-based message header containing URB handles and
3148 * such.
3149 */
3150 emit_urb_write_header(mrf++);
3151
3152 if (brw->gen < 6) {
3153 emit_ndc_computation();
3154 }
3155
3156 /* Lower legacy ff and ClipVertex clipping to clip distances */
3157 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3158 current_annotation = "user clip distances";
3159
3160 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3161 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3162
3163 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3164 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3165 }
3166
3167 /* We may need to split this up into several URB writes, so do them in a
3168 * loop.
3169 */
3170 int slot = 0;
3171 bool complete = false;
3172 do {
3173 /* URB offset is in URB row increments, and each of our MRFs is half of
3174 * one of those, since we're doing interleaved writes.
3175 */
3176 int offset = slot / 2;
3177
3178 mrf = base_mrf + 1;
3179 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3180 emit_urb_slot(dst_reg(MRF, mrf++),
3181 prog_data->vue_map.slot_to_varying[slot]);
3182
3183 /* If this was max_usable_mrf, we can't fit anything more into this
3184 * URB WRITE.
3185 */
3186 if (mrf > max_usable_mrf) {
3187 slot++;
3188 break;
3189 }
3190 }
3191
3192 complete = slot >= prog_data->vue_map.num_slots;
3193 current_annotation = "URB write";
3194 vec4_instruction *inst = emit_urb_write_opcode(complete);
3195 inst->base_mrf = base_mrf;
3196 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3197 inst->offset += offset;
3198 } while(!complete);
3199 }
3200
3201
3202 src_reg
3203 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3204 src_reg *reladdr, int reg_offset)
3205 {
3206 /* Because we store the values to scratch interleaved like our
3207 * vertex data, we need to scale the vec4 index by 2.
3208 */
3209 int message_header_scale = 2;
3210
3211 /* Pre-gen6, the message header uses byte offsets instead of vec4
3212 * (16-byte) offset units.
3213 */
3214 if (brw->gen < 6)
3215 message_header_scale *= 16;
3216
3217 if (reladdr) {
3218 src_reg index = src_reg(this, glsl_type::int_type);
3219
3220 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3221 src_reg(reg_offset)));
3222 emit_before(block, inst, MUL(dst_reg(index), index,
3223 src_reg(message_header_scale)));
3224
3225 return index;
3226 } else {
3227 return src_reg(reg_offset * message_header_scale);
3228 }
3229 }
3230
3231 src_reg
3232 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3233 src_reg *reladdr, int reg_offset)
3234 {
3235 if (reladdr) {
3236 src_reg index = src_reg(this, glsl_type::int_type);
3237
3238 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3239 src_reg(reg_offset)));
3240
3241 /* Pre-gen6, the message header uses byte offsets instead of vec4
3242 * (16-byte) offset units.
3243 */
3244 if (brw->gen < 6) {
3245 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3246 }
3247
3248 return index;
3249 } else if (brw->gen >= 8) {
3250 /* Store the offset in a GRF so we can send-from-GRF. */
3251 src_reg offset = src_reg(this, glsl_type::int_type);
3252 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3253 return offset;
3254 } else {
3255 int message_header_scale = brw->gen < 6 ? 16 : 1;
3256 return src_reg(reg_offset * message_header_scale);
3257 }
3258 }
3259
3260 /**
3261 * Emits an instruction before @inst to load the value named by @orig_src
3262 * from scratch space at @base_offset to @temp.
3263 *
3264 * @base_offset is measured in 32-byte units (the size of a register).
3265 */
3266 void
3267 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3268 dst_reg temp, src_reg orig_src,
3269 int base_offset)
3270 {
3271 int reg_offset = base_offset + orig_src.reg_offset;
3272 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3273 reg_offset);
3274
3275 emit_before(block, inst, SCRATCH_READ(temp, index));
3276 }
3277
3278 /**
3279 * Emits an instruction after @inst to store the value to be written
3280 * to @orig_dst to scratch space at @base_offset, from @temp.
3281 *
3282 * @base_offset is measured in 32-byte units (the size of a register).
3283 */
3284 void
3285 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3286 int base_offset)
3287 {
3288 int reg_offset = base_offset + inst->dst.reg_offset;
3289 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3290 reg_offset);
3291
3292 /* Create a temporary register to store *inst's result in.
3293 *
3294 * We have to be careful in MOVing from our temporary result register in
3295 * the scratch write. If we swizzle from channels of the temporary that
3296 * weren't initialized, it will confuse live interval analysis, which will
3297 * make spilling fail to make progress.
3298 */
3299 src_reg temp = src_reg(this, glsl_type::vec4_type);
3300 temp.type = inst->dst.type;
3301 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3302 int swizzles[4];
3303 for (int i = 0; i < 4; i++)
3304 if (inst->dst.writemask & (1 << i))
3305 swizzles[i] = i;
3306 else
3307 swizzles[i] = first_writemask_chan;
3308 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3309 swizzles[2], swizzles[3]);
3310
3311 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3312 inst->dst.writemask));
3313 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3314 write->predicate = inst->predicate;
3315 write->ir = inst->ir;
3316 write->annotation = inst->annotation;
3317 inst->insert_after(block, write);
3318
3319 inst->dst.file = temp.file;
3320 inst->dst.reg = temp.reg;
3321 inst->dst.reg_offset = temp.reg_offset;
3322 inst->dst.reladdr = NULL;
3323 }
3324
3325 /**
3326 * We can't generally support array access in GRF space, because a
3327 * single instruction's destination can only span 2 contiguous
3328 * registers. So, we send all GRF arrays that get variable index
3329 * access to scratch space.
3330 */
3331 void
3332 vec4_visitor::move_grf_array_access_to_scratch()
3333 {
3334 int scratch_loc[this->virtual_grf_count];
3335
3336 for (int i = 0; i < this->virtual_grf_count; i++) {
3337 scratch_loc[i] = -1;
3338 }
3339
3340 calculate_cfg();
3341
3342 /* First, calculate the set of virtual GRFs that need to be punted
3343 * to scratch due to having any array access on them, and where in
3344 * scratch.
3345 */
3346 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3347 if (inst->dst.file == GRF && inst->dst.reladdr &&
3348 scratch_loc[inst->dst.reg] == -1) {
3349 scratch_loc[inst->dst.reg] = c->last_scratch;
3350 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3351 }
3352
3353 for (int i = 0 ; i < 3; i++) {
3354 src_reg *src = &inst->src[i];
3355
3356 if (src->file == GRF && src->reladdr &&
3357 scratch_loc[src->reg] == -1) {
3358 scratch_loc[src->reg] = c->last_scratch;
3359 c->last_scratch += this->virtual_grf_sizes[src->reg];
3360 }
3361 }
3362 }
3363
3364 /* Now, for anything that will be accessed through scratch, rewrite
3365 * it to load/store. Note that this is a _safe list walk, because
3366 * we may generate a new scratch_write instruction after the one
3367 * we're processing.
3368 */
3369 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3370 /* Set up the annotation tracking for new generated instructions. */
3371 base_ir = inst->ir;
3372 current_annotation = inst->annotation;
3373
3374 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3375 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3376 }
3377
3378 for (int i = 0 ; i < 3; i++) {
3379 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3380 continue;
3381
3382 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3383
3384 emit_scratch_read(block, inst, temp, inst->src[i],
3385 scratch_loc[inst->src[i].reg]);
3386
3387 inst->src[i].file = temp.file;
3388 inst->src[i].reg = temp.reg;
3389 inst->src[i].reg_offset = temp.reg_offset;
3390 inst->src[i].reladdr = NULL;
3391 }
3392 }
3393 }
3394
3395 /**
3396 * Emits an instruction before @inst to load the value named by @orig_src
3397 * from the pull constant buffer (surface) at @base_offset to @temp.
3398 */
3399 void
3400 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3401 dst_reg temp, src_reg orig_src,
3402 int base_offset)
3403 {
3404 int reg_offset = base_offset + orig_src.reg_offset;
3405 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3406 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3407 reg_offset);
3408 vec4_instruction *load;
3409
3410 if (brw->gen >= 7) {
3411 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3412 grf_offset.type = offset.type;
3413 emit_before(block, inst, MOV(grf_offset, offset));
3414
3415 load = new(mem_ctx) vec4_instruction(this,
3416 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3417 temp, index, src_reg(grf_offset));
3418 } else {
3419 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3420 temp, index, offset);
3421 load->base_mrf = 14;
3422 load->mlen = 1;
3423 }
3424 emit_before(block, inst, load);
3425 }
3426
3427 /**
3428 * Implements array access of uniforms by inserting a
3429 * PULL_CONSTANT_LOAD instruction.
3430 *
3431 * Unlike temporary GRF array access (where we don't support it due to
3432 * the difficulty of doing relative addressing on instruction
3433 * destinations), we could potentially do array access of uniforms
3434 * that were loaded in GRF space as push constants. In real-world
3435 * usage we've seen, though, the arrays being used are always larger
3436 * than we could load as push constants, so just always move all
3437 * uniform array access out to a pull constant buffer.
3438 */
3439 void
3440 vec4_visitor::move_uniform_array_access_to_pull_constants()
3441 {
3442 int pull_constant_loc[this->uniforms];
3443
3444 for (int i = 0; i < this->uniforms; i++) {
3445 pull_constant_loc[i] = -1;
3446 }
3447
3448 calculate_cfg();
3449
3450 /* Walk through and find array access of uniforms. Put a copy of that
3451 * uniform in the pull constant buffer.
3452 *
3453 * Note that we don't move constant-indexed accesses to arrays. No
3454 * testing has been done of the performance impact of this choice.
3455 */
3456 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3457 for (int i = 0 ; i < 3; i++) {
3458 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3459 continue;
3460
3461 int uniform = inst->src[i].reg;
3462
3463 /* If this array isn't already present in the pull constant buffer,
3464 * add it.
3465 */
3466 if (pull_constant_loc[uniform] == -1) {
3467 const gl_constant_value **values =
3468 &stage_prog_data->param[uniform * 4];
3469
3470 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3471
3472 assert(uniform < uniform_array_size);
3473 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3474 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3475 = values[j];
3476 }
3477 }
3478
3479 /* Set up the annotation tracking for new generated instructions. */
3480 base_ir = inst->ir;
3481 current_annotation = inst->annotation;
3482
3483 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3484
3485 emit_pull_constant_load(block, inst, temp, inst->src[i],
3486 pull_constant_loc[uniform]);
3487
3488 inst->src[i].file = temp.file;
3489 inst->src[i].reg = temp.reg;
3490 inst->src[i].reg_offset = temp.reg_offset;
3491 inst->src[i].reladdr = NULL;
3492 }
3493 }
3494
3495 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3496 * no need to track them as larger-than-vec4 objects. This will be
3497 * relied on in cutting out unused uniform vectors from push
3498 * constants.
3499 */
3500 split_uniform_registers();
3501 }
3502
3503 void
3504 vec4_visitor::resolve_ud_negate(src_reg *reg)
3505 {
3506 if (reg->type != BRW_REGISTER_TYPE_UD ||
3507 !reg->negate)
3508 return;
3509
3510 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3511 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3512 *reg = temp;
3513 }
3514
3515 vec4_visitor::vec4_visitor(struct brw_context *brw,
3516 struct brw_vec4_compile *c,
3517 struct gl_program *prog,
3518 const struct brw_vec4_prog_key *key,
3519 struct brw_vec4_prog_data *prog_data,
3520 struct gl_shader_program *shader_prog,
3521 gl_shader_stage stage,
3522 void *mem_ctx,
3523 bool debug_flag,
3524 bool no_spills,
3525 shader_time_shader_type st_base,
3526 shader_time_shader_type st_written,
3527 shader_time_shader_type st_reset)
3528 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3529 c(c),
3530 key(key),
3531 prog_data(prog_data),
3532 sanity_param_count(0),
3533 fail_msg(NULL),
3534 first_non_payload_grf(0),
3535 need_all_constants_in_pull_buffer(false),
3536 debug_flag(debug_flag),
3537 no_spills(no_spills),
3538 st_base(st_base),
3539 st_written(st_written),
3540 st_reset(st_reset)
3541 {
3542 this->mem_ctx = mem_ctx;
3543 this->failed = false;
3544
3545 this->base_ir = NULL;
3546 this->current_annotation = NULL;
3547 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3548
3549 this->variable_ht = hash_table_ctor(0,
3550 hash_table_pointer_hash,
3551 hash_table_pointer_compare);
3552
3553 this->virtual_grf_start = NULL;
3554 this->virtual_grf_end = NULL;
3555 this->virtual_grf_sizes = NULL;
3556 this->virtual_grf_count = 0;
3557 this->virtual_grf_reg_map = NULL;
3558 this->virtual_grf_reg_count = 0;
3559 this->virtual_grf_array_size = 0;
3560 this->live_intervals_valid = false;
3561
3562 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3563
3564 this->uniforms = 0;
3565
3566 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3567 * at least one. See setup_uniforms() in brw_vec4.cpp.
3568 */
3569 this->uniform_array_size = 1;
3570 if (prog_data) {
3571 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3572 }
3573
3574 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3575 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3576 }
3577
3578 vec4_visitor::~vec4_visitor()
3579 {
3580 hash_table_dtor(this->variable_ht);
3581 }
3582
3583
3584 void
3585 vec4_visitor::fail(const char *format, ...)
3586 {
3587 va_list va;
3588 char *msg;
3589
3590 if (failed)
3591 return;
3592
3593 failed = true;
3594
3595 va_start(va, format);
3596 msg = ralloc_vasprintf(mem_ctx, format, va);
3597 va_end(va);
3598 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3599
3600 this->fail_msg = msg;
3601
3602 if (debug_flag) {
3603 fprintf(stderr, "%s", msg);
3604 }
3605 }
3606
3607 } /* namespace brw */