0caa7a50f3980e4214fc689dd6728cde55256abd
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
71 vec4_instruction *new_inst)
72 {
73 new_inst->ir = inst->ir;
74 new_inst->annotation = inst->annotation;
75
76 inst->insert_before(block, new_inst);
77
78 return inst;
79 }
80
81 vec4_instruction *
82 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
83 src_reg src0, src_reg src1, src_reg src2)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
86 src0, src1, src2));
87 }
88
89
90 vec4_instruction *
91 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
92 {
93 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
94 }
95
96 vec4_instruction *
97 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
98 {
99 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
100 }
101
102 vec4_instruction *
103 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
104 {
105 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
106 }
107
108 vec4_instruction *
109 vec4_visitor::emit(enum opcode opcode)
110 {
111 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
112 }
113
114 #define ALU1(op) \
115 vec4_instruction * \
116 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
117 { \
118 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
119 src0); \
120 }
121
122 #define ALU2(op) \
123 vec4_instruction * \
124 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
125 const src_reg &src1) \
126 { \
127 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
128 src0, src1); \
129 }
130
131 #define ALU2_ACC(op) \
132 vec4_instruction * \
133 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
134 const src_reg &src1) \
135 { \
136 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
137 BRW_OPCODE_##op, dst, src0, src1); \
138 inst->writes_accumulator = true; \
139 return inst; \
140 }
141
142 #define ALU3(op) \
143 vec4_instruction * \
144 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
145 const src_reg &src1, const src_reg &src2) \
146 { \
147 assert(brw->gen >= 6); \
148 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
149 src0, src1, src2); \
150 }
151
152 ALU1(NOT)
153 ALU1(MOV)
154 ALU1(FRC)
155 ALU1(RNDD)
156 ALU1(RNDE)
157 ALU1(RNDZ)
158 ALU1(F32TO16)
159 ALU1(F16TO32)
160 ALU2(ADD)
161 ALU2(MUL)
162 ALU2_ACC(MACH)
163 ALU2(AND)
164 ALU2(OR)
165 ALU2(XOR)
166 ALU2(DP3)
167 ALU2(DP4)
168 ALU2(DPH)
169 ALU2(SHL)
170 ALU2(SHR)
171 ALU2(ASR)
172 ALU3(LRP)
173 ALU1(BFREV)
174 ALU3(BFE)
175 ALU2(BFI1)
176 ALU3(BFI2)
177 ALU1(FBH)
178 ALU1(FBL)
179 ALU1(CBIT)
180 ALU3(MAD)
181 ALU2_ACC(ADDC)
182 ALU2_ACC(SUBB)
183 ALU2(MAC)
184
185 /** Gen4 predicated IF. */
186 vec4_instruction *
187 vec4_visitor::IF(enum brw_predicate predicate)
188 {
189 vec4_instruction *inst;
190
191 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
192 inst->predicate = predicate;
193
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 vec4_instruction *
199 vec4_visitor::IF(src_reg src0, src_reg src1,
200 enum brw_conditional_mod condition)
201 {
202 assert(brw->gen == 6);
203
204 vec4_instruction *inst;
205
206 resolve_ud_negate(&src0);
207 resolve_ud_negate(&src1);
208
209 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
210 src0, src1);
211 inst->conditional_mod = condition;
212
213 return inst;
214 }
215
216 /**
217 * CMP: Sets the low bit of the destination channels with the result
218 * of the comparison, while the upper bits are undefined, and updates
219 * the flag register with the packed 16 bits of the result.
220 */
221 vec4_instruction *
222 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
223 enum brw_conditional_mod condition)
224 {
225 vec4_instruction *inst;
226
227 /* original gen4 does type conversion to the destination type
228 * before before comparison, producing garbage results for floating
229 * point comparisons.
230 */
231 if (brw->gen == 4) {
232 dst.type = src0.type;
233 if (dst.file == HW_REG)
234 dst.fixed_hw_reg.type = dst.type;
235 }
236
237 resolve_ud_negate(&src0);
238 resolve_ud_negate(&src1);
239
240 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
241 inst->conditional_mod = condition;
242
243 return inst;
244 }
245
246 vec4_instruction *
247 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
248 {
249 vec4_instruction *inst;
250
251 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
252 dst, index);
253 inst->base_mrf = 14;
254 inst->mlen = 2;
255
256 return inst;
257 }
258
259 vec4_instruction *
260 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
261 const src_reg &index)
262 {
263 vec4_instruction *inst;
264
265 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
266 dst, src, index);
267 inst->base_mrf = 13;
268 inst->mlen = 3;
269
270 return inst;
271 }
272
273 void
274 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
275 {
276 static enum opcode dot_opcodes[] = {
277 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
278 };
279
280 emit(dot_opcodes[elements - 2], dst, src0, src1);
281 }
282
283 src_reg
284 vec4_visitor::fix_3src_operand(src_reg src)
285 {
286 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
287 * able to use vertical stride of zero to replicate the vec4 uniform, like
288 *
289 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
290 *
291 * But you can't, since vertical stride is always four in three-source
292 * instructions. Instead, insert a MOV instruction to do the replication so
293 * that the three-source instruction can consume it.
294 */
295
296 /* The MOV is only needed if the source is a uniform or immediate. */
297 if (src.file != UNIFORM && src.file != IMM)
298 return src;
299
300 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
301 return src;
302
303 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
304 expanded.type = src.type;
305 emit(MOV(expanded, src));
306 return src_reg(expanded);
307 }
308
309 src_reg
310 vec4_visitor::fix_math_operand(src_reg src)
311 {
312 /* The gen6 math instruction ignores the source modifiers --
313 * swizzle, abs, negate, and at least some parts of the register
314 * region description.
315 *
316 * Rather than trying to enumerate all these cases, *always* expand the
317 * operand to a temp GRF for gen6.
318 *
319 * For gen7, keep the operand as-is, except if immediate, which gen7 still
320 * can't use.
321 */
322
323 if (brw->gen == 7 && src.file != IMM)
324 return src;
325
326 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
327 expanded.type = src.type;
328 emit(MOV(expanded, src));
329 return src_reg(expanded);
330 }
331
332 void
333 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
334 {
335 src = fix_math_operand(src);
336
337 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
338 /* The gen6 math instruction must be align1, so we can't do
339 * writemasks.
340 */
341 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
342
343 emit(opcode, temp_dst, src);
344
345 emit(MOV(dst, src_reg(temp_dst)));
346 } else {
347 emit(opcode, dst, src);
348 }
349 }
350
351 void
352 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
353 {
354 vec4_instruction *inst = emit(opcode, dst, src);
355 inst->base_mrf = 1;
356 inst->mlen = 1;
357 }
358
359 void
360 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
361 {
362 switch (opcode) {
363 case SHADER_OPCODE_RCP:
364 case SHADER_OPCODE_RSQ:
365 case SHADER_OPCODE_SQRT:
366 case SHADER_OPCODE_EXP2:
367 case SHADER_OPCODE_LOG2:
368 case SHADER_OPCODE_SIN:
369 case SHADER_OPCODE_COS:
370 break;
371 default:
372 unreachable("not reached: bad math opcode");
373 }
374
375 if (brw->gen >= 8) {
376 emit(opcode, dst, src);
377 } else if (brw->gen >= 6) {
378 emit_math1_gen6(opcode, dst, src);
379 } else {
380 emit_math1_gen4(opcode, dst, src);
381 }
382 }
383
384 void
385 vec4_visitor::emit_math2_gen6(enum opcode opcode,
386 dst_reg dst, src_reg src0, src_reg src1)
387 {
388 src0 = fix_math_operand(src0);
389 src1 = fix_math_operand(src1);
390
391 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
392 /* The gen6 math instruction must be align1, so we can't do
393 * writemasks.
394 */
395 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
396 temp_dst.type = dst.type;
397
398 emit(opcode, temp_dst, src0, src1);
399
400 emit(MOV(dst, src_reg(temp_dst)));
401 } else {
402 emit(opcode, dst, src0, src1);
403 }
404 }
405
406 void
407 vec4_visitor::emit_math2_gen4(enum opcode opcode,
408 dst_reg dst, src_reg src0, src_reg src1)
409 {
410 vec4_instruction *inst = emit(opcode, dst, src0, src1);
411 inst->base_mrf = 1;
412 inst->mlen = 2;
413 }
414
415 void
416 vec4_visitor::emit_math(enum opcode opcode,
417 dst_reg dst, src_reg src0, src_reg src1)
418 {
419 switch (opcode) {
420 case SHADER_OPCODE_POW:
421 case SHADER_OPCODE_INT_QUOTIENT:
422 case SHADER_OPCODE_INT_REMAINDER:
423 break;
424 default:
425 unreachable("not reached: unsupported binary math opcode");
426 }
427
428 if (brw->gen >= 8) {
429 emit(opcode, dst, src0, src1);
430 } else if (brw->gen >= 6) {
431 emit_math2_gen6(opcode, dst, src0, src1);
432 } else {
433 emit_math2_gen4(opcode, dst, src0, src1);
434 }
435 }
436
437 void
438 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
439 {
440 if (brw->gen < 7) {
441 unreachable("ir_unop_pack_half_2x16 should be lowered");
442 }
443
444 assert(dst.type == BRW_REGISTER_TYPE_UD);
445 assert(src0.type == BRW_REGISTER_TYPE_F);
446
447 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
448 *
449 * Because this instruction does not have a 16-bit floating-point type,
450 * the destination data type must be Word (W).
451 *
452 * The destination must be DWord-aligned and specify a horizontal stride
453 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
454 * each destination channel and the upper word is not modified.
455 *
456 * The above restriction implies that the f32to16 instruction must use
457 * align1 mode, because only in align1 mode is it possible to specify
458 * horizontal stride. We choose here to defy the hardware docs and emit
459 * align16 instructions.
460 *
461 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
462 * instructions. I was partially successful in that the code passed all
463 * tests. However, the code was dubiously correct and fragile, and the
464 * tests were not harsh enough to probe that frailty. Not trusting the
465 * code, I chose instead to remain in align16 mode in defiance of the hw
466 * docs).
467 *
468 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
469 * simulator, emitting a f32to16 in align16 mode with UD as destination
470 * data type is safe. The behavior differs from that specified in the PRM
471 * in that the upper word of each destination channel is cleared to 0.
472 */
473
474 dst_reg tmp_dst(this, glsl_type::uvec2_type);
475 src_reg tmp_src(tmp_dst);
476
477 #if 0
478 /* Verify the undocumented behavior on which the following instructions
479 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
480 * then the result of the bit-or instruction below will be incorrect.
481 *
482 * You should inspect the disasm output in order to verify that the MOV is
483 * not optimized away.
484 */
485 emit(MOV(tmp_dst, src_reg(0x12345678u)));
486 #endif
487
488 /* Give tmp the form below, where "." means untouched.
489 *
490 * w z y x w z y x
491 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
492 *
493 * That the upper word of each write-channel be 0 is required for the
494 * following bit-shift and bit-or instructions to work. Note that this
495 * relies on the undocumented hardware behavior mentioned above.
496 */
497 tmp_dst.writemask = WRITEMASK_XY;
498 emit(F32TO16(tmp_dst, src0));
499
500 /* Give the write-channels of dst the form:
501 * 0xhhhh0000
502 */
503 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
504 emit(SHL(dst, tmp_src, src_reg(16u)));
505
506 /* Finally, give the write-channels of dst the form of packHalf2x16's
507 * output:
508 * 0xhhhhllll
509 */
510 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
511 emit(OR(dst, src_reg(dst), tmp_src));
512 }
513
514 void
515 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
516 {
517 if (brw->gen < 7) {
518 unreachable("ir_unop_unpack_half_2x16 should be lowered");
519 }
520
521 assert(dst.type == BRW_REGISTER_TYPE_F);
522 assert(src0.type == BRW_REGISTER_TYPE_UD);
523
524 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
525 *
526 * Because this instruction does not have a 16-bit floating-point type,
527 * the source data type must be Word (W). The destination type must be
528 * F (Float).
529 *
530 * To use W as the source data type, we must adjust horizontal strides,
531 * which is only possible in align1 mode. All my [chadv] attempts at
532 * emitting align1 instructions for unpackHalf2x16 failed to pass the
533 * Piglit tests, so I gave up.
534 *
535 * I've verified that, on gen7 hardware and the simulator, it is safe to
536 * emit f16to32 in align16 mode with UD as source data type.
537 */
538
539 dst_reg tmp_dst(this, glsl_type::uvec2_type);
540 src_reg tmp_src(tmp_dst);
541
542 tmp_dst.writemask = WRITEMASK_X;
543 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
544
545 tmp_dst.writemask = WRITEMASK_Y;
546 emit(SHR(tmp_dst, src0, src_reg(16u)));
547
548 dst.writemask = WRITEMASK_XY;
549 emit(F16TO32(dst, tmp_src));
550 }
551
552 void
553 vec4_visitor::visit_instructions(const exec_list *list)
554 {
555 foreach_in_list(ir_instruction, ir, list) {
556 base_ir = ir;
557 ir->accept(this);
558 }
559 }
560
561
562 static int
563 type_size(const struct glsl_type *type)
564 {
565 unsigned int i;
566 int size;
567
568 switch (type->base_type) {
569 case GLSL_TYPE_UINT:
570 case GLSL_TYPE_INT:
571 case GLSL_TYPE_FLOAT:
572 case GLSL_TYPE_BOOL:
573 if (type->is_matrix()) {
574 return type->matrix_columns;
575 } else {
576 /* Regardless of size of vector, it gets a vec4. This is bad
577 * packing for things like floats, but otherwise arrays become a
578 * mess. Hopefully a later pass over the code can pack scalars
579 * down if appropriate.
580 */
581 return 1;
582 }
583 case GLSL_TYPE_ARRAY:
584 assert(type->length > 0);
585 return type_size(type->fields.array) * type->length;
586 case GLSL_TYPE_STRUCT:
587 size = 0;
588 for (i = 0; i < type->length; i++) {
589 size += type_size(type->fields.structure[i].type);
590 }
591 return size;
592 case GLSL_TYPE_SAMPLER:
593 /* Samplers take up no register space, since they're baked in at
594 * link time.
595 */
596 return 0;
597 case GLSL_TYPE_ATOMIC_UINT:
598 return 0;
599 case GLSL_TYPE_IMAGE:
600 case GLSL_TYPE_VOID:
601 case GLSL_TYPE_ERROR:
602 case GLSL_TYPE_INTERFACE:
603 unreachable("not reached");
604 }
605
606 return 0;
607 }
608
609 int
610 vec4_visitor::virtual_grf_alloc(int size)
611 {
612 if (virtual_grf_array_size <= virtual_grf_count) {
613 if (virtual_grf_array_size == 0)
614 virtual_grf_array_size = 16;
615 else
616 virtual_grf_array_size *= 2;
617 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
618 virtual_grf_array_size);
619 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
620 virtual_grf_array_size);
621 }
622 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
623 virtual_grf_reg_count += size;
624 virtual_grf_sizes[virtual_grf_count] = size;
625 return virtual_grf_count++;
626 }
627
628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
629 {
630 init();
631
632 this->file = GRF;
633 this->reg = v->virtual_grf_alloc(type_size(type));
634
635 if (type->is_array() || type->is_record()) {
636 this->swizzle = BRW_SWIZZLE_NOOP;
637 } else {
638 this->swizzle = swizzle_for_size(type->vector_elements);
639 }
640
641 this->type = brw_type_for_base_type(type);
642 }
643
644 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
645 {
646 assert(size > 0);
647
648 init();
649
650 this->file = GRF;
651 this->reg = v->virtual_grf_alloc(type_size(type) * size);
652
653 this->swizzle = BRW_SWIZZLE_NOOP;
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
659 {
660 init();
661
662 this->file = GRF;
663 this->reg = v->virtual_grf_alloc(type_size(type));
664
665 if (type->is_array() || type->is_record()) {
666 this->writemask = WRITEMASK_XYZW;
667 } else {
668 this->writemask = (1 << type->vector_elements) - 1;
669 }
670
671 this->type = brw_type_for_base_type(type);
672 }
673
674 /* Our support for uniforms is piggy-backed on the struct
675 * gl_fragment_program, because that's where the values actually
676 * get stored, rather than in some global gl_shader_program uniform
677 * store.
678 */
679 void
680 vec4_visitor::setup_uniform_values(ir_variable *ir)
681 {
682 int namelen = strlen(ir->name);
683
684 /* The data for our (non-builtin) uniforms is stored in a series of
685 * gl_uniform_driver_storage structs for each subcomponent that
686 * glGetUniformLocation() could name. We know it's been set up in the same
687 * order we'd walk the type, so walk the list of storage and find anything
688 * with our name, or the prefix of a component that starts with our name.
689 */
690 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
691 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
692
693 if (strncmp(ir->name, storage->name, namelen) != 0 ||
694 (storage->name[namelen] != 0 &&
695 storage->name[namelen] != '.' &&
696 storage->name[namelen] != '[')) {
697 continue;
698 }
699
700 gl_constant_value *components = storage->storage;
701 unsigned vector_count = (MAX2(storage->array_elements, 1) *
702 storage->type->matrix_columns);
703
704 for (unsigned s = 0; s < vector_count; s++) {
705 assert(uniforms < uniform_array_size);
706 uniform_vector_size[uniforms] = storage->type->vector_elements;
707
708 int i;
709 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
710 stage_prog_data->param[uniforms * 4 + i] = components;
711 components++;
712 }
713 for (; i < 4; i++) {
714 static gl_constant_value zero = { 0.0 };
715 stage_prog_data->param[uniforms * 4 + i] = &zero;
716 }
717
718 uniforms++;
719 }
720 }
721 }
722
723 void
724 vec4_visitor::setup_uniform_clipplane_values()
725 {
726 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
727
728 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
729 assert(this->uniforms < uniform_array_size);
730 this->uniform_vector_size[this->uniforms] = 4;
731 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
732 this->userplane[i].type = BRW_REGISTER_TYPE_F;
733 for (int j = 0; j < 4; ++j) {
734 stage_prog_data->param[this->uniforms * 4 + j] =
735 (gl_constant_value *) &clip_planes[i][j];
736 }
737 ++this->uniforms;
738 }
739 }
740
741 /* Our support for builtin uniforms is even scarier than non-builtin.
742 * It sits on top of the PROG_STATE_VAR parameters that are
743 * automatically updated from GL context state.
744 */
745 void
746 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
747 {
748 const ir_state_slot *const slots = ir->get_state_slots();
749 assert(slots != NULL);
750
751 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
752 /* This state reference has already been setup by ir_to_mesa,
753 * but we'll get the same index back here. We can reference
754 * ParameterValues directly, since unlike brw_fs.cpp, we never
755 * add new state references during compile.
756 */
757 int index = _mesa_add_state_reference(this->prog->Parameters,
758 (gl_state_index *)slots[i].tokens);
759 gl_constant_value *values =
760 &this->prog->Parameters->ParameterValues[index][0];
761
762 assert(this->uniforms < uniform_array_size);
763 this->uniform_vector_size[this->uniforms] = 0;
764 /* Add each of the unique swizzled channels of the element.
765 * This will end up matching the size of the glsl_type of this field.
766 */
767 int last_swiz = -1;
768 for (unsigned int j = 0; j < 4; j++) {
769 int swiz = GET_SWZ(slots[i].swizzle, j);
770 last_swiz = swiz;
771
772 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
773 assert(this->uniforms < uniform_array_size);
774 if (swiz <= last_swiz)
775 this->uniform_vector_size[this->uniforms]++;
776 }
777 this->uniforms++;
778 }
779 }
780
781 dst_reg *
782 vec4_visitor::variable_storage(ir_variable *var)
783 {
784 return (dst_reg *)hash_table_find(this->variable_ht, var);
785 }
786
787 void
788 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
789 enum brw_predicate *predicate)
790 {
791 ir_expression *expr = ir->as_expression();
792
793 *predicate = BRW_PREDICATE_NORMAL;
794
795 if (expr && expr->operation != ir_binop_ubo_load) {
796 src_reg op[3];
797 vec4_instruction *inst;
798
799 assert(expr->get_num_operands() <= 3);
800 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
801 expr->operands[i]->accept(this);
802 op[i] = this->result;
803
804 resolve_ud_negate(&op[i]);
805 }
806
807 switch (expr->operation) {
808 case ir_unop_logic_not:
809 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
810 inst->conditional_mod = BRW_CONDITIONAL_Z;
811 break;
812
813 case ir_binop_logic_xor:
814 inst = emit(XOR(dst_null_d(), op[0], op[1]));
815 inst->conditional_mod = BRW_CONDITIONAL_NZ;
816 break;
817
818 case ir_binop_logic_or:
819 inst = emit(OR(dst_null_d(), op[0], op[1]));
820 inst->conditional_mod = BRW_CONDITIONAL_NZ;
821 break;
822
823 case ir_binop_logic_and:
824 inst = emit(AND(dst_null_d(), op[0], op[1]));
825 inst->conditional_mod = BRW_CONDITIONAL_NZ;
826 break;
827
828 case ir_unop_f2b:
829 if (brw->gen >= 6) {
830 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
831 } else {
832 inst = emit(MOV(dst_null_f(), op[0]));
833 inst->conditional_mod = BRW_CONDITIONAL_NZ;
834 }
835 break;
836
837 case ir_unop_i2b:
838 if (brw->gen >= 6) {
839 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
840 } else {
841 inst = emit(MOV(dst_null_d(), op[0]));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 }
844 break;
845
846 case ir_binop_all_equal:
847 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
848 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
849 break;
850
851 case ir_binop_any_nequal:
852 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
853 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
854 break;
855
856 case ir_unop_any:
857 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
858 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
859 break;
860
861 case ir_binop_greater:
862 case ir_binop_gequal:
863 case ir_binop_less:
864 case ir_binop_lequal:
865 case ir_binop_equal:
866 case ir_binop_nequal:
867 emit(CMP(dst_null_d(), op[0], op[1],
868 brw_conditional_for_comparison(expr->operation)));
869 break;
870
871 case ir_triop_csel: {
872 /* Expand the boolean condition into the flag register. */
873 inst = emit(MOV(dst_null_d(), op[0]));
874 inst->conditional_mod = BRW_CONDITIONAL_NZ;
875
876 /* Select which boolean to return. */
877 dst_reg temp(this, expr->operands[1]->type);
878 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
879 inst->predicate = BRW_PREDICATE_NORMAL;
880
881 /* Expand the result to a condition code. */
882 inst = emit(MOV(dst_null_d(), src_reg(temp)));
883 inst->conditional_mod = BRW_CONDITIONAL_NZ;
884 break;
885 }
886
887 default:
888 unreachable("not reached");
889 }
890 return;
891 }
892
893 ir->accept(this);
894
895 resolve_ud_negate(&this->result);
896
897 if (brw->gen >= 6) {
898 vec4_instruction *inst = emit(AND(dst_null_d(),
899 this->result, src_reg(1)));
900 inst->conditional_mod = BRW_CONDITIONAL_NZ;
901 } else {
902 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
903 inst->conditional_mod = BRW_CONDITIONAL_NZ;
904 }
905 }
906
907 /**
908 * Emit a gen6 IF statement with the comparison folded into the IF
909 * instruction.
910 */
911 void
912 vec4_visitor::emit_if_gen6(ir_if *ir)
913 {
914 ir_expression *expr = ir->condition->as_expression();
915
916 if (expr && expr->operation != ir_binop_ubo_load) {
917 src_reg op[3];
918 dst_reg temp;
919
920 assert(expr->get_num_operands() <= 3);
921 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
922 expr->operands[i]->accept(this);
923 op[i] = this->result;
924 }
925
926 switch (expr->operation) {
927 case ir_unop_logic_not:
928 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
929 return;
930
931 case ir_binop_logic_xor:
932 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
933 return;
934
935 case ir_binop_logic_or:
936 temp = dst_reg(this, glsl_type::bool_type);
937 emit(OR(temp, op[0], op[1]));
938 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
939 return;
940
941 case ir_binop_logic_and:
942 temp = dst_reg(this, glsl_type::bool_type);
943 emit(AND(temp, op[0], op[1]));
944 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
945 return;
946
947 case ir_unop_f2b:
948 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
949 return;
950
951 case ir_unop_i2b:
952 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
953 return;
954
955 case ir_binop_greater:
956 case ir_binop_gequal:
957 case ir_binop_less:
958 case ir_binop_lequal:
959 case ir_binop_equal:
960 case ir_binop_nequal:
961 emit(IF(op[0], op[1],
962 brw_conditional_for_comparison(expr->operation)));
963 return;
964
965 case ir_binop_all_equal:
966 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
967 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
968 return;
969
970 case ir_binop_any_nequal:
971 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
972 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
973 return;
974
975 case ir_unop_any:
976 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
977 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
978 return;
979
980 case ir_triop_csel: {
981 /* Expand the boolean condition into the flag register. */
982 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
983 inst->conditional_mod = BRW_CONDITIONAL_NZ;
984
985 /* Select which boolean to return. */
986 dst_reg temp(this, expr->operands[1]->type);
987 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
988 inst->predicate = BRW_PREDICATE_NORMAL;
989
990 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
991 return;
992 }
993
994 default:
995 unreachable("not reached");
996 }
997 return;
998 }
999
1000 ir->condition->accept(this);
1001
1002 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
1003 }
1004
1005 void
1006 vec4_visitor::visit(ir_variable *ir)
1007 {
1008 dst_reg *reg = NULL;
1009
1010 if (variable_storage(ir))
1011 return;
1012
1013 switch (ir->data.mode) {
1014 case ir_var_shader_in:
1015 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
1016 break;
1017
1018 case ir_var_shader_out:
1019 reg = new(mem_ctx) dst_reg(this, ir->type);
1020
1021 for (int i = 0; i < type_size(ir->type); i++) {
1022 output_reg[ir->data.location + i] = *reg;
1023 output_reg[ir->data.location + i].reg_offset = i;
1024 output_reg[ir->data.location + i].type =
1025 brw_type_for_base_type(ir->type->get_scalar_type());
1026 output_reg_annotation[ir->data.location + i] = ir->name;
1027 }
1028 break;
1029
1030 case ir_var_auto:
1031 case ir_var_temporary:
1032 reg = new(mem_ctx) dst_reg(this, ir->type);
1033 break;
1034
1035 case ir_var_uniform:
1036 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1037
1038 /* Thanks to the lower_ubo_reference pass, we will see only
1039 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1040 * variables, so no need for them to be in variable_ht.
1041 *
1042 * Some uniforms, such as samplers and atomic counters, have no actual
1043 * storage, so we should ignore them.
1044 */
1045 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
1046 return;
1047
1048 /* Track how big the whole uniform variable is, in case we need to put a
1049 * copy of its data into pull constants for array access.
1050 */
1051 assert(this->uniforms < uniform_array_size);
1052 this->uniform_size[this->uniforms] = type_size(ir->type);
1053
1054 if (!strncmp(ir->name, "gl_", 3)) {
1055 setup_builtin_uniform_values(ir);
1056 } else {
1057 setup_uniform_values(ir);
1058 }
1059 break;
1060
1061 case ir_var_system_value:
1062 reg = make_reg_for_system_value(ir);
1063 break;
1064
1065 default:
1066 unreachable("not reached");
1067 }
1068
1069 reg->type = brw_type_for_base_type(ir->type);
1070 hash_table_insert(this->variable_ht, reg, ir);
1071 }
1072
1073 void
1074 vec4_visitor::visit(ir_loop *ir)
1075 {
1076 /* We don't want debugging output to print the whole body of the
1077 * loop as the annotation.
1078 */
1079 this->base_ir = NULL;
1080
1081 emit(BRW_OPCODE_DO);
1082
1083 visit_instructions(&ir->body_instructions);
1084
1085 emit(BRW_OPCODE_WHILE);
1086 }
1087
1088 void
1089 vec4_visitor::visit(ir_loop_jump *ir)
1090 {
1091 switch (ir->mode) {
1092 case ir_loop_jump::jump_break:
1093 emit(BRW_OPCODE_BREAK);
1094 break;
1095 case ir_loop_jump::jump_continue:
1096 emit(BRW_OPCODE_CONTINUE);
1097 break;
1098 }
1099 }
1100
1101
1102 void
1103 vec4_visitor::visit(ir_function_signature *)
1104 {
1105 unreachable("not reached");
1106 }
1107
1108 void
1109 vec4_visitor::visit(ir_function *ir)
1110 {
1111 /* Ignore function bodies other than main() -- we shouldn't see calls to
1112 * them since they should all be inlined.
1113 */
1114 if (strcmp(ir->name, "main") == 0) {
1115 const ir_function_signature *sig;
1116 exec_list empty;
1117
1118 sig = ir->matching_signature(NULL, &empty, false);
1119
1120 assert(sig);
1121
1122 visit_instructions(&sig->body);
1123 }
1124 }
1125
1126 bool
1127 vec4_visitor::try_emit_mad(ir_expression *ir)
1128 {
1129 /* 3-src instructions were introduced in gen6. */
1130 if (brw->gen < 6)
1131 return false;
1132
1133 /* MAD can only handle floating-point data. */
1134 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1135 return false;
1136
1137 ir_rvalue *nonmul = ir->operands[1];
1138 ir_expression *mul = ir->operands[0]->as_expression();
1139
1140 if (!mul || mul->operation != ir_binop_mul) {
1141 nonmul = ir->operands[0];
1142 mul = ir->operands[1]->as_expression();
1143
1144 if (!mul || mul->operation != ir_binop_mul)
1145 return false;
1146 }
1147
1148 nonmul->accept(this);
1149 src_reg src0 = fix_3src_operand(this->result);
1150
1151 mul->operands[0]->accept(this);
1152 src_reg src1 = fix_3src_operand(this->result);
1153
1154 mul->operands[1]->accept(this);
1155 src_reg src2 = fix_3src_operand(this->result);
1156
1157 this->result = src_reg(this, ir->type);
1158 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1159
1160 return true;
1161 }
1162
1163 bool
1164 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1165 {
1166 /* This optimization relies on CMP setting the destination to 0 when
1167 * false. Early hardware only sets the least significant bit, and
1168 * leaves the other bits undefined. So we can't use it.
1169 */
1170 if (brw->gen < 6)
1171 return false;
1172
1173 ir_expression *const cmp = ir->operands[0]->as_expression();
1174
1175 if (cmp == NULL)
1176 return false;
1177
1178 switch (cmp->operation) {
1179 case ir_binop_less:
1180 case ir_binop_greater:
1181 case ir_binop_lequal:
1182 case ir_binop_gequal:
1183 case ir_binop_equal:
1184 case ir_binop_nequal:
1185 break;
1186
1187 default:
1188 return false;
1189 }
1190
1191 cmp->operands[0]->accept(this);
1192 const src_reg cmp_src0 = this->result;
1193
1194 cmp->operands[1]->accept(this);
1195 const src_reg cmp_src1 = this->result;
1196
1197 this->result = src_reg(this, ir->type);
1198
1199 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1200 brw_conditional_for_comparison(cmp->operation)));
1201
1202 /* If the comparison is false, this->result will just happen to be zero.
1203 */
1204 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1205 this->result, src_reg(1.0f));
1206 inst->predicate = BRW_PREDICATE_NORMAL;
1207 inst->predicate_inverse = true;
1208
1209 return true;
1210 }
1211
1212 void
1213 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1214 src_reg src0, src_reg src1)
1215 {
1216 vec4_instruction *inst;
1217
1218 if (brw->gen >= 6) {
1219 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1220 inst->conditional_mod = conditionalmod;
1221 } else {
1222 emit(CMP(dst, src0, src1, conditionalmod));
1223
1224 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1225 inst->predicate = BRW_PREDICATE_NORMAL;
1226 }
1227 }
1228
1229 void
1230 vec4_visitor::emit_lrp(const dst_reg &dst,
1231 const src_reg &x, const src_reg &y, const src_reg &a)
1232 {
1233 if (brw->gen >= 6) {
1234 /* Note that the instruction's argument order is reversed from GLSL
1235 * and the IR.
1236 */
1237 emit(LRP(dst,
1238 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1239 } else {
1240 /* Earlier generations don't support three source operations, so we
1241 * need to emit x*(1-a) + y*a.
1242 */
1243 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1244 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1245 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1246 y_times_a.writemask = dst.writemask;
1247 one_minus_a.writemask = dst.writemask;
1248 x_times_one_minus_a.writemask = dst.writemask;
1249
1250 emit(MUL(y_times_a, y, a));
1251 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1252 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1253 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1254 }
1255 }
1256
1257 void
1258 vec4_visitor::visit(ir_expression *ir)
1259 {
1260 unsigned int operand;
1261 src_reg op[Elements(ir->operands)];
1262 src_reg result_src;
1263 dst_reg result_dst;
1264 vec4_instruction *inst;
1265
1266 if (ir->operation == ir_binop_add) {
1267 if (try_emit_mad(ir))
1268 return;
1269 }
1270
1271 if (ir->operation == ir_unop_b2f) {
1272 if (try_emit_b2f_of_compare(ir))
1273 return;
1274 }
1275
1276 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1277 this->result.file = BAD_FILE;
1278 ir->operands[operand]->accept(this);
1279 if (this->result.file == BAD_FILE) {
1280 fprintf(stderr, "Failed to get tree for expression operand:\n");
1281 ir->operands[operand]->fprint(stderr);
1282 exit(1);
1283 }
1284 op[operand] = this->result;
1285
1286 /* Matrix expression operands should have been broken down to vector
1287 * operations already.
1288 */
1289 assert(!ir->operands[operand]->type->is_matrix());
1290 }
1291
1292 int vector_elements = ir->operands[0]->type->vector_elements;
1293 if (ir->operands[1]) {
1294 vector_elements = MAX2(vector_elements,
1295 ir->operands[1]->type->vector_elements);
1296 }
1297
1298 this->result.file = BAD_FILE;
1299
1300 /* Storage for our result. Ideally for an assignment we'd be using
1301 * the actual storage for the result here, instead.
1302 */
1303 result_src = src_reg(this, ir->type);
1304 /* convenience for the emit functions below. */
1305 result_dst = dst_reg(result_src);
1306 /* If nothing special happens, this is the result. */
1307 this->result = result_src;
1308 /* Limit writes to the channels that will be used by result_src later.
1309 * This does limit this temp's use as a temporary for multi-instruction
1310 * sequences.
1311 */
1312 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1313
1314 switch (ir->operation) {
1315 case ir_unop_logic_not:
1316 if (ctx->Const.UniformBooleanTrue != 1) {
1317 emit(NOT(result_dst, op[0]));
1318 } else {
1319 emit(XOR(result_dst, op[0], src_reg(1u)));
1320 }
1321 break;
1322 case ir_unop_neg:
1323 op[0].negate = !op[0].negate;
1324 emit(MOV(result_dst, op[0]));
1325 break;
1326 case ir_unop_abs:
1327 op[0].abs = true;
1328 op[0].negate = false;
1329 emit(MOV(result_dst, op[0]));
1330 break;
1331
1332 case ir_unop_sign:
1333 if (ir->type->is_float()) {
1334 /* AND(val, 0x80000000) gives the sign bit.
1335 *
1336 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1337 * zero.
1338 */
1339 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1340
1341 op[0].type = BRW_REGISTER_TYPE_UD;
1342 result_dst.type = BRW_REGISTER_TYPE_UD;
1343 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1344
1345 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1346 inst->predicate = BRW_PREDICATE_NORMAL;
1347
1348 this->result.type = BRW_REGISTER_TYPE_F;
1349 } else {
1350 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1351 * -> non-negative val generates 0x00000000.
1352 * Predicated OR sets 1 if val is positive.
1353 */
1354 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1355
1356 emit(ASR(result_dst, op[0], src_reg(31)));
1357
1358 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1359 inst->predicate = BRW_PREDICATE_NORMAL;
1360 }
1361 break;
1362
1363 case ir_unop_rcp:
1364 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1365 break;
1366
1367 case ir_unop_exp2:
1368 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1369 break;
1370 case ir_unop_log2:
1371 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1372 break;
1373 case ir_unop_exp:
1374 case ir_unop_log:
1375 unreachable("not reached: should be handled by ir_explog_to_explog2");
1376 case ir_unop_sin:
1377 case ir_unop_sin_reduced:
1378 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1379 break;
1380 case ir_unop_cos:
1381 case ir_unop_cos_reduced:
1382 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1383 break;
1384
1385 case ir_unop_dFdx:
1386 case ir_unop_dFdx_coarse:
1387 case ir_unop_dFdx_fine:
1388 case ir_unop_dFdy:
1389 case ir_unop_dFdy_coarse:
1390 case ir_unop_dFdy_fine:
1391 unreachable("derivatives not valid in vertex shader");
1392
1393 case ir_unop_bitfield_reverse:
1394 emit(BFREV(result_dst, op[0]));
1395 break;
1396 case ir_unop_bit_count:
1397 emit(CBIT(result_dst, op[0]));
1398 break;
1399 case ir_unop_find_msb: {
1400 src_reg temp = src_reg(this, glsl_type::uint_type);
1401
1402 inst = emit(FBH(dst_reg(temp), op[0]));
1403 inst->dst.writemask = WRITEMASK_XYZW;
1404
1405 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1406 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1407 * subtract the result from 31 to convert the MSB count into an LSB count.
1408 */
1409
1410 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1411 temp.swizzle = BRW_SWIZZLE_NOOP;
1412 emit(MOV(result_dst, temp));
1413
1414 src_reg src_tmp = src_reg(result_dst);
1415 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1416
1417 src_tmp.negate = true;
1418 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1419 inst->predicate = BRW_PREDICATE_NORMAL;
1420 break;
1421 }
1422 case ir_unop_find_lsb:
1423 emit(FBL(result_dst, op[0]));
1424 break;
1425 case ir_unop_saturate:
1426 inst = emit(MOV(result_dst, op[0]));
1427 inst->saturate = true;
1428 break;
1429
1430 case ir_unop_noise:
1431 unreachable("not reached: should be handled by lower_noise");
1432
1433 case ir_binop_add:
1434 emit(ADD(result_dst, op[0], op[1]));
1435 break;
1436 case ir_binop_sub:
1437 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1438
1439 case ir_binop_mul:
1440 if (brw->gen < 8 && ir->type->is_integer()) {
1441 /* For integer multiplication, the MUL uses the low 16 bits of one of
1442 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1443 * accumulates in the contribution of the upper 16 bits of that
1444 * operand. If we can determine that one of the args is in the low
1445 * 16 bits, though, we can just emit a single MUL.
1446 */
1447 if (ir->operands[0]->is_uint16_constant()) {
1448 if (brw->gen < 7)
1449 emit(MUL(result_dst, op[0], op[1]));
1450 else
1451 emit(MUL(result_dst, op[1], op[0]));
1452 } else if (ir->operands[1]->is_uint16_constant()) {
1453 if (brw->gen < 7)
1454 emit(MUL(result_dst, op[1], op[0]));
1455 else
1456 emit(MUL(result_dst, op[0], op[1]));
1457 } else {
1458 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1459
1460 emit(MUL(acc, op[0], op[1]));
1461 emit(MACH(dst_null_d(), op[0], op[1]));
1462 emit(MOV(result_dst, src_reg(acc)));
1463 }
1464 } else {
1465 emit(MUL(result_dst, op[0], op[1]));
1466 }
1467 break;
1468 case ir_binop_imul_high: {
1469 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1470
1471 emit(MUL(acc, op[0], op[1]));
1472 emit(MACH(result_dst, op[0], op[1]));
1473 break;
1474 }
1475 case ir_binop_div:
1476 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1477 assert(ir->type->is_integer());
1478 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1479 break;
1480 case ir_binop_carry: {
1481 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1482
1483 emit(ADDC(dst_null_ud(), op[0], op[1]));
1484 emit(MOV(result_dst, src_reg(acc)));
1485 break;
1486 }
1487 case ir_binop_borrow: {
1488 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1489
1490 emit(SUBB(dst_null_ud(), op[0], op[1]));
1491 emit(MOV(result_dst, src_reg(acc)));
1492 break;
1493 }
1494 case ir_binop_mod:
1495 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1496 assert(ir->type->is_integer());
1497 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1498 break;
1499
1500 case ir_binop_less:
1501 case ir_binop_greater:
1502 case ir_binop_lequal:
1503 case ir_binop_gequal:
1504 case ir_binop_equal:
1505 case ir_binop_nequal: {
1506 emit(CMP(result_dst, op[0], op[1],
1507 brw_conditional_for_comparison(ir->operation)));
1508 if (ctx->Const.UniformBooleanTrue == 1) {
1509 emit(AND(result_dst, result_src, src_reg(1u)));
1510 }
1511 break;
1512 }
1513
1514 case ir_binop_all_equal:
1515 /* "==" operator producing a scalar boolean. */
1516 if (ir->operands[0]->type->is_vector() ||
1517 ir->operands[1]->type->is_vector()) {
1518 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1519 emit(MOV(result_dst, src_reg(0)));
1520 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1521 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1522 } else {
1523 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1524 if (ctx->Const.UniformBooleanTrue == 1) {
1525 emit(AND(result_dst, result_src, src_reg(1u)));
1526 }
1527 }
1528 break;
1529 case ir_binop_any_nequal:
1530 /* "!=" operator producing a scalar boolean. */
1531 if (ir->operands[0]->type->is_vector() ||
1532 ir->operands[1]->type->is_vector()) {
1533 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1534
1535 emit(MOV(result_dst, src_reg(0)));
1536 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1537 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1538 } else {
1539 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1540 if (ctx->Const.UniformBooleanTrue == 1) {
1541 emit(AND(result_dst, result_src, src_reg(1u)));
1542 }
1543 }
1544 break;
1545
1546 case ir_unop_any:
1547 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1548 emit(MOV(result_dst, src_reg(0)));
1549
1550 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1551 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1552 break;
1553
1554 case ir_binop_logic_xor:
1555 emit(XOR(result_dst, op[0], op[1]));
1556 break;
1557
1558 case ir_binop_logic_or:
1559 emit(OR(result_dst, op[0], op[1]));
1560 break;
1561
1562 case ir_binop_logic_and:
1563 emit(AND(result_dst, op[0], op[1]));
1564 break;
1565
1566 case ir_binop_dot:
1567 assert(ir->operands[0]->type->is_vector());
1568 assert(ir->operands[0]->type == ir->operands[1]->type);
1569 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1570 break;
1571
1572 case ir_unop_sqrt:
1573 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1574 break;
1575 case ir_unop_rsq:
1576 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1577 break;
1578
1579 case ir_unop_bitcast_i2f:
1580 case ir_unop_bitcast_u2f:
1581 this->result = op[0];
1582 this->result.type = BRW_REGISTER_TYPE_F;
1583 break;
1584
1585 case ir_unop_bitcast_f2i:
1586 this->result = op[0];
1587 this->result.type = BRW_REGISTER_TYPE_D;
1588 break;
1589
1590 case ir_unop_bitcast_f2u:
1591 this->result = op[0];
1592 this->result.type = BRW_REGISTER_TYPE_UD;
1593 break;
1594
1595 case ir_unop_i2f:
1596 case ir_unop_i2u:
1597 case ir_unop_u2i:
1598 case ir_unop_u2f:
1599 case ir_unop_f2i:
1600 case ir_unop_f2u:
1601 emit(MOV(result_dst, op[0]));
1602 break;
1603 case ir_unop_b2i:
1604 if (ctx->Const.UniformBooleanTrue != 1) {
1605 emit(AND(result_dst, op[0], src_reg(1u)));
1606 } else {
1607 emit(MOV(result_dst, op[0]));
1608 }
1609 break;
1610 case ir_unop_b2f:
1611 if (ctx->Const.UniformBooleanTrue != 1) {
1612 op[0].type = BRW_REGISTER_TYPE_UD;
1613 result_dst.type = BRW_REGISTER_TYPE_UD;
1614 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1615 result_dst.type = BRW_REGISTER_TYPE_F;
1616 } else {
1617 emit(MOV(result_dst, op[0]));
1618 }
1619 break;
1620 case ir_unop_f2b:
1621 case ir_unop_i2b:
1622 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1623 if (ctx->Const.UniformBooleanTrue == 1) {
1624 emit(AND(result_dst, result_src, src_reg(1u)));
1625 }
1626 break;
1627
1628 case ir_unop_trunc:
1629 emit(RNDZ(result_dst, op[0]));
1630 break;
1631 case ir_unop_ceil:
1632 op[0].negate = !op[0].negate;
1633 inst = emit(RNDD(result_dst, op[0]));
1634 this->result.negate = true;
1635 break;
1636 case ir_unop_floor:
1637 inst = emit(RNDD(result_dst, op[0]));
1638 break;
1639 case ir_unop_fract:
1640 inst = emit(FRC(result_dst, op[0]));
1641 break;
1642 case ir_unop_round_even:
1643 emit(RNDE(result_dst, op[0]));
1644 break;
1645
1646 case ir_binop_min:
1647 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1648 break;
1649 case ir_binop_max:
1650 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1651 break;
1652
1653 case ir_binop_pow:
1654 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1655 break;
1656
1657 case ir_unop_bit_not:
1658 inst = emit(NOT(result_dst, op[0]));
1659 break;
1660 case ir_binop_bit_and:
1661 inst = emit(AND(result_dst, op[0], op[1]));
1662 break;
1663 case ir_binop_bit_xor:
1664 inst = emit(XOR(result_dst, op[0], op[1]));
1665 break;
1666 case ir_binop_bit_or:
1667 inst = emit(OR(result_dst, op[0], op[1]));
1668 break;
1669
1670 case ir_binop_lshift:
1671 inst = emit(SHL(result_dst, op[0], op[1]));
1672 break;
1673
1674 case ir_binop_rshift:
1675 if (ir->type->base_type == GLSL_TYPE_INT)
1676 inst = emit(ASR(result_dst, op[0], op[1]));
1677 else
1678 inst = emit(SHR(result_dst, op[0], op[1]));
1679 break;
1680
1681 case ir_binop_bfm:
1682 emit(BFI1(result_dst, op[0], op[1]));
1683 break;
1684
1685 case ir_binop_ubo_load: {
1686 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1687 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1688 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1689 src_reg offset;
1690
1691 /* Now, load the vector from that offset. */
1692 assert(ir->type->is_vector() || ir->type->is_scalar());
1693
1694 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1695 packed_consts.type = result.type;
1696 src_reg surf_index;
1697
1698 if (const_uniform_block) {
1699 /* The block index is a constant, so just emit the binding table entry
1700 * as an immediate.
1701 */
1702 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1703 const_uniform_block->value.u[0]);
1704 } else {
1705 /* The block index is not a constant. Evaluate the index expression
1706 * per-channel and add the base UBO index; the generator will select
1707 * a value from any live channel.
1708 */
1709 surf_index = src_reg(this, glsl_type::uint_type);
1710 emit(ADD(dst_reg(surf_index), op[0],
1711 src_reg(prog_data->base.binding_table.ubo_start)));
1712
1713 /* Assume this may touch any UBO. It would be nice to provide
1714 * a tighter bound, but the array information is already lowered away.
1715 */
1716 brw_mark_surface_used(&prog_data->base,
1717 prog_data->base.binding_table.ubo_start +
1718 shader_prog->NumUniformBlocks - 1);
1719 }
1720
1721 if (const_offset_ir) {
1722 if (brw->gen >= 8) {
1723 /* Store the offset in a GRF so we can send-from-GRF. */
1724 offset = src_reg(this, glsl_type::int_type);
1725 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1726 } else {
1727 /* Immediates are fine on older generations since they'll be moved
1728 * to a (potentially fake) MRF at the generator level.
1729 */
1730 offset = src_reg(const_offset / 16);
1731 }
1732 } else {
1733 offset = src_reg(this, glsl_type::uint_type);
1734 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1735 }
1736
1737 if (brw->gen >= 7) {
1738 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1739 grf_offset.type = offset.type;
1740
1741 emit(MOV(grf_offset, offset));
1742
1743 emit(new(mem_ctx) vec4_instruction(this,
1744 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1745 dst_reg(packed_consts),
1746 surf_index,
1747 src_reg(grf_offset)));
1748 } else {
1749 vec4_instruction *pull =
1750 emit(new(mem_ctx) vec4_instruction(this,
1751 VS_OPCODE_PULL_CONSTANT_LOAD,
1752 dst_reg(packed_consts),
1753 surf_index,
1754 offset));
1755 pull->base_mrf = 14;
1756 pull->mlen = 1;
1757 }
1758
1759 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1760 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1761 const_offset % 16 / 4,
1762 const_offset % 16 / 4,
1763 const_offset % 16 / 4);
1764
1765 /* UBO bools are any nonzero int. We need to convert them to use the
1766 * value of true stored in ctx->Const.UniformBooleanTrue.
1767 */
1768 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1769 emit(CMP(result_dst, packed_consts, src_reg(0u),
1770 BRW_CONDITIONAL_NZ));
1771 if (ctx->Const.UniformBooleanTrue == 1) {
1772 emit(AND(result_dst, result, src_reg(1u)));
1773 }
1774 } else {
1775 emit(MOV(result_dst, packed_consts));
1776 }
1777 break;
1778 }
1779
1780 case ir_binop_vector_extract:
1781 unreachable("should have been lowered by vec_index_to_cond_assign");
1782
1783 case ir_triop_fma:
1784 op[0] = fix_3src_operand(op[0]);
1785 op[1] = fix_3src_operand(op[1]);
1786 op[2] = fix_3src_operand(op[2]);
1787 /* Note that the instruction's argument order is reversed from GLSL
1788 * and the IR.
1789 */
1790 emit(MAD(result_dst, op[2], op[1], op[0]));
1791 break;
1792
1793 case ir_triop_lrp:
1794 emit_lrp(result_dst, op[0], op[1], op[2]);
1795 break;
1796
1797 case ir_triop_csel:
1798 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1799 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1800 inst->predicate = BRW_PREDICATE_NORMAL;
1801 break;
1802
1803 case ir_triop_bfi:
1804 op[0] = fix_3src_operand(op[0]);
1805 op[1] = fix_3src_operand(op[1]);
1806 op[2] = fix_3src_operand(op[2]);
1807 emit(BFI2(result_dst, op[0], op[1], op[2]));
1808 break;
1809
1810 case ir_triop_bitfield_extract:
1811 op[0] = fix_3src_operand(op[0]);
1812 op[1] = fix_3src_operand(op[1]);
1813 op[2] = fix_3src_operand(op[2]);
1814 /* Note that the instruction's argument order is reversed from GLSL
1815 * and the IR.
1816 */
1817 emit(BFE(result_dst, op[2], op[1], op[0]));
1818 break;
1819
1820 case ir_triop_vector_insert:
1821 unreachable("should have been lowered by lower_vector_insert");
1822
1823 case ir_quadop_bitfield_insert:
1824 unreachable("not reached: should be handled by "
1825 "bitfield_insert_to_bfm_bfi\n");
1826
1827 case ir_quadop_vector:
1828 unreachable("not reached: should be handled by lower_quadop_vector");
1829
1830 case ir_unop_pack_half_2x16:
1831 emit_pack_half_2x16(result_dst, op[0]);
1832 break;
1833 case ir_unop_unpack_half_2x16:
1834 emit_unpack_half_2x16(result_dst, op[0]);
1835 break;
1836 case ir_unop_pack_snorm_2x16:
1837 case ir_unop_pack_snorm_4x8:
1838 case ir_unop_pack_unorm_2x16:
1839 case ir_unop_pack_unorm_4x8:
1840 case ir_unop_unpack_snorm_2x16:
1841 case ir_unop_unpack_snorm_4x8:
1842 case ir_unop_unpack_unorm_2x16:
1843 case ir_unop_unpack_unorm_4x8:
1844 unreachable("not reached: should be handled by lower_packing_builtins");
1845 case ir_unop_unpack_half_2x16_split_x:
1846 case ir_unop_unpack_half_2x16_split_y:
1847 case ir_binop_pack_half_2x16_split:
1848 case ir_unop_interpolate_at_centroid:
1849 case ir_binop_interpolate_at_sample:
1850 case ir_binop_interpolate_at_offset:
1851 unreachable("not reached: should not occur in vertex shader");
1852 case ir_binop_ldexp:
1853 unreachable("not reached: should be handled by ldexp_to_arith()");
1854 }
1855 }
1856
1857
1858 void
1859 vec4_visitor::visit(ir_swizzle *ir)
1860 {
1861 src_reg src;
1862 int i = 0;
1863 int swizzle[4];
1864
1865 /* Note that this is only swizzles in expressions, not those on the left
1866 * hand side of an assignment, which do write masking. See ir_assignment
1867 * for that.
1868 */
1869
1870 ir->val->accept(this);
1871 src = this->result;
1872 assert(src.file != BAD_FILE);
1873
1874 for (i = 0; i < ir->type->vector_elements; i++) {
1875 switch (i) {
1876 case 0:
1877 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1878 break;
1879 case 1:
1880 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1881 break;
1882 case 2:
1883 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1884 break;
1885 case 3:
1886 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1887 break;
1888 }
1889 }
1890 for (; i < 4; i++) {
1891 /* Replicate the last channel out. */
1892 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1893 }
1894
1895 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1896
1897 this->result = src;
1898 }
1899
1900 void
1901 vec4_visitor::visit(ir_dereference_variable *ir)
1902 {
1903 const struct glsl_type *type = ir->type;
1904 dst_reg *reg = variable_storage(ir->var);
1905
1906 if (!reg) {
1907 fail("Failed to find variable storage for %s\n", ir->var->name);
1908 this->result = src_reg(brw_null_reg());
1909 return;
1910 }
1911
1912 this->result = src_reg(*reg);
1913
1914 /* System values get their swizzle from the dst_reg writemask */
1915 if (ir->var->data.mode == ir_var_system_value)
1916 return;
1917
1918 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1919 this->result.swizzle = swizzle_for_size(type->vector_elements);
1920 }
1921
1922
1923 int
1924 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1925 {
1926 /* Under normal circumstances array elements are stored consecutively, so
1927 * the stride is equal to the size of the array element.
1928 */
1929 return type_size(ir->type);
1930 }
1931
1932
1933 void
1934 vec4_visitor::visit(ir_dereference_array *ir)
1935 {
1936 ir_constant *constant_index;
1937 src_reg src;
1938 int array_stride = compute_array_stride(ir);
1939
1940 constant_index = ir->array_index->constant_expression_value();
1941
1942 ir->array->accept(this);
1943 src = this->result;
1944
1945 if (constant_index) {
1946 src.reg_offset += constant_index->value.i[0] * array_stride;
1947 } else {
1948 /* Variable index array dereference. It eats the "vec4" of the
1949 * base of the array and an index that offsets the Mesa register
1950 * index.
1951 */
1952 ir->array_index->accept(this);
1953
1954 src_reg index_reg;
1955
1956 if (array_stride == 1) {
1957 index_reg = this->result;
1958 } else {
1959 index_reg = src_reg(this, glsl_type::int_type);
1960
1961 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1962 }
1963
1964 if (src.reladdr) {
1965 src_reg temp = src_reg(this, glsl_type::int_type);
1966
1967 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1968
1969 index_reg = temp;
1970 }
1971
1972 src.reladdr = ralloc(mem_ctx, src_reg);
1973 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1974 }
1975
1976 /* If the type is smaller than a vec4, replicate the last channel out. */
1977 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1978 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1979 else
1980 src.swizzle = BRW_SWIZZLE_NOOP;
1981 src.type = brw_type_for_base_type(ir->type);
1982
1983 this->result = src;
1984 }
1985
1986 void
1987 vec4_visitor::visit(ir_dereference_record *ir)
1988 {
1989 unsigned int i;
1990 const glsl_type *struct_type = ir->record->type;
1991 int offset = 0;
1992
1993 ir->record->accept(this);
1994
1995 for (i = 0; i < struct_type->length; i++) {
1996 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1997 break;
1998 offset += type_size(struct_type->fields.structure[i].type);
1999 }
2000
2001 /* If the type is smaller than a vec4, replicate the last channel out. */
2002 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
2003 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
2004 else
2005 this->result.swizzle = BRW_SWIZZLE_NOOP;
2006 this->result.type = brw_type_for_base_type(ir->type);
2007
2008 this->result.reg_offset += offset;
2009 }
2010
2011 /**
2012 * We want to be careful in assignment setup to hit the actual storage
2013 * instead of potentially using a temporary like we might with the
2014 * ir_dereference handler.
2015 */
2016 static dst_reg
2017 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
2018 {
2019 /* The LHS must be a dereference. If the LHS is a variable indexed array
2020 * access of a vector, it must be separated into a series conditional moves
2021 * before reaching this point (see ir_vec_index_to_cond_assign).
2022 */
2023 assert(ir->as_dereference());
2024 ir_dereference_array *deref_array = ir->as_dereference_array();
2025 if (deref_array) {
2026 assert(!deref_array->array->type->is_vector());
2027 }
2028
2029 /* Use the rvalue deref handler for the most part. We'll ignore
2030 * swizzles in it and write swizzles using writemask, though.
2031 */
2032 ir->accept(v);
2033 return dst_reg(v->result);
2034 }
2035
2036 void
2037 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
2038 const struct glsl_type *type,
2039 enum brw_predicate predicate)
2040 {
2041 if (type->base_type == GLSL_TYPE_STRUCT) {
2042 for (unsigned int i = 0; i < type->length; i++) {
2043 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
2044 }
2045 return;
2046 }
2047
2048 if (type->is_array()) {
2049 for (unsigned int i = 0; i < type->length; i++) {
2050 emit_block_move(dst, src, type->fields.array, predicate);
2051 }
2052 return;
2053 }
2054
2055 if (type->is_matrix()) {
2056 const struct glsl_type *vec_type;
2057
2058 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
2059 type->vector_elements, 1);
2060
2061 for (int i = 0; i < type->matrix_columns; i++) {
2062 emit_block_move(dst, src, vec_type, predicate);
2063 }
2064 return;
2065 }
2066
2067 assert(type->is_scalar() || type->is_vector());
2068
2069 dst->type = brw_type_for_base_type(type);
2070 src->type = dst->type;
2071
2072 dst->writemask = (1 << type->vector_elements) - 1;
2073
2074 src->swizzle = swizzle_for_size(type->vector_elements);
2075
2076 vec4_instruction *inst = emit(MOV(*dst, *src));
2077 inst->predicate = predicate;
2078
2079 dst->reg_offset++;
2080 src->reg_offset++;
2081 }
2082
2083
2084 /* If the RHS processing resulted in an instruction generating a
2085 * temporary value, and it would be easy to rewrite the instruction to
2086 * generate its result right into the LHS instead, do so. This ends
2087 * up reliably removing instructions where it can be tricky to do so
2088 * later without real UD chain information.
2089 */
2090 bool
2091 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2092 dst_reg dst,
2093 src_reg src,
2094 vec4_instruction *pre_rhs_inst,
2095 vec4_instruction *last_rhs_inst)
2096 {
2097 /* This could be supported, but it would take more smarts. */
2098 if (ir->condition)
2099 return false;
2100
2101 if (pre_rhs_inst == last_rhs_inst)
2102 return false; /* No instructions generated to work with. */
2103
2104 /* Make sure the last instruction generated our source reg. */
2105 if (src.file != GRF ||
2106 src.file != last_rhs_inst->dst.file ||
2107 src.reg != last_rhs_inst->dst.reg ||
2108 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2109 src.reladdr ||
2110 src.abs ||
2111 src.negate ||
2112 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2113 return false;
2114
2115 /* Check that that last instruction fully initialized the channels
2116 * we want to use, in the order we want to use them. We could
2117 * potentially reswizzle the operands of many instructions so that
2118 * we could handle out of order channels, but don't yet.
2119 */
2120
2121 for (unsigned i = 0; i < 4; i++) {
2122 if (dst.writemask & (1 << i)) {
2123 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2124 return false;
2125
2126 if (BRW_GET_SWZ(src.swizzle, i) != i)
2127 return false;
2128 }
2129 }
2130
2131 /* Success! Rewrite the instruction. */
2132 last_rhs_inst->dst.file = dst.file;
2133 last_rhs_inst->dst.reg = dst.reg;
2134 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2135 last_rhs_inst->dst.reladdr = dst.reladdr;
2136 last_rhs_inst->dst.writemask &= dst.writemask;
2137
2138 return true;
2139 }
2140
2141 void
2142 vec4_visitor::visit(ir_assignment *ir)
2143 {
2144 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2145 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2146
2147 if (!ir->lhs->type->is_scalar() &&
2148 !ir->lhs->type->is_vector()) {
2149 ir->rhs->accept(this);
2150 src_reg src = this->result;
2151
2152 if (ir->condition) {
2153 emit_bool_to_cond_code(ir->condition, &predicate);
2154 }
2155
2156 /* emit_block_move doesn't account for swizzles in the source register.
2157 * This should be ok, since the source register is a structure or an
2158 * array, and those can't be swizzled. But double-check to be sure.
2159 */
2160 assert(src.swizzle ==
2161 (ir->rhs->type->is_matrix()
2162 ? swizzle_for_size(ir->rhs->type->vector_elements)
2163 : BRW_SWIZZLE_NOOP));
2164
2165 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2166 return;
2167 }
2168
2169 /* Now we're down to just a scalar/vector with writemasks. */
2170 int i;
2171
2172 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2173 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2174
2175 ir->rhs->accept(this);
2176
2177 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2178
2179 src_reg src = this->result;
2180
2181 int swizzles[4];
2182 int first_enabled_chan = 0;
2183 int src_chan = 0;
2184
2185 assert(ir->lhs->type->is_vector() ||
2186 ir->lhs->type->is_scalar());
2187 dst.writemask = ir->write_mask;
2188
2189 for (int i = 0; i < 4; i++) {
2190 if (dst.writemask & (1 << i)) {
2191 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2192 break;
2193 }
2194 }
2195
2196 /* Swizzle a small RHS vector into the channels being written.
2197 *
2198 * glsl ir treats write_mask as dictating how many channels are
2199 * present on the RHS while in our instructions we need to make
2200 * those channels appear in the slots of the vec4 they're written to.
2201 */
2202 for (int i = 0; i < 4; i++) {
2203 if (dst.writemask & (1 << i))
2204 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2205 else
2206 swizzles[i] = first_enabled_chan;
2207 }
2208 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2209 swizzles[2], swizzles[3]);
2210
2211 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2212 return;
2213 }
2214
2215 if (ir->condition) {
2216 emit_bool_to_cond_code(ir->condition, &predicate);
2217 }
2218
2219 for (i = 0; i < type_size(ir->lhs->type); i++) {
2220 vec4_instruction *inst = emit(MOV(dst, src));
2221 inst->predicate = predicate;
2222
2223 dst.reg_offset++;
2224 src.reg_offset++;
2225 }
2226 }
2227
2228 void
2229 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2230 {
2231 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2232 foreach_in_list(ir_constant, field_value, &ir->components) {
2233 emit_constant_values(dst, field_value);
2234 }
2235 return;
2236 }
2237
2238 if (ir->type->is_array()) {
2239 for (unsigned int i = 0; i < ir->type->length; i++) {
2240 emit_constant_values(dst, ir->array_elements[i]);
2241 }
2242 return;
2243 }
2244
2245 if (ir->type->is_matrix()) {
2246 for (int i = 0; i < ir->type->matrix_columns; i++) {
2247 float *vec = &ir->value.f[i * ir->type->vector_elements];
2248
2249 for (int j = 0; j < ir->type->vector_elements; j++) {
2250 dst->writemask = 1 << j;
2251 dst->type = BRW_REGISTER_TYPE_F;
2252
2253 emit(MOV(*dst, src_reg(vec[j])));
2254 }
2255 dst->reg_offset++;
2256 }
2257 return;
2258 }
2259
2260 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2261
2262 for (int i = 0; i < ir->type->vector_elements; i++) {
2263 if (!(remaining_writemask & (1 << i)))
2264 continue;
2265
2266 dst->writemask = 1 << i;
2267 dst->type = brw_type_for_base_type(ir->type);
2268
2269 /* Find other components that match the one we're about to
2270 * write. Emits fewer instructions for things like vec4(0.5,
2271 * 1.5, 1.5, 1.5).
2272 */
2273 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2274 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2275 if (ir->value.b[i] == ir->value.b[j])
2276 dst->writemask |= (1 << j);
2277 } else {
2278 /* u, i, and f storage all line up, so no need for a
2279 * switch case for comparing each type.
2280 */
2281 if (ir->value.u[i] == ir->value.u[j])
2282 dst->writemask |= (1 << j);
2283 }
2284 }
2285
2286 switch (ir->type->base_type) {
2287 case GLSL_TYPE_FLOAT:
2288 emit(MOV(*dst, src_reg(ir->value.f[i])));
2289 break;
2290 case GLSL_TYPE_INT:
2291 emit(MOV(*dst, src_reg(ir->value.i[i])));
2292 break;
2293 case GLSL_TYPE_UINT:
2294 emit(MOV(*dst, src_reg(ir->value.u[i])));
2295 break;
2296 case GLSL_TYPE_BOOL:
2297 emit(MOV(*dst,
2298 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2299 : 0u)));
2300 break;
2301 default:
2302 unreachable("Non-float/uint/int/bool constant");
2303 }
2304
2305 remaining_writemask &= ~dst->writemask;
2306 }
2307 dst->reg_offset++;
2308 }
2309
2310 void
2311 vec4_visitor::visit(ir_constant *ir)
2312 {
2313 dst_reg dst = dst_reg(this, ir->type);
2314 this->result = src_reg(dst);
2315
2316 emit_constant_values(&dst, ir);
2317 }
2318
2319 void
2320 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2321 {
2322 ir_dereference *deref = static_cast<ir_dereference *>(
2323 ir->actual_parameters.get_head());
2324 ir_variable *location = deref->variable_referenced();
2325 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2326 location->data.binding);
2327
2328 /* Calculate the surface offset */
2329 src_reg offset(this, glsl_type::uint_type);
2330 ir_dereference_array *deref_array = deref->as_dereference_array();
2331 if (deref_array) {
2332 deref_array->array_index->accept(this);
2333
2334 src_reg tmp(this, glsl_type::uint_type);
2335 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2336 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2337 } else {
2338 offset = location->data.atomic.offset;
2339 }
2340
2341 /* Emit the appropriate machine instruction */
2342 const char *callee = ir->callee->function_name();
2343 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2344
2345 if (!strcmp("__intrinsic_atomic_read", callee)) {
2346 emit_untyped_surface_read(surf_index, dst, offset);
2347
2348 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2349 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2350 src_reg(), src_reg());
2351
2352 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2353 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2354 src_reg(), src_reg());
2355 }
2356 }
2357
2358 void
2359 vec4_visitor::visit(ir_call *ir)
2360 {
2361 const char *callee = ir->callee->function_name();
2362
2363 if (!strcmp("__intrinsic_atomic_read", callee) ||
2364 !strcmp("__intrinsic_atomic_increment", callee) ||
2365 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2366 visit_atomic_counter_intrinsic(ir);
2367 } else {
2368 unreachable("Unsupported intrinsic.");
2369 }
2370 }
2371
2372 src_reg
2373 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2374 {
2375 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2376 inst->base_mrf = 2;
2377 inst->mlen = 1;
2378 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2379 inst->dst.writemask = WRITEMASK_XYZW;
2380
2381 inst->src[1] = sampler;
2382
2383 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2384 int param_base = inst->base_mrf;
2385 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2386 int zero_mask = 0xf & ~coord_mask;
2387
2388 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2389 coordinate));
2390
2391 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2392 src_reg(0)));
2393
2394 emit(inst);
2395 return src_reg(inst->dst);
2396 }
2397
2398 static bool
2399 is_high_sampler(struct brw_context *brw, src_reg sampler)
2400 {
2401 if (brw->gen < 8 && !brw->is_haswell)
2402 return false;
2403
2404 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2405 }
2406
2407 void
2408 vec4_visitor::visit(ir_texture *ir)
2409 {
2410 uint32_t sampler =
2411 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2412
2413 ir_rvalue *nonconst_sampler_index =
2414 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2415
2416 /* Handle non-constant sampler array indexing */
2417 src_reg sampler_reg;
2418 if (nonconst_sampler_index) {
2419 /* The highest sampler which may be used by this operation is
2420 * the last element of the array. Mark it here, because the generator
2421 * doesn't have enough information to determine the bound.
2422 */
2423 uint32_t array_size = ir->sampler->as_dereference_array()
2424 ->array->type->array_size();
2425
2426 uint32_t max_used = sampler + array_size - 1;
2427 if (ir->op == ir_tg4 && brw->gen < 8) {
2428 max_used += prog_data->base.binding_table.gather_texture_start;
2429 } else {
2430 max_used += prog_data->base.binding_table.texture_start;
2431 }
2432
2433 brw_mark_surface_used(&prog_data->base, max_used);
2434
2435 /* Emit code to evaluate the actual indexing expression */
2436 nonconst_sampler_index->accept(this);
2437 dst_reg temp(this, glsl_type::uint_type);
2438 emit(ADD(temp, this->result, src_reg(sampler)))
2439 ->force_writemask_all = true;
2440 sampler_reg = src_reg(temp);
2441 } else {
2442 /* Single sampler, or constant array index; the indexing expression
2443 * is just an immediate.
2444 */
2445 sampler_reg = src_reg(sampler);
2446 }
2447
2448 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2449 * emitting anything other than setting up the constant result.
2450 */
2451 if (ir->op == ir_tg4) {
2452 ir_constant *chan = ir->lod_info.component->as_constant();
2453 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2454 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2455 dst_reg result(this, ir->type);
2456 this->result = src_reg(result);
2457 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2458 return;
2459 }
2460 }
2461
2462 /* Should be lowered by do_lower_texture_projection */
2463 assert(!ir->projector);
2464
2465 /* Should be lowered */
2466 assert(!ir->offset || !ir->offset->type->is_array());
2467
2468 /* Generate code to compute all the subexpression trees. This has to be
2469 * done before loading any values into MRFs for the sampler message since
2470 * generating these values may involve SEND messages that need the MRFs.
2471 */
2472 src_reg coordinate;
2473 if (ir->coordinate) {
2474 ir->coordinate->accept(this);
2475 coordinate = this->result;
2476 }
2477
2478 src_reg shadow_comparitor;
2479 if (ir->shadow_comparitor) {
2480 ir->shadow_comparitor->accept(this);
2481 shadow_comparitor = this->result;
2482 }
2483
2484 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2485 src_reg offset_value;
2486 if (has_nonconstant_offset) {
2487 ir->offset->accept(this);
2488 offset_value = src_reg(this->result);
2489 }
2490
2491 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2492 src_reg lod, dPdx, dPdy, sample_index, mcs;
2493 switch (ir->op) {
2494 case ir_tex:
2495 lod = src_reg(0.0f);
2496 lod_type = glsl_type::float_type;
2497 break;
2498 case ir_txf:
2499 case ir_txl:
2500 case ir_txs:
2501 ir->lod_info.lod->accept(this);
2502 lod = this->result;
2503 lod_type = ir->lod_info.lod->type;
2504 break;
2505 case ir_query_levels:
2506 lod = src_reg(0);
2507 lod_type = glsl_type::int_type;
2508 break;
2509 case ir_txf_ms:
2510 ir->lod_info.sample_index->accept(this);
2511 sample_index = this->result;
2512 sample_index_type = ir->lod_info.sample_index->type;
2513
2514 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2515 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2516 else
2517 mcs = src_reg(0u);
2518 break;
2519 case ir_txd:
2520 ir->lod_info.grad.dPdx->accept(this);
2521 dPdx = this->result;
2522
2523 ir->lod_info.grad.dPdy->accept(this);
2524 dPdy = this->result;
2525
2526 lod_type = ir->lod_info.grad.dPdx->type;
2527 break;
2528 case ir_txb:
2529 case ir_lod:
2530 case ir_tg4:
2531 break;
2532 }
2533
2534 enum opcode opcode;
2535 switch (ir->op) {
2536 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2537 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2538 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2539 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2540 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2541 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2542 case ir_tg4: opcode = has_nonconstant_offset
2543 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2544 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2545 case ir_txb:
2546 unreachable("TXB is not valid for vertex shaders.");
2547 case ir_lod:
2548 unreachable("LOD is not valid for vertex shaders.");
2549 default:
2550 unreachable("Unrecognized tex op");
2551 }
2552
2553 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2554
2555 if (ir->offset != NULL && !has_nonconstant_offset) {
2556 inst->texture_offset =
2557 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2558 ir->offset->type->vector_elements);
2559 }
2560
2561 /* Stuff the channel select bits in the top of the texture offset */
2562 if (ir->op == ir_tg4)
2563 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2564
2565 /* The message header is necessary for:
2566 * - Gen4 (always)
2567 * - Texel offsets
2568 * - Gather channel selection
2569 * - Sampler indices too large to fit in a 4-bit value.
2570 */
2571 inst->header_present =
2572 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2573 is_high_sampler(brw, sampler_reg);
2574 inst->base_mrf = 2;
2575 inst->mlen = inst->header_present + 1; /* always at least one */
2576 inst->dst = dst_reg(this, ir->type);
2577 inst->dst.writemask = WRITEMASK_XYZW;
2578 inst->shadow_compare = ir->shadow_comparitor != NULL;
2579
2580 inst->src[1] = sampler_reg;
2581
2582 /* MRF for the first parameter */
2583 int param_base = inst->base_mrf + inst->header_present;
2584
2585 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2586 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2587 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2588 } else {
2589 /* Load the coordinate */
2590 /* FINISHME: gl_clamp_mask and saturate */
2591 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2592 int zero_mask = 0xf & ~coord_mask;
2593
2594 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2595 coordinate));
2596
2597 if (zero_mask != 0) {
2598 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2599 src_reg(0)));
2600 }
2601 /* Load the shadow comparitor */
2602 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2603 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2604 WRITEMASK_X),
2605 shadow_comparitor));
2606 inst->mlen++;
2607 }
2608
2609 /* Load the LOD info */
2610 if (ir->op == ir_tex || ir->op == ir_txl) {
2611 int mrf, writemask;
2612 if (brw->gen >= 5) {
2613 mrf = param_base + 1;
2614 if (ir->shadow_comparitor) {
2615 writemask = WRITEMASK_Y;
2616 /* mlen already incremented */
2617 } else {
2618 writemask = WRITEMASK_X;
2619 inst->mlen++;
2620 }
2621 } else /* brw->gen == 4 */ {
2622 mrf = param_base;
2623 writemask = WRITEMASK_W;
2624 }
2625 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2626 } else if (ir->op == ir_txf) {
2627 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2628 } else if (ir->op == ir_txf_ms) {
2629 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2630 sample_index));
2631 if (brw->gen >= 7) {
2632 /* MCS data is in the first channel of `mcs`, but we need to get it into
2633 * the .y channel of the second vec4 of params, so replicate .x across
2634 * the whole vec4 and then mask off everything except .y
2635 */
2636 mcs.swizzle = BRW_SWIZZLE_XXXX;
2637 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2638 mcs));
2639 }
2640 inst->mlen++;
2641 } else if (ir->op == ir_txd) {
2642 const glsl_type *type = lod_type;
2643
2644 if (brw->gen >= 5) {
2645 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2646 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2647 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2648 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2649 inst->mlen++;
2650
2651 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2652 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2653 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2654 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2655 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2656 inst->mlen++;
2657
2658 if (ir->shadow_comparitor) {
2659 emit(MOV(dst_reg(MRF, param_base + 2,
2660 ir->shadow_comparitor->type, WRITEMASK_Z),
2661 shadow_comparitor));
2662 }
2663 }
2664 } else /* brw->gen == 4 */ {
2665 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2666 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2667 inst->mlen += 2;
2668 }
2669 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2670 if (ir->shadow_comparitor) {
2671 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2672 shadow_comparitor));
2673 }
2674
2675 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2676 offset_value));
2677 inst->mlen++;
2678 }
2679 }
2680
2681 emit(inst);
2682
2683 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2684 * spec requires layers.
2685 */
2686 if (ir->op == ir_txs) {
2687 glsl_type const *type = ir->sampler->type;
2688 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2689 type->sampler_array) {
2690 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2691 writemask(inst->dst, WRITEMASK_Z),
2692 src_reg(inst->dst), src_reg(6));
2693 }
2694 }
2695
2696 if (brw->gen == 6 && ir->op == ir_tg4) {
2697 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2698 }
2699
2700 swizzle_result(ir, src_reg(inst->dst), sampler);
2701 }
2702
2703 /**
2704 * Apply workarounds for Gen6 gather with UINT/SINT
2705 */
2706 void
2707 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2708 {
2709 if (!wa)
2710 return;
2711
2712 int width = (wa & WA_8BIT) ? 8 : 16;
2713 dst_reg dst_f = dst;
2714 dst_f.type = BRW_REGISTER_TYPE_F;
2715
2716 /* Convert from UNORM to UINT */
2717 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2718 emit(MOV(dst, src_reg(dst_f)));
2719
2720 if (wa & WA_SIGN) {
2721 /* Reinterpret the UINT value as a signed INT value by
2722 * shifting the sign bit into place, then shifting back
2723 * preserving sign.
2724 */
2725 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2726 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2727 }
2728 }
2729
2730 /**
2731 * Set up the gather channel based on the swizzle, for gather4.
2732 */
2733 uint32_t
2734 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2735 {
2736 ir_constant *chan = ir->lod_info.component->as_constant();
2737 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2738 switch (swiz) {
2739 case SWIZZLE_X: return 0;
2740 case SWIZZLE_Y:
2741 /* gather4 sampler is broken for green channel on RG32F --
2742 * we must ask for blue instead.
2743 */
2744 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2745 return 2;
2746 return 1;
2747 case SWIZZLE_Z: return 2;
2748 case SWIZZLE_W: return 3;
2749 default:
2750 unreachable("Not reached"); /* zero, one swizzles handled already */
2751 }
2752 }
2753
2754 void
2755 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2756 {
2757 int s = key->tex.swizzles[sampler];
2758
2759 this->result = src_reg(this, ir->type);
2760 dst_reg swizzled_result(this->result);
2761
2762 if (ir->op == ir_query_levels) {
2763 /* # levels is in .w */
2764 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2765 emit(MOV(swizzled_result, orig_val));
2766 return;
2767 }
2768
2769 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2770 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2771 emit(MOV(swizzled_result, orig_val));
2772 return;
2773 }
2774
2775
2776 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2777 int swizzle[4] = {0};
2778
2779 for (int i = 0; i < 4; i++) {
2780 switch (GET_SWZ(s, i)) {
2781 case SWIZZLE_ZERO:
2782 zero_mask |= (1 << i);
2783 break;
2784 case SWIZZLE_ONE:
2785 one_mask |= (1 << i);
2786 break;
2787 default:
2788 copy_mask |= (1 << i);
2789 swizzle[i] = GET_SWZ(s, i);
2790 break;
2791 }
2792 }
2793
2794 if (copy_mask) {
2795 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2796 swizzled_result.writemask = copy_mask;
2797 emit(MOV(swizzled_result, orig_val));
2798 }
2799
2800 if (zero_mask) {
2801 swizzled_result.writemask = zero_mask;
2802 emit(MOV(swizzled_result, src_reg(0.0f)));
2803 }
2804
2805 if (one_mask) {
2806 swizzled_result.writemask = one_mask;
2807 emit(MOV(swizzled_result, src_reg(1.0f)));
2808 }
2809 }
2810
2811 void
2812 vec4_visitor::visit(ir_return *)
2813 {
2814 unreachable("not reached");
2815 }
2816
2817 void
2818 vec4_visitor::visit(ir_discard *)
2819 {
2820 unreachable("not reached");
2821 }
2822
2823 void
2824 vec4_visitor::visit(ir_if *ir)
2825 {
2826 /* Don't point the annotation at the if statement, because then it plus
2827 * the then and else blocks get printed.
2828 */
2829 this->base_ir = ir->condition;
2830
2831 if (brw->gen == 6) {
2832 emit_if_gen6(ir);
2833 } else {
2834 enum brw_predicate predicate;
2835 emit_bool_to_cond_code(ir->condition, &predicate);
2836 emit(IF(predicate));
2837 }
2838
2839 visit_instructions(&ir->then_instructions);
2840
2841 if (!ir->else_instructions.is_empty()) {
2842 this->base_ir = ir->condition;
2843 emit(BRW_OPCODE_ELSE);
2844
2845 visit_instructions(&ir->else_instructions);
2846 }
2847
2848 this->base_ir = ir->condition;
2849 emit(BRW_OPCODE_ENDIF);
2850 }
2851
2852 void
2853 vec4_visitor::visit(ir_emit_vertex *)
2854 {
2855 unreachable("not reached");
2856 }
2857
2858 void
2859 vec4_visitor::visit(ir_end_primitive *)
2860 {
2861 unreachable("not reached");
2862 }
2863
2864 void
2865 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2866 dst_reg dst, src_reg offset,
2867 src_reg src0, src_reg src1)
2868 {
2869 unsigned mlen = 0;
2870
2871 /* Set the atomic operation offset. */
2872 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2873 mlen++;
2874
2875 /* Set the atomic operation arguments. */
2876 if (src0.file != BAD_FILE) {
2877 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2878 mlen++;
2879 }
2880
2881 if (src1.file != BAD_FILE) {
2882 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2883 mlen++;
2884 }
2885
2886 /* Emit the instruction. Note that this maps to the normal SIMD8
2887 * untyped atomic message on Ivy Bridge, but that's OK because
2888 * unused channels will be masked out.
2889 */
2890 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2891 src_reg(atomic_op), src_reg(surf_index));
2892 inst->base_mrf = 0;
2893 inst->mlen = mlen;
2894 }
2895
2896 void
2897 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2898 src_reg offset)
2899 {
2900 /* Set the surface read offset. */
2901 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2902
2903 /* Emit the instruction. Note that this maps to the normal SIMD8
2904 * untyped surface read message, but that's OK because unused
2905 * channels will be masked out.
2906 */
2907 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2908 dst, src_reg(surf_index));
2909 inst->base_mrf = 0;
2910 inst->mlen = 1;
2911 }
2912
2913 void
2914 vec4_visitor::emit_ndc_computation()
2915 {
2916 /* Get the position */
2917 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2918
2919 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2920 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2921 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2922
2923 current_annotation = "NDC";
2924 dst_reg ndc_w = ndc;
2925 ndc_w.writemask = WRITEMASK_W;
2926 src_reg pos_w = pos;
2927 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2928 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2929
2930 dst_reg ndc_xyz = ndc;
2931 ndc_xyz.writemask = WRITEMASK_XYZ;
2932
2933 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2934 }
2935
2936 void
2937 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2938 {
2939 if (brw->gen < 6 &&
2940 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2941 key->userclip_active || brw->has_negative_rhw_bug)) {
2942 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2943 dst_reg header1_w = header1;
2944 header1_w.writemask = WRITEMASK_W;
2945
2946 emit(MOV(header1, 0u));
2947
2948 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2949 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2950
2951 current_annotation = "Point size";
2952 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2953 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2954 }
2955
2956 if (key->userclip_active) {
2957 current_annotation = "Clipping flags";
2958 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2959 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2960
2961 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2962 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2963 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2964
2965 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2966 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2967 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2968 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2969 }
2970
2971 /* i965 clipping workaround:
2972 * 1) Test for -ve rhw
2973 * 2) If set,
2974 * set ndc = (0,0,0,0)
2975 * set ucp[6] = 1
2976 *
2977 * Later, clipping will detect ucp[6] and ensure the primitive is
2978 * clipped against all fixed planes.
2979 */
2980 if (brw->has_negative_rhw_bug) {
2981 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2982 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2983 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2984 vec4_instruction *inst;
2985 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2986 inst->predicate = BRW_PREDICATE_NORMAL;
2987 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2988 inst->predicate = BRW_PREDICATE_NORMAL;
2989 }
2990
2991 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2992 } else if (brw->gen < 6) {
2993 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2994 } else {
2995 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2996 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2997 dst_reg reg_w = reg;
2998 reg_w.writemask = WRITEMASK_W;
2999 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
3000 }
3001 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
3002 dst_reg reg_y = reg;
3003 reg_y.writemask = WRITEMASK_Y;
3004 reg_y.type = BRW_REGISTER_TYPE_D;
3005 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
3006 }
3007 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
3008 dst_reg reg_z = reg;
3009 reg_z.writemask = WRITEMASK_Z;
3010 reg_z.type = BRW_REGISTER_TYPE_D;
3011 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
3012 }
3013 }
3014 }
3015
3016 void
3017 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
3018 {
3019 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
3020 *
3021 * "If a linked set of shaders forming the vertex stage contains no
3022 * static write to gl_ClipVertex or gl_ClipDistance, but the
3023 * application has requested clipping against user clip planes through
3024 * the API, then the coordinate written to gl_Position is used for
3025 * comparison against the user clip planes."
3026 *
3027 * This function is only called if the shader didn't write to
3028 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
3029 * if the user wrote to it; otherwise we use gl_Position.
3030 */
3031 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
3032 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
3033 clip_vertex = VARYING_SLOT_POS;
3034 }
3035
3036 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
3037 ++i) {
3038 reg.writemask = 1 << i;
3039 emit(DP4(reg,
3040 src_reg(output_reg[clip_vertex]),
3041 src_reg(this->userplane[i + offset])));
3042 }
3043 }
3044
3045 void
3046 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
3047 {
3048 assert (varying < VARYING_SLOT_MAX);
3049 reg.type = output_reg[varying].type;
3050 current_annotation = output_reg_annotation[varying];
3051 /* Copy the register, saturating if necessary */
3052 vec4_instruction *inst = emit(MOV(reg,
3053 src_reg(output_reg[varying])));
3054 if ((varying == VARYING_SLOT_COL0 ||
3055 varying == VARYING_SLOT_COL1 ||
3056 varying == VARYING_SLOT_BFC0 ||
3057 varying == VARYING_SLOT_BFC1) &&
3058 key->clamp_vertex_color) {
3059 inst->saturate = true;
3060 }
3061 }
3062
3063 void
3064 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
3065 {
3066 reg.type = BRW_REGISTER_TYPE_F;
3067
3068 switch (varying) {
3069 case VARYING_SLOT_PSIZ:
3070 {
3071 /* PSIZ is always in slot 0, and is coupled with other flags. */
3072 current_annotation = "indices, point width, clip flags";
3073 emit_psiz_and_flags(reg);
3074 break;
3075 }
3076 case BRW_VARYING_SLOT_NDC:
3077 current_annotation = "NDC";
3078 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
3079 break;
3080 case VARYING_SLOT_POS:
3081 current_annotation = "gl_Position";
3082 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
3083 break;
3084 case VARYING_SLOT_EDGE:
3085 /* This is present when doing unfilled polygons. We're supposed to copy
3086 * the edge flag from the user-provided vertex array
3087 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3088 * of that attribute (starts as 1.0f). This is then used in clipping to
3089 * determine which edges should be drawn as wireframe.
3090 */
3091 current_annotation = "edge flag";
3092 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3093 glsl_type::float_type, WRITEMASK_XYZW))));
3094 break;
3095 case BRW_VARYING_SLOT_PAD:
3096 /* No need to write to this slot */
3097 break;
3098 default:
3099 emit_generic_urb_slot(reg, varying);
3100 break;
3101 }
3102 }
3103
3104 static int
3105 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3106 {
3107 if (brw->gen >= 6) {
3108 /* URB data written (does not include the message header reg) must
3109 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3110 * section 5.4.3.2.2: URB_INTERLEAVED.
3111 *
3112 * URB entries are allocated on a multiple of 1024 bits, so an
3113 * extra 128 bits written here to make the end align to 256 is
3114 * no problem.
3115 */
3116 if ((mlen % 2) != 1)
3117 mlen++;
3118 }
3119
3120 return mlen;
3121 }
3122
3123
3124 /**
3125 * Generates the VUE payload plus the necessary URB write instructions to
3126 * output it.
3127 *
3128 * The VUE layout is documented in Volume 2a.
3129 */
3130 void
3131 vec4_visitor::emit_vertex()
3132 {
3133 /* MRF 0 is reserved for the debugger, so start with message header
3134 * in MRF 1.
3135 */
3136 int base_mrf = 1;
3137 int mrf = base_mrf;
3138 /* In the process of generating our URB write message contents, we
3139 * may need to unspill a register or load from an array. Those
3140 * reads would use MRFs 14-15.
3141 */
3142 int max_usable_mrf = 13;
3143
3144 /* The following assertion verifies that max_usable_mrf causes an
3145 * even-numbered amount of URB write data, which will meet gen6's
3146 * requirements for length alignment.
3147 */
3148 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3149
3150 /* First mrf is the g0-based message header containing URB handles and
3151 * such.
3152 */
3153 emit_urb_write_header(mrf++);
3154
3155 if (brw->gen < 6) {
3156 emit_ndc_computation();
3157 }
3158
3159 /* Lower legacy ff and ClipVertex clipping to clip distances */
3160 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3161 current_annotation = "user clip distances";
3162
3163 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3164 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3165
3166 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3167 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3168 }
3169
3170 /* We may need to split this up into several URB writes, so do them in a
3171 * loop.
3172 */
3173 int slot = 0;
3174 bool complete = false;
3175 do {
3176 /* URB offset is in URB row increments, and each of our MRFs is half of
3177 * one of those, since we're doing interleaved writes.
3178 */
3179 int offset = slot / 2;
3180
3181 mrf = base_mrf + 1;
3182 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3183 emit_urb_slot(dst_reg(MRF, mrf++),
3184 prog_data->vue_map.slot_to_varying[slot]);
3185
3186 /* If this was max_usable_mrf, we can't fit anything more into this
3187 * URB WRITE.
3188 */
3189 if (mrf > max_usable_mrf) {
3190 slot++;
3191 break;
3192 }
3193 }
3194
3195 complete = slot >= prog_data->vue_map.num_slots;
3196 current_annotation = "URB write";
3197 vec4_instruction *inst = emit_urb_write_opcode(complete);
3198 inst->base_mrf = base_mrf;
3199 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3200 inst->offset += offset;
3201 } while(!complete);
3202 }
3203
3204
3205 src_reg
3206 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3207 src_reg *reladdr, int reg_offset)
3208 {
3209 /* Because we store the values to scratch interleaved like our
3210 * vertex data, we need to scale the vec4 index by 2.
3211 */
3212 int message_header_scale = 2;
3213
3214 /* Pre-gen6, the message header uses byte offsets instead of vec4
3215 * (16-byte) offset units.
3216 */
3217 if (brw->gen < 6)
3218 message_header_scale *= 16;
3219
3220 if (reladdr) {
3221 src_reg index = src_reg(this, glsl_type::int_type);
3222
3223 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3224 src_reg(reg_offset)));
3225 emit_before(block, inst, MUL(dst_reg(index), index,
3226 src_reg(message_header_scale)));
3227
3228 return index;
3229 } else {
3230 return src_reg(reg_offset * message_header_scale);
3231 }
3232 }
3233
3234 src_reg
3235 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3236 src_reg *reladdr, int reg_offset)
3237 {
3238 if (reladdr) {
3239 src_reg index = src_reg(this, glsl_type::int_type);
3240
3241 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3242 src_reg(reg_offset)));
3243
3244 /* Pre-gen6, the message header uses byte offsets instead of vec4
3245 * (16-byte) offset units.
3246 */
3247 if (brw->gen < 6) {
3248 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3249 }
3250
3251 return index;
3252 } else if (brw->gen >= 8) {
3253 /* Store the offset in a GRF so we can send-from-GRF. */
3254 src_reg offset = src_reg(this, glsl_type::int_type);
3255 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3256 return offset;
3257 } else {
3258 int message_header_scale = brw->gen < 6 ? 16 : 1;
3259 return src_reg(reg_offset * message_header_scale);
3260 }
3261 }
3262
3263 /**
3264 * Emits an instruction before @inst to load the value named by @orig_src
3265 * from scratch space at @base_offset to @temp.
3266 *
3267 * @base_offset is measured in 32-byte units (the size of a register).
3268 */
3269 void
3270 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3271 dst_reg temp, src_reg orig_src,
3272 int base_offset)
3273 {
3274 int reg_offset = base_offset + orig_src.reg_offset;
3275 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3276 reg_offset);
3277
3278 emit_before(block, inst, SCRATCH_READ(temp, index));
3279 }
3280
3281 /**
3282 * Emits an instruction after @inst to store the value to be written
3283 * to @orig_dst to scratch space at @base_offset, from @temp.
3284 *
3285 * @base_offset is measured in 32-byte units (the size of a register).
3286 */
3287 void
3288 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3289 int base_offset)
3290 {
3291 int reg_offset = base_offset + inst->dst.reg_offset;
3292 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3293 reg_offset);
3294
3295 /* Create a temporary register to store *inst's result in.
3296 *
3297 * We have to be careful in MOVing from our temporary result register in
3298 * the scratch write. If we swizzle from channels of the temporary that
3299 * weren't initialized, it will confuse live interval analysis, which will
3300 * make spilling fail to make progress.
3301 */
3302 src_reg temp = src_reg(this, glsl_type::vec4_type);
3303 temp.type = inst->dst.type;
3304 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3305 int swizzles[4];
3306 for (int i = 0; i < 4; i++)
3307 if (inst->dst.writemask & (1 << i))
3308 swizzles[i] = i;
3309 else
3310 swizzles[i] = first_writemask_chan;
3311 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3312 swizzles[2], swizzles[3]);
3313
3314 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3315 inst->dst.writemask));
3316 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3317 write->predicate = inst->predicate;
3318 write->ir = inst->ir;
3319 write->annotation = inst->annotation;
3320 inst->insert_after(block, write);
3321
3322 inst->dst.file = temp.file;
3323 inst->dst.reg = temp.reg;
3324 inst->dst.reg_offset = temp.reg_offset;
3325 inst->dst.reladdr = NULL;
3326 }
3327
3328 /**
3329 * We can't generally support array access in GRF space, because a
3330 * single instruction's destination can only span 2 contiguous
3331 * registers. So, we send all GRF arrays that get variable index
3332 * access to scratch space.
3333 */
3334 void
3335 vec4_visitor::move_grf_array_access_to_scratch()
3336 {
3337 int scratch_loc[this->virtual_grf_count];
3338 memset(scratch_loc, -1, sizeof(scratch_loc));
3339
3340 /* First, calculate the set of virtual GRFs that need to be punted
3341 * to scratch due to having any array access on them, and where in
3342 * scratch.
3343 */
3344 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3345 if (inst->dst.file == GRF && inst->dst.reladdr &&
3346 scratch_loc[inst->dst.reg] == -1) {
3347 scratch_loc[inst->dst.reg] = c->last_scratch;
3348 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3349 }
3350
3351 for (int i = 0 ; i < 3; i++) {
3352 src_reg *src = &inst->src[i];
3353
3354 if (src->file == GRF && src->reladdr &&
3355 scratch_loc[src->reg] == -1) {
3356 scratch_loc[src->reg] = c->last_scratch;
3357 c->last_scratch += this->virtual_grf_sizes[src->reg];
3358 }
3359 }
3360 }
3361
3362 /* Now, for anything that will be accessed through scratch, rewrite
3363 * it to load/store. Note that this is a _safe list walk, because
3364 * we may generate a new scratch_write instruction after the one
3365 * we're processing.
3366 */
3367 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3368 /* Set up the annotation tracking for new generated instructions. */
3369 base_ir = inst->ir;
3370 current_annotation = inst->annotation;
3371
3372 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3373 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3374 }
3375
3376 for (int i = 0 ; i < 3; i++) {
3377 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3378 continue;
3379
3380 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3381
3382 emit_scratch_read(block, inst, temp, inst->src[i],
3383 scratch_loc[inst->src[i].reg]);
3384
3385 inst->src[i].file = temp.file;
3386 inst->src[i].reg = temp.reg;
3387 inst->src[i].reg_offset = temp.reg_offset;
3388 inst->src[i].reladdr = NULL;
3389 }
3390 }
3391 }
3392
3393 /**
3394 * Emits an instruction before @inst to load the value named by @orig_src
3395 * from the pull constant buffer (surface) at @base_offset to @temp.
3396 */
3397 void
3398 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3399 dst_reg temp, src_reg orig_src,
3400 int base_offset)
3401 {
3402 int reg_offset = base_offset + orig_src.reg_offset;
3403 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3404 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3405 reg_offset);
3406 vec4_instruction *load;
3407
3408 if (brw->gen >= 7) {
3409 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3410 grf_offset.type = offset.type;
3411 emit_before(block, inst, MOV(grf_offset, offset));
3412
3413 load = new(mem_ctx) vec4_instruction(this,
3414 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3415 temp, index, src_reg(grf_offset));
3416 } else {
3417 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3418 temp, index, offset);
3419 load->base_mrf = 14;
3420 load->mlen = 1;
3421 }
3422 emit_before(block, inst, load);
3423 }
3424
3425 /**
3426 * Implements array access of uniforms by inserting a
3427 * PULL_CONSTANT_LOAD instruction.
3428 *
3429 * Unlike temporary GRF array access (where we don't support it due to
3430 * the difficulty of doing relative addressing on instruction
3431 * destinations), we could potentially do array access of uniforms
3432 * that were loaded in GRF space as push constants. In real-world
3433 * usage we've seen, though, the arrays being used are always larger
3434 * than we could load as push constants, so just always move all
3435 * uniform array access out to a pull constant buffer.
3436 */
3437 void
3438 vec4_visitor::move_uniform_array_access_to_pull_constants()
3439 {
3440 int pull_constant_loc[this->uniforms];
3441 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3442
3443 /* Walk through and find array access of uniforms. Put a copy of that
3444 * uniform in the pull constant buffer.
3445 *
3446 * Note that we don't move constant-indexed accesses to arrays. No
3447 * testing has been done of the performance impact of this choice.
3448 */
3449 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3450 for (int i = 0 ; i < 3; i++) {
3451 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3452 continue;
3453
3454 int uniform = inst->src[i].reg;
3455
3456 /* If this array isn't already present in the pull constant buffer,
3457 * add it.
3458 */
3459 if (pull_constant_loc[uniform] == -1) {
3460 const gl_constant_value **values =
3461 &stage_prog_data->param[uniform * 4];
3462
3463 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3464
3465 assert(uniform < uniform_array_size);
3466 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3467 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3468 = values[j];
3469 }
3470 }
3471
3472 /* Set up the annotation tracking for new generated instructions. */
3473 base_ir = inst->ir;
3474 current_annotation = inst->annotation;
3475
3476 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3477
3478 emit_pull_constant_load(block, inst, temp, inst->src[i],
3479 pull_constant_loc[uniform]);
3480
3481 inst->src[i].file = temp.file;
3482 inst->src[i].reg = temp.reg;
3483 inst->src[i].reg_offset = temp.reg_offset;
3484 inst->src[i].reladdr = NULL;
3485 }
3486 }
3487
3488 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3489 * no need to track them as larger-than-vec4 objects. This will be
3490 * relied on in cutting out unused uniform vectors from push
3491 * constants.
3492 */
3493 split_uniform_registers();
3494 }
3495
3496 void
3497 vec4_visitor::resolve_ud_negate(src_reg *reg)
3498 {
3499 if (reg->type != BRW_REGISTER_TYPE_UD ||
3500 !reg->negate)
3501 return;
3502
3503 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3504 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3505 *reg = temp;
3506 }
3507
3508 vec4_visitor::vec4_visitor(struct brw_context *brw,
3509 struct brw_vec4_compile *c,
3510 struct gl_program *prog,
3511 const struct brw_vec4_prog_key *key,
3512 struct brw_vec4_prog_data *prog_data,
3513 struct gl_shader_program *shader_prog,
3514 gl_shader_stage stage,
3515 void *mem_ctx,
3516 bool debug_flag,
3517 bool no_spills,
3518 shader_time_shader_type st_base,
3519 shader_time_shader_type st_written,
3520 shader_time_shader_type st_reset)
3521 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3522 c(c),
3523 key(key),
3524 prog_data(prog_data),
3525 sanity_param_count(0),
3526 fail_msg(NULL),
3527 first_non_payload_grf(0),
3528 need_all_constants_in_pull_buffer(false),
3529 debug_flag(debug_flag),
3530 no_spills(no_spills),
3531 st_base(st_base),
3532 st_written(st_written),
3533 st_reset(st_reset)
3534 {
3535 this->mem_ctx = mem_ctx;
3536 this->failed = false;
3537
3538 this->base_ir = NULL;
3539 this->current_annotation = NULL;
3540 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3541
3542 this->variable_ht = hash_table_ctor(0,
3543 hash_table_pointer_hash,
3544 hash_table_pointer_compare);
3545
3546 this->virtual_grf_start = NULL;
3547 this->virtual_grf_end = NULL;
3548 this->virtual_grf_sizes = NULL;
3549 this->virtual_grf_count = 0;
3550 this->virtual_grf_reg_map = NULL;
3551 this->virtual_grf_reg_count = 0;
3552 this->virtual_grf_array_size = 0;
3553 this->live_intervals_valid = false;
3554
3555 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3556
3557 this->uniforms = 0;
3558
3559 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3560 * at least one. See setup_uniforms() in brw_vec4.cpp.
3561 */
3562 this->uniform_array_size = 1;
3563 if (prog_data) {
3564 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3565 }
3566
3567 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3568 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3569 }
3570
3571 vec4_visitor::~vec4_visitor()
3572 {
3573 hash_table_dtor(this->variable_ht);
3574 }
3575
3576
3577 void
3578 vec4_visitor::fail(const char *format, ...)
3579 {
3580 va_list va;
3581 char *msg;
3582
3583 if (failed)
3584 return;
3585
3586 failed = true;
3587
3588 va_start(va, format);
3589 msg = ralloc_vasprintf(mem_ctx, format, va);
3590 va_end(va);
3591 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3592
3593 this->fail_msg = msg;
3594
3595 if (debug_flag) {
3596 fprintf(stderr, "%s", msg);
3597 }
3598 }
3599
3600 } /* namespace brw */