i965/vec4: Don't fix_math_operand() on Gen >= 8.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->writes_accumulator = false;
46 this->conditional_mod = BRW_CONDITIONAL_NONE;
47 this->sampler = 0;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(dst_reg dst, src_reg src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
123 { \
124 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
125 src0, src1); \
126 }
127
128 #define ALU2_ACC(op) \
129 vec4_instruction * \
130 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
131 { \
132 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
133 BRW_OPCODE_##op, dst, src0, src1); \
134 inst->writes_accumulator = true; \
135 return inst; \
136 }
137
138 #define ALU3(op) \
139 vec4_instruction * \
140 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
141 { \
142 assert(brw->gen >= 6); \
143 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
144 src0, src1, src2); \
145 }
146
147 ALU1(NOT)
148 ALU1(MOV)
149 ALU1(FRC)
150 ALU1(RNDD)
151 ALU1(RNDE)
152 ALU1(RNDZ)
153 ALU1(F32TO16)
154 ALU1(F16TO32)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2_ACC(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(DP3)
162 ALU2(DP4)
163 ALU2(DPH)
164 ALU2(SHL)
165 ALU2(SHR)
166 ALU2(ASR)
167 ALU3(LRP)
168 ALU1(BFREV)
169 ALU3(BFE)
170 ALU2(BFI1)
171 ALU3(BFI2)
172 ALU1(FBH)
173 ALU1(FBL)
174 ALU1(CBIT)
175 ALU3(MAD)
176 ALU2_ACC(ADDC)
177 ALU2_ACC(SUBB)
178 ALU2(MAC)
179
180 /** Gen4 predicated IF. */
181 vec4_instruction *
182 vec4_visitor::IF(uint32_t predicate)
183 {
184 vec4_instruction *inst;
185
186 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
187 inst->predicate = predicate;
188
189 return inst;
190 }
191
192 /** Gen6 IF with embedded comparison. */
193 vec4_instruction *
194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
195 {
196 assert(brw->gen == 6);
197
198 vec4_instruction *inst;
199
200 resolve_ud_negate(&src0);
201 resolve_ud_negate(&src1);
202
203 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
204 src0, src1);
205 inst->conditional_mod = condition;
206
207 return inst;
208 }
209
210 /**
211 * CMP: Sets the low bit of the destination channels with the result
212 * of the comparison, while the upper bits are undefined, and updates
213 * the flag register with the packed 16 bits of the result.
214 */
215 vec4_instruction *
216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
217 {
218 vec4_instruction *inst;
219
220 /* original gen4 does type conversion to the destination type
221 * before before comparison, producing garbage results for floating
222 * point comparisons.
223 */
224 if (brw->gen == 4) {
225 dst.type = src0.type;
226 if (dst.file == HW_REG)
227 dst.fixed_hw_reg.type = dst.type;
228 }
229
230 resolve_ud_negate(&src0);
231 resolve_ud_negate(&src1);
232
233 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
234 inst->conditional_mod = condition;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
245 dst, index);
246 inst->base_mrf = 14;
247 inst->mlen = 2;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
258 dst, src, index);
259 inst->base_mrf = 13;
260 inst->mlen = 3;
261
262 return inst;
263 }
264
265 void
266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
267 {
268 static enum opcode dot_opcodes[] = {
269 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
270 };
271
272 emit(dot_opcodes[elements - 2], dst, src0, src1);
273 }
274
275 src_reg
276 vec4_visitor::fix_3src_operand(src_reg src)
277 {
278 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
279 * able to use vertical stride of zero to replicate the vec4 uniform, like
280 *
281 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
282 *
283 * But you can't, since vertical stride is always four in three-source
284 * instructions. Instead, insert a MOV instruction to do the replication so
285 * that the three-source instruction can consume it.
286 */
287
288 /* The MOV is only needed if the source is a uniform or immediate. */
289 if (src.file != UNIFORM && src.file != IMM)
290 return src;
291
292 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
293 return src;
294
295 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
296 expanded.type = src.type;
297 emit(MOV(expanded, src));
298 return src_reg(expanded);
299 }
300
301 src_reg
302 vec4_visitor::fix_math_operand(src_reg src)
303 {
304 /* The gen6 math instruction ignores the source modifiers --
305 * swizzle, abs, negate, and at least some parts of the register
306 * region description.
307 *
308 * Rather than trying to enumerate all these cases, *always* expand the
309 * operand to a temp GRF for gen6.
310 *
311 * For gen7, keep the operand as-is, except if immediate, which gen7 still
312 * can't use.
313 */
314
315 if (brw->gen == 7 && src.file != IMM)
316 return src;
317
318 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
319 expanded.type = src.type;
320 emit(MOV(expanded, src));
321 return src_reg(expanded);
322 }
323
324 void
325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
326 {
327 src = fix_math_operand(src);
328
329 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
330 /* The gen6 math instruction must be align1, so we can't do
331 * writemasks.
332 */
333 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
334
335 emit(opcode, temp_dst, src);
336
337 emit(MOV(dst, src_reg(temp_dst)));
338 } else {
339 emit(opcode, dst, src);
340 }
341 }
342
343 void
344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
345 {
346 vec4_instruction *inst = emit(opcode, dst, src);
347 inst->base_mrf = 1;
348 inst->mlen = 1;
349 }
350
351 void
352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
353 {
354 switch (opcode) {
355 case SHADER_OPCODE_RCP:
356 case SHADER_OPCODE_RSQ:
357 case SHADER_OPCODE_SQRT:
358 case SHADER_OPCODE_EXP2:
359 case SHADER_OPCODE_LOG2:
360 case SHADER_OPCODE_SIN:
361 case SHADER_OPCODE_COS:
362 break;
363 default:
364 assert(!"not reached: bad math opcode");
365 return;
366 }
367
368 if (brw->gen >= 8) {
369 emit(opcode, dst, src);
370 } else if (brw->gen >= 6) {
371 emit_math1_gen6(opcode, dst, src);
372 } else {
373 emit_math1_gen4(opcode, dst, src);
374 }
375 }
376
377 void
378 vec4_visitor::emit_math2_gen6(enum opcode opcode,
379 dst_reg dst, src_reg src0, src_reg src1)
380 {
381 src0 = fix_math_operand(src0);
382 src1 = fix_math_operand(src1);
383
384 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
385 /* The gen6 math instruction must be align1, so we can't do
386 * writemasks.
387 */
388 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
389 temp_dst.type = dst.type;
390
391 emit(opcode, temp_dst, src0, src1);
392
393 emit(MOV(dst, src_reg(temp_dst)));
394 } else {
395 emit(opcode, dst, src0, src1);
396 }
397 }
398
399 void
400 vec4_visitor::emit_math2_gen4(enum opcode opcode,
401 dst_reg dst, src_reg src0, src_reg src1)
402 {
403 vec4_instruction *inst = emit(opcode, dst, src0, src1);
404 inst->base_mrf = 1;
405 inst->mlen = 2;
406 }
407
408 void
409 vec4_visitor::emit_math(enum opcode opcode,
410 dst_reg dst, src_reg src0, src_reg src1)
411 {
412 switch (opcode) {
413 case SHADER_OPCODE_POW:
414 case SHADER_OPCODE_INT_QUOTIENT:
415 case SHADER_OPCODE_INT_REMAINDER:
416 break;
417 default:
418 assert(!"not reached: unsupported binary math opcode");
419 return;
420 }
421
422 if (brw->gen >= 8) {
423 emit(opcode, dst, src0, src1);
424 } else if (brw->gen >= 6) {
425 emit_math2_gen6(opcode, dst, src0, src1);
426 } else {
427 emit_math2_gen4(opcode, dst, src0, src1);
428 }
429 }
430
431 void
432 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
433 {
434 if (brw->gen < 7)
435 assert(!"ir_unop_pack_half_2x16 should be lowered");
436
437 assert(dst.type == BRW_REGISTER_TYPE_UD);
438 assert(src0.type == BRW_REGISTER_TYPE_F);
439
440 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
441 *
442 * Because this instruction does not have a 16-bit floating-point type,
443 * the destination data type must be Word (W).
444 *
445 * The destination must be DWord-aligned and specify a horizontal stride
446 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
447 * each destination channel and the upper word is not modified.
448 *
449 * The above restriction implies that the f32to16 instruction must use
450 * align1 mode, because only in align1 mode is it possible to specify
451 * horizontal stride. We choose here to defy the hardware docs and emit
452 * align16 instructions.
453 *
454 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
455 * instructions. I was partially successful in that the code passed all
456 * tests. However, the code was dubiously correct and fragile, and the
457 * tests were not harsh enough to probe that frailty. Not trusting the
458 * code, I chose instead to remain in align16 mode in defiance of the hw
459 * docs).
460 *
461 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
462 * simulator, emitting a f32to16 in align16 mode with UD as destination
463 * data type is safe. The behavior differs from that specified in the PRM
464 * in that the upper word of each destination channel is cleared to 0.
465 */
466
467 dst_reg tmp_dst(this, glsl_type::uvec2_type);
468 src_reg tmp_src(tmp_dst);
469
470 #if 0
471 /* Verify the undocumented behavior on which the following instructions
472 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
473 * then the result of the bit-or instruction below will be incorrect.
474 *
475 * You should inspect the disasm output in order to verify that the MOV is
476 * not optimized away.
477 */
478 emit(MOV(tmp_dst, src_reg(0x12345678u)));
479 #endif
480
481 /* Give tmp the form below, where "." means untouched.
482 *
483 * w z y x w z y x
484 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
485 *
486 * That the upper word of each write-channel be 0 is required for the
487 * following bit-shift and bit-or instructions to work. Note that this
488 * relies on the undocumented hardware behavior mentioned above.
489 */
490 tmp_dst.writemask = WRITEMASK_XY;
491 emit(F32TO16(tmp_dst, src0));
492
493 /* Give the write-channels of dst the form:
494 * 0xhhhh0000
495 */
496 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
497 emit(SHL(dst, tmp_src, src_reg(16u)));
498
499 /* Finally, give the write-channels of dst the form of packHalf2x16's
500 * output:
501 * 0xhhhhllll
502 */
503 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
504 emit(OR(dst, src_reg(dst), tmp_src));
505 }
506
507 void
508 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
509 {
510 if (brw->gen < 7)
511 assert(!"ir_unop_unpack_half_2x16 should be lowered");
512
513 assert(dst.type == BRW_REGISTER_TYPE_F);
514 assert(src0.type == BRW_REGISTER_TYPE_UD);
515
516 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
517 *
518 * Because this instruction does not have a 16-bit floating-point type,
519 * the source data type must be Word (W). The destination type must be
520 * F (Float).
521 *
522 * To use W as the source data type, we must adjust horizontal strides,
523 * which is only possible in align1 mode. All my [chadv] attempts at
524 * emitting align1 instructions for unpackHalf2x16 failed to pass the
525 * Piglit tests, so I gave up.
526 *
527 * I've verified that, on gen7 hardware and the simulator, it is safe to
528 * emit f16to32 in align16 mode with UD as source data type.
529 */
530
531 dst_reg tmp_dst(this, glsl_type::uvec2_type);
532 src_reg tmp_src(tmp_dst);
533
534 tmp_dst.writemask = WRITEMASK_X;
535 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
536
537 tmp_dst.writemask = WRITEMASK_Y;
538 emit(SHR(tmp_dst, src0, src_reg(16u)));
539
540 dst.writemask = WRITEMASK_XY;
541 emit(F16TO32(dst, tmp_src));
542 }
543
544 void
545 vec4_visitor::visit_instructions(const exec_list *list)
546 {
547 foreach_list(node, list) {
548 ir_instruction *ir = (ir_instruction *)node;
549
550 base_ir = ir;
551 ir->accept(this);
552 }
553 }
554
555
556 static int
557 type_size(const struct glsl_type *type)
558 {
559 unsigned int i;
560 int size;
561
562 switch (type->base_type) {
563 case GLSL_TYPE_UINT:
564 case GLSL_TYPE_INT:
565 case GLSL_TYPE_FLOAT:
566 case GLSL_TYPE_BOOL:
567 if (type->is_matrix()) {
568 return type->matrix_columns;
569 } else {
570 /* Regardless of size of vector, it gets a vec4. This is bad
571 * packing for things like floats, but otherwise arrays become a
572 * mess. Hopefully a later pass over the code can pack scalars
573 * down if appropriate.
574 */
575 return 1;
576 }
577 case GLSL_TYPE_ARRAY:
578 assert(type->length > 0);
579 return type_size(type->fields.array) * type->length;
580 case GLSL_TYPE_STRUCT:
581 size = 0;
582 for (i = 0; i < type->length; i++) {
583 size += type_size(type->fields.structure[i].type);
584 }
585 return size;
586 case GLSL_TYPE_SAMPLER:
587 /* Samplers take up one slot in UNIFORMS[], but they're baked in
588 * at link time.
589 */
590 return 1;
591 case GLSL_TYPE_ATOMIC_UINT:
592 return 0;
593 case GLSL_TYPE_IMAGE:
594 case GLSL_TYPE_VOID:
595 case GLSL_TYPE_ERROR:
596 case GLSL_TYPE_INTERFACE:
597 assert(0);
598 break;
599 }
600
601 return 0;
602 }
603
604 int
605 vec4_visitor::virtual_grf_alloc(int size)
606 {
607 if (virtual_grf_array_size <= virtual_grf_count) {
608 if (virtual_grf_array_size == 0)
609 virtual_grf_array_size = 16;
610 else
611 virtual_grf_array_size *= 2;
612 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
613 virtual_grf_array_size);
614 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
615 virtual_grf_array_size);
616 }
617 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
618 virtual_grf_reg_count += size;
619 virtual_grf_sizes[virtual_grf_count] = size;
620 return virtual_grf_count++;
621 }
622
623 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->virtual_grf_alloc(type_size(type));
629
630 if (type->is_array() || type->is_record()) {
631 this->swizzle = BRW_SWIZZLE_NOOP;
632 } else {
633 this->swizzle = swizzle_for_size(type->vector_elements);
634 }
635
636 this->type = brw_type_for_base_type(type);
637 }
638
639 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
640 {
641 init();
642
643 this->file = GRF;
644 this->reg = v->virtual_grf_alloc(type_size(type));
645
646 if (type->is_array() || type->is_record()) {
647 this->writemask = WRITEMASK_XYZW;
648 } else {
649 this->writemask = (1 << type->vector_elements) - 1;
650 }
651
652 this->type = brw_type_for_base_type(type);
653 }
654
655 /* Our support for uniforms is piggy-backed on the struct
656 * gl_fragment_program, because that's where the values actually
657 * get stored, rather than in some global gl_shader_program uniform
658 * store.
659 */
660 void
661 vec4_visitor::setup_uniform_values(ir_variable *ir)
662 {
663 int namelen = strlen(ir->name);
664
665 /* The data for our (non-builtin) uniforms is stored in a series of
666 * gl_uniform_driver_storage structs for each subcomponent that
667 * glGetUniformLocation() could name. We know it's been set up in the same
668 * order we'd walk the type, so walk the list of storage and find anything
669 * with our name, or the prefix of a component that starts with our name.
670 */
671 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
672 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
673
674 if (strncmp(ir->name, storage->name, namelen) != 0 ||
675 (storage->name[namelen] != 0 &&
676 storage->name[namelen] != '.' &&
677 storage->name[namelen] != '[')) {
678 continue;
679 }
680
681 gl_constant_value *components = storage->storage;
682 unsigned vector_count = (MAX2(storage->array_elements, 1) *
683 storage->type->matrix_columns);
684
685 for (unsigned s = 0; s < vector_count; s++) {
686 assert(uniforms < uniform_array_size);
687 uniform_vector_size[uniforms] = storage->type->vector_elements;
688
689 int i;
690 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
691 stage_prog_data->param[uniforms * 4 + i] = &components->f;
692 components++;
693 }
694 for (; i < 4; i++) {
695 static float zero = 0;
696 stage_prog_data->param[uniforms * 4 + i] = &zero;
697 }
698
699 uniforms++;
700 }
701 }
702 }
703
704 void
705 vec4_visitor::setup_uniform_clipplane_values()
706 {
707 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
708
709 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
710 assert(this->uniforms < uniform_array_size);
711 this->uniform_vector_size[this->uniforms] = 4;
712 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
713 this->userplane[i].type = BRW_REGISTER_TYPE_F;
714 for (int j = 0; j < 4; ++j) {
715 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
716 }
717 ++this->uniforms;
718 }
719 }
720
721 /* Our support for builtin uniforms is even scarier than non-builtin.
722 * It sits on top of the PROG_STATE_VAR parameters that are
723 * automatically updated from GL context state.
724 */
725 void
726 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
727 {
728 const ir_state_slot *const slots = ir->state_slots;
729 assert(ir->state_slots != NULL);
730
731 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
732 /* This state reference has already been setup by ir_to_mesa,
733 * but we'll get the same index back here. We can reference
734 * ParameterValues directly, since unlike brw_fs.cpp, we never
735 * add new state references during compile.
736 */
737 int index = _mesa_add_state_reference(this->prog->Parameters,
738 (gl_state_index *)slots[i].tokens);
739 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
740
741 assert(this->uniforms < uniform_array_size);
742 this->uniform_vector_size[this->uniforms] = 0;
743 /* Add each of the unique swizzled channels of the element.
744 * This will end up matching the size of the glsl_type of this field.
745 */
746 int last_swiz = -1;
747 for (unsigned int j = 0; j < 4; j++) {
748 int swiz = GET_SWZ(slots[i].swizzle, j);
749 last_swiz = swiz;
750
751 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
752 assert(this->uniforms < uniform_array_size);
753 if (swiz <= last_swiz)
754 this->uniform_vector_size[this->uniforms]++;
755 }
756 this->uniforms++;
757 }
758 }
759
760 dst_reg *
761 vec4_visitor::variable_storage(ir_variable *var)
762 {
763 return (dst_reg *)hash_table_find(this->variable_ht, var);
764 }
765
766 void
767 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
768 {
769 ir_expression *expr = ir->as_expression();
770
771 *predicate = BRW_PREDICATE_NORMAL;
772
773 if (expr) {
774 src_reg op[2];
775 vec4_instruction *inst;
776
777 assert(expr->get_num_operands() <= 2);
778 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
779 expr->operands[i]->accept(this);
780 op[i] = this->result;
781
782 resolve_ud_negate(&op[i]);
783 }
784
785 switch (expr->operation) {
786 case ir_unop_logic_not:
787 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
788 inst->conditional_mod = BRW_CONDITIONAL_Z;
789 break;
790
791 case ir_binop_logic_xor:
792 inst = emit(XOR(dst_null_d(), op[0], op[1]));
793 inst->conditional_mod = BRW_CONDITIONAL_NZ;
794 break;
795
796 case ir_binop_logic_or:
797 inst = emit(OR(dst_null_d(), op[0], op[1]));
798 inst->conditional_mod = BRW_CONDITIONAL_NZ;
799 break;
800
801 case ir_binop_logic_and:
802 inst = emit(AND(dst_null_d(), op[0], op[1]));
803 inst->conditional_mod = BRW_CONDITIONAL_NZ;
804 break;
805
806 case ir_unop_f2b:
807 if (brw->gen >= 6) {
808 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
809 } else {
810 inst = emit(MOV(dst_null_f(), op[0]));
811 inst->conditional_mod = BRW_CONDITIONAL_NZ;
812 }
813 break;
814
815 case ir_unop_i2b:
816 if (brw->gen >= 6) {
817 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
818 } else {
819 inst = emit(MOV(dst_null_d(), op[0]));
820 inst->conditional_mod = BRW_CONDITIONAL_NZ;
821 }
822 break;
823
824 case ir_binop_all_equal:
825 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
826 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
827 break;
828
829 case ir_binop_any_nequal:
830 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
831 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
832 break;
833
834 case ir_unop_any:
835 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
836 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
837 break;
838
839 case ir_binop_greater:
840 case ir_binop_gequal:
841 case ir_binop_less:
842 case ir_binop_lequal:
843 case ir_binop_equal:
844 case ir_binop_nequal:
845 emit(CMP(dst_null_d(), op[0], op[1],
846 brw_conditional_for_comparison(expr->operation)));
847 break;
848
849 default:
850 assert(!"not reached");
851 break;
852 }
853 return;
854 }
855
856 ir->accept(this);
857
858 resolve_ud_negate(&this->result);
859
860 if (brw->gen >= 6) {
861 vec4_instruction *inst = emit(AND(dst_null_d(),
862 this->result, src_reg(1)));
863 inst->conditional_mod = BRW_CONDITIONAL_NZ;
864 } else {
865 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 }
868 }
869
870 /**
871 * Emit a gen6 IF statement with the comparison folded into the IF
872 * instruction.
873 */
874 void
875 vec4_visitor::emit_if_gen6(ir_if *ir)
876 {
877 ir_expression *expr = ir->condition->as_expression();
878
879 if (expr) {
880 src_reg op[2];
881 dst_reg temp;
882
883 assert(expr->get_num_operands() <= 2);
884 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
885 expr->operands[i]->accept(this);
886 op[i] = this->result;
887 }
888
889 switch (expr->operation) {
890 case ir_unop_logic_not:
891 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
892 return;
893
894 case ir_binop_logic_xor:
895 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
896 return;
897
898 case ir_binop_logic_or:
899 temp = dst_reg(this, glsl_type::bool_type);
900 emit(OR(temp, op[0], op[1]));
901 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
902 return;
903
904 case ir_binop_logic_and:
905 temp = dst_reg(this, glsl_type::bool_type);
906 emit(AND(temp, op[0], op[1]));
907 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
908 return;
909
910 case ir_unop_f2b:
911 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
912 return;
913
914 case ir_unop_i2b:
915 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
916 return;
917
918 case ir_binop_greater:
919 case ir_binop_gequal:
920 case ir_binop_less:
921 case ir_binop_lequal:
922 case ir_binop_equal:
923 case ir_binop_nequal:
924 emit(IF(op[0], op[1],
925 brw_conditional_for_comparison(expr->operation)));
926 return;
927
928 case ir_binop_all_equal:
929 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
930 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
931 return;
932
933 case ir_binop_any_nequal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
935 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
936 return;
937
938 case ir_unop_any:
939 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 default:
944 assert(!"not reached");
945 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
946 return;
947 }
948 return;
949 }
950
951 ir->condition->accept(this);
952
953 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
954 }
955
956 void
957 vec4_visitor::visit(ir_variable *ir)
958 {
959 dst_reg *reg = NULL;
960
961 if (variable_storage(ir))
962 return;
963
964 switch (ir->data.mode) {
965 case ir_var_shader_in:
966 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
967 break;
968
969 case ir_var_shader_out:
970 reg = new(mem_ctx) dst_reg(this, ir->type);
971
972 for (int i = 0; i < type_size(ir->type); i++) {
973 output_reg[ir->data.location + i] = *reg;
974 output_reg[ir->data.location + i].reg_offset = i;
975 output_reg[ir->data.location + i].type =
976 brw_type_for_base_type(ir->type->get_scalar_type());
977 output_reg_annotation[ir->data.location + i] = ir->name;
978 }
979 break;
980
981 case ir_var_auto:
982 case ir_var_temporary:
983 reg = new(mem_ctx) dst_reg(this, ir->type);
984 break;
985
986 case ir_var_uniform:
987 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
988
989 /* Thanks to the lower_ubo_reference pass, we will see only
990 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
991 * variables, so no need for them to be in variable_ht.
992 *
993 * Atomic counters take no uniform storage, no need to do
994 * anything here.
995 */
996 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
997 return;
998
999 /* Track how big the whole uniform variable is, in case we need to put a
1000 * copy of its data into pull constants for array access.
1001 */
1002 assert(this->uniforms < uniform_array_size);
1003 this->uniform_size[this->uniforms] = type_size(ir->type);
1004
1005 if (!strncmp(ir->name, "gl_", 3)) {
1006 setup_builtin_uniform_values(ir);
1007 } else {
1008 setup_uniform_values(ir);
1009 }
1010 break;
1011
1012 case ir_var_system_value:
1013 reg = make_reg_for_system_value(ir);
1014 break;
1015
1016 default:
1017 assert(!"not reached");
1018 }
1019
1020 reg->type = brw_type_for_base_type(ir->type);
1021 hash_table_insert(this->variable_ht, reg, ir);
1022 }
1023
1024 void
1025 vec4_visitor::visit(ir_loop *ir)
1026 {
1027 /* We don't want debugging output to print the whole body of the
1028 * loop as the annotation.
1029 */
1030 this->base_ir = NULL;
1031
1032 emit(BRW_OPCODE_DO);
1033
1034 visit_instructions(&ir->body_instructions);
1035
1036 emit(BRW_OPCODE_WHILE);
1037 }
1038
1039 void
1040 vec4_visitor::visit(ir_loop_jump *ir)
1041 {
1042 switch (ir->mode) {
1043 case ir_loop_jump::jump_break:
1044 emit(BRW_OPCODE_BREAK);
1045 break;
1046 case ir_loop_jump::jump_continue:
1047 emit(BRW_OPCODE_CONTINUE);
1048 break;
1049 }
1050 }
1051
1052
1053 void
1054 vec4_visitor::visit(ir_function_signature *ir)
1055 {
1056 assert(0);
1057 (void)ir;
1058 }
1059
1060 void
1061 vec4_visitor::visit(ir_function *ir)
1062 {
1063 /* Ignore function bodies other than main() -- we shouldn't see calls to
1064 * them since they should all be inlined.
1065 */
1066 if (strcmp(ir->name, "main") == 0) {
1067 const ir_function_signature *sig;
1068 exec_list empty;
1069
1070 sig = ir->matching_signature(NULL, &empty);
1071
1072 assert(sig);
1073
1074 visit_instructions(&sig->body);
1075 }
1076 }
1077
1078 bool
1079 vec4_visitor::try_emit_sat(ir_expression *ir)
1080 {
1081 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1082 if (!sat_src)
1083 return false;
1084
1085 sat_src->accept(this);
1086 src_reg src = this->result;
1087
1088 this->result = src_reg(this, ir->type);
1089 vec4_instruction *inst;
1090 inst = emit(MOV(dst_reg(this->result), src));
1091 inst->saturate = true;
1092
1093 return true;
1094 }
1095
1096 bool
1097 vec4_visitor::try_emit_mad(ir_expression *ir)
1098 {
1099 /* 3-src instructions were introduced in gen6. */
1100 if (brw->gen < 6)
1101 return false;
1102
1103 /* MAD can only handle floating-point data. */
1104 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1105 return false;
1106
1107 ir_rvalue *nonmul = ir->operands[1];
1108 ir_expression *mul = ir->operands[0]->as_expression();
1109
1110 if (!mul || mul->operation != ir_binop_mul) {
1111 nonmul = ir->operands[0];
1112 mul = ir->operands[1]->as_expression();
1113
1114 if (!mul || mul->operation != ir_binop_mul)
1115 return false;
1116 }
1117
1118 nonmul->accept(this);
1119 src_reg src0 = fix_3src_operand(this->result);
1120
1121 mul->operands[0]->accept(this);
1122 src_reg src1 = fix_3src_operand(this->result);
1123
1124 mul->operands[1]->accept(this);
1125 src_reg src2 = fix_3src_operand(this->result);
1126
1127 this->result = src_reg(this, ir->type);
1128 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1129
1130 return true;
1131 }
1132
1133 bool
1134 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1135 {
1136 ir_expression *const cmp = ir->operands[0]->as_expression();
1137
1138 if (cmp == NULL)
1139 return false;
1140
1141 switch (cmp->operation) {
1142 case ir_binop_less:
1143 case ir_binop_greater:
1144 case ir_binop_lequal:
1145 case ir_binop_gequal:
1146 case ir_binop_equal:
1147 case ir_binop_nequal:
1148 break;
1149
1150 default:
1151 return false;
1152 }
1153
1154 cmp->operands[0]->accept(this);
1155 const src_reg cmp_src0 = this->result;
1156
1157 cmp->operands[1]->accept(this);
1158 const src_reg cmp_src1 = this->result;
1159
1160 this->result = src_reg(this, ir->type);
1161
1162 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1163 brw_conditional_for_comparison(cmp->operation)));
1164
1165 /* If the comparison is false, this->result will just happen to be zero.
1166 */
1167 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1168 this->result, src_reg(1.0f));
1169 inst->predicate = BRW_PREDICATE_NORMAL;
1170 inst->predicate_inverse = true;
1171
1172 return true;
1173 }
1174
1175 void
1176 vec4_visitor::emit_bool_comparison(unsigned int op,
1177 dst_reg dst, src_reg src0, src_reg src1)
1178 {
1179 /* original gen4 does destination conversion before comparison. */
1180 if (brw->gen < 5)
1181 dst.type = src0.type;
1182
1183 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1184
1185 dst.type = BRW_REGISTER_TYPE_D;
1186 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1187 }
1188
1189 void
1190 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1191 src_reg src0, src_reg src1)
1192 {
1193 vec4_instruction *inst;
1194
1195 if (brw->gen >= 6) {
1196 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1197 inst->conditional_mod = conditionalmod;
1198 } else {
1199 emit(CMP(dst, src0, src1, conditionalmod));
1200
1201 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1202 inst->predicate = BRW_PREDICATE_NORMAL;
1203 }
1204 }
1205
1206 void
1207 vec4_visitor::emit_lrp(const dst_reg &dst,
1208 const src_reg &x, const src_reg &y, const src_reg &a)
1209 {
1210 if (brw->gen >= 6) {
1211 /* Note that the instruction's argument order is reversed from GLSL
1212 * and the IR.
1213 */
1214 emit(LRP(dst,
1215 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1216 } else {
1217 /* Earlier generations don't support three source operations, so we
1218 * need to emit x*(1-a) + y*a.
1219 */
1220 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1221 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1222 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1223 y_times_a.writemask = dst.writemask;
1224 one_minus_a.writemask = dst.writemask;
1225 x_times_one_minus_a.writemask = dst.writemask;
1226
1227 emit(MUL(y_times_a, y, a));
1228 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1229 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1230 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1231 }
1232 }
1233
1234 void
1235 vec4_visitor::visit(ir_expression *ir)
1236 {
1237 unsigned int operand;
1238 src_reg op[Elements(ir->operands)];
1239 src_reg result_src;
1240 dst_reg result_dst;
1241 vec4_instruction *inst;
1242
1243 if (try_emit_sat(ir))
1244 return;
1245
1246 if (ir->operation == ir_binop_add) {
1247 if (try_emit_mad(ir))
1248 return;
1249 }
1250
1251 if (ir->operation == ir_unop_b2f) {
1252 if (try_emit_b2f_of_compare(ir))
1253 return;
1254 }
1255
1256 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1257 this->result.file = BAD_FILE;
1258 ir->operands[operand]->accept(this);
1259 if (this->result.file == BAD_FILE) {
1260 fprintf(stderr, "Failed to get tree for expression operand:\n");
1261 ir->operands[operand]->fprint(stderr);
1262 exit(1);
1263 }
1264 op[operand] = this->result;
1265
1266 /* Matrix expression operands should have been broken down to vector
1267 * operations already.
1268 */
1269 assert(!ir->operands[operand]->type->is_matrix());
1270 }
1271
1272 int vector_elements = ir->operands[0]->type->vector_elements;
1273 if (ir->operands[1]) {
1274 vector_elements = MAX2(vector_elements,
1275 ir->operands[1]->type->vector_elements);
1276 }
1277
1278 this->result.file = BAD_FILE;
1279
1280 /* Storage for our result. Ideally for an assignment we'd be using
1281 * the actual storage for the result here, instead.
1282 */
1283 result_src = src_reg(this, ir->type);
1284 /* convenience for the emit functions below. */
1285 result_dst = dst_reg(result_src);
1286 /* If nothing special happens, this is the result. */
1287 this->result = result_src;
1288 /* Limit writes to the channels that will be used by result_src later.
1289 * This does limit this temp's use as a temporary for multi-instruction
1290 * sequences.
1291 */
1292 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1293
1294 switch (ir->operation) {
1295 case ir_unop_logic_not:
1296 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1297 * ones complement of the whole register, not just bit 0.
1298 */
1299 emit(XOR(result_dst, op[0], src_reg(1)));
1300 break;
1301 case ir_unop_neg:
1302 op[0].negate = !op[0].negate;
1303 emit(MOV(result_dst, op[0]));
1304 break;
1305 case ir_unop_abs:
1306 op[0].abs = true;
1307 op[0].negate = false;
1308 emit(MOV(result_dst, op[0]));
1309 break;
1310
1311 case ir_unop_sign:
1312 if (ir->type->is_float()) {
1313 /* AND(val, 0x80000000) gives the sign bit.
1314 *
1315 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1316 * zero.
1317 */
1318 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1319
1320 op[0].type = BRW_REGISTER_TYPE_UD;
1321 result_dst.type = BRW_REGISTER_TYPE_UD;
1322 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1323
1324 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1325 inst->predicate = BRW_PREDICATE_NORMAL;
1326
1327 this->result.type = BRW_REGISTER_TYPE_F;
1328 } else {
1329 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1330 * -> non-negative val generates 0x00000000.
1331 * Predicated OR sets 1 if val is positive.
1332 */
1333 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1334
1335 emit(ASR(result_dst, op[0], src_reg(31)));
1336
1337 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1338 inst->predicate = BRW_PREDICATE_NORMAL;
1339 }
1340 break;
1341
1342 case ir_unop_rcp:
1343 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1344 break;
1345
1346 case ir_unop_exp2:
1347 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1348 break;
1349 case ir_unop_log2:
1350 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1351 break;
1352 case ir_unop_exp:
1353 case ir_unop_log:
1354 assert(!"not reached: should be handled by ir_explog_to_explog2");
1355 break;
1356 case ir_unop_sin:
1357 case ir_unop_sin_reduced:
1358 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1359 break;
1360 case ir_unop_cos:
1361 case ir_unop_cos_reduced:
1362 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1363 break;
1364
1365 case ir_unop_dFdx:
1366 case ir_unop_dFdy:
1367 assert(!"derivatives not valid in vertex shader");
1368 break;
1369
1370 case ir_unop_bitfield_reverse:
1371 emit(BFREV(result_dst, op[0]));
1372 break;
1373 case ir_unop_bit_count:
1374 emit(CBIT(result_dst, op[0]));
1375 break;
1376 case ir_unop_find_msb: {
1377 src_reg temp = src_reg(this, glsl_type::uint_type);
1378
1379 inst = emit(FBH(dst_reg(temp), op[0]));
1380 inst->dst.writemask = WRITEMASK_XYZW;
1381
1382 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1383 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1384 * subtract the result from 31 to convert the MSB count into an LSB count.
1385 */
1386
1387 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1388 temp.swizzle = BRW_SWIZZLE_NOOP;
1389 emit(MOV(result_dst, temp));
1390
1391 src_reg src_tmp = src_reg(result_dst);
1392 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1393
1394 src_tmp.negate = true;
1395 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1396 inst->predicate = BRW_PREDICATE_NORMAL;
1397 break;
1398 }
1399 case ir_unop_find_lsb:
1400 emit(FBL(result_dst, op[0]));
1401 break;
1402
1403 case ir_unop_noise:
1404 assert(!"not reached: should be handled by lower_noise");
1405 break;
1406
1407 case ir_binop_add:
1408 emit(ADD(result_dst, op[0], op[1]));
1409 break;
1410 case ir_binop_sub:
1411 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1412 break;
1413
1414 case ir_binop_mul:
1415 if (brw->gen < 8 && ir->type->is_integer()) {
1416 /* For integer multiplication, the MUL uses the low 16 bits of one of
1417 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1418 * accumulates in the contribution of the upper 16 bits of that
1419 * operand. If we can determine that one of the args is in the low
1420 * 16 bits, though, we can just emit a single MUL.
1421 */
1422 if (ir->operands[0]->is_uint16_constant()) {
1423 if (brw->gen < 7)
1424 emit(MUL(result_dst, op[0], op[1]));
1425 else
1426 emit(MUL(result_dst, op[1], op[0]));
1427 } else if (ir->operands[1]->is_uint16_constant()) {
1428 if (brw->gen < 7)
1429 emit(MUL(result_dst, op[1], op[0]));
1430 else
1431 emit(MUL(result_dst, op[0], op[1]));
1432 } else {
1433 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1434
1435 emit(MUL(acc, op[0], op[1]));
1436 emit(MACH(dst_null_d(), op[0], op[1]));
1437 emit(MOV(result_dst, src_reg(acc)));
1438 }
1439 } else {
1440 emit(MUL(result_dst, op[0], op[1]));
1441 }
1442 break;
1443 case ir_binop_imul_high: {
1444 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1445
1446 emit(MUL(acc, op[0], op[1]));
1447 emit(MACH(result_dst, op[0], op[1]));
1448 break;
1449 }
1450 case ir_binop_div:
1451 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1452 assert(ir->type->is_integer());
1453 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1454 break;
1455 case ir_binop_carry: {
1456 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1457
1458 emit(ADDC(dst_null_ud(), op[0], op[1]));
1459 emit(MOV(result_dst, src_reg(acc)));
1460 break;
1461 }
1462 case ir_binop_borrow: {
1463 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1464
1465 emit(SUBB(dst_null_ud(), op[0], op[1]));
1466 emit(MOV(result_dst, src_reg(acc)));
1467 break;
1468 }
1469 case ir_binop_mod:
1470 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1471 assert(ir->type->is_integer());
1472 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1473 break;
1474
1475 case ir_binop_less:
1476 case ir_binop_greater:
1477 case ir_binop_lequal:
1478 case ir_binop_gequal:
1479 case ir_binop_equal:
1480 case ir_binop_nequal: {
1481 emit(CMP(result_dst, op[0], op[1],
1482 brw_conditional_for_comparison(ir->operation)));
1483 emit(AND(result_dst, result_src, src_reg(0x1)));
1484 break;
1485 }
1486
1487 case ir_binop_all_equal:
1488 /* "==" operator producing a scalar boolean. */
1489 if (ir->operands[0]->type->is_vector() ||
1490 ir->operands[1]->type->is_vector()) {
1491 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1492 emit(MOV(result_dst, src_reg(0)));
1493 inst = emit(MOV(result_dst, src_reg(1)));
1494 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1495 } else {
1496 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1497 emit(AND(result_dst, result_src, src_reg(0x1)));
1498 }
1499 break;
1500 case ir_binop_any_nequal:
1501 /* "!=" operator producing a scalar boolean. */
1502 if (ir->operands[0]->type->is_vector() ||
1503 ir->operands[1]->type->is_vector()) {
1504 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1505
1506 emit(MOV(result_dst, src_reg(0)));
1507 inst = emit(MOV(result_dst, src_reg(1)));
1508 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1509 } else {
1510 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1511 emit(AND(result_dst, result_src, src_reg(0x1)));
1512 }
1513 break;
1514
1515 case ir_unop_any:
1516 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1517 emit(MOV(result_dst, src_reg(0)));
1518
1519 inst = emit(MOV(result_dst, src_reg(1)));
1520 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1521 break;
1522
1523 case ir_binop_logic_xor:
1524 emit(XOR(result_dst, op[0], op[1]));
1525 break;
1526
1527 case ir_binop_logic_or:
1528 emit(OR(result_dst, op[0], op[1]));
1529 break;
1530
1531 case ir_binop_logic_and:
1532 emit(AND(result_dst, op[0], op[1]));
1533 break;
1534
1535 case ir_binop_dot:
1536 assert(ir->operands[0]->type->is_vector());
1537 assert(ir->operands[0]->type == ir->operands[1]->type);
1538 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1539 break;
1540
1541 case ir_unop_sqrt:
1542 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1543 break;
1544 case ir_unop_rsq:
1545 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1546 break;
1547
1548 case ir_unop_bitcast_i2f:
1549 case ir_unop_bitcast_u2f:
1550 this->result = op[0];
1551 this->result.type = BRW_REGISTER_TYPE_F;
1552 break;
1553
1554 case ir_unop_bitcast_f2i:
1555 this->result = op[0];
1556 this->result.type = BRW_REGISTER_TYPE_D;
1557 break;
1558
1559 case ir_unop_bitcast_f2u:
1560 this->result = op[0];
1561 this->result.type = BRW_REGISTER_TYPE_UD;
1562 break;
1563
1564 case ir_unop_i2f:
1565 case ir_unop_i2u:
1566 case ir_unop_u2i:
1567 case ir_unop_u2f:
1568 case ir_unop_b2f:
1569 case ir_unop_b2i:
1570 case ir_unop_f2i:
1571 case ir_unop_f2u:
1572 emit(MOV(result_dst, op[0]));
1573 break;
1574 case ir_unop_f2b:
1575 case ir_unop_i2b: {
1576 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1577 emit(AND(result_dst, result_src, src_reg(1)));
1578 break;
1579 }
1580
1581 case ir_unop_trunc:
1582 emit(RNDZ(result_dst, op[0]));
1583 break;
1584 case ir_unop_ceil:
1585 op[0].negate = !op[0].negate;
1586 inst = emit(RNDD(result_dst, op[0]));
1587 this->result.negate = true;
1588 break;
1589 case ir_unop_floor:
1590 inst = emit(RNDD(result_dst, op[0]));
1591 break;
1592 case ir_unop_fract:
1593 inst = emit(FRC(result_dst, op[0]));
1594 break;
1595 case ir_unop_round_even:
1596 emit(RNDE(result_dst, op[0]));
1597 break;
1598
1599 case ir_binop_min:
1600 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1601 break;
1602 case ir_binop_max:
1603 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1604 break;
1605
1606 case ir_binop_pow:
1607 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1608 break;
1609
1610 case ir_unop_bit_not:
1611 inst = emit(NOT(result_dst, op[0]));
1612 break;
1613 case ir_binop_bit_and:
1614 inst = emit(AND(result_dst, op[0], op[1]));
1615 break;
1616 case ir_binop_bit_xor:
1617 inst = emit(XOR(result_dst, op[0], op[1]));
1618 break;
1619 case ir_binop_bit_or:
1620 inst = emit(OR(result_dst, op[0], op[1]));
1621 break;
1622
1623 case ir_binop_lshift:
1624 inst = emit(SHL(result_dst, op[0], op[1]));
1625 break;
1626
1627 case ir_binop_rshift:
1628 if (ir->type->base_type == GLSL_TYPE_INT)
1629 inst = emit(ASR(result_dst, op[0], op[1]));
1630 else
1631 inst = emit(SHR(result_dst, op[0], op[1]));
1632 break;
1633
1634 case ir_binop_bfm:
1635 emit(BFI1(result_dst, op[0], op[1]));
1636 break;
1637
1638 case ir_binop_ubo_load: {
1639 ir_constant *uniform_block = ir->operands[0]->as_constant();
1640 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1641 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1642 src_reg offset;
1643
1644 /* Now, load the vector from that offset. */
1645 assert(ir->type->is_vector() || ir->type->is_scalar());
1646
1647 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1648 packed_consts.type = result.type;
1649 src_reg surf_index =
1650 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1651 if (const_offset_ir) {
1652 if (brw->gen >= 8) {
1653 /* Store the offset in a GRF so we can send-from-GRF. */
1654 offset = src_reg(this, glsl_type::int_type);
1655 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1656 } else {
1657 /* Immediates are fine on older generations since they'll be moved
1658 * to a (potentially fake) MRF at the generator level.
1659 */
1660 offset = src_reg(const_offset / 16);
1661 }
1662 } else {
1663 offset = src_reg(this, glsl_type::uint_type);
1664 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1665 }
1666
1667 if (brw->gen >= 7) {
1668 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1669 grf_offset.type = offset.type;
1670
1671 emit(MOV(grf_offset, offset));
1672
1673 emit(new(mem_ctx) vec4_instruction(this,
1674 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1675 dst_reg(packed_consts),
1676 surf_index,
1677 src_reg(grf_offset)));
1678 } else {
1679 vec4_instruction *pull =
1680 emit(new(mem_ctx) vec4_instruction(this,
1681 VS_OPCODE_PULL_CONSTANT_LOAD,
1682 dst_reg(packed_consts),
1683 surf_index,
1684 offset));
1685 pull->base_mrf = 14;
1686 pull->mlen = 1;
1687 }
1688
1689 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1690 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1691 const_offset % 16 / 4,
1692 const_offset % 16 / 4,
1693 const_offset % 16 / 4);
1694
1695 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1696 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1697 emit(CMP(result_dst, packed_consts, src_reg(0u),
1698 BRW_CONDITIONAL_NZ));
1699 emit(AND(result_dst, result, src_reg(0x1)));
1700 } else {
1701 emit(MOV(result_dst, packed_consts));
1702 }
1703 break;
1704 }
1705
1706 case ir_binop_vector_extract:
1707 assert(!"should have been lowered by vec_index_to_cond_assign");
1708 break;
1709
1710 case ir_triop_fma:
1711 op[0] = fix_3src_operand(op[0]);
1712 op[1] = fix_3src_operand(op[1]);
1713 op[2] = fix_3src_operand(op[2]);
1714 /* Note that the instruction's argument order is reversed from GLSL
1715 * and the IR.
1716 */
1717 emit(MAD(result_dst, op[2], op[1], op[0]));
1718 break;
1719
1720 case ir_triop_lrp:
1721 emit_lrp(result_dst, op[0], op[1], op[2]);
1722 break;
1723
1724 case ir_triop_csel:
1725 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1726 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1727 inst->predicate = BRW_PREDICATE_NORMAL;
1728 break;
1729
1730 case ir_triop_bfi:
1731 op[0] = fix_3src_operand(op[0]);
1732 op[1] = fix_3src_operand(op[1]);
1733 op[2] = fix_3src_operand(op[2]);
1734 emit(BFI2(result_dst, op[0], op[1], op[2]));
1735 break;
1736
1737 case ir_triop_bitfield_extract:
1738 op[0] = fix_3src_operand(op[0]);
1739 op[1] = fix_3src_operand(op[1]);
1740 op[2] = fix_3src_operand(op[2]);
1741 /* Note that the instruction's argument order is reversed from GLSL
1742 * and the IR.
1743 */
1744 emit(BFE(result_dst, op[2], op[1], op[0]));
1745 break;
1746
1747 case ir_triop_vector_insert:
1748 assert(!"should have been lowered by lower_vector_insert");
1749 break;
1750
1751 case ir_quadop_bitfield_insert:
1752 assert(!"not reached: should be handled by "
1753 "bitfield_insert_to_bfm_bfi\n");
1754 break;
1755
1756 case ir_quadop_vector:
1757 assert(!"not reached: should be handled by lower_quadop_vector");
1758 break;
1759
1760 case ir_unop_pack_half_2x16:
1761 emit_pack_half_2x16(result_dst, op[0]);
1762 break;
1763 case ir_unop_unpack_half_2x16:
1764 emit_unpack_half_2x16(result_dst, op[0]);
1765 break;
1766 case ir_unop_pack_snorm_2x16:
1767 case ir_unop_pack_snorm_4x8:
1768 case ir_unop_pack_unorm_2x16:
1769 case ir_unop_pack_unorm_4x8:
1770 case ir_unop_unpack_snorm_2x16:
1771 case ir_unop_unpack_snorm_4x8:
1772 case ir_unop_unpack_unorm_2x16:
1773 case ir_unop_unpack_unorm_4x8:
1774 assert(!"not reached: should be handled by lower_packing_builtins");
1775 break;
1776 case ir_unop_unpack_half_2x16_split_x:
1777 case ir_unop_unpack_half_2x16_split_y:
1778 case ir_binop_pack_half_2x16_split:
1779 assert(!"not reached: should not occur in vertex shader");
1780 break;
1781 case ir_binop_ldexp:
1782 assert(!"not reached: should be handled by ldexp_to_arith()");
1783 break;
1784 }
1785 }
1786
1787
1788 void
1789 vec4_visitor::visit(ir_swizzle *ir)
1790 {
1791 src_reg src;
1792 int i = 0;
1793 int swizzle[4];
1794
1795 /* Note that this is only swizzles in expressions, not those on the left
1796 * hand side of an assignment, which do write masking. See ir_assignment
1797 * for that.
1798 */
1799
1800 ir->val->accept(this);
1801 src = this->result;
1802 assert(src.file != BAD_FILE);
1803
1804 for (i = 0; i < ir->type->vector_elements; i++) {
1805 switch (i) {
1806 case 0:
1807 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1808 break;
1809 case 1:
1810 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1811 break;
1812 case 2:
1813 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1814 break;
1815 case 3:
1816 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1817 break;
1818 }
1819 }
1820 for (; i < 4; i++) {
1821 /* Replicate the last channel out. */
1822 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1823 }
1824
1825 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1826
1827 this->result = src;
1828 }
1829
1830 void
1831 vec4_visitor::visit(ir_dereference_variable *ir)
1832 {
1833 const struct glsl_type *type = ir->type;
1834 dst_reg *reg = variable_storage(ir->var);
1835
1836 if (!reg) {
1837 fail("Failed to find variable storage for %s\n", ir->var->name);
1838 this->result = src_reg(brw_null_reg());
1839 return;
1840 }
1841
1842 this->result = src_reg(*reg);
1843
1844 /* System values get their swizzle from the dst_reg writemask */
1845 if (ir->var->data.mode == ir_var_system_value)
1846 return;
1847
1848 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1849 this->result.swizzle = swizzle_for_size(type->vector_elements);
1850 }
1851
1852
1853 int
1854 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1855 {
1856 /* Under normal circumstances array elements are stored consecutively, so
1857 * the stride is equal to the size of the array element.
1858 */
1859 return type_size(ir->type);
1860 }
1861
1862
1863 void
1864 vec4_visitor::visit(ir_dereference_array *ir)
1865 {
1866 ir_constant *constant_index;
1867 src_reg src;
1868 int array_stride = compute_array_stride(ir);
1869
1870 constant_index = ir->array_index->constant_expression_value();
1871
1872 ir->array->accept(this);
1873 src = this->result;
1874
1875 if (constant_index) {
1876 src.reg_offset += constant_index->value.i[0] * array_stride;
1877 } else {
1878 /* Variable index array dereference. It eats the "vec4" of the
1879 * base of the array and an index that offsets the Mesa register
1880 * index.
1881 */
1882 ir->array_index->accept(this);
1883
1884 src_reg index_reg;
1885
1886 if (array_stride == 1) {
1887 index_reg = this->result;
1888 } else {
1889 index_reg = src_reg(this, glsl_type::int_type);
1890
1891 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1892 }
1893
1894 if (src.reladdr) {
1895 src_reg temp = src_reg(this, glsl_type::int_type);
1896
1897 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1898
1899 index_reg = temp;
1900 }
1901
1902 src.reladdr = ralloc(mem_ctx, src_reg);
1903 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1904 }
1905
1906 /* If the type is smaller than a vec4, replicate the last channel out. */
1907 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1908 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1909 else
1910 src.swizzle = BRW_SWIZZLE_NOOP;
1911 src.type = brw_type_for_base_type(ir->type);
1912
1913 this->result = src;
1914 }
1915
1916 void
1917 vec4_visitor::visit(ir_dereference_record *ir)
1918 {
1919 unsigned int i;
1920 const glsl_type *struct_type = ir->record->type;
1921 int offset = 0;
1922
1923 ir->record->accept(this);
1924
1925 for (i = 0; i < struct_type->length; i++) {
1926 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1927 break;
1928 offset += type_size(struct_type->fields.structure[i].type);
1929 }
1930
1931 /* If the type is smaller than a vec4, replicate the last channel out. */
1932 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1933 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1934 else
1935 this->result.swizzle = BRW_SWIZZLE_NOOP;
1936 this->result.type = brw_type_for_base_type(ir->type);
1937
1938 this->result.reg_offset += offset;
1939 }
1940
1941 /**
1942 * We want to be careful in assignment setup to hit the actual storage
1943 * instead of potentially using a temporary like we might with the
1944 * ir_dereference handler.
1945 */
1946 static dst_reg
1947 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1948 {
1949 /* The LHS must be a dereference. If the LHS is a variable indexed array
1950 * access of a vector, it must be separated into a series conditional moves
1951 * before reaching this point (see ir_vec_index_to_cond_assign).
1952 */
1953 assert(ir->as_dereference());
1954 ir_dereference_array *deref_array = ir->as_dereference_array();
1955 if (deref_array) {
1956 assert(!deref_array->array->type->is_vector());
1957 }
1958
1959 /* Use the rvalue deref handler for the most part. We'll ignore
1960 * swizzles in it and write swizzles using writemask, though.
1961 */
1962 ir->accept(v);
1963 return dst_reg(v->result);
1964 }
1965
1966 void
1967 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1968 const struct glsl_type *type, uint32_t predicate)
1969 {
1970 if (type->base_type == GLSL_TYPE_STRUCT) {
1971 for (unsigned int i = 0; i < type->length; i++) {
1972 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1973 }
1974 return;
1975 }
1976
1977 if (type->is_array()) {
1978 for (unsigned int i = 0; i < type->length; i++) {
1979 emit_block_move(dst, src, type->fields.array, predicate);
1980 }
1981 return;
1982 }
1983
1984 if (type->is_matrix()) {
1985 const struct glsl_type *vec_type;
1986
1987 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1988 type->vector_elements, 1);
1989
1990 for (int i = 0; i < type->matrix_columns; i++) {
1991 emit_block_move(dst, src, vec_type, predicate);
1992 }
1993 return;
1994 }
1995
1996 assert(type->is_scalar() || type->is_vector());
1997
1998 dst->type = brw_type_for_base_type(type);
1999 src->type = dst->type;
2000
2001 dst->writemask = (1 << type->vector_elements) - 1;
2002
2003 src->swizzle = swizzle_for_size(type->vector_elements);
2004
2005 vec4_instruction *inst = emit(MOV(*dst, *src));
2006 inst->predicate = predicate;
2007
2008 dst->reg_offset++;
2009 src->reg_offset++;
2010 }
2011
2012
2013 /* If the RHS processing resulted in an instruction generating a
2014 * temporary value, and it would be easy to rewrite the instruction to
2015 * generate its result right into the LHS instead, do so. This ends
2016 * up reliably removing instructions where it can be tricky to do so
2017 * later without real UD chain information.
2018 */
2019 bool
2020 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2021 dst_reg dst,
2022 src_reg src,
2023 vec4_instruction *pre_rhs_inst,
2024 vec4_instruction *last_rhs_inst)
2025 {
2026 /* This could be supported, but it would take more smarts. */
2027 if (ir->condition)
2028 return false;
2029
2030 if (pre_rhs_inst == last_rhs_inst)
2031 return false; /* No instructions generated to work with. */
2032
2033 /* Make sure the last instruction generated our source reg. */
2034 if (src.file != GRF ||
2035 src.file != last_rhs_inst->dst.file ||
2036 src.reg != last_rhs_inst->dst.reg ||
2037 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2038 src.reladdr ||
2039 src.abs ||
2040 src.negate ||
2041 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2042 return false;
2043
2044 /* Check that that last instruction fully initialized the channels
2045 * we want to use, in the order we want to use them. We could
2046 * potentially reswizzle the operands of many instructions so that
2047 * we could handle out of order channels, but don't yet.
2048 */
2049
2050 for (unsigned i = 0; i < 4; i++) {
2051 if (dst.writemask & (1 << i)) {
2052 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2053 return false;
2054
2055 if (BRW_GET_SWZ(src.swizzle, i) != i)
2056 return false;
2057 }
2058 }
2059
2060 /* Success! Rewrite the instruction. */
2061 last_rhs_inst->dst.file = dst.file;
2062 last_rhs_inst->dst.reg = dst.reg;
2063 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2064 last_rhs_inst->dst.reladdr = dst.reladdr;
2065 last_rhs_inst->dst.writemask &= dst.writemask;
2066
2067 return true;
2068 }
2069
2070 void
2071 vec4_visitor::visit(ir_assignment *ir)
2072 {
2073 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2074 uint32_t predicate = BRW_PREDICATE_NONE;
2075
2076 if (!ir->lhs->type->is_scalar() &&
2077 !ir->lhs->type->is_vector()) {
2078 ir->rhs->accept(this);
2079 src_reg src = this->result;
2080
2081 if (ir->condition) {
2082 emit_bool_to_cond_code(ir->condition, &predicate);
2083 }
2084
2085 /* emit_block_move doesn't account for swizzles in the source register.
2086 * This should be ok, since the source register is a structure or an
2087 * array, and those can't be swizzled. But double-check to be sure.
2088 */
2089 assert(src.swizzle ==
2090 (ir->rhs->type->is_matrix()
2091 ? swizzle_for_size(ir->rhs->type->vector_elements)
2092 : BRW_SWIZZLE_NOOP));
2093
2094 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2095 return;
2096 }
2097
2098 /* Now we're down to just a scalar/vector with writemasks. */
2099 int i;
2100
2101 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2102 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2103
2104 ir->rhs->accept(this);
2105
2106 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2107
2108 src_reg src = this->result;
2109
2110 int swizzles[4];
2111 int first_enabled_chan = 0;
2112 int src_chan = 0;
2113
2114 assert(ir->lhs->type->is_vector() ||
2115 ir->lhs->type->is_scalar());
2116 dst.writemask = ir->write_mask;
2117
2118 for (int i = 0; i < 4; i++) {
2119 if (dst.writemask & (1 << i)) {
2120 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2121 break;
2122 }
2123 }
2124
2125 /* Swizzle a small RHS vector into the channels being written.
2126 *
2127 * glsl ir treats write_mask as dictating how many channels are
2128 * present on the RHS while in our instructions we need to make
2129 * those channels appear in the slots of the vec4 they're written to.
2130 */
2131 for (int i = 0; i < 4; i++) {
2132 if (dst.writemask & (1 << i))
2133 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2134 else
2135 swizzles[i] = first_enabled_chan;
2136 }
2137 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2138 swizzles[2], swizzles[3]);
2139
2140 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2141 return;
2142 }
2143
2144 if (ir->condition) {
2145 emit_bool_to_cond_code(ir->condition, &predicate);
2146 }
2147
2148 for (i = 0; i < type_size(ir->lhs->type); i++) {
2149 vec4_instruction *inst = emit(MOV(dst, src));
2150 inst->predicate = predicate;
2151
2152 dst.reg_offset++;
2153 src.reg_offset++;
2154 }
2155 }
2156
2157 void
2158 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2159 {
2160 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2161 foreach_list(node, &ir->components) {
2162 ir_constant *field_value = (ir_constant *)node;
2163
2164 emit_constant_values(dst, field_value);
2165 }
2166 return;
2167 }
2168
2169 if (ir->type->is_array()) {
2170 for (unsigned int i = 0; i < ir->type->length; i++) {
2171 emit_constant_values(dst, ir->array_elements[i]);
2172 }
2173 return;
2174 }
2175
2176 if (ir->type->is_matrix()) {
2177 for (int i = 0; i < ir->type->matrix_columns; i++) {
2178 float *vec = &ir->value.f[i * ir->type->vector_elements];
2179
2180 for (int j = 0; j < ir->type->vector_elements; j++) {
2181 dst->writemask = 1 << j;
2182 dst->type = BRW_REGISTER_TYPE_F;
2183
2184 emit(MOV(*dst, src_reg(vec[j])));
2185 }
2186 dst->reg_offset++;
2187 }
2188 return;
2189 }
2190
2191 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2192
2193 for (int i = 0; i < ir->type->vector_elements; i++) {
2194 if (!(remaining_writemask & (1 << i)))
2195 continue;
2196
2197 dst->writemask = 1 << i;
2198 dst->type = brw_type_for_base_type(ir->type);
2199
2200 /* Find other components that match the one we're about to
2201 * write. Emits fewer instructions for things like vec4(0.5,
2202 * 1.5, 1.5, 1.5).
2203 */
2204 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2205 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2206 if (ir->value.b[i] == ir->value.b[j])
2207 dst->writemask |= (1 << j);
2208 } else {
2209 /* u, i, and f storage all line up, so no need for a
2210 * switch case for comparing each type.
2211 */
2212 if (ir->value.u[i] == ir->value.u[j])
2213 dst->writemask |= (1 << j);
2214 }
2215 }
2216
2217 switch (ir->type->base_type) {
2218 case GLSL_TYPE_FLOAT:
2219 emit(MOV(*dst, src_reg(ir->value.f[i])));
2220 break;
2221 case GLSL_TYPE_INT:
2222 emit(MOV(*dst, src_reg(ir->value.i[i])));
2223 break;
2224 case GLSL_TYPE_UINT:
2225 emit(MOV(*dst, src_reg(ir->value.u[i])));
2226 break;
2227 case GLSL_TYPE_BOOL:
2228 emit(MOV(*dst, src_reg(ir->value.b[i])));
2229 break;
2230 default:
2231 assert(!"Non-float/uint/int/bool constant");
2232 break;
2233 }
2234
2235 remaining_writemask &= ~dst->writemask;
2236 }
2237 dst->reg_offset++;
2238 }
2239
2240 void
2241 vec4_visitor::visit(ir_constant *ir)
2242 {
2243 dst_reg dst = dst_reg(this, ir->type);
2244 this->result = src_reg(dst);
2245
2246 emit_constant_values(&dst, ir);
2247 }
2248
2249 void
2250 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2251 {
2252 ir_dereference *deref = static_cast<ir_dereference *>(
2253 ir->actual_parameters.get_head());
2254 ir_variable *location = deref->variable_referenced();
2255 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2256 location->data.atomic.buffer_index);
2257
2258 /* Calculate the surface offset */
2259 src_reg offset(this, glsl_type::uint_type);
2260 ir_dereference_array *deref_array = deref->as_dereference_array();
2261 if (deref_array) {
2262 deref_array->array_index->accept(this);
2263
2264 src_reg tmp(this, glsl_type::uint_type);
2265 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2266 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2267 } else {
2268 offset = location->data.atomic.offset;
2269 }
2270
2271 /* Emit the appropriate machine instruction */
2272 const char *callee = ir->callee->function_name();
2273 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2274
2275 if (!strcmp("__intrinsic_atomic_read", callee)) {
2276 emit_untyped_surface_read(surf_index, dst, offset);
2277
2278 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2279 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2280 src_reg(), src_reg());
2281
2282 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2283 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2284 src_reg(), src_reg());
2285 }
2286 }
2287
2288 void
2289 vec4_visitor::visit(ir_call *ir)
2290 {
2291 const char *callee = ir->callee->function_name();
2292
2293 if (!strcmp("__intrinsic_atomic_read", callee) ||
2294 !strcmp("__intrinsic_atomic_increment", callee) ||
2295 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2296 visit_atomic_counter_intrinsic(ir);
2297 } else {
2298 assert(!"Unsupported intrinsic.");
2299 }
2300 }
2301
2302 src_reg
2303 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2304 {
2305 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2306 inst->base_mrf = 2;
2307 inst->mlen = 1;
2308 inst->sampler = sampler;
2309 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2310 inst->dst.writemask = WRITEMASK_XYZW;
2311
2312 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2313 int param_base = inst->base_mrf;
2314 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2315 int zero_mask = 0xf & ~coord_mask;
2316
2317 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2318 coordinate));
2319
2320 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2321 src_reg(0)));
2322
2323 emit(inst);
2324 return src_reg(inst->dst);
2325 }
2326
2327 void
2328 vec4_visitor::visit(ir_texture *ir)
2329 {
2330 int sampler =
2331 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2332
2333 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2334 * emitting anything other than setting up the constant result.
2335 */
2336 if (ir->op == ir_tg4) {
2337 ir_constant *chan = ir->lod_info.component->as_constant();
2338 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2339 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2340 dst_reg result(this, ir->type);
2341 this->result = src_reg(result);
2342 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2343 return;
2344 }
2345 }
2346
2347 /* Should be lowered by do_lower_texture_projection */
2348 assert(!ir->projector);
2349
2350 /* Should be lowered */
2351 assert(!ir->offset || !ir->offset->type->is_array());
2352
2353 /* Generate code to compute all the subexpression trees. This has to be
2354 * done before loading any values into MRFs for the sampler message since
2355 * generating these values may involve SEND messages that need the MRFs.
2356 */
2357 src_reg coordinate;
2358 if (ir->coordinate) {
2359 ir->coordinate->accept(this);
2360 coordinate = this->result;
2361 }
2362
2363 src_reg shadow_comparitor;
2364 if (ir->shadow_comparitor) {
2365 ir->shadow_comparitor->accept(this);
2366 shadow_comparitor = this->result;
2367 }
2368
2369 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2370 src_reg offset_value;
2371 if (has_nonconstant_offset) {
2372 ir->offset->accept(this);
2373 offset_value = src_reg(this->result);
2374 }
2375
2376 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2377 src_reg lod, dPdx, dPdy, sample_index, mcs;
2378 switch (ir->op) {
2379 case ir_tex:
2380 lod = src_reg(0.0f);
2381 lod_type = glsl_type::float_type;
2382 break;
2383 case ir_txf:
2384 case ir_txl:
2385 case ir_txs:
2386 ir->lod_info.lod->accept(this);
2387 lod = this->result;
2388 lod_type = ir->lod_info.lod->type;
2389 break;
2390 case ir_query_levels:
2391 lod = src_reg(0);
2392 lod_type = glsl_type::int_type;
2393 break;
2394 case ir_txf_ms:
2395 ir->lod_info.sample_index->accept(this);
2396 sample_index = this->result;
2397 sample_index_type = ir->lod_info.sample_index->type;
2398
2399 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2400 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2401 else
2402 mcs = src_reg(0u);
2403 break;
2404 case ir_txd:
2405 ir->lod_info.grad.dPdx->accept(this);
2406 dPdx = this->result;
2407
2408 ir->lod_info.grad.dPdy->accept(this);
2409 dPdy = this->result;
2410
2411 lod_type = ir->lod_info.grad.dPdx->type;
2412 break;
2413 case ir_txb:
2414 case ir_lod:
2415 case ir_tg4:
2416 break;
2417 }
2418
2419 vec4_instruction *inst = NULL;
2420 switch (ir->op) {
2421 case ir_tex:
2422 case ir_txl:
2423 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2424 break;
2425 case ir_txd:
2426 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2427 break;
2428 case ir_txf:
2429 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2430 break;
2431 case ir_txf_ms:
2432 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2433 break;
2434 case ir_txs:
2435 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2436 break;
2437 case ir_tg4:
2438 if (has_nonconstant_offset)
2439 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2440 else
2441 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2442 break;
2443 case ir_query_levels:
2444 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2445 break;
2446 case ir_txb:
2447 assert(!"TXB is not valid for vertex shaders.");
2448 break;
2449 case ir_lod:
2450 assert(!"LOD is not valid for vertex shaders.");
2451 break;
2452 default:
2453 assert(!"Unrecognized tex op");
2454 }
2455
2456 if (ir->offset != NULL && ir->op != ir_txf)
2457 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2458
2459 /* Stuff the channel select bits in the top of the texture offset */
2460 if (ir->op == ir_tg4)
2461 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2462
2463 /* The message header is necessary for:
2464 * - Gen4 (always)
2465 * - Texel offsets
2466 * - Gather channel selection
2467 * - Sampler indices too large to fit in a 4-bit value.
2468 */
2469 inst->header_present =
2470 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2471 sampler >= 16;
2472 inst->base_mrf = 2;
2473 inst->mlen = inst->header_present + 1; /* always at least one */
2474 inst->sampler = sampler;
2475 inst->dst = dst_reg(this, ir->type);
2476 inst->dst.writemask = WRITEMASK_XYZW;
2477 inst->shadow_compare = ir->shadow_comparitor != NULL;
2478
2479 /* MRF for the first parameter */
2480 int param_base = inst->base_mrf + inst->header_present;
2481
2482 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2483 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2484 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2485 } else {
2486 /* Load the coordinate */
2487 /* FINISHME: gl_clamp_mask and saturate */
2488 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2489 int zero_mask = 0xf & ~coord_mask;
2490
2491 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2492 coordinate));
2493
2494 if (zero_mask != 0) {
2495 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2496 src_reg(0)));
2497 }
2498 /* Load the shadow comparitor */
2499 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2500 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2501 WRITEMASK_X),
2502 shadow_comparitor));
2503 inst->mlen++;
2504 }
2505
2506 /* Load the LOD info */
2507 if (ir->op == ir_tex || ir->op == ir_txl) {
2508 int mrf, writemask;
2509 if (brw->gen >= 5) {
2510 mrf = param_base + 1;
2511 if (ir->shadow_comparitor) {
2512 writemask = WRITEMASK_Y;
2513 /* mlen already incremented */
2514 } else {
2515 writemask = WRITEMASK_X;
2516 inst->mlen++;
2517 }
2518 } else /* brw->gen == 4 */ {
2519 mrf = param_base;
2520 writemask = WRITEMASK_W;
2521 }
2522 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2523 } else if (ir->op == ir_txf) {
2524 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2525 } else if (ir->op == ir_txf_ms) {
2526 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2527 sample_index));
2528 if (brw->gen >= 7)
2529 /* MCS data is in the first channel of `mcs`, but we need to get it into
2530 * the .y channel of the second vec4 of params, so replicate .x across
2531 * the whole vec4 and then mask off everything except .y
2532 */
2533 mcs.swizzle = BRW_SWIZZLE_XXXX;
2534 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2535 mcs));
2536 inst->mlen++;
2537 } else if (ir->op == ir_txd) {
2538 const glsl_type *type = lod_type;
2539
2540 if (brw->gen >= 5) {
2541 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2542 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2543 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2544 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2545 inst->mlen++;
2546
2547 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2548 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2549 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2550 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2551 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2552 inst->mlen++;
2553
2554 if (ir->shadow_comparitor) {
2555 emit(MOV(dst_reg(MRF, param_base + 2,
2556 ir->shadow_comparitor->type, WRITEMASK_Z),
2557 shadow_comparitor));
2558 }
2559 }
2560 } else /* brw->gen == 4 */ {
2561 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2562 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2563 inst->mlen += 2;
2564 }
2565 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2566 if (ir->shadow_comparitor) {
2567 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2568 shadow_comparitor));
2569 }
2570
2571 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2572 offset_value));
2573 inst->mlen++;
2574 }
2575 }
2576
2577 emit(inst);
2578
2579 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2580 * spec requires layers.
2581 */
2582 if (ir->op == ir_txs) {
2583 glsl_type const *type = ir->sampler->type;
2584 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2585 type->sampler_array) {
2586 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2587 writemask(inst->dst, WRITEMASK_Z),
2588 src_reg(inst->dst), src_reg(6));
2589 }
2590 }
2591
2592 if (brw->gen == 6 && ir->op == ir_tg4) {
2593 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2594 }
2595
2596 swizzle_result(ir, src_reg(inst->dst), sampler);
2597 }
2598
2599 /**
2600 * Apply workarounds for Gen6 gather with UINT/SINT
2601 */
2602 void
2603 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2604 {
2605 if (!wa)
2606 return;
2607
2608 int width = (wa & WA_8BIT) ? 8 : 16;
2609 dst_reg dst_f = dst;
2610 dst_f.type = BRW_REGISTER_TYPE_F;
2611
2612 /* Convert from UNORM to UINT */
2613 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2614 emit(MOV(dst, src_reg(dst_f)));
2615
2616 if (wa & WA_SIGN) {
2617 /* Reinterpret the UINT value as a signed INT value by
2618 * shifting the sign bit into place, then shifting back
2619 * preserving sign.
2620 */
2621 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2622 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2623 }
2624 }
2625
2626 /**
2627 * Set up the gather channel based on the swizzle, for gather4.
2628 */
2629 uint32_t
2630 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2631 {
2632 ir_constant *chan = ir->lod_info.component->as_constant();
2633 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2634 switch (swiz) {
2635 case SWIZZLE_X: return 0;
2636 case SWIZZLE_Y:
2637 /* gather4 sampler is broken for green channel on RG32F --
2638 * we must ask for blue instead.
2639 */
2640 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2641 return 2;
2642 return 1;
2643 case SWIZZLE_Z: return 2;
2644 case SWIZZLE_W: return 3;
2645 default:
2646 assert(!"Not reached"); /* zero, one swizzles handled already */
2647 return 0;
2648 }
2649 }
2650
2651 void
2652 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2653 {
2654 int s = key->tex.swizzles[sampler];
2655
2656 this->result = src_reg(this, ir->type);
2657 dst_reg swizzled_result(this->result);
2658
2659 if (ir->op == ir_query_levels) {
2660 /* # levels is in .w */
2661 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2662 emit(MOV(swizzled_result, orig_val));
2663 return;
2664 }
2665
2666 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2667 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2668 emit(MOV(swizzled_result, orig_val));
2669 return;
2670 }
2671
2672
2673 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2674 int swizzle[4] = {0};
2675
2676 for (int i = 0; i < 4; i++) {
2677 switch (GET_SWZ(s, i)) {
2678 case SWIZZLE_ZERO:
2679 zero_mask |= (1 << i);
2680 break;
2681 case SWIZZLE_ONE:
2682 one_mask |= (1 << i);
2683 break;
2684 default:
2685 copy_mask |= (1 << i);
2686 swizzle[i] = GET_SWZ(s, i);
2687 break;
2688 }
2689 }
2690
2691 if (copy_mask) {
2692 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2693 swizzled_result.writemask = copy_mask;
2694 emit(MOV(swizzled_result, orig_val));
2695 }
2696
2697 if (zero_mask) {
2698 swizzled_result.writemask = zero_mask;
2699 emit(MOV(swizzled_result, src_reg(0.0f)));
2700 }
2701
2702 if (one_mask) {
2703 swizzled_result.writemask = one_mask;
2704 emit(MOV(swizzled_result, src_reg(1.0f)));
2705 }
2706 }
2707
2708 void
2709 vec4_visitor::visit(ir_return *)
2710 {
2711 assert(!"not reached");
2712 }
2713
2714 void
2715 vec4_visitor::visit(ir_discard *)
2716 {
2717 assert(!"not reached");
2718 }
2719
2720 void
2721 vec4_visitor::visit(ir_if *ir)
2722 {
2723 /* Don't point the annotation at the if statement, because then it plus
2724 * the then and else blocks get printed.
2725 */
2726 this->base_ir = ir->condition;
2727
2728 if (brw->gen == 6) {
2729 emit_if_gen6(ir);
2730 } else {
2731 uint32_t predicate;
2732 emit_bool_to_cond_code(ir->condition, &predicate);
2733 emit(IF(predicate));
2734 }
2735
2736 visit_instructions(&ir->then_instructions);
2737
2738 if (!ir->else_instructions.is_empty()) {
2739 this->base_ir = ir->condition;
2740 emit(BRW_OPCODE_ELSE);
2741
2742 visit_instructions(&ir->else_instructions);
2743 }
2744
2745 this->base_ir = ir->condition;
2746 emit(BRW_OPCODE_ENDIF);
2747 }
2748
2749 void
2750 vec4_visitor::visit(ir_emit_vertex *)
2751 {
2752 assert(!"not reached");
2753 }
2754
2755 void
2756 vec4_visitor::visit(ir_end_primitive *)
2757 {
2758 assert(!"not reached");
2759 }
2760
2761 void
2762 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2763 dst_reg dst, src_reg offset,
2764 src_reg src0, src_reg src1)
2765 {
2766 unsigned mlen = 0;
2767
2768 /* Set the atomic operation offset. */
2769 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2770 mlen++;
2771
2772 /* Set the atomic operation arguments. */
2773 if (src0.file != BAD_FILE) {
2774 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2775 mlen++;
2776 }
2777
2778 if (src1.file != BAD_FILE) {
2779 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2780 mlen++;
2781 }
2782
2783 /* Emit the instruction. Note that this maps to the normal SIMD8
2784 * untyped atomic message on Ivy Bridge, but that's OK because
2785 * unused channels will be masked out.
2786 */
2787 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2788 src_reg(atomic_op), src_reg(surf_index));
2789 inst->base_mrf = 0;
2790 inst->mlen = mlen;
2791 }
2792
2793 void
2794 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2795 src_reg offset)
2796 {
2797 /* Set the surface read offset. */
2798 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2799
2800 /* Emit the instruction. Note that this maps to the normal SIMD8
2801 * untyped surface read message, but that's OK because unused
2802 * channels will be masked out.
2803 */
2804 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2805 dst, src_reg(surf_index));
2806 inst->base_mrf = 0;
2807 inst->mlen = 1;
2808 }
2809
2810 void
2811 vec4_visitor::emit_ndc_computation()
2812 {
2813 /* Get the position */
2814 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2815
2816 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2817 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2818 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2819
2820 current_annotation = "NDC";
2821 dst_reg ndc_w = ndc;
2822 ndc_w.writemask = WRITEMASK_W;
2823 src_reg pos_w = pos;
2824 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2825 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2826
2827 dst_reg ndc_xyz = ndc;
2828 ndc_xyz.writemask = WRITEMASK_XYZ;
2829
2830 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2831 }
2832
2833 void
2834 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2835 {
2836 if (brw->gen < 6 &&
2837 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2838 key->userclip_active || brw->has_negative_rhw_bug)) {
2839 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2840 dst_reg header1_w = header1;
2841 header1_w.writemask = WRITEMASK_W;
2842
2843 emit(MOV(header1, 0u));
2844
2845 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2846 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2847
2848 current_annotation = "Point size";
2849 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2850 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2851 }
2852
2853 if (key->userclip_active) {
2854 current_annotation = "Clipping flags";
2855 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2856 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2857
2858 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2859 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2860 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2861
2862 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2863 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2864 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2865 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2866 }
2867
2868 /* i965 clipping workaround:
2869 * 1) Test for -ve rhw
2870 * 2) If set,
2871 * set ndc = (0,0,0,0)
2872 * set ucp[6] = 1
2873 *
2874 * Later, clipping will detect ucp[6] and ensure the primitive is
2875 * clipped against all fixed planes.
2876 */
2877 if (brw->has_negative_rhw_bug) {
2878 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2879 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2880 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2881 vec4_instruction *inst;
2882 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2883 inst->predicate = BRW_PREDICATE_NORMAL;
2884 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2885 inst->predicate = BRW_PREDICATE_NORMAL;
2886 }
2887
2888 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2889 } else if (brw->gen < 6) {
2890 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2891 } else {
2892 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2893 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2894 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2895 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2896 }
2897 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2898 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2899 src_reg(output_reg[VARYING_SLOT_LAYER])));
2900 }
2901 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2902 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2903 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2904 }
2905 }
2906 }
2907
2908 void
2909 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2910 {
2911 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2912 *
2913 * "If a linked set of shaders forming the vertex stage contains no
2914 * static write to gl_ClipVertex or gl_ClipDistance, but the
2915 * application has requested clipping against user clip planes through
2916 * the API, then the coordinate written to gl_Position is used for
2917 * comparison against the user clip planes."
2918 *
2919 * This function is only called if the shader didn't write to
2920 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2921 * if the user wrote to it; otherwise we use gl_Position.
2922 */
2923 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2924 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2925 clip_vertex = VARYING_SLOT_POS;
2926 }
2927
2928 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2929 ++i) {
2930 reg.writemask = 1 << i;
2931 emit(DP4(reg,
2932 src_reg(output_reg[clip_vertex]),
2933 src_reg(this->userplane[i + offset])));
2934 }
2935 }
2936
2937 void
2938 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2939 {
2940 assert (varying < VARYING_SLOT_MAX);
2941 reg.type = output_reg[varying].type;
2942 current_annotation = output_reg_annotation[varying];
2943 /* Copy the register, saturating if necessary */
2944 vec4_instruction *inst = emit(MOV(reg,
2945 src_reg(output_reg[varying])));
2946 if ((varying == VARYING_SLOT_COL0 ||
2947 varying == VARYING_SLOT_COL1 ||
2948 varying == VARYING_SLOT_BFC0 ||
2949 varying == VARYING_SLOT_BFC1) &&
2950 key->clamp_vertex_color) {
2951 inst->saturate = true;
2952 }
2953 }
2954
2955 void
2956 vec4_visitor::emit_urb_slot(int mrf, int varying)
2957 {
2958 struct brw_reg hw_reg = brw_message_reg(mrf);
2959 dst_reg reg = dst_reg(MRF, mrf);
2960 reg.type = BRW_REGISTER_TYPE_F;
2961
2962 switch (varying) {
2963 case VARYING_SLOT_PSIZ:
2964 /* PSIZ is always in slot 0, and is coupled with other flags. */
2965 current_annotation = "indices, point width, clip flags";
2966 emit_psiz_and_flags(hw_reg);
2967 break;
2968 case BRW_VARYING_SLOT_NDC:
2969 current_annotation = "NDC";
2970 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2971 break;
2972 case VARYING_SLOT_POS:
2973 current_annotation = "gl_Position";
2974 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2975 break;
2976 case VARYING_SLOT_EDGE:
2977 /* This is present when doing unfilled polygons. We're supposed to copy
2978 * the edge flag from the user-provided vertex array
2979 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2980 * of that attribute (starts as 1.0f). This is then used in clipping to
2981 * determine which edges should be drawn as wireframe.
2982 */
2983 current_annotation = "edge flag";
2984 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2985 glsl_type::float_type, WRITEMASK_XYZW))));
2986 break;
2987 case BRW_VARYING_SLOT_PAD:
2988 /* No need to write to this slot */
2989 break;
2990 default:
2991 emit_generic_urb_slot(reg, varying);
2992 break;
2993 }
2994 }
2995
2996 static int
2997 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2998 {
2999 if (brw->gen >= 6) {
3000 /* URB data written (does not include the message header reg) must
3001 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3002 * section 5.4.3.2.2: URB_INTERLEAVED.
3003 *
3004 * URB entries are allocated on a multiple of 1024 bits, so an
3005 * extra 128 bits written here to make the end align to 256 is
3006 * no problem.
3007 */
3008 if ((mlen % 2) != 1)
3009 mlen++;
3010 }
3011
3012 return mlen;
3013 }
3014
3015
3016 /**
3017 * Generates the VUE payload plus the necessary URB write instructions to
3018 * output it.
3019 *
3020 * The VUE layout is documented in Volume 2a.
3021 */
3022 void
3023 vec4_visitor::emit_vertex()
3024 {
3025 /* MRF 0 is reserved for the debugger, so start with message header
3026 * in MRF 1.
3027 */
3028 int base_mrf = 1;
3029 int mrf = base_mrf;
3030 /* In the process of generating our URB write message contents, we
3031 * may need to unspill a register or load from an array. Those
3032 * reads would use MRFs 14-15.
3033 */
3034 int max_usable_mrf = 13;
3035
3036 /* The following assertion verifies that max_usable_mrf causes an
3037 * even-numbered amount of URB write data, which will meet gen6's
3038 * requirements for length alignment.
3039 */
3040 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3041
3042 /* First mrf is the g0-based message header containing URB handles and
3043 * such.
3044 */
3045 emit_urb_write_header(mrf++);
3046
3047 if (brw->gen < 6) {
3048 emit_ndc_computation();
3049 }
3050
3051 /* Lower legacy ff and ClipVertex clipping to clip distances */
3052 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3053 current_annotation = "user clip distances";
3054
3055 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3056 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3057
3058 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3059 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3060 }
3061
3062 /* We may need to split this up into several URB writes, so do them in a
3063 * loop.
3064 */
3065 int slot = 0;
3066 bool complete = false;
3067 do {
3068 /* URB offset is in URB row increments, and each of our MRFs is half of
3069 * one of those, since we're doing interleaved writes.
3070 */
3071 int offset = slot / 2;
3072
3073 mrf = base_mrf + 1;
3074 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3075 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3076
3077 /* If this was max_usable_mrf, we can't fit anything more into this
3078 * URB WRITE.
3079 */
3080 if (mrf > max_usable_mrf) {
3081 slot++;
3082 break;
3083 }
3084 }
3085
3086 complete = slot >= prog_data->vue_map.num_slots;
3087 current_annotation = "URB write";
3088 vec4_instruction *inst = emit_urb_write_opcode(complete);
3089 inst->base_mrf = base_mrf;
3090 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3091 inst->offset += offset;
3092 } while(!complete);
3093 }
3094
3095
3096 src_reg
3097 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3098 src_reg *reladdr, int reg_offset)
3099 {
3100 /* Because we store the values to scratch interleaved like our
3101 * vertex data, we need to scale the vec4 index by 2.
3102 */
3103 int message_header_scale = 2;
3104
3105 /* Pre-gen6, the message header uses byte offsets instead of vec4
3106 * (16-byte) offset units.
3107 */
3108 if (brw->gen < 6)
3109 message_header_scale *= 16;
3110
3111 if (reladdr) {
3112 src_reg index = src_reg(this, glsl_type::int_type);
3113
3114 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3115 emit_before(inst, MUL(dst_reg(index),
3116 index, src_reg(message_header_scale)));
3117
3118 return index;
3119 } else {
3120 return src_reg(reg_offset * message_header_scale);
3121 }
3122 }
3123
3124 src_reg
3125 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3126 src_reg *reladdr, int reg_offset)
3127 {
3128 if (reladdr) {
3129 src_reg index = src_reg(this, glsl_type::int_type);
3130
3131 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3132
3133 /* Pre-gen6, the message header uses byte offsets instead of vec4
3134 * (16-byte) offset units.
3135 */
3136 if (brw->gen < 6) {
3137 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3138 }
3139
3140 return index;
3141 } else if (brw->gen >= 8) {
3142 /* Store the offset in a GRF so we can send-from-GRF. */
3143 src_reg offset = src_reg(this, glsl_type::int_type);
3144 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3145 return offset;
3146 } else {
3147 int message_header_scale = brw->gen < 6 ? 16 : 1;
3148 return src_reg(reg_offset * message_header_scale);
3149 }
3150 }
3151
3152 /**
3153 * Emits an instruction before @inst to load the value named by @orig_src
3154 * from scratch space at @base_offset to @temp.
3155 *
3156 * @base_offset is measured in 32-byte units (the size of a register).
3157 */
3158 void
3159 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3160 dst_reg temp, src_reg orig_src,
3161 int base_offset)
3162 {
3163 int reg_offset = base_offset + orig_src.reg_offset;
3164 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3165
3166 emit_before(inst, SCRATCH_READ(temp, index));
3167 }
3168
3169 /**
3170 * Emits an instruction after @inst to store the value to be written
3171 * to @orig_dst to scratch space at @base_offset, from @temp.
3172 *
3173 * @base_offset is measured in 32-byte units (the size of a register).
3174 */
3175 void
3176 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3177 {
3178 int reg_offset = base_offset + inst->dst.reg_offset;
3179 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3180
3181 /* Create a temporary register to store *inst's result in.
3182 *
3183 * We have to be careful in MOVing from our temporary result register in
3184 * the scratch write. If we swizzle from channels of the temporary that
3185 * weren't initialized, it will confuse live interval analysis, which will
3186 * make spilling fail to make progress.
3187 */
3188 src_reg temp = src_reg(this, glsl_type::vec4_type);
3189 temp.type = inst->dst.type;
3190 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3191 int swizzles[4];
3192 for (int i = 0; i < 4; i++)
3193 if (inst->dst.writemask & (1 << i))
3194 swizzles[i] = i;
3195 else
3196 swizzles[i] = first_writemask_chan;
3197 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3198 swizzles[2], swizzles[3]);
3199
3200 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3201 inst->dst.writemask));
3202 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3203 write->predicate = inst->predicate;
3204 write->ir = inst->ir;
3205 write->annotation = inst->annotation;
3206 inst->insert_after(write);
3207
3208 inst->dst.file = temp.file;
3209 inst->dst.reg = temp.reg;
3210 inst->dst.reg_offset = temp.reg_offset;
3211 inst->dst.reladdr = NULL;
3212 }
3213
3214 /**
3215 * We can't generally support array access in GRF space, because a
3216 * single instruction's destination can only span 2 contiguous
3217 * registers. So, we send all GRF arrays that get variable index
3218 * access to scratch space.
3219 */
3220 void
3221 vec4_visitor::move_grf_array_access_to_scratch()
3222 {
3223 int scratch_loc[this->virtual_grf_count];
3224
3225 for (int i = 0; i < this->virtual_grf_count; i++) {
3226 scratch_loc[i] = -1;
3227 }
3228
3229 /* First, calculate the set of virtual GRFs that need to be punted
3230 * to scratch due to having any array access on them, and where in
3231 * scratch.
3232 */
3233 foreach_list(node, &this->instructions) {
3234 vec4_instruction *inst = (vec4_instruction *)node;
3235
3236 if (inst->dst.file == GRF && inst->dst.reladdr &&
3237 scratch_loc[inst->dst.reg] == -1) {
3238 scratch_loc[inst->dst.reg] = c->last_scratch;
3239 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3240 }
3241
3242 for (int i = 0 ; i < 3; i++) {
3243 src_reg *src = &inst->src[i];
3244
3245 if (src->file == GRF && src->reladdr &&
3246 scratch_loc[src->reg] == -1) {
3247 scratch_loc[src->reg] = c->last_scratch;
3248 c->last_scratch += this->virtual_grf_sizes[src->reg];
3249 }
3250 }
3251 }
3252
3253 /* Now, for anything that will be accessed through scratch, rewrite
3254 * it to load/store. Note that this is a _safe list walk, because
3255 * we may generate a new scratch_write instruction after the one
3256 * we're processing.
3257 */
3258 foreach_list_safe(node, &this->instructions) {
3259 vec4_instruction *inst = (vec4_instruction *)node;
3260
3261 /* Set up the annotation tracking for new generated instructions. */
3262 base_ir = inst->ir;
3263 current_annotation = inst->annotation;
3264
3265 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3266 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3267 }
3268
3269 for (int i = 0 ; i < 3; i++) {
3270 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3271 continue;
3272
3273 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3274
3275 emit_scratch_read(inst, temp, inst->src[i],
3276 scratch_loc[inst->src[i].reg]);
3277
3278 inst->src[i].file = temp.file;
3279 inst->src[i].reg = temp.reg;
3280 inst->src[i].reg_offset = temp.reg_offset;
3281 inst->src[i].reladdr = NULL;
3282 }
3283 }
3284 }
3285
3286 /**
3287 * Emits an instruction before @inst to load the value named by @orig_src
3288 * from the pull constant buffer (surface) at @base_offset to @temp.
3289 */
3290 void
3291 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3292 dst_reg temp, src_reg orig_src,
3293 int base_offset)
3294 {
3295 int reg_offset = base_offset + orig_src.reg_offset;
3296 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3297 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3298 vec4_instruction *load;
3299
3300 if (brw->gen >= 7) {
3301 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3302 grf_offset.type = offset.type;
3303 emit_before(inst, MOV(grf_offset, offset));
3304
3305 load = new(mem_ctx) vec4_instruction(this,
3306 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3307 temp, index, src_reg(grf_offset));
3308 } else {
3309 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3310 temp, index, offset);
3311 load->base_mrf = 14;
3312 load->mlen = 1;
3313 }
3314 emit_before(inst, load);
3315 }
3316
3317 /**
3318 * Implements array access of uniforms by inserting a
3319 * PULL_CONSTANT_LOAD instruction.
3320 *
3321 * Unlike temporary GRF array access (where we don't support it due to
3322 * the difficulty of doing relative addressing on instruction
3323 * destinations), we could potentially do array access of uniforms
3324 * that were loaded in GRF space as push constants. In real-world
3325 * usage we've seen, though, the arrays being used are always larger
3326 * than we could load as push constants, so just always move all
3327 * uniform array access out to a pull constant buffer.
3328 */
3329 void
3330 vec4_visitor::move_uniform_array_access_to_pull_constants()
3331 {
3332 int pull_constant_loc[this->uniforms];
3333
3334 for (int i = 0; i < this->uniforms; i++) {
3335 pull_constant_loc[i] = -1;
3336 }
3337
3338 /* Walk through and find array access of uniforms. Put a copy of that
3339 * uniform in the pull constant buffer.
3340 *
3341 * Note that we don't move constant-indexed accesses to arrays. No
3342 * testing has been done of the performance impact of this choice.
3343 */
3344 foreach_list_safe(node, &this->instructions) {
3345 vec4_instruction *inst = (vec4_instruction *)node;
3346
3347 for (int i = 0 ; i < 3; i++) {
3348 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3349 continue;
3350
3351 int uniform = inst->src[i].reg;
3352
3353 /* If this array isn't already present in the pull constant buffer,
3354 * add it.
3355 */
3356 if (pull_constant_loc[uniform] == -1) {
3357 const float **values = &stage_prog_data->param[uniform * 4];
3358
3359 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3360
3361 assert(uniform < uniform_array_size);
3362 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3363 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3364 = values[j];
3365 }
3366 }
3367
3368 /* Set up the annotation tracking for new generated instructions. */
3369 base_ir = inst->ir;
3370 current_annotation = inst->annotation;
3371
3372 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3373
3374 emit_pull_constant_load(inst, temp, inst->src[i],
3375 pull_constant_loc[uniform]);
3376
3377 inst->src[i].file = temp.file;
3378 inst->src[i].reg = temp.reg;
3379 inst->src[i].reg_offset = temp.reg_offset;
3380 inst->src[i].reladdr = NULL;
3381 }
3382 }
3383
3384 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3385 * no need to track them as larger-than-vec4 objects. This will be
3386 * relied on in cutting out unused uniform vectors from push
3387 * constants.
3388 */
3389 split_uniform_registers();
3390 }
3391
3392 void
3393 vec4_visitor::resolve_ud_negate(src_reg *reg)
3394 {
3395 if (reg->type != BRW_REGISTER_TYPE_UD ||
3396 !reg->negate)
3397 return;
3398
3399 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3400 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3401 *reg = temp;
3402 }
3403
3404 vec4_visitor::vec4_visitor(struct brw_context *brw,
3405 struct brw_vec4_compile *c,
3406 struct gl_program *prog,
3407 const struct brw_vec4_prog_key *key,
3408 struct brw_vec4_prog_data *prog_data,
3409 struct gl_shader_program *shader_prog,
3410 gl_shader_stage stage,
3411 void *mem_ctx,
3412 bool debug_flag,
3413 bool no_spills,
3414 shader_time_shader_type st_base,
3415 shader_time_shader_type st_written,
3416 shader_time_shader_type st_reset)
3417 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3418 c(c),
3419 key(key),
3420 prog_data(prog_data),
3421 sanity_param_count(0),
3422 fail_msg(NULL),
3423 first_non_payload_grf(0),
3424 need_all_constants_in_pull_buffer(false),
3425 debug_flag(debug_flag),
3426 no_spills(no_spills),
3427 st_base(st_base),
3428 st_written(st_written),
3429 st_reset(st_reset)
3430 {
3431 this->mem_ctx = mem_ctx;
3432 this->failed = false;
3433
3434 this->base_ir = NULL;
3435 this->current_annotation = NULL;
3436 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3437
3438 this->variable_ht = hash_table_ctor(0,
3439 hash_table_pointer_hash,
3440 hash_table_pointer_compare);
3441
3442 this->virtual_grf_start = NULL;
3443 this->virtual_grf_end = NULL;
3444 this->virtual_grf_sizes = NULL;
3445 this->virtual_grf_count = 0;
3446 this->virtual_grf_reg_map = NULL;
3447 this->virtual_grf_reg_count = 0;
3448 this->virtual_grf_array_size = 0;
3449 this->live_intervals_valid = false;
3450
3451 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3452
3453 this->uniforms = 0;
3454
3455 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3456 * at least one. See setup_uniforms() in brw_vec4.cpp.
3457 */
3458 this->uniform_array_size = 1;
3459 if (prog_data) {
3460 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3461 }
3462
3463 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3464 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3465 }
3466
3467 vec4_visitor::~vec4_visitor()
3468 {
3469 hash_table_dtor(this->variable_ht);
3470 }
3471
3472
3473 void
3474 vec4_visitor::fail(const char *format, ...)
3475 {
3476 va_list va;
3477 char *msg;
3478
3479 if (failed)
3480 return;
3481
3482 failed = true;
3483
3484 va_start(va, format);
3485 msg = ralloc_vasprintf(mem_ctx, format, va);
3486 va_end(va);
3487 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3488
3489 this->fail_msg = msg;
3490
3491 if (debug_flag) {
3492 fprintf(stderr, "%s", msg);
3493 }
3494 }
3495
3496 } /* namespace brw */