i965: Fix dump_prog_cache to handle compacted instructions.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, dst_reg dst,
34 src_reg src0, src_reg src1, src_reg src2)
35 {
36 this->opcode = opcode;
37 this->dst = dst;
38 this->src[0] = src0;
39 this->src[1] = src1;
40 this->src[2] = src2;
41 this->saturate = false;
42 this->force_writemask_all = false;
43 this->no_dd_clear = false;
44 this->no_dd_check = false;
45 this->writes_accumulator = false;
46 this->conditional_mod = BRW_CONDITIONAL_NONE;
47 this->sampler = 0;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(dst_reg dst, src_reg src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
123 { \
124 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
125 src0, src1); \
126 }
127
128 #define ALU2_ACC(op) \
129 vec4_instruction * \
130 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
131 { \
132 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
133 BRW_OPCODE_##op, dst, src0, src1); \
134 inst->writes_accumulator = true; \
135 return inst; \
136 }
137
138 #define ALU3(op) \
139 vec4_instruction * \
140 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
141 { \
142 assert(brw->gen >= 6); \
143 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
144 src0, src1, src2); \
145 }
146
147 ALU1(NOT)
148 ALU1(MOV)
149 ALU1(FRC)
150 ALU1(RNDD)
151 ALU1(RNDE)
152 ALU1(RNDZ)
153 ALU1(F32TO16)
154 ALU1(F16TO32)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2_ACC(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(DP3)
162 ALU2(DP4)
163 ALU2(DPH)
164 ALU2(SHL)
165 ALU2(SHR)
166 ALU2(ASR)
167 ALU3(LRP)
168 ALU1(BFREV)
169 ALU3(BFE)
170 ALU2(BFI1)
171 ALU3(BFI2)
172 ALU1(FBH)
173 ALU1(FBL)
174 ALU1(CBIT)
175 ALU3(MAD)
176 ALU2_ACC(ADDC)
177 ALU2_ACC(SUBB)
178 ALU2(MAC)
179
180 /** Gen4 predicated IF. */
181 vec4_instruction *
182 vec4_visitor::IF(uint32_t predicate)
183 {
184 vec4_instruction *inst;
185
186 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
187 inst->predicate = predicate;
188
189 return inst;
190 }
191
192 /** Gen6 IF with embedded comparison. */
193 vec4_instruction *
194 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
195 {
196 assert(brw->gen == 6);
197
198 vec4_instruction *inst;
199
200 resolve_ud_negate(&src0);
201 resolve_ud_negate(&src1);
202
203 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
204 src0, src1);
205 inst->conditional_mod = condition;
206
207 return inst;
208 }
209
210 /**
211 * CMP: Sets the low bit of the destination channels with the result
212 * of the comparison, while the upper bits are undefined, and updates
213 * the flag register with the packed 16 bits of the result.
214 */
215 vec4_instruction *
216 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
217 {
218 vec4_instruction *inst;
219
220 /* original gen4 does type conversion to the destination type
221 * before before comparison, producing garbage results for floating
222 * point comparisons.
223 */
224 if (brw->gen == 4) {
225 dst.type = src0.type;
226 if (dst.file == HW_REG)
227 dst.fixed_hw_reg.type = dst.type;
228 }
229
230 resolve_ud_negate(&src0);
231 resolve_ud_negate(&src1);
232
233 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
234 inst->conditional_mod = condition;
235
236 return inst;
237 }
238
239 vec4_instruction *
240 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
241 {
242 vec4_instruction *inst;
243
244 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
245 dst, index);
246 inst->base_mrf = 14;
247 inst->mlen = 2;
248
249 return inst;
250 }
251
252 vec4_instruction *
253 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
254 {
255 vec4_instruction *inst;
256
257 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
258 dst, src, index);
259 inst->base_mrf = 13;
260 inst->mlen = 3;
261
262 return inst;
263 }
264
265 void
266 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
267 {
268 static enum opcode dot_opcodes[] = {
269 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
270 };
271
272 emit(dot_opcodes[elements - 2], dst, src0, src1);
273 }
274
275 src_reg
276 vec4_visitor::fix_3src_operand(src_reg src)
277 {
278 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
279 * able to use vertical stride of zero to replicate the vec4 uniform, like
280 *
281 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
282 *
283 * But you can't, since vertical stride is always four in three-source
284 * instructions. Instead, insert a MOV instruction to do the replication so
285 * that the three-source instruction can consume it.
286 */
287
288 /* The MOV is only needed if the source is a uniform or immediate. */
289 if (src.file != UNIFORM && src.file != IMM)
290 return src;
291
292 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
293 return src;
294
295 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
296 expanded.type = src.type;
297 emit(MOV(expanded, src));
298 return src_reg(expanded);
299 }
300
301 src_reg
302 vec4_visitor::fix_math_operand(src_reg src)
303 {
304 /* The gen6 math instruction ignores the source modifiers --
305 * swizzle, abs, negate, and at least some parts of the register
306 * region description.
307 *
308 * Rather than trying to enumerate all these cases, *always* expand the
309 * operand to a temp GRF for gen6.
310 *
311 * For gen7, keep the operand as-is, except if immediate, which gen7 still
312 * can't use.
313 */
314
315 if (brw->gen == 7 && src.file != IMM)
316 return src;
317
318 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
319 expanded.type = src.type;
320 emit(MOV(expanded, src));
321 return src_reg(expanded);
322 }
323
324 void
325 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
326 {
327 src = fix_math_operand(src);
328
329 if (dst.writemask != WRITEMASK_XYZW) {
330 /* The gen6 math instruction must be align1, so we can't do
331 * writemasks.
332 */
333 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
334
335 emit(opcode, temp_dst, src);
336
337 emit(MOV(dst, src_reg(temp_dst)));
338 } else {
339 emit(opcode, dst, src);
340 }
341 }
342
343 void
344 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
345 {
346 vec4_instruction *inst = emit(opcode, dst, src);
347 inst->base_mrf = 1;
348 inst->mlen = 1;
349 }
350
351 void
352 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
353 {
354 switch (opcode) {
355 case SHADER_OPCODE_RCP:
356 case SHADER_OPCODE_RSQ:
357 case SHADER_OPCODE_SQRT:
358 case SHADER_OPCODE_EXP2:
359 case SHADER_OPCODE_LOG2:
360 case SHADER_OPCODE_SIN:
361 case SHADER_OPCODE_COS:
362 break;
363 default:
364 assert(!"not reached: bad math opcode");
365 return;
366 }
367
368 if (brw->gen >= 6) {
369 return emit_math1_gen6(opcode, dst, src);
370 } else {
371 return emit_math1_gen4(opcode, dst, src);
372 }
373 }
374
375 void
376 vec4_visitor::emit_math2_gen6(enum opcode opcode,
377 dst_reg dst, src_reg src0, src_reg src1)
378 {
379 src0 = fix_math_operand(src0);
380 src1 = fix_math_operand(src1);
381
382 if (dst.writemask != WRITEMASK_XYZW) {
383 /* The gen6 math instruction must be align1, so we can't do
384 * writemasks.
385 */
386 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
387 temp_dst.type = dst.type;
388
389 emit(opcode, temp_dst, src0, src1);
390
391 emit(MOV(dst, src_reg(temp_dst)));
392 } else {
393 emit(opcode, dst, src0, src1);
394 }
395 }
396
397 void
398 vec4_visitor::emit_math2_gen4(enum opcode opcode,
399 dst_reg dst, src_reg src0, src_reg src1)
400 {
401 vec4_instruction *inst = emit(opcode, dst, src0, src1);
402 inst->base_mrf = 1;
403 inst->mlen = 2;
404 }
405
406 void
407 vec4_visitor::emit_math(enum opcode opcode,
408 dst_reg dst, src_reg src0, src_reg src1)
409 {
410 switch (opcode) {
411 case SHADER_OPCODE_POW:
412 case SHADER_OPCODE_INT_QUOTIENT:
413 case SHADER_OPCODE_INT_REMAINDER:
414 break;
415 default:
416 assert(!"not reached: unsupported binary math opcode");
417 return;
418 }
419
420 if (brw->gen >= 6) {
421 return emit_math2_gen6(opcode, dst, src0, src1);
422 } else {
423 return emit_math2_gen4(opcode, dst, src0, src1);
424 }
425 }
426
427 void
428 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
429 {
430 if (brw->gen < 7)
431 assert(!"ir_unop_pack_half_2x16 should be lowered");
432
433 assert(dst.type == BRW_REGISTER_TYPE_UD);
434 assert(src0.type == BRW_REGISTER_TYPE_F);
435
436 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
437 *
438 * Because this instruction does not have a 16-bit floating-point type,
439 * the destination data type must be Word (W).
440 *
441 * The destination must be DWord-aligned and specify a horizontal stride
442 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
443 * each destination channel and the upper word is not modified.
444 *
445 * The above restriction implies that the f32to16 instruction must use
446 * align1 mode, because only in align1 mode is it possible to specify
447 * horizontal stride. We choose here to defy the hardware docs and emit
448 * align16 instructions.
449 *
450 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
451 * instructions. I was partially successful in that the code passed all
452 * tests. However, the code was dubiously correct and fragile, and the
453 * tests were not harsh enough to probe that frailty. Not trusting the
454 * code, I chose instead to remain in align16 mode in defiance of the hw
455 * docs).
456 *
457 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
458 * simulator, emitting a f32to16 in align16 mode with UD as destination
459 * data type is safe. The behavior differs from that specified in the PRM
460 * in that the upper word of each destination channel is cleared to 0.
461 */
462
463 dst_reg tmp_dst(this, glsl_type::uvec2_type);
464 src_reg tmp_src(tmp_dst);
465
466 #if 0
467 /* Verify the undocumented behavior on which the following instructions
468 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
469 * then the result of the bit-or instruction below will be incorrect.
470 *
471 * You should inspect the disasm output in order to verify that the MOV is
472 * not optimized away.
473 */
474 emit(MOV(tmp_dst, src_reg(0x12345678u)));
475 #endif
476
477 /* Give tmp the form below, where "." means untouched.
478 *
479 * w z y x w z y x
480 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
481 *
482 * That the upper word of each write-channel be 0 is required for the
483 * following bit-shift and bit-or instructions to work. Note that this
484 * relies on the undocumented hardware behavior mentioned above.
485 */
486 tmp_dst.writemask = WRITEMASK_XY;
487 emit(F32TO16(tmp_dst, src0));
488
489 /* Give the write-channels of dst the form:
490 * 0xhhhh0000
491 */
492 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
493 emit(SHL(dst, tmp_src, src_reg(16u)));
494
495 /* Finally, give the write-channels of dst the form of packHalf2x16's
496 * output:
497 * 0xhhhhllll
498 */
499 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
500 emit(OR(dst, src_reg(dst), tmp_src));
501 }
502
503 void
504 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
505 {
506 if (brw->gen < 7)
507 assert(!"ir_unop_unpack_half_2x16 should be lowered");
508
509 assert(dst.type == BRW_REGISTER_TYPE_F);
510 assert(src0.type == BRW_REGISTER_TYPE_UD);
511
512 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
513 *
514 * Because this instruction does not have a 16-bit floating-point type,
515 * the source data type must be Word (W). The destination type must be
516 * F (Float).
517 *
518 * To use W as the source data type, we must adjust horizontal strides,
519 * which is only possible in align1 mode. All my [chadv] attempts at
520 * emitting align1 instructions for unpackHalf2x16 failed to pass the
521 * Piglit tests, so I gave up.
522 *
523 * I've verified that, on gen7 hardware and the simulator, it is safe to
524 * emit f16to32 in align16 mode with UD as source data type.
525 */
526
527 dst_reg tmp_dst(this, glsl_type::uvec2_type);
528 src_reg tmp_src(tmp_dst);
529
530 tmp_dst.writemask = WRITEMASK_X;
531 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
532
533 tmp_dst.writemask = WRITEMASK_Y;
534 emit(SHR(tmp_dst, src0, src_reg(16u)));
535
536 dst.writemask = WRITEMASK_XY;
537 emit(F16TO32(dst, tmp_src));
538 }
539
540 void
541 vec4_visitor::visit_instructions(const exec_list *list)
542 {
543 foreach_list(node, list) {
544 ir_instruction *ir = (ir_instruction *)node;
545
546 base_ir = ir;
547 ir->accept(this);
548 }
549 }
550
551
552 static int
553 type_size(const struct glsl_type *type)
554 {
555 unsigned int i;
556 int size;
557
558 switch (type->base_type) {
559 case GLSL_TYPE_UINT:
560 case GLSL_TYPE_INT:
561 case GLSL_TYPE_FLOAT:
562 case GLSL_TYPE_BOOL:
563 if (type->is_matrix()) {
564 return type->matrix_columns;
565 } else {
566 /* Regardless of size of vector, it gets a vec4. This is bad
567 * packing for things like floats, but otherwise arrays become a
568 * mess. Hopefully a later pass over the code can pack scalars
569 * down if appropriate.
570 */
571 return 1;
572 }
573 case GLSL_TYPE_ARRAY:
574 assert(type->length > 0);
575 return type_size(type->fields.array) * type->length;
576 case GLSL_TYPE_STRUCT:
577 size = 0;
578 for (i = 0; i < type->length; i++) {
579 size += type_size(type->fields.structure[i].type);
580 }
581 return size;
582 case GLSL_TYPE_SAMPLER:
583 /* Samplers take up one slot in UNIFORMS[], but they're baked in
584 * at link time.
585 */
586 return 1;
587 case GLSL_TYPE_ATOMIC_UINT:
588 return 0;
589 case GLSL_TYPE_IMAGE:
590 case GLSL_TYPE_VOID:
591 case GLSL_TYPE_ERROR:
592 case GLSL_TYPE_INTERFACE:
593 assert(0);
594 break;
595 }
596
597 return 0;
598 }
599
600 int
601 vec4_visitor::virtual_grf_alloc(int size)
602 {
603 if (virtual_grf_array_size <= virtual_grf_count) {
604 if (virtual_grf_array_size == 0)
605 virtual_grf_array_size = 16;
606 else
607 virtual_grf_array_size *= 2;
608 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
609 virtual_grf_array_size);
610 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
611 virtual_grf_array_size);
612 }
613 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
614 virtual_grf_reg_count += size;
615 virtual_grf_sizes[virtual_grf_count] = size;
616 return virtual_grf_count++;
617 }
618
619 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
620 {
621 init();
622
623 this->file = GRF;
624 this->reg = v->virtual_grf_alloc(type_size(type));
625
626 if (type->is_array() || type->is_record()) {
627 this->swizzle = BRW_SWIZZLE_NOOP;
628 } else {
629 this->swizzle = swizzle_for_size(type->vector_elements);
630 }
631
632 this->type = brw_type_for_base_type(type);
633 }
634
635 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
636 {
637 init();
638
639 this->file = GRF;
640 this->reg = v->virtual_grf_alloc(type_size(type));
641
642 if (type->is_array() || type->is_record()) {
643 this->writemask = WRITEMASK_XYZW;
644 } else {
645 this->writemask = (1 << type->vector_elements) - 1;
646 }
647
648 this->type = brw_type_for_base_type(type);
649 }
650
651 /* Our support for uniforms is piggy-backed on the struct
652 * gl_fragment_program, because that's where the values actually
653 * get stored, rather than in some global gl_shader_program uniform
654 * store.
655 */
656 void
657 vec4_visitor::setup_uniform_values(ir_variable *ir)
658 {
659 int namelen = strlen(ir->name);
660
661 /* The data for our (non-builtin) uniforms is stored in a series of
662 * gl_uniform_driver_storage structs for each subcomponent that
663 * glGetUniformLocation() could name. We know it's been set up in the same
664 * order we'd walk the type, so walk the list of storage and find anything
665 * with our name, or the prefix of a component that starts with our name.
666 */
667 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
668 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
669
670 if (strncmp(ir->name, storage->name, namelen) != 0 ||
671 (storage->name[namelen] != 0 &&
672 storage->name[namelen] != '.' &&
673 storage->name[namelen] != '[')) {
674 continue;
675 }
676
677 gl_constant_value *components = storage->storage;
678 unsigned vector_count = (MAX2(storage->array_elements, 1) *
679 storage->type->matrix_columns);
680
681 for (unsigned s = 0; s < vector_count; s++) {
682 assert(uniforms < uniform_array_size);
683 uniform_vector_size[uniforms] = storage->type->vector_elements;
684
685 int i;
686 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
687 stage_prog_data->param[uniforms * 4 + i] = &components->f;
688 components++;
689 }
690 for (; i < 4; i++) {
691 static float zero = 0;
692 stage_prog_data->param[uniforms * 4 + i] = &zero;
693 }
694
695 uniforms++;
696 }
697 }
698 }
699
700 void
701 vec4_visitor::setup_uniform_clipplane_values()
702 {
703 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
704
705 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
706 assert(this->uniforms < uniform_array_size);
707 this->uniform_vector_size[this->uniforms] = 4;
708 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
709 this->userplane[i].type = BRW_REGISTER_TYPE_F;
710 for (int j = 0; j < 4; ++j) {
711 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
712 }
713 ++this->uniforms;
714 }
715 }
716
717 /* Our support for builtin uniforms is even scarier than non-builtin.
718 * It sits on top of the PROG_STATE_VAR parameters that are
719 * automatically updated from GL context state.
720 */
721 void
722 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
723 {
724 const ir_state_slot *const slots = ir->state_slots;
725 assert(ir->state_slots != NULL);
726
727 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
728 /* This state reference has already been setup by ir_to_mesa,
729 * but we'll get the same index back here. We can reference
730 * ParameterValues directly, since unlike brw_fs.cpp, we never
731 * add new state references during compile.
732 */
733 int index = _mesa_add_state_reference(this->prog->Parameters,
734 (gl_state_index *)slots[i].tokens);
735 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
736
737 assert(this->uniforms < uniform_array_size);
738 this->uniform_vector_size[this->uniforms] = 0;
739 /* Add each of the unique swizzled channels of the element.
740 * This will end up matching the size of the glsl_type of this field.
741 */
742 int last_swiz = -1;
743 for (unsigned int j = 0; j < 4; j++) {
744 int swiz = GET_SWZ(slots[i].swizzle, j);
745 last_swiz = swiz;
746
747 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
748 assert(this->uniforms < uniform_array_size);
749 if (swiz <= last_swiz)
750 this->uniform_vector_size[this->uniforms]++;
751 }
752 this->uniforms++;
753 }
754 }
755
756 dst_reg *
757 vec4_visitor::variable_storage(ir_variable *var)
758 {
759 return (dst_reg *)hash_table_find(this->variable_ht, var);
760 }
761
762 void
763 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
764 {
765 ir_expression *expr = ir->as_expression();
766
767 *predicate = BRW_PREDICATE_NORMAL;
768
769 if (expr) {
770 src_reg op[2];
771 vec4_instruction *inst;
772
773 assert(expr->get_num_operands() <= 2);
774 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
775 expr->operands[i]->accept(this);
776 op[i] = this->result;
777
778 resolve_ud_negate(&op[i]);
779 }
780
781 switch (expr->operation) {
782 case ir_unop_logic_not:
783 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
784 inst->conditional_mod = BRW_CONDITIONAL_Z;
785 break;
786
787 case ir_binop_logic_xor:
788 inst = emit(XOR(dst_null_d(), op[0], op[1]));
789 inst->conditional_mod = BRW_CONDITIONAL_NZ;
790 break;
791
792 case ir_binop_logic_or:
793 inst = emit(OR(dst_null_d(), op[0], op[1]));
794 inst->conditional_mod = BRW_CONDITIONAL_NZ;
795 break;
796
797 case ir_binop_logic_and:
798 inst = emit(AND(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_unop_f2b:
803 if (brw->gen >= 6) {
804 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
805 } else {
806 inst = emit(MOV(dst_null_f(), op[0]));
807 inst->conditional_mod = BRW_CONDITIONAL_NZ;
808 }
809 break;
810
811 case ir_unop_i2b:
812 if (brw->gen >= 6) {
813 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
814 } else {
815 inst = emit(MOV(dst_null_d(), op[0]));
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 }
818 break;
819
820 case ir_binop_all_equal:
821 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
822 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
823 break;
824
825 case ir_binop_any_nequal:
826 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
827 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
828 break;
829
830 case ir_unop_any:
831 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
832 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
833 break;
834
835 case ir_binop_greater:
836 case ir_binop_gequal:
837 case ir_binop_less:
838 case ir_binop_lequal:
839 case ir_binop_equal:
840 case ir_binop_nequal:
841 emit(CMP(dst_null_d(), op[0], op[1],
842 brw_conditional_for_comparison(expr->operation)));
843 break;
844
845 default:
846 assert(!"not reached");
847 break;
848 }
849 return;
850 }
851
852 ir->accept(this);
853
854 resolve_ud_negate(&this->result);
855
856 if (brw->gen >= 6) {
857 vec4_instruction *inst = emit(AND(dst_null_d(),
858 this->result, src_reg(1)));
859 inst->conditional_mod = BRW_CONDITIONAL_NZ;
860 } else {
861 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
862 inst->conditional_mod = BRW_CONDITIONAL_NZ;
863 }
864 }
865
866 /**
867 * Emit a gen6 IF statement with the comparison folded into the IF
868 * instruction.
869 */
870 void
871 vec4_visitor::emit_if_gen6(ir_if *ir)
872 {
873 ir_expression *expr = ir->condition->as_expression();
874
875 if (expr) {
876 src_reg op[2];
877 dst_reg temp;
878
879 assert(expr->get_num_operands() <= 2);
880 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
881 expr->operands[i]->accept(this);
882 op[i] = this->result;
883 }
884
885 switch (expr->operation) {
886 case ir_unop_logic_not:
887 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
888 return;
889
890 case ir_binop_logic_xor:
891 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
892 return;
893
894 case ir_binop_logic_or:
895 temp = dst_reg(this, glsl_type::bool_type);
896 emit(OR(temp, op[0], op[1]));
897 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
898 return;
899
900 case ir_binop_logic_and:
901 temp = dst_reg(this, glsl_type::bool_type);
902 emit(AND(temp, op[0], op[1]));
903 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
904 return;
905
906 case ir_unop_f2b:
907 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
908 return;
909
910 case ir_unop_i2b:
911 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
912 return;
913
914 case ir_binop_greater:
915 case ir_binop_gequal:
916 case ir_binop_less:
917 case ir_binop_lequal:
918 case ir_binop_equal:
919 case ir_binop_nequal:
920 emit(IF(op[0], op[1],
921 brw_conditional_for_comparison(expr->operation)));
922 return;
923
924 case ir_binop_all_equal:
925 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
926 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
927 return;
928
929 case ir_binop_any_nequal:
930 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
931 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
932 return;
933
934 case ir_unop_any:
935 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
936 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
937 return;
938
939 default:
940 assert(!"not reached");
941 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
942 return;
943 }
944 return;
945 }
946
947 ir->condition->accept(this);
948
949 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
950 }
951
952 void
953 vec4_visitor::visit(ir_variable *ir)
954 {
955 dst_reg *reg = NULL;
956
957 if (variable_storage(ir))
958 return;
959
960 switch (ir->data.mode) {
961 case ir_var_shader_in:
962 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
963 break;
964
965 case ir_var_shader_out:
966 reg = new(mem_ctx) dst_reg(this, ir->type);
967
968 for (int i = 0; i < type_size(ir->type); i++) {
969 output_reg[ir->data.location + i] = *reg;
970 output_reg[ir->data.location + i].reg_offset = i;
971 output_reg[ir->data.location + i].type =
972 brw_type_for_base_type(ir->type->get_scalar_type());
973 output_reg_annotation[ir->data.location + i] = ir->name;
974 }
975 break;
976
977 case ir_var_auto:
978 case ir_var_temporary:
979 reg = new(mem_ctx) dst_reg(this, ir->type);
980 break;
981
982 case ir_var_uniform:
983 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
984
985 /* Thanks to the lower_ubo_reference pass, we will see only
986 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
987 * variables, so no need for them to be in variable_ht.
988 *
989 * Atomic counters take no uniform storage, no need to do
990 * anything here.
991 */
992 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
993 return;
994
995 /* Track how big the whole uniform variable is, in case we need to put a
996 * copy of its data into pull constants for array access.
997 */
998 assert(this->uniforms < uniform_array_size);
999 this->uniform_size[this->uniforms] = type_size(ir->type);
1000
1001 if (!strncmp(ir->name, "gl_", 3)) {
1002 setup_builtin_uniform_values(ir);
1003 } else {
1004 setup_uniform_values(ir);
1005 }
1006 break;
1007
1008 case ir_var_system_value:
1009 reg = make_reg_for_system_value(ir);
1010 break;
1011
1012 default:
1013 assert(!"not reached");
1014 }
1015
1016 reg->type = brw_type_for_base_type(ir->type);
1017 hash_table_insert(this->variable_ht, reg, ir);
1018 }
1019
1020 void
1021 vec4_visitor::visit(ir_loop *ir)
1022 {
1023 /* We don't want debugging output to print the whole body of the
1024 * loop as the annotation.
1025 */
1026 this->base_ir = NULL;
1027
1028 emit(BRW_OPCODE_DO);
1029
1030 visit_instructions(&ir->body_instructions);
1031
1032 emit(BRW_OPCODE_WHILE);
1033 }
1034
1035 void
1036 vec4_visitor::visit(ir_loop_jump *ir)
1037 {
1038 switch (ir->mode) {
1039 case ir_loop_jump::jump_break:
1040 emit(BRW_OPCODE_BREAK);
1041 break;
1042 case ir_loop_jump::jump_continue:
1043 emit(BRW_OPCODE_CONTINUE);
1044 break;
1045 }
1046 }
1047
1048
1049 void
1050 vec4_visitor::visit(ir_function_signature *ir)
1051 {
1052 assert(0);
1053 (void)ir;
1054 }
1055
1056 void
1057 vec4_visitor::visit(ir_function *ir)
1058 {
1059 /* Ignore function bodies other than main() -- we shouldn't see calls to
1060 * them since they should all be inlined.
1061 */
1062 if (strcmp(ir->name, "main") == 0) {
1063 const ir_function_signature *sig;
1064 exec_list empty;
1065
1066 sig = ir->matching_signature(NULL, &empty);
1067
1068 assert(sig);
1069
1070 visit_instructions(&sig->body);
1071 }
1072 }
1073
1074 bool
1075 vec4_visitor::try_emit_sat(ir_expression *ir)
1076 {
1077 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1078 if (!sat_src)
1079 return false;
1080
1081 sat_src->accept(this);
1082 src_reg src = this->result;
1083
1084 this->result = src_reg(this, ir->type);
1085 vec4_instruction *inst;
1086 inst = emit(MOV(dst_reg(this->result), src));
1087 inst->saturate = true;
1088
1089 return true;
1090 }
1091
1092 bool
1093 vec4_visitor::try_emit_mad(ir_expression *ir)
1094 {
1095 /* 3-src instructions were introduced in gen6. */
1096 if (brw->gen < 6)
1097 return false;
1098
1099 /* MAD can only handle floating-point data. */
1100 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1101 return false;
1102
1103 ir_rvalue *nonmul = ir->operands[1];
1104 ir_expression *mul = ir->operands[0]->as_expression();
1105
1106 if (!mul || mul->operation != ir_binop_mul) {
1107 nonmul = ir->operands[0];
1108 mul = ir->operands[1]->as_expression();
1109
1110 if (!mul || mul->operation != ir_binop_mul)
1111 return false;
1112 }
1113
1114 nonmul->accept(this);
1115 src_reg src0 = fix_3src_operand(this->result);
1116
1117 mul->operands[0]->accept(this);
1118 src_reg src1 = fix_3src_operand(this->result);
1119
1120 mul->operands[1]->accept(this);
1121 src_reg src2 = fix_3src_operand(this->result);
1122
1123 this->result = src_reg(this, ir->type);
1124 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1125
1126 return true;
1127 }
1128
1129 void
1130 vec4_visitor::emit_bool_comparison(unsigned int op,
1131 dst_reg dst, src_reg src0, src_reg src1)
1132 {
1133 /* original gen4 does destination conversion before comparison. */
1134 if (brw->gen < 5)
1135 dst.type = src0.type;
1136
1137 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1138
1139 dst.type = BRW_REGISTER_TYPE_D;
1140 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1141 }
1142
1143 void
1144 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1145 src_reg src0, src_reg src1)
1146 {
1147 vec4_instruction *inst;
1148
1149 if (brw->gen >= 6) {
1150 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1151 inst->conditional_mod = conditionalmod;
1152 } else {
1153 emit(CMP(dst, src0, src1, conditionalmod));
1154
1155 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1156 inst->predicate = BRW_PREDICATE_NORMAL;
1157 }
1158 }
1159
1160 void
1161 vec4_visitor::emit_lrp(const dst_reg &dst,
1162 const src_reg &x, const src_reg &y, const src_reg &a)
1163 {
1164 if (brw->gen >= 6) {
1165 /* Note that the instruction's argument order is reversed from GLSL
1166 * and the IR.
1167 */
1168 emit(LRP(dst,
1169 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1170 } else {
1171 /* Earlier generations don't support three source operations, so we
1172 * need to emit x*(1-a) + y*a.
1173 */
1174 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1175 one_minus_a.writemask = dst.writemask;
1176
1177 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1178 vec4_instruction *mul = emit(MUL(dst_null_f(), y, a));
1179 mul->writes_accumulator = true;
1180 emit(MAC(dst, x, src_reg(one_minus_a)));
1181 }
1182 }
1183
1184 void
1185 vec4_visitor::visit(ir_expression *ir)
1186 {
1187 unsigned int operand;
1188 src_reg op[Elements(ir->operands)];
1189 src_reg result_src;
1190 dst_reg result_dst;
1191 vec4_instruction *inst;
1192
1193 if (try_emit_sat(ir))
1194 return;
1195
1196 if (ir->operation == ir_binop_add) {
1197 if (try_emit_mad(ir))
1198 return;
1199 }
1200
1201 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1202 this->result.file = BAD_FILE;
1203 ir->operands[operand]->accept(this);
1204 if (this->result.file == BAD_FILE) {
1205 fprintf(stderr, "Failed to get tree for expression operand:\n");
1206 ir->operands[operand]->fprint(stderr);
1207 exit(1);
1208 }
1209 op[operand] = this->result;
1210
1211 /* Matrix expression operands should have been broken down to vector
1212 * operations already.
1213 */
1214 assert(!ir->operands[operand]->type->is_matrix());
1215 }
1216
1217 int vector_elements = ir->operands[0]->type->vector_elements;
1218 if (ir->operands[1]) {
1219 vector_elements = MAX2(vector_elements,
1220 ir->operands[1]->type->vector_elements);
1221 }
1222
1223 this->result.file = BAD_FILE;
1224
1225 /* Storage for our result. Ideally for an assignment we'd be using
1226 * the actual storage for the result here, instead.
1227 */
1228 result_src = src_reg(this, ir->type);
1229 /* convenience for the emit functions below. */
1230 result_dst = dst_reg(result_src);
1231 /* If nothing special happens, this is the result. */
1232 this->result = result_src;
1233 /* Limit writes to the channels that will be used by result_src later.
1234 * This does limit this temp's use as a temporary for multi-instruction
1235 * sequences.
1236 */
1237 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1238
1239 switch (ir->operation) {
1240 case ir_unop_logic_not:
1241 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1242 * ones complement of the whole register, not just bit 0.
1243 */
1244 emit(XOR(result_dst, op[0], src_reg(1)));
1245 break;
1246 case ir_unop_neg:
1247 op[0].negate = !op[0].negate;
1248 emit(MOV(result_dst, op[0]));
1249 break;
1250 case ir_unop_abs:
1251 op[0].abs = true;
1252 op[0].negate = false;
1253 emit(MOV(result_dst, op[0]));
1254 break;
1255
1256 case ir_unop_sign:
1257 if (ir->type->is_float()) {
1258 /* AND(val, 0x80000000) gives the sign bit.
1259 *
1260 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1261 * zero.
1262 */
1263 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1264
1265 op[0].type = BRW_REGISTER_TYPE_UD;
1266 result_dst.type = BRW_REGISTER_TYPE_UD;
1267 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1268
1269 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1270 inst->predicate = BRW_PREDICATE_NORMAL;
1271
1272 this->result.type = BRW_REGISTER_TYPE_F;
1273 } else {
1274 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1275 * -> non-negative val generates 0x00000000.
1276 * Predicated OR sets 1 if val is positive.
1277 */
1278 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1279
1280 emit(ASR(result_dst, op[0], src_reg(31)));
1281
1282 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1283 inst->predicate = BRW_PREDICATE_NORMAL;
1284 }
1285 break;
1286
1287 case ir_unop_rcp:
1288 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1289 break;
1290
1291 case ir_unop_exp2:
1292 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1293 break;
1294 case ir_unop_log2:
1295 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1296 break;
1297 case ir_unop_exp:
1298 case ir_unop_log:
1299 assert(!"not reached: should be handled by ir_explog_to_explog2");
1300 break;
1301 case ir_unop_sin:
1302 case ir_unop_sin_reduced:
1303 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1304 break;
1305 case ir_unop_cos:
1306 case ir_unop_cos_reduced:
1307 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1308 break;
1309
1310 case ir_unop_dFdx:
1311 case ir_unop_dFdy:
1312 assert(!"derivatives not valid in vertex shader");
1313 break;
1314
1315 case ir_unop_bitfield_reverse:
1316 emit(BFREV(result_dst, op[0]));
1317 break;
1318 case ir_unop_bit_count:
1319 emit(CBIT(result_dst, op[0]));
1320 break;
1321 case ir_unop_find_msb: {
1322 src_reg temp = src_reg(this, glsl_type::uint_type);
1323
1324 inst = emit(FBH(dst_reg(temp), op[0]));
1325 inst->dst.writemask = WRITEMASK_XYZW;
1326
1327 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1328 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1329 * subtract the result from 31 to convert the MSB count into an LSB count.
1330 */
1331
1332 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1333 temp.swizzle = BRW_SWIZZLE_NOOP;
1334 emit(MOV(result_dst, temp));
1335
1336 src_reg src_tmp = src_reg(result_dst);
1337 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1338
1339 src_tmp.negate = true;
1340 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1341 inst->predicate = BRW_PREDICATE_NORMAL;
1342 break;
1343 }
1344 case ir_unop_find_lsb:
1345 emit(FBL(result_dst, op[0]));
1346 break;
1347
1348 case ir_unop_noise:
1349 assert(!"not reached: should be handled by lower_noise");
1350 break;
1351
1352 case ir_binop_add:
1353 emit(ADD(result_dst, op[0], op[1]));
1354 break;
1355 case ir_binop_sub:
1356 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1357 break;
1358
1359 case ir_binop_mul:
1360 if (brw->gen < 8 && ir->type->is_integer()) {
1361 /* For integer multiplication, the MUL uses the low 16 bits of one of
1362 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1363 * accumulates in the contribution of the upper 16 bits of that
1364 * operand. If we can determine that one of the args is in the low
1365 * 16 bits, though, we can just emit a single MUL.
1366 */
1367 if (ir->operands[0]->is_uint16_constant()) {
1368 if (brw->gen < 7)
1369 emit(MUL(result_dst, op[0], op[1]));
1370 else
1371 emit(MUL(result_dst, op[1], op[0]));
1372 } else if (ir->operands[1]->is_uint16_constant()) {
1373 if (brw->gen < 7)
1374 emit(MUL(result_dst, op[1], op[0]));
1375 else
1376 emit(MUL(result_dst, op[0], op[1]));
1377 } else {
1378 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1379
1380 emit(MUL(acc, op[0], op[1]));
1381 emit(MACH(dst_null_d(), op[0], op[1]));
1382 emit(MOV(result_dst, src_reg(acc)));
1383 }
1384 } else {
1385 emit(MUL(result_dst, op[0], op[1]));
1386 }
1387 break;
1388 case ir_binop_imul_high: {
1389 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1390
1391 emit(MUL(acc, op[0], op[1]));
1392 emit(MACH(result_dst, op[0], op[1]));
1393 break;
1394 }
1395 case ir_binop_div:
1396 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1397 assert(ir->type->is_integer());
1398 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1399 break;
1400 case ir_binop_carry: {
1401 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1402
1403 emit(ADDC(dst_null_ud(), op[0], op[1]));
1404 emit(MOV(result_dst, src_reg(acc)));
1405 break;
1406 }
1407 case ir_binop_borrow: {
1408 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1409
1410 emit(SUBB(dst_null_ud(), op[0], op[1]));
1411 emit(MOV(result_dst, src_reg(acc)));
1412 break;
1413 }
1414 case ir_binop_mod:
1415 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1416 assert(ir->type->is_integer());
1417 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1418 break;
1419
1420 case ir_binop_less:
1421 case ir_binop_greater:
1422 case ir_binop_lequal:
1423 case ir_binop_gequal:
1424 case ir_binop_equal:
1425 case ir_binop_nequal: {
1426 emit(CMP(result_dst, op[0], op[1],
1427 brw_conditional_for_comparison(ir->operation)));
1428 emit(AND(result_dst, result_src, src_reg(0x1)));
1429 break;
1430 }
1431
1432 case ir_binop_all_equal:
1433 /* "==" operator producing a scalar boolean. */
1434 if (ir->operands[0]->type->is_vector() ||
1435 ir->operands[1]->type->is_vector()) {
1436 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1437 emit(MOV(result_dst, src_reg(0)));
1438 inst = emit(MOV(result_dst, src_reg(1)));
1439 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1440 } else {
1441 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1442 emit(AND(result_dst, result_src, src_reg(0x1)));
1443 }
1444 break;
1445 case ir_binop_any_nequal:
1446 /* "!=" operator producing a scalar boolean. */
1447 if (ir->operands[0]->type->is_vector() ||
1448 ir->operands[1]->type->is_vector()) {
1449 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1450
1451 emit(MOV(result_dst, src_reg(0)));
1452 inst = emit(MOV(result_dst, src_reg(1)));
1453 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1454 } else {
1455 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1456 emit(AND(result_dst, result_src, src_reg(0x1)));
1457 }
1458 break;
1459
1460 case ir_unop_any:
1461 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1462 emit(MOV(result_dst, src_reg(0)));
1463
1464 inst = emit(MOV(result_dst, src_reg(1)));
1465 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1466 break;
1467
1468 case ir_binop_logic_xor:
1469 emit(XOR(result_dst, op[0], op[1]));
1470 break;
1471
1472 case ir_binop_logic_or:
1473 emit(OR(result_dst, op[0], op[1]));
1474 break;
1475
1476 case ir_binop_logic_and:
1477 emit(AND(result_dst, op[0], op[1]));
1478 break;
1479
1480 case ir_binop_dot:
1481 assert(ir->operands[0]->type->is_vector());
1482 assert(ir->operands[0]->type == ir->operands[1]->type);
1483 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1484 break;
1485
1486 case ir_unop_sqrt:
1487 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1488 break;
1489 case ir_unop_rsq:
1490 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1491 break;
1492
1493 case ir_unop_bitcast_i2f:
1494 case ir_unop_bitcast_u2f:
1495 this->result = op[0];
1496 this->result.type = BRW_REGISTER_TYPE_F;
1497 break;
1498
1499 case ir_unop_bitcast_f2i:
1500 this->result = op[0];
1501 this->result.type = BRW_REGISTER_TYPE_D;
1502 break;
1503
1504 case ir_unop_bitcast_f2u:
1505 this->result = op[0];
1506 this->result.type = BRW_REGISTER_TYPE_UD;
1507 break;
1508
1509 case ir_unop_i2f:
1510 case ir_unop_i2u:
1511 case ir_unop_u2i:
1512 case ir_unop_u2f:
1513 case ir_unop_b2f:
1514 case ir_unop_b2i:
1515 case ir_unop_f2i:
1516 case ir_unop_f2u:
1517 emit(MOV(result_dst, op[0]));
1518 break;
1519 case ir_unop_f2b:
1520 case ir_unop_i2b: {
1521 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1522 emit(AND(result_dst, result_src, src_reg(1)));
1523 break;
1524 }
1525
1526 case ir_unop_trunc:
1527 emit(RNDZ(result_dst, op[0]));
1528 break;
1529 case ir_unop_ceil:
1530 op[0].negate = !op[0].negate;
1531 inst = emit(RNDD(result_dst, op[0]));
1532 this->result.negate = true;
1533 break;
1534 case ir_unop_floor:
1535 inst = emit(RNDD(result_dst, op[0]));
1536 break;
1537 case ir_unop_fract:
1538 inst = emit(FRC(result_dst, op[0]));
1539 break;
1540 case ir_unop_round_even:
1541 emit(RNDE(result_dst, op[0]));
1542 break;
1543
1544 case ir_binop_min:
1545 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1546 break;
1547 case ir_binop_max:
1548 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1549 break;
1550
1551 case ir_binop_pow:
1552 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1553 break;
1554
1555 case ir_unop_bit_not:
1556 inst = emit(NOT(result_dst, op[0]));
1557 break;
1558 case ir_binop_bit_and:
1559 inst = emit(AND(result_dst, op[0], op[1]));
1560 break;
1561 case ir_binop_bit_xor:
1562 inst = emit(XOR(result_dst, op[0], op[1]));
1563 break;
1564 case ir_binop_bit_or:
1565 inst = emit(OR(result_dst, op[0], op[1]));
1566 break;
1567
1568 case ir_binop_lshift:
1569 inst = emit(SHL(result_dst, op[0], op[1]));
1570 break;
1571
1572 case ir_binop_rshift:
1573 if (ir->type->base_type == GLSL_TYPE_INT)
1574 inst = emit(ASR(result_dst, op[0], op[1]));
1575 else
1576 inst = emit(SHR(result_dst, op[0], op[1]));
1577 break;
1578
1579 case ir_binop_bfm:
1580 emit(BFI1(result_dst, op[0], op[1]));
1581 break;
1582
1583 case ir_binop_ubo_load: {
1584 ir_constant *uniform_block = ir->operands[0]->as_constant();
1585 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1586 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1587 src_reg offset;
1588
1589 /* Now, load the vector from that offset. */
1590 assert(ir->type->is_vector() || ir->type->is_scalar());
1591
1592 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1593 packed_consts.type = result.type;
1594 src_reg surf_index =
1595 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1596 if (const_offset_ir) {
1597 if (brw->gen >= 8) {
1598 /* Store the offset in a GRF so we can send-from-GRF. */
1599 offset = src_reg(this, glsl_type::int_type);
1600 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1601 } else {
1602 /* Immediates are fine on older generations since they'll be moved
1603 * to a (potentially fake) MRF at the generator level.
1604 */
1605 offset = src_reg(const_offset / 16);
1606 }
1607 } else {
1608 offset = src_reg(this, glsl_type::uint_type);
1609 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1610 }
1611
1612 if (brw->gen >= 7) {
1613 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1614 grf_offset.type = offset.type;
1615
1616 emit(MOV(grf_offset, offset));
1617
1618 emit(new(mem_ctx) vec4_instruction(this,
1619 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1620 dst_reg(packed_consts),
1621 surf_index,
1622 src_reg(grf_offset)));
1623 } else {
1624 vec4_instruction *pull =
1625 emit(new(mem_ctx) vec4_instruction(this,
1626 VS_OPCODE_PULL_CONSTANT_LOAD,
1627 dst_reg(packed_consts),
1628 surf_index,
1629 offset));
1630 pull->base_mrf = 14;
1631 pull->mlen = 1;
1632 }
1633
1634 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1635 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1636 const_offset % 16 / 4,
1637 const_offset % 16 / 4,
1638 const_offset % 16 / 4);
1639
1640 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1641 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1642 emit(CMP(result_dst, packed_consts, src_reg(0u),
1643 BRW_CONDITIONAL_NZ));
1644 emit(AND(result_dst, result, src_reg(0x1)));
1645 } else {
1646 emit(MOV(result_dst, packed_consts));
1647 }
1648 break;
1649 }
1650
1651 case ir_binop_vector_extract:
1652 assert(!"should have been lowered by vec_index_to_cond_assign");
1653 break;
1654
1655 case ir_triop_fma:
1656 op[0] = fix_3src_operand(op[0]);
1657 op[1] = fix_3src_operand(op[1]);
1658 op[2] = fix_3src_operand(op[2]);
1659 /* Note that the instruction's argument order is reversed from GLSL
1660 * and the IR.
1661 */
1662 emit(MAD(result_dst, op[2], op[1], op[0]));
1663 break;
1664
1665 case ir_triop_lrp:
1666 emit_lrp(result_dst, op[0], op[1], op[2]);
1667 break;
1668
1669 case ir_triop_csel:
1670 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1671 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1672 inst->predicate = BRW_PREDICATE_NORMAL;
1673 break;
1674
1675 case ir_triop_bfi:
1676 op[0] = fix_3src_operand(op[0]);
1677 op[1] = fix_3src_operand(op[1]);
1678 op[2] = fix_3src_operand(op[2]);
1679 emit(BFI2(result_dst, op[0], op[1], op[2]));
1680 break;
1681
1682 case ir_triop_bitfield_extract:
1683 op[0] = fix_3src_operand(op[0]);
1684 op[1] = fix_3src_operand(op[1]);
1685 op[2] = fix_3src_operand(op[2]);
1686 /* Note that the instruction's argument order is reversed from GLSL
1687 * and the IR.
1688 */
1689 emit(BFE(result_dst, op[2], op[1], op[0]));
1690 break;
1691
1692 case ir_triop_vector_insert:
1693 assert(!"should have been lowered by lower_vector_insert");
1694 break;
1695
1696 case ir_quadop_bitfield_insert:
1697 assert(!"not reached: should be handled by "
1698 "bitfield_insert_to_bfm_bfi\n");
1699 break;
1700
1701 case ir_quadop_vector:
1702 assert(!"not reached: should be handled by lower_quadop_vector");
1703 break;
1704
1705 case ir_unop_pack_half_2x16:
1706 emit_pack_half_2x16(result_dst, op[0]);
1707 break;
1708 case ir_unop_unpack_half_2x16:
1709 emit_unpack_half_2x16(result_dst, op[0]);
1710 break;
1711 case ir_unop_pack_snorm_2x16:
1712 case ir_unop_pack_snorm_4x8:
1713 case ir_unop_pack_unorm_2x16:
1714 case ir_unop_pack_unorm_4x8:
1715 case ir_unop_unpack_snorm_2x16:
1716 case ir_unop_unpack_snorm_4x8:
1717 case ir_unop_unpack_unorm_2x16:
1718 case ir_unop_unpack_unorm_4x8:
1719 assert(!"not reached: should be handled by lower_packing_builtins");
1720 break;
1721 case ir_unop_unpack_half_2x16_split_x:
1722 case ir_unop_unpack_half_2x16_split_y:
1723 case ir_binop_pack_half_2x16_split:
1724 assert(!"not reached: should not occur in vertex shader");
1725 break;
1726 case ir_binop_ldexp:
1727 assert(!"not reached: should be handled by ldexp_to_arith()");
1728 break;
1729 }
1730 }
1731
1732
1733 void
1734 vec4_visitor::visit(ir_swizzle *ir)
1735 {
1736 src_reg src;
1737 int i = 0;
1738 int swizzle[4];
1739
1740 /* Note that this is only swizzles in expressions, not those on the left
1741 * hand side of an assignment, which do write masking. See ir_assignment
1742 * for that.
1743 */
1744
1745 ir->val->accept(this);
1746 src = this->result;
1747 assert(src.file != BAD_FILE);
1748
1749 for (i = 0; i < ir->type->vector_elements; i++) {
1750 switch (i) {
1751 case 0:
1752 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1753 break;
1754 case 1:
1755 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1756 break;
1757 case 2:
1758 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1759 break;
1760 case 3:
1761 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1762 break;
1763 }
1764 }
1765 for (; i < 4; i++) {
1766 /* Replicate the last channel out. */
1767 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1768 }
1769
1770 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1771
1772 this->result = src;
1773 }
1774
1775 void
1776 vec4_visitor::visit(ir_dereference_variable *ir)
1777 {
1778 const struct glsl_type *type = ir->type;
1779 dst_reg *reg = variable_storage(ir->var);
1780
1781 if (!reg) {
1782 fail("Failed to find variable storage for %s\n", ir->var->name);
1783 this->result = src_reg(brw_null_reg());
1784 return;
1785 }
1786
1787 this->result = src_reg(*reg);
1788
1789 /* System values get their swizzle from the dst_reg writemask */
1790 if (ir->var->data.mode == ir_var_system_value)
1791 return;
1792
1793 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1794 this->result.swizzle = swizzle_for_size(type->vector_elements);
1795 }
1796
1797
1798 int
1799 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1800 {
1801 /* Under normal circumstances array elements are stored consecutively, so
1802 * the stride is equal to the size of the array element.
1803 */
1804 return type_size(ir->type);
1805 }
1806
1807
1808 void
1809 vec4_visitor::visit(ir_dereference_array *ir)
1810 {
1811 ir_constant *constant_index;
1812 src_reg src;
1813 int array_stride = compute_array_stride(ir);
1814
1815 constant_index = ir->array_index->constant_expression_value();
1816
1817 ir->array->accept(this);
1818 src = this->result;
1819
1820 if (constant_index) {
1821 src.reg_offset += constant_index->value.i[0] * array_stride;
1822 } else {
1823 /* Variable index array dereference. It eats the "vec4" of the
1824 * base of the array and an index that offsets the Mesa register
1825 * index.
1826 */
1827 ir->array_index->accept(this);
1828
1829 src_reg index_reg;
1830
1831 if (array_stride == 1) {
1832 index_reg = this->result;
1833 } else {
1834 index_reg = src_reg(this, glsl_type::int_type);
1835
1836 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1837 }
1838
1839 if (src.reladdr) {
1840 src_reg temp = src_reg(this, glsl_type::int_type);
1841
1842 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1843
1844 index_reg = temp;
1845 }
1846
1847 src.reladdr = ralloc(mem_ctx, src_reg);
1848 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1849 }
1850
1851 /* If the type is smaller than a vec4, replicate the last channel out. */
1852 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1853 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1854 else
1855 src.swizzle = BRW_SWIZZLE_NOOP;
1856 src.type = brw_type_for_base_type(ir->type);
1857
1858 this->result = src;
1859 }
1860
1861 void
1862 vec4_visitor::visit(ir_dereference_record *ir)
1863 {
1864 unsigned int i;
1865 const glsl_type *struct_type = ir->record->type;
1866 int offset = 0;
1867
1868 ir->record->accept(this);
1869
1870 for (i = 0; i < struct_type->length; i++) {
1871 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1872 break;
1873 offset += type_size(struct_type->fields.structure[i].type);
1874 }
1875
1876 /* If the type is smaller than a vec4, replicate the last channel out. */
1877 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1878 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1879 else
1880 this->result.swizzle = BRW_SWIZZLE_NOOP;
1881 this->result.type = brw_type_for_base_type(ir->type);
1882
1883 this->result.reg_offset += offset;
1884 }
1885
1886 /**
1887 * We want to be careful in assignment setup to hit the actual storage
1888 * instead of potentially using a temporary like we might with the
1889 * ir_dereference handler.
1890 */
1891 static dst_reg
1892 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1893 {
1894 /* The LHS must be a dereference. If the LHS is a variable indexed array
1895 * access of a vector, it must be separated into a series conditional moves
1896 * before reaching this point (see ir_vec_index_to_cond_assign).
1897 */
1898 assert(ir->as_dereference());
1899 ir_dereference_array *deref_array = ir->as_dereference_array();
1900 if (deref_array) {
1901 assert(!deref_array->array->type->is_vector());
1902 }
1903
1904 /* Use the rvalue deref handler for the most part. We'll ignore
1905 * swizzles in it and write swizzles using writemask, though.
1906 */
1907 ir->accept(v);
1908 return dst_reg(v->result);
1909 }
1910
1911 void
1912 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1913 const struct glsl_type *type, uint32_t predicate)
1914 {
1915 if (type->base_type == GLSL_TYPE_STRUCT) {
1916 for (unsigned int i = 0; i < type->length; i++) {
1917 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1918 }
1919 return;
1920 }
1921
1922 if (type->is_array()) {
1923 for (unsigned int i = 0; i < type->length; i++) {
1924 emit_block_move(dst, src, type->fields.array, predicate);
1925 }
1926 return;
1927 }
1928
1929 if (type->is_matrix()) {
1930 const struct glsl_type *vec_type;
1931
1932 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1933 type->vector_elements, 1);
1934
1935 for (int i = 0; i < type->matrix_columns; i++) {
1936 emit_block_move(dst, src, vec_type, predicate);
1937 }
1938 return;
1939 }
1940
1941 assert(type->is_scalar() || type->is_vector());
1942
1943 dst->type = brw_type_for_base_type(type);
1944 src->type = dst->type;
1945
1946 dst->writemask = (1 << type->vector_elements) - 1;
1947
1948 src->swizzle = swizzle_for_size(type->vector_elements);
1949
1950 vec4_instruction *inst = emit(MOV(*dst, *src));
1951 inst->predicate = predicate;
1952
1953 dst->reg_offset++;
1954 src->reg_offset++;
1955 }
1956
1957
1958 /* If the RHS processing resulted in an instruction generating a
1959 * temporary value, and it would be easy to rewrite the instruction to
1960 * generate its result right into the LHS instead, do so. This ends
1961 * up reliably removing instructions where it can be tricky to do so
1962 * later without real UD chain information.
1963 */
1964 bool
1965 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1966 dst_reg dst,
1967 src_reg src,
1968 vec4_instruction *pre_rhs_inst,
1969 vec4_instruction *last_rhs_inst)
1970 {
1971 /* This could be supported, but it would take more smarts. */
1972 if (ir->condition)
1973 return false;
1974
1975 if (pre_rhs_inst == last_rhs_inst)
1976 return false; /* No instructions generated to work with. */
1977
1978 /* Make sure the last instruction generated our source reg. */
1979 if (src.file != GRF ||
1980 src.file != last_rhs_inst->dst.file ||
1981 src.reg != last_rhs_inst->dst.reg ||
1982 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1983 src.reladdr ||
1984 src.abs ||
1985 src.negate ||
1986 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1987 return false;
1988
1989 /* Check that that last instruction fully initialized the channels
1990 * we want to use, in the order we want to use them. We could
1991 * potentially reswizzle the operands of many instructions so that
1992 * we could handle out of order channels, but don't yet.
1993 */
1994
1995 for (unsigned i = 0; i < 4; i++) {
1996 if (dst.writemask & (1 << i)) {
1997 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1998 return false;
1999
2000 if (BRW_GET_SWZ(src.swizzle, i) != i)
2001 return false;
2002 }
2003 }
2004
2005 /* Success! Rewrite the instruction. */
2006 last_rhs_inst->dst.file = dst.file;
2007 last_rhs_inst->dst.reg = dst.reg;
2008 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2009 last_rhs_inst->dst.reladdr = dst.reladdr;
2010 last_rhs_inst->dst.writemask &= dst.writemask;
2011
2012 return true;
2013 }
2014
2015 void
2016 vec4_visitor::visit(ir_assignment *ir)
2017 {
2018 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2019 uint32_t predicate = BRW_PREDICATE_NONE;
2020
2021 if (!ir->lhs->type->is_scalar() &&
2022 !ir->lhs->type->is_vector()) {
2023 ir->rhs->accept(this);
2024 src_reg src = this->result;
2025
2026 if (ir->condition) {
2027 emit_bool_to_cond_code(ir->condition, &predicate);
2028 }
2029
2030 /* emit_block_move doesn't account for swizzles in the source register.
2031 * This should be ok, since the source register is a structure or an
2032 * array, and those can't be swizzled. But double-check to be sure.
2033 */
2034 assert(src.swizzle ==
2035 (ir->rhs->type->is_matrix()
2036 ? swizzle_for_size(ir->rhs->type->vector_elements)
2037 : BRW_SWIZZLE_NOOP));
2038
2039 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2040 return;
2041 }
2042
2043 /* Now we're down to just a scalar/vector with writemasks. */
2044 int i;
2045
2046 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2047 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2048
2049 ir->rhs->accept(this);
2050
2051 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2052
2053 src_reg src = this->result;
2054
2055 int swizzles[4];
2056 int first_enabled_chan = 0;
2057 int src_chan = 0;
2058
2059 assert(ir->lhs->type->is_vector() ||
2060 ir->lhs->type->is_scalar());
2061 dst.writemask = ir->write_mask;
2062
2063 for (int i = 0; i < 4; i++) {
2064 if (dst.writemask & (1 << i)) {
2065 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2066 break;
2067 }
2068 }
2069
2070 /* Swizzle a small RHS vector into the channels being written.
2071 *
2072 * glsl ir treats write_mask as dictating how many channels are
2073 * present on the RHS while in our instructions we need to make
2074 * those channels appear in the slots of the vec4 they're written to.
2075 */
2076 for (int i = 0; i < 4; i++) {
2077 if (dst.writemask & (1 << i))
2078 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2079 else
2080 swizzles[i] = first_enabled_chan;
2081 }
2082 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2083 swizzles[2], swizzles[3]);
2084
2085 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2086 return;
2087 }
2088
2089 if (ir->condition) {
2090 emit_bool_to_cond_code(ir->condition, &predicate);
2091 }
2092
2093 for (i = 0; i < type_size(ir->lhs->type); i++) {
2094 vec4_instruction *inst = emit(MOV(dst, src));
2095 inst->predicate = predicate;
2096
2097 dst.reg_offset++;
2098 src.reg_offset++;
2099 }
2100 }
2101
2102 void
2103 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2104 {
2105 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2106 foreach_list(node, &ir->components) {
2107 ir_constant *field_value = (ir_constant *)node;
2108
2109 emit_constant_values(dst, field_value);
2110 }
2111 return;
2112 }
2113
2114 if (ir->type->is_array()) {
2115 for (unsigned int i = 0; i < ir->type->length; i++) {
2116 emit_constant_values(dst, ir->array_elements[i]);
2117 }
2118 return;
2119 }
2120
2121 if (ir->type->is_matrix()) {
2122 for (int i = 0; i < ir->type->matrix_columns; i++) {
2123 float *vec = &ir->value.f[i * ir->type->vector_elements];
2124
2125 for (int j = 0; j < ir->type->vector_elements; j++) {
2126 dst->writemask = 1 << j;
2127 dst->type = BRW_REGISTER_TYPE_F;
2128
2129 emit(MOV(*dst, src_reg(vec[j])));
2130 }
2131 dst->reg_offset++;
2132 }
2133 return;
2134 }
2135
2136 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2137
2138 for (int i = 0; i < ir->type->vector_elements; i++) {
2139 if (!(remaining_writemask & (1 << i)))
2140 continue;
2141
2142 dst->writemask = 1 << i;
2143 dst->type = brw_type_for_base_type(ir->type);
2144
2145 /* Find other components that match the one we're about to
2146 * write. Emits fewer instructions for things like vec4(0.5,
2147 * 1.5, 1.5, 1.5).
2148 */
2149 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2150 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2151 if (ir->value.b[i] == ir->value.b[j])
2152 dst->writemask |= (1 << j);
2153 } else {
2154 /* u, i, and f storage all line up, so no need for a
2155 * switch case for comparing each type.
2156 */
2157 if (ir->value.u[i] == ir->value.u[j])
2158 dst->writemask |= (1 << j);
2159 }
2160 }
2161
2162 switch (ir->type->base_type) {
2163 case GLSL_TYPE_FLOAT:
2164 emit(MOV(*dst, src_reg(ir->value.f[i])));
2165 break;
2166 case GLSL_TYPE_INT:
2167 emit(MOV(*dst, src_reg(ir->value.i[i])));
2168 break;
2169 case GLSL_TYPE_UINT:
2170 emit(MOV(*dst, src_reg(ir->value.u[i])));
2171 break;
2172 case GLSL_TYPE_BOOL:
2173 emit(MOV(*dst, src_reg(ir->value.b[i])));
2174 break;
2175 default:
2176 assert(!"Non-float/uint/int/bool constant");
2177 break;
2178 }
2179
2180 remaining_writemask &= ~dst->writemask;
2181 }
2182 dst->reg_offset++;
2183 }
2184
2185 void
2186 vec4_visitor::visit(ir_constant *ir)
2187 {
2188 dst_reg dst = dst_reg(this, ir->type);
2189 this->result = src_reg(dst);
2190
2191 emit_constant_values(&dst, ir);
2192 }
2193
2194 void
2195 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2196 {
2197 ir_dereference *deref = static_cast<ir_dereference *>(
2198 ir->actual_parameters.get_head());
2199 ir_variable *location = deref->variable_referenced();
2200 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2201 location->data.atomic.buffer_index);
2202
2203 /* Calculate the surface offset */
2204 src_reg offset(this, glsl_type::uint_type);
2205 ir_dereference_array *deref_array = deref->as_dereference_array();
2206 if (deref_array) {
2207 deref_array->array_index->accept(this);
2208
2209 src_reg tmp(this, glsl_type::uint_type);
2210 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2211 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2212 } else {
2213 offset = location->data.atomic.offset;
2214 }
2215
2216 /* Emit the appropriate machine instruction */
2217 const char *callee = ir->callee->function_name();
2218 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2219
2220 if (!strcmp("__intrinsic_atomic_read", callee)) {
2221 emit_untyped_surface_read(surf_index, dst, offset);
2222
2223 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2224 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2225 src_reg(), src_reg());
2226
2227 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2228 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2229 src_reg(), src_reg());
2230 }
2231 }
2232
2233 void
2234 vec4_visitor::visit(ir_call *ir)
2235 {
2236 const char *callee = ir->callee->function_name();
2237
2238 if (!strcmp("__intrinsic_atomic_read", callee) ||
2239 !strcmp("__intrinsic_atomic_increment", callee) ||
2240 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2241 visit_atomic_counter_intrinsic(ir);
2242 } else {
2243 assert(!"Unsupported intrinsic.");
2244 }
2245 }
2246
2247 src_reg
2248 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2249 {
2250 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2251 inst->base_mrf = 2;
2252 inst->mlen = 1;
2253 inst->sampler = sampler;
2254 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2255 inst->dst.writemask = WRITEMASK_XYZW;
2256
2257 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2258 int param_base = inst->base_mrf;
2259 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2260 int zero_mask = 0xf & ~coord_mask;
2261
2262 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2263 coordinate));
2264
2265 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2266 src_reg(0)));
2267
2268 emit(inst);
2269 return src_reg(inst->dst);
2270 }
2271
2272 void
2273 vec4_visitor::visit(ir_texture *ir)
2274 {
2275 int sampler =
2276 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2277
2278 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2279 * emitting anything other than setting up the constant result.
2280 */
2281 if (ir->op == ir_tg4) {
2282 ir_constant *chan = ir->lod_info.component->as_constant();
2283 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2284 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2285 dst_reg result(this, ir->type);
2286 this->result = src_reg(result);
2287 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2288 return;
2289 }
2290 }
2291
2292 /* Should be lowered by do_lower_texture_projection */
2293 assert(!ir->projector);
2294
2295 /* Should be lowered */
2296 assert(!ir->offset || !ir->offset->type->is_array());
2297
2298 /* Generate code to compute all the subexpression trees. This has to be
2299 * done before loading any values into MRFs for the sampler message since
2300 * generating these values may involve SEND messages that need the MRFs.
2301 */
2302 src_reg coordinate;
2303 if (ir->coordinate) {
2304 ir->coordinate->accept(this);
2305 coordinate = this->result;
2306 }
2307
2308 src_reg shadow_comparitor;
2309 if (ir->shadow_comparitor) {
2310 ir->shadow_comparitor->accept(this);
2311 shadow_comparitor = this->result;
2312 }
2313
2314 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2315 src_reg offset_value;
2316 if (has_nonconstant_offset) {
2317 ir->offset->accept(this);
2318 offset_value = src_reg(this->result);
2319 }
2320
2321 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2322 src_reg lod, dPdx, dPdy, sample_index, mcs;
2323 switch (ir->op) {
2324 case ir_tex:
2325 lod = src_reg(0.0f);
2326 lod_type = glsl_type::float_type;
2327 break;
2328 case ir_txf:
2329 case ir_txl:
2330 case ir_txs:
2331 ir->lod_info.lod->accept(this);
2332 lod = this->result;
2333 lod_type = ir->lod_info.lod->type;
2334 break;
2335 case ir_query_levels:
2336 lod = src_reg(0);
2337 lod_type = glsl_type::int_type;
2338 break;
2339 case ir_txf_ms:
2340 ir->lod_info.sample_index->accept(this);
2341 sample_index = this->result;
2342 sample_index_type = ir->lod_info.sample_index->type;
2343
2344 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2345 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2346 else
2347 mcs = src_reg(0u);
2348 break;
2349 case ir_txd:
2350 ir->lod_info.grad.dPdx->accept(this);
2351 dPdx = this->result;
2352
2353 ir->lod_info.grad.dPdy->accept(this);
2354 dPdy = this->result;
2355
2356 lod_type = ir->lod_info.grad.dPdx->type;
2357 break;
2358 case ir_txb:
2359 case ir_lod:
2360 case ir_tg4:
2361 break;
2362 }
2363
2364 vec4_instruction *inst = NULL;
2365 switch (ir->op) {
2366 case ir_tex:
2367 case ir_txl:
2368 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2369 break;
2370 case ir_txd:
2371 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2372 break;
2373 case ir_txf:
2374 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2375 break;
2376 case ir_txf_ms:
2377 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2378 break;
2379 case ir_txs:
2380 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2381 break;
2382 case ir_tg4:
2383 if (has_nonconstant_offset)
2384 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2385 else
2386 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2387 break;
2388 case ir_query_levels:
2389 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2390 break;
2391 case ir_txb:
2392 assert(!"TXB is not valid for vertex shaders.");
2393 break;
2394 case ir_lod:
2395 assert(!"LOD is not valid for vertex shaders.");
2396 break;
2397 default:
2398 assert(!"Unrecognized tex op");
2399 }
2400
2401 if (ir->offset != NULL && ir->op != ir_txf)
2402 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2403
2404 /* Stuff the channel select bits in the top of the texture offset */
2405 if (ir->op == ir_tg4)
2406 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2407
2408 /* The message header is necessary for:
2409 * - Gen4 (always)
2410 * - Texel offsets
2411 * - Gather channel selection
2412 * - Sampler indices too large to fit in a 4-bit value.
2413 */
2414 inst->header_present =
2415 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2416 sampler >= 16;
2417 inst->base_mrf = 2;
2418 inst->mlen = inst->header_present + 1; /* always at least one */
2419 inst->sampler = sampler;
2420 inst->dst = dst_reg(this, ir->type);
2421 inst->dst.writemask = WRITEMASK_XYZW;
2422 inst->shadow_compare = ir->shadow_comparitor != NULL;
2423
2424 /* MRF for the first parameter */
2425 int param_base = inst->base_mrf + inst->header_present;
2426
2427 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2428 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2429 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2430 } else {
2431 /* Load the coordinate */
2432 /* FINISHME: gl_clamp_mask and saturate */
2433 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2434 int zero_mask = 0xf & ~coord_mask;
2435
2436 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2437 coordinate));
2438
2439 if (zero_mask != 0) {
2440 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2441 src_reg(0)));
2442 }
2443 /* Load the shadow comparitor */
2444 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2445 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2446 WRITEMASK_X),
2447 shadow_comparitor));
2448 inst->mlen++;
2449 }
2450
2451 /* Load the LOD info */
2452 if (ir->op == ir_tex || ir->op == ir_txl) {
2453 int mrf, writemask;
2454 if (brw->gen >= 5) {
2455 mrf = param_base + 1;
2456 if (ir->shadow_comparitor) {
2457 writemask = WRITEMASK_Y;
2458 /* mlen already incremented */
2459 } else {
2460 writemask = WRITEMASK_X;
2461 inst->mlen++;
2462 }
2463 } else /* brw->gen == 4 */ {
2464 mrf = param_base;
2465 writemask = WRITEMASK_W;
2466 }
2467 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2468 } else if (ir->op == ir_txf) {
2469 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2470 } else if (ir->op == ir_txf_ms) {
2471 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2472 sample_index));
2473 if (brw->gen >= 7)
2474 /* MCS data is in the first channel of `mcs`, but we need to get it into
2475 * the .y channel of the second vec4 of params, so replicate .x across
2476 * the whole vec4 and then mask off everything except .y
2477 */
2478 mcs.swizzle = BRW_SWIZZLE_XXXX;
2479 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2480 mcs));
2481 inst->mlen++;
2482 } else if (ir->op == ir_txd) {
2483 const glsl_type *type = lod_type;
2484
2485 if (brw->gen >= 5) {
2486 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2487 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2488 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2489 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2490 inst->mlen++;
2491
2492 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2493 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2494 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2495 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2496 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2497 inst->mlen++;
2498
2499 if (ir->shadow_comparitor) {
2500 emit(MOV(dst_reg(MRF, param_base + 2,
2501 ir->shadow_comparitor->type, WRITEMASK_Z),
2502 shadow_comparitor));
2503 }
2504 }
2505 } else /* brw->gen == 4 */ {
2506 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2507 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2508 inst->mlen += 2;
2509 }
2510 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2511 if (ir->shadow_comparitor) {
2512 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2513 shadow_comparitor));
2514 }
2515
2516 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2517 offset_value));
2518 inst->mlen++;
2519 }
2520 }
2521
2522 emit(inst);
2523
2524 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2525 * spec requires layers.
2526 */
2527 if (ir->op == ir_txs) {
2528 glsl_type const *type = ir->sampler->type;
2529 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2530 type->sampler_array) {
2531 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2532 writemask(inst->dst, WRITEMASK_Z),
2533 src_reg(inst->dst), src_reg(6));
2534 }
2535 }
2536
2537 if (brw->gen == 6 && ir->op == ir_tg4) {
2538 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2539 }
2540
2541 swizzle_result(ir, src_reg(inst->dst), sampler);
2542 }
2543
2544 /**
2545 * Apply workarounds for Gen6 gather with UINT/SINT
2546 */
2547 void
2548 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2549 {
2550 if (!wa)
2551 return;
2552
2553 int width = (wa & WA_8BIT) ? 8 : 16;
2554 dst_reg dst_f = dst;
2555 dst_f.type = BRW_REGISTER_TYPE_F;
2556
2557 /* Convert from UNORM to UINT */
2558 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2559 emit(MOV(dst, src_reg(dst_f)));
2560
2561 if (wa & WA_SIGN) {
2562 /* Reinterpret the UINT value as a signed INT value by
2563 * shifting the sign bit into place, then shifting back
2564 * preserving sign.
2565 */
2566 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2567 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2568 }
2569 }
2570
2571 /**
2572 * Set up the gather channel based on the swizzle, for gather4.
2573 */
2574 uint32_t
2575 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2576 {
2577 ir_constant *chan = ir->lod_info.component->as_constant();
2578 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2579 switch (swiz) {
2580 case SWIZZLE_X: return 0;
2581 case SWIZZLE_Y:
2582 /* gather4 sampler is broken for green channel on RG32F --
2583 * we must ask for blue instead.
2584 */
2585 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2586 return 2;
2587 return 1;
2588 case SWIZZLE_Z: return 2;
2589 case SWIZZLE_W: return 3;
2590 default:
2591 assert(!"Not reached"); /* zero, one swizzles handled already */
2592 return 0;
2593 }
2594 }
2595
2596 void
2597 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2598 {
2599 int s = key->tex.swizzles[sampler];
2600
2601 this->result = src_reg(this, ir->type);
2602 dst_reg swizzled_result(this->result);
2603
2604 if (ir->op == ir_query_levels) {
2605 /* # levels is in .w */
2606 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2607 emit(MOV(swizzled_result, orig_val));
2608 return;
2609 }
2610
2611 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2612 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2613 emit(MOV(swizzled_result, orig_val));
2614 return;
2615 }
2616
2617
2618 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2619 int swizzle[4] = {0};
2620
2621 for (int i = 0; i < 4; i++) {
2622 switch (GET_SWZ(s, i)) {
2623 case SWIZZLE_ZERO:
2624 zero_mask |= (1 << i);
2625 break;
2626 case SWIZZLE_ONE:
2627 one_mask |= (1 << i);
2628 break;
2629 default:
2630 copy_mask |= (1 << i);
2631 swizzle[i] = GET_SWZ(s, i);
2632 break;
2633 }
2634 }
2635
2636 if (copy_mask) {
2637 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2638 swizzled_result.writemask = copy_mask;
2639 emit(MOV(swizzled_result, orig_val));
2640 }
2641
2642 if (zero_mask) {
2643 swizzled_result.writemask = zero_mask;
2644 emit(MOV(swizzled_result, src_reg(0.0f)));
2645 }
2646
2647 if (one_mask) {
2648 swizzled_result.writemask = one_mask;
2649 emit(MOV(swizzled_result, src_reg(1.0f)));
2650 }
2651 }
2652
2653 void
2654 vec4_visitor::visit(ir_return *ir)
2655 {
2656 assert(!"not reached");
2657 }
2658
2659 void
2660 vec4_visitor::visit(ir_discard *ir)
2661 {
2662 assert(!"not reached");
2663 }
2664
2665 void
2666 vec4_visitor::visit(ir_if *ir)
2667 {
2668 /* Don't point the annotation at the if statement, because then it plus
2669 * the then and else blocks get printed.
2670 */
2671 this->base_ir = ir->condition;
2672
2673 if (brw->gen == 6) {
2674 emit_if_gen6(ir);
2675 } else {
2676 uint32_t predicate;
2677 emit_bool_to_cond_code(ir->condition, &predicate);
2678 emit(IF(predicate));
2679 }
2680
2681 visit_instructions(&ir->then_instructions);
2682
2683 if (!ir->else_instructions.is_empty()) {
2684 this->base_ir = ir->condition;
2685 emit(BRW_OPCODE_ELSE);
2686
2687 visit_instructions(&ir->else_instructions);
2688 }
2689
2690 this->base_ir = ir->condition;
2691 emit(BRW_OPCODE_ENDIF);
2692 }
2693
2694 void
2695 vec4_visitor::visit(ir_emit_vertex *)
2696 {
2697 assert(!"not reached");
2698 }
2699
2700 void
2701 vec4_visitor::visit(ir_end_primitive *)
2702 {
2703 assert(!"not reached");
2704 }
2705
2706 void
2707 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2708 dst_reg dst, src_reg offset,
2709 src_reg src0, src_reg src1)
2710 {
2711 unsigned mlen = 0;
2712
2713 /* Set the atomic operation offset. */
2714 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2715 mlen++;
2716
2717 /* Set the atomic operation arguments. */
2718 if (src0.file != BAD_FILE) {
2719 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2720 mlen++;
2721 }
2722
2723 if (src1.file != BAD_FILE) {
2724 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2725 mlen++;
2726 }
2727
2728 /* Emit the instruction. Note that this maps to the normal SIMD8
2729 * untyped atomic message on Ivy Bridge, but that's OK because
2730 * unused channels will be masked out.
2731 */
2732 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2733 src_reg(atomic_op), src_reg(surf_index));
2734 inst->base_mrf = 0;
2735 inst->mlen = mlen;
2736 }
2737
2738 void
2739 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2740 src_reg offset)
2741 {
2742 /* Set the surface read offset. */
2743 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2744
2745 /* Emit the instruction. Note that this maps to the normal SIMD8
2746 * untyped surface read message, but that's OK because unused
2747 * channels will be masked out.
2748 */
2749 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2750 dst, src_reg(surf_index));
2751 inst->base_mrf = 0;
2752 inst->mlen = 1;
2753 }
2754
2755 void
2756 vec4_visitor::emit_ndc_computation()
2757 {
2758 /* Get the position */
2759 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2760
2761 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2762 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2763 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2764
2765 current_annotation = "NDC";
2766 dst_reg ndc_w = ndc;
2767 ndc_w.writemask = WRITEMASK_W;
2768 src_reg pos_w = pos;
2769 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2770 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2771
2772 dst_reg ndc_xyz = ndc;
2773 ndc_xyz.writemask = WRITEMASK_XYZ;
2774
2775 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2776 }
2777
2778 void
2779 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2780 {
2781 if (brw->gen < 6 &&
2782 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2783 key->userclip_active || brw->has_negative_rhw_bug)) {
2784 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2785 dst_reg header1_w = header1;
2786 header1_w.writemask = WRITEMASK_W;
2787
2788 emit(MOV(header1, 0u));
2789
2790 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2791 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2792
2793 current_annotation = "Point size";
2794 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2795 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2796 }
2797
2798 if (key->userclip_active) {
2799 current_annotation = "Clipping flags";
2800 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2801 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2802
2803 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2804 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2805 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2806
2807 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2808 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2809 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2810 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2811 }
2812
2813 /* i965 clipping workaround:
2814 * 1) Test for -ve rhw
2815 * 2) If set,
2816 * set ndc = (0,0,0,0)
2817 * set ucp[6] = 1
2818 *
2819 * Later, clipping will detect ucp[6] and ensure the primitive is
2820 * clipped against all fixed planes.
2821 */
2822 if (brw->has_negative_rhw_bug) {
2823 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2824 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2825 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2826 vec4_instruction *inst;
2827 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2828 inst->predicate = BRW_PREDICATE_NORMAL;
2829 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2830 inst->predicate = BRW_PREDICATE_NORMAL;
2831 }
2832
2833 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2834 } else if (brw->gen < 6) {
2835 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2836 } else {
2837 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2838 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2839 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2840 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2841 }
2842 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2843 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2844 src_reg(output_reg[VARYING_SLOT_LAYER])));
2845 }
2846 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2847 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2848 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2849 }
2850 }
2851 }
2852
2853 void
2854 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2855 {
2856 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2857 *
2858 * "If a linked set of shaders forming the vertex stage contains no
2859 * static write to gl_ClipVertex or gl_ClipDistance, but the
2860 * application has requested clipping against user clip planes through
2861 * the API, then the coordinate written to gl_Position is used for
2862 * comparison against the user clip planes."
2863 *
2864 * This function is only called if the shader didn't write to
2865 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2866 * if the user wrote to it; otherwise we use gl_Position.
2867 */
2868 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2869 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2870 clip_vertex = VARYING_SLOT_POS;
2871 }
2872
2873 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2874 ++i) {
2875 reg.writemask = 1 << i;
2876 emit(DP4(reg,
2877 src_reg(output_reg[clip_vertex]),
2878 src_reg(this->userplane[i + offset])));
2879 }
2880 }
2881
2882 void
2883 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2884 {
2885 assert (varying < VARYING_SLOT_MAX);
2886 reg.type = output_reg[varying].type;
2887 current_annotation = output_reg_annotation[varying];
2888 /* Copy the register, saturating if necessary */
2889 vec4_instruction *inst = emit(MOV(reg,
2890 src_reg(output_reg[varying])));
2891 if ((varying == VARYING_SLOT_COL0 ||
2892 varying == VARYING_SLOT_COL1 ||
2893 varying == VARYING_SLOT_BFC0 ||
2894 varying == VARYING_SLOT_BFC1) &&
2895 key->clamp_vertex_color) {
2896 inst->saturate = true;
2897 }
2898 }
2899
2900 void
2901 vec4_visitor::emit_urb_slot(int mrf, int varying)
2902 {
2903 struct brw_reg hw_reg = brw_message_reg(mrf);
2904 dst_reg reg = dst_reg(MRF, mrf);
2905 reg.type = BRW_REGISTER_TYPE_F;
2906
2907 switch (varying) {
2908 case VARYING_SLOT_PSIZ:
2909 /* PSIZ is always in slot 0, and is coupled with other flags. */
2910 current_annotation = "indices, point width, clip flags";
2911 emit_psiz_and_flags(hw_reg);
2912 break;
2913 case BRW_VARYING_SLOT_NDC:
2914 current_annotation = "NDC";
2915 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2916 break;
2917 case VARYING_SLOT_POS:
2918 current_annotation = "gl_Position";
2919 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2920 break;
2921 case VARYING_SLOT_EDGE:
2922 /* This is present when doing unfilled polygons. We're supposed to copy
2923 * the edge flag from the user-provided vertex array
2924 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2925 * of that attribute (starts as 1.0f). This is then used in clipping to
2926 * determine which edges should be drawn as wireframe.
2927 */
2928 current_annotation = "edge flag";
2929 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2930 glsl_type::float_type, WRITEMASK_XYZW))));
2931 break;
2932 case BRW_VARYING_SLOT_PAD:
2933 /* No need to write to this slot */
2934 break;
2935 default:
2936 emit_generic_urb_slot(reg, varying);
2937 break;
2938 }
2939 }
2940
2941 static int
2942 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2943 {
2944 if (brw->gen >= 6) {
2945 /* URB data written (does not include the message header reg) must
2946 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2947 * section 5.4.3.2.2: URB_INTERLEAVED.
2948 *
2949 * URB entries are allocated on a multiple of 1024 bits, so an
2950 * extra 128 bits written here to make the end align to 256 is
2951 * no problem.
2952 */
2953 if ((mlen % 2) != 1)
2954 mlen++;
2955 }
2956
2957 return mlen;
2958 }
2959
2960
2961 /**
2962 * Generates the VUE payload plus the necessary URB write instructions to
2963 * output it.
2964 *
2965 * The VUE layout is documented in Volume 2a.
2966 */
2967 void
2968 vec4_visitor::emit_vertex()
2969 {
2970 /* MRF 0 is reserved for the debugger, so start with message header
2971 * in MRF 1.
2972 */
2973 int base_mrf = 1;
2974 int mrf = base_mrf;
2975 /* In the process of generating our URB write message contents, we
2976 * may need to unspill a register or load from an array. Those
2977 * reads would use MRFs 14-15.
2978 */
2979 int max_usable_mrf = 13;
2980
2981 /* The following assertion verifies that max_usable_mrf causes an
2982 * even-numbered amount of URB write data, which will meet gen6's
2983 * requirements for length alignment.
2984 */
2985 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2986
2987 /* First mrf is the g0-based message header containing URB handles and
2988 * such.
2989 */
2990 emit_urb_write_header(mrf++);
2991
2992 if (brw->gen < 6) {
2993 emit_ndc_computation();
2994 }
2995
2996 /* Lower legacy ff and ClipVertex clipping to clip distances */
2997 if (key->userclip_active && !prog->UsesClipDistanceOut) {
2998 current_annotation = "user clip distances";
2999
3000 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3001 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3002
3003 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3004 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3005 }
3006
3007 /* We may need to split this up into several URB writes, so do them in a
3008 * loop.
3009 */
3010 int slot = 0;
3011 bool complete = false;
3012 do {
3013 /* URB offset is in URB row increments, and each of our MRFs is half of
3014 * one of those, since we're doing interleaved writes.
3015 */
3016 int offset = slot / 2;
3017
3018 mrf = base_mrf + 1;
3019 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3020 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3021
3022 /* If this was max_usable_mrf, we can't fit anything more into this
3023 * URB WRITE.
3024 */
3025 if (mrf > max_usable_mrf) {
3026 slot++;
3027 break;
3028 }
3029 }
3030
3031 complete = slot >= prog_data->vue_map.num_slots;
3032 current_annotation = "URB write";
3033 vec4_instruction *inst = emit_urb_write_opcode(complete);
3034 inst->base_mrf = base_mrf;
3035 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3036 inst->offset += offset;
3037 } while(!complete);
3038 }
3039
3040
3041 src_reg
3042 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3043 src_reg *reladdr, int reg_offset)
3044 {
3045 /* Because we store the values to scratch interleaved like our
3046 * vertex data, we need to scale the vec4 index by 2.
3047 */
3048 int message_header_scale = 2;
3049
3050 /* Pre-gen6, the message header uses byte offsets instead of vec4
3051 * (16-byte) offset units.
3052 */
3053 if (brw->gen < 6)
3054 message_header_scale *= 16;
3055
3056 if (reladdr) {
3057 src_reg index = src_reg(this, glsl_type::int_type);
3058
3059 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3060 emit_before(inst, MUL(dst_reg(index),
3061 index, src_reg(message_header_scale)));
3062
3063 return index;
3064 } else {
3065 return src_reg(reg_offset * message_header_scale);
3066 }
3067 }
3068
3069 src_reg
3070 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3071 src_reg *reladdr, int reg_offset)
3072 {
3073 if (reladdr) {
3074 src_reg index = src_reg(this, glsl_type::int_type);
3075
3076 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3077
3078 /* Pre-gen6, the message header uses byte offsets instead of vec4
3079 * (16-byte) offset units.
3080 */
3081 if (brw->gen < 6) {
3082 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3083 }
3084
3085 return index;
3086 } else if (brw->gen >= 8) {
3087 /* Store the offset in a GRF so we can send-from-GRF. */
3088 src_reg offset = src_reg(this, glsl_type::int_type);
3089 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3090 return offset;
3091 } else {
3092 int message_header_scale = brw->gen < 6 ? 16 : 1;
3093 return src_reg(reg_offset * message_header_scale);
3094 }
3095 }
3096
3097 /**
3098 * Emits an instruction before @inst to load the value named by @orig_src
3099 * from scratch space at @base_offset to @temp.
3100 *
3101 * @base_offset is measured in 32-byte units (the size of a register).
3102 */
3103 void
3104 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3105 dst_reg temp, src_reg orig_src,
3106 int base_offset)
3107 {
3108 int reg_offset = base_offset + orig_src.reg_offset;
3109 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3110
3111 emit_before(inst, SCRATCH_READ(temp, index));
3112 }
3113
3114 /**
3115 * Emits an instruction after @inst to store the value to be written
3116 * to @orig_dst to scratch space at @base_offset, from @temp.
3117 *
3118 * @base_offset is measured in 32-byte units (the size of a register).
3119 */
3120 void
3121 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3122 {
3123 int reg_offset = base_offset + inst->dst.reg_offset;
3124 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3125
3126 /* Create a temporary register to store *inst's result in.
3127 *
3128 * We have to be careful in MOVing from our temporary result register in
3129 * the scratch write. If we swizzle from channels of the temporary that
3130 * weren't initialized, it will confuse live interval analysis, which will
3131 * make spilling fail to make progress.
3132 */
3133 src_reg temp = src_reg(this, glsl_type::vec4_type);
3134 temp.type = inst->dst.type;
3135 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3136 int swizzles[4];
3137 for (int i = 0; i < 4; i++)
3138 if (inst->dst.writemask & (1 << i))
3139 swizzles[i] = i;
3140 else
3141 swizzles[i] = first_writemask_chan;
3142 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3143 swizzles[2], swizzles[3]);
3144
3145 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3146 inst->dst.writemask));
3147 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3148 write->predicate = inst->predicate;
3149 write->ir = inst->ir;
3150 write->annotation = inst->annotation;
3151 inst->insert_after(write);
3152
3153 inst->dst.file = temp.file;
3154 inst->dst.reg = temp.reg;
3155 inst->dst.reg_offset = temp.reg_offset;
3156 inst->dst.reladdr = NULL;
3157 }
3158
3159 /**
3160 * We can't generally support array access in GRF space, because a
3161 * single instruction's destination can only span 2 contiguous
3162 * registers. So, we send all GRF arrays that get variable index
3163 * access to scratch space.
3164 */
3165 void
3166 vec4_visitor::move_grf_array_access_to_scratch()
3167 {
3168 int scratch_loc[this->virtual_grf_count];
3169
3170 for (int i = 0; i < this->virtual_grf_count; i++) {
3171 scratch_loc[i] = -1;
3172 }
3173
3174 /* First, calculate the set of virtual GRFs that need to be punted
3175 * to scratch due to having any array access on them, and where in
3176 * scratch.
3177 */
3178 foreach_list(node, &this->instructions) {
3179 vec4_instruction *inst = (vec4_instruction *)node;
3180
3181 if (inst->dst.file == GRF && inst->dst.reladdr &&
3182 scratch_loc[inst->dst.reg] == -1) {
3183 scratch_loc[inst->dst.reg] = c->last_scratch;
3184 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3185 }
3186
3187 for (int i = 0 ; i < 3; i++) {
3188 src_reg *src = &inst->src[i];
3189
3190 if (src->file == GRF && src->reladdr &&
3191 scratch_loc[src->reg] == -1) {
3192 scratch_loc[src->reg] = c->last_scratch;
3193 c->last_scratch += this->virtual_grf_sizes[src->reg];
3194 }
3195 }
3196 }
3197
3198 /* Now, for anything that will be accessed through scratch, rewrite
3199 * it to load/store. Note that this is a _safe list walk, because
3200 * we may generate a new scratch_write instruction after the one
3201 * we're processing.
3202 */
3203 foreach_list_safe(node, &this->instructions) {
3204 vec4_instruction *inst = (vec4_instruction *)node;
3205
3206 /* Set up the annotation tracking for new generated instructions. */
3207 base_ir = inst->ir;
3208 current_annotation = inst->annotation;
3209
3210 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3211 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3212 }
3213
3214 for (int i = 0 ; i < 3; i++) {
3215 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3216 continue;
3217
3218 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3219
3220 emit_scratch_read(inst, temp, inst->src[i],
3221 scratch_loc[inst->src[i].reg]);
3222
3223 inst->src[i].file = temp.file;
3224 inst->src[i].reg = temp.reg;
3225 inst->src[i].reg_offset = temp.reg_offset;
3226 inst->src[i].reladdr = NULL;
3227 }
3228 }
3229 }
3230
3231 /**
3232 * Emits an instruction before @inst to load the value named by @orig_src
3233 * from the pull constant buffer (surface) at @base_offset to @temp.
3234 */
3235 void
3236 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3237 dst_reg temp, src_reg orig_src,
3238 int base_offset)
3239 {
3240 int reg_offset = base_offset + orig_src.reg_offset;
3241 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3242 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3243 vec4_instruction *load;
3244
3245 if (brw->gen >= 7) {
3246 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3247 grf_offset.type = offset.type;
3248 emit_before(inst, MOV(grf_offset, offset));
3249
3250 load = new(mem_ctx) vec4_instruction(this,
3251 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3252 temp, index, src_reg(grf_offset));
3253 } else {
3254 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3255 temp, index, offset);
3256 load->base_mrf = 14;
3257 load->mlen = 1;
3258 }
3259 emit_before(inst, load);
3260 }
3261
3262 /**
3263 * Implements array access of uniforms by inserting a
3264 * PULL_CONSTANT_LOAD instruction.
3265 *
3266 * Unlike temporary GRF array access (where we don't support it due to
3267 * the difficulty of doing relative addressing on instruction
3268 * destinations), we could potentially do array access of uniforms
3269 * that were loaded in GRF space as push constants. In real-world
3270 * usage we've seen, though, the arrays being used are always larger
3271 * than we could load as push constants, so just always move all
3272 * uniform array access out to a pull constant buffer.
3273 */
3274 void
3275 vec4_visitor::move_uniform_array_access_to_pull_constants()
3276 {
3277 int pull_constant_loc[this->uniforms];
3278
3279 for (int i = 0; i < this->uniforms; i++) {
3280 pull_constant_loc[i] = -1;
3281 }
3282
3283 /* Walk through and find array access of uniforms. Put a copy of that
3284 * uniform in the pull constant buffer.
3285 *
3286 * Note that we don't move constant-indexed accesses to arrays. No
3287 * testing has been done of the performance impact of this choice.
3288 */
3289 foreach_list_safe(node, &this->instructions) {
3290 vec4_instruction *inst = (vec4_instruction *)node;
3291
3292 for (int i = 0 ; i < 3; i++) {
3293 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3294 continue;
3295
3296 int uniform = inst->src[i].reg;
3297
3298 /* If this array isn't already present in the pull constant buffer,
3299 * add it.
3300 */
3301 if (pull_constant_loc[uniform] == -1) {
3302 const float **values = &stage_prog_data->param[uniform * 4];
3303
3304 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3305
3306 assert(uniform < uniform_array_size);
3307 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3308 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3309 = values[j];
3310 }
3311 }
3312
3313 /* Set up the annotation tracking for new generated instructions. */
3314 base_ir = inst->ir;
3315 current_annotation = inst->annotation;
3316
3317 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3318
3319 emit_pull_constant_load(inst, temp, inst->src[i],
3320 pull_constant_loc[uniform]);
3321
3322 inst->src[i].file = temp.file;
3323 inst->src[i].reg = temp.reg;
3324 inst->src[i].reg_offset = temp.reg_offset;
3325 inst->src[i].reladdr = NULL;
3326 }
3327 }
3328
3329 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3330 * no need to track them as larger-than-vec4 objects. This will be
3331 * relied on in cutting out unused uniform vectors from push
3332 * constants.
3333 */
3334 split_uniform_registers();
3335 }
3336
3337 void
3338 vec4_visitor::resolve_ud_negate(src_reg *reg)
3339 {
3340 if (reg->type != BRW_REGISTER_TYPE_UD ||
3341 !reg->negate)
3342 return;
3343
3344 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3345 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3346 *reg = temp;
3347 }
3348
3349 vec4_visitor::vec4_visitor(struct brw_context *brw,
3350 struct brw_vec4_compile *c,
3351 struct gl_program *prog,
3352 const struct brw_vec4_prog_key *key,
3353 struct brw_vec4_prog_data *prog_data,
3354 struct gl_shader_program *shader_prog,
3355 gl_shader_stage stage,
3356 void *mem_ctx,
3357 bool debug_flag,
3358 bool no_spills,
3359 shader_time_shader_type st_base,
3360 shader_time_shader_type st_written,
3361 shader_time_shader_type st_reset)
3362 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3363 c(c),
3364 key(key),
3365 prog_data(prog_data),
3366 sanity_param_count(0),
3367 fail_msg(NULL),
3368 first_non_payload_grf(0),
3369 need_all_constants_in_pull_buffer(false),
3370 debug_flag(debug_flag),
3371 no_spills(no_spills),
3372 st_base(st_base),
3373 st_written(st_written),
3374 st_reset(st_reset)
3375 {
3376 this->mem_ctx = mem_ctx;
3377 this->failed = false;
3378
3379 this->base_ir = NULL;
3380 this->current_annotation = NULL;
3381 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3382
3383 this->variable_ht = hash_table_ctor(0,
3384 hash_table_pointer_hash,
3385 hash_table_pointer_compare);
3386
3387 this->virtual_grf_start = NULL;
3388 this->virtual_grf_end = NULL;
3389 this->virtual_grf_sizes = NULL;
3390 this->virtual_grf_count = 0;
3391 this->virtual_grf_reg_map = NULL;
3392 this->virtual_grf_reg_count = 0;
3393 this->virtual_grf_array_size = 0;
3394 this->live_intervals_valid = false;
3395
3396 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3397
3398 this->uniforms = 0;
3399
3400 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3401 * at least one. See setup_uniforms() in brw_vec4.cpp.
3402 */
3403 this->uniform_array_size = 1;
3404 if (prog_data) {
3405 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3406 }
3407
3408 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3409 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3410 }
3411
3412 vec4_visitor::~vec4_visitor()
3413 {
3414 hash_table_dtor(this->variable_ht);
3415 }
3416
3417
3418 void
3419 vec4_visitor::fail(const char *format, ...)
3420 {
3421 va_list va;
3422 char *msg;
3423
3424 if (failed)
3425 return;
3426
3427 failed = true;
3428
3429 va_start(va, format);
3430 msg = ralloc_vasprintf(mem_ctx, format, va);
3431 va_end(va);
3432 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3433
3434 this->fail_msg = msg;
3435
3436 if (debug_flag) {
3437 fprintf(stderr, "%s", msg);
3438 }
3439 }
3440
3441 } /* namespace brw */