i965/vec4: Collect all emits of texture ops into one place
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->sampler = 0;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
82 src_reg src0, src_reg src1, src_reg src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
91 {
92 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
93 }
94
95 vec4_instruction *
96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
97 {
98 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
103 {
104 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode)
109 {
110 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
111 }
112
113 #define ALU1(op) \
114 vec4_instruction * \
115 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
116 { \
117 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
118 src0); \
119 }
120
121 #define ALU2(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
124 const src_reg &src1) \
125 { \
126 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
127 src0, src1); \
128 }
129
130 #define ALU2_ACC(op) \
131 vec4_instruction * \
132 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
133 const src_reg &src1) \
134 { \
135 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
136 BRW_OPCODE_##op, dst, src0, src1); \
137 inst->writes_accumulator = true; \
138 return inst; \
139 }
140
141 #define ALU3(op) \
142 vec4_instruction * \
143 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
144 const src_reg &src1, const src_reg &src2) \
145 { \
146 assert(brw->gen >= 6); \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1, src2); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU1(F32TO16)
158 ALU1(F16TO32)
159 ALU2(ADD)
160 ALU2(MUL)
161 ALU2_ACC(MACH)
162 ALU2(AND)
163 ALU2(OR)
164 ALU2(XOR)
165 ALU2(DP3)
166 ALU2(DP4)
167 ALU2(DPH)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172 ALU1(BFREV)
173 ALU3(BFE)
174 ALU2(BFI1)
175 ALU3(BFI2)
176 ALU1(FBH)
177 ALU1(FBL)
178 ALU1(CBIT)
179 ALU3(MAD)
180 ALU2_ACC(ADDC)
181 ALU2_ACC(SUBB)
182 ALU2(MAC)
183
184 /** Gen4 predicated IF. */
185 vec4_instruction *
186 vec4_visitor::IF(enum brw_predicate predicate)
187 {
188 vec4_instruction *inst;
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
191 inst->predicate = predicate;
192
193 return inst;
194 }
195
196 /** Gen6 IF with embedded comparison. */
197 vec4_instruction *
198 vec4_visitor::IF(src_reg src0, src_reg src1,
199 enum brw_conditional_mod condition)
200 {
201 assert(brw->gen == 6);
202
203 vec4_instruction *inst;
204
205 resolve_ud_negate(&src0);
206 resolve_ud_negate(&src1);
207
208 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
209 src0, src1);
210 inst->conditional_mod = condition;
211
212 return inst;
213 }
214
215 /**
216 * CMP: Sets the low bit of the destination channels with the result
217 * of the comparison, while the upper bits are undefined, and updates
218 * the flag register with the packed 16 bits of the result.
219 */
220 vec4_instruction *
221 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
222 enum brw_conditional_mod condition)
223 {
224 vec4_instruction *inst;
225
226 /* original gen4 does type conversion to the destination type
227 * before before comparison, producing garbage results for floating
228 * point comparisons.
229 */
230 if (brw->gen == 4) {
231 dst.type = src0.type;
232 if (dst.file == HW_REG)
233 dst.fixed_hw_reg.type = dst.type;
234 }
235
236 resolve_ud_negate(&src0);
237 resolve_ud_negate(&src1);
238
239 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
240 inst->conditional_mod = condition;
241
242 return inst;
243 }
244
245 vec4_instruction *
246 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
247 {
248 vec4_instruction *inst;
249
250 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
251 dst, index);
252 inst->base_mrf = 14;
253 inst->mlen = 2;
254
255 return inst;
256 }
257
258 vec4_instruction *
259 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
260 const src_reg &index)
261 {
262 vec4_instruction *inst;
263
264 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
265 dst, src, index);
266 inst->base_mrf = 13;
267 inst->mlen = 3;
268
269 return inst;
270 }
271
272 void
273 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
274 {
275 static enum opcode dot_opcodes[] = {
276 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
277 };
278
279 emit(dot_opcodes[elements - 2], dst, src0, src1);
280 }
281
282 src_reg
283 vec4_visitor::fix_3src_operand(src_reg src)
284 {
285 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
286 * able to use vertical stride of zero to replicate the vec4 uniform, like
287 *
288 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
289 *
290 * But you can't, since vertical stride is always four in three-source
291 * instructions. Instead, insert a MOV instruction to do the replication so
292 * that the three-source instruction can consume it.
293 */
294
295 /* The MOV is only needed if the source is a uniform or immediate. */
296 if (src.file != UNIFORM && src.file != IMM)
297 return src;
298
299 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
300 return src;
301
302 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
303 expanded.type = src.type;
304 emit(MOV(expanded, src));
305 return src_reg(expanded);
306 }
307
308 src_reg
309 vec4_visitor::fix_math_operand(src_reg src)
310 {
311 /* The gen6 math instruction ignores the source modifiers --
312 * swizzle, abs, negate, and at least some parts of the register
313 * region description.
314 *
315 * Rather than trying to enumerate all these cases, *always* expand the
316 * operand to a temp GRF for gen6.
317 *
318 * For gen7, keep the operand as-is, except if immediate, which gen7 still
319 * can't use.
320 */
321
322 if (brw->gen == 7 && src.file != IMM)
323 return src;
324
325 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
326 expanded.type = src.type;
327 emit(MOV(expanded, src));
328 return src_reg(expanded);
329 }
330
331 void
332 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
333 {
334 src = fix_math_operand(src);
335
336 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
337 /* The gen6 math instruction must be align1, so we can't do
338 * writemasks.
339 */
340 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
341
342 emit(opcode, temp_dst, src);
343
344 emit(MOV(dst, src_reg(temp_dst)));
345 } else {
346 emit(opcode, dst, src);
347 }
348 }
349
350 void
351 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
352 {
353 vec4_instruction *inst = emit(opcode, dst, src);
354 inst->base_mrf = 1;
355 inst->mlen = 1;
356 }
357
358 void
359 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
360 {
361 switch (opcode) {
362 case SHADER_OPCODE_RCP:
363 case SHADER_OPCODE_RSQ:
364 case SHADER_OPCODE_SQRT:
365 case SHADER_OPCODE_EXP2:
366 case SHADER_OPCODE_LOG2:
367 case SHADER_OPCODE_SIN:
368 case SHADER_OPCODE_COS:
369 break;
370 default:
371 unreachable("not reached: bad math opcode");
372 }
373
374 if (brw->gen >= 8) {
375 emit(opcode, dst, src);
376 } else if (brw->gen >= 6) {
377 emit_math1_gen6(opcode, dst, src);
378 } else {
379 emit_math1_gen4(opcode, dst, src);
380 }
381 }
382
383 void
384 vec4_visitor::emit_math2_gen6(enum opcode opcode,
385 dst_reg dst, src_reg src0, src_reg src1)
386 {
387 src0 = fix_math_operand(src0);
388 src1 = fix_math_operand(src1);
389
390 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
391 /* The gen6 math instruction must be align1, so we can't do
392 * writemasks.
393 */
394 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
395 temp_dst.type = dst.type;
396
397 emit(opcode, temp_dst, src0, src1);
398
399 emit(MOV(dst, src_reg(temp_dst)));
400 } else {
401 emit(opcode, dst, src0, src1);
402 }
403 }
404
405 void
406 vec4_visitor::emit_math2_gen4(enum opcode opcode,
407 dst_reg dst, src_reg src0, src_reg src1)
408 {
409 vec4_instruction *inst = emit(opcode, dst, src0, src1);
410 inst->base_mrf = 1;
411 inst->mlen = 2;
412 }
413
414 void
415 vec4_visitor::emit_math(enum opcode opcode,
416 dst_reg dst, src_reg src0, src_reg src1)
417 {
418 switch (opcode) {
419 case SHADER_OPCODE_POW:
420 case SHADER_OPCODE_INT_QUOTIENT:
421 case SHADER_OPCODE_INT_REMAINDER:
422 break;
423 default:
424 unreachable("not reached: unsupported binary math opcode");
425 }
426
427 if (brw->gen >= 8) {
428 emit(opcode, dst, src0, src1);
429 } else if (brw->gen >= 6) {
430 emit_math2_gen6(opcode, dst, src0, src1);
431 } else {
432 emit_math2_gen4(opcode, dst, src0, src1);
433 }
434 }
435
436 void
437 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
438 {
439 if (brw->gen < 7) {
440 unreachable("ir_unop_pack_half_2x16 should be lowered");
441 }
442
443 assert(dst.type == BRW_REGISTER_TYPE_UD);
444 assert(src0.type == BRW_REGISTER_TYPE_F);
445
446 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
447 *
448 * Because this instruction does not have a 16-bit floating-point type,
449 * the destination data type must be Word (W).
450 *
451 * The destination must be DWord-aligned and specify a horizontal stride
452 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
453 * each destination channel and the upper word is not modified.
454 *
455 * The above restriction implies that the f32to16 instruction must use
456 * align1 mode, because only in align1 mode is it possible to specify
457 * horizontal stride. We choose here to defy the hardware docs and emit
458 * align16 instructions.
459 *
460 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
461 * instructions. I was partially successful in that the code passed all
462 * tests. However, the code was dubiously correct and fragile, and the
463 * tests were not harsh enough to probe that frailty. Not trusting the
464 * code, I chose instead to remain in align16 mode in defiance of the hw
465 * docs).
466 *
467 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
468 * simulator, emitting a f32to16 in align16 mode with UD as destination
469 * data type is safe. The behavior differs from that specified in the PRM
470 * in that the upper word of each destination channel is cleared to 0.
471 */
472
473 dst_reg tmp_dst(this, glsl_type::uvec2_type);
474 src_reg tmp_src(tmp_dst);
475
476 #if 0
477 /* Verify the undocumented behavior on which the following instructions
478 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
479 * then the result of the bit-or instruction below will be incorrect.
480 *
481 * You should inspect the disasm output in order to verify that the MOV is
482 * not optimized away.
483 */
484 emit(MOV(tmp_dst, src_reg(0x12345678u)));
485 #endif
486
487 /* Give tmp the form below, where "." means untouched.
488 *
489 * w z y x w z y x
490 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
491 *
492 * That the upper word of each write-channel be 0 is required for the
493 * following bit-shift and bit-or instructions to work. Note that this
494 * relies on the undocumented hardware behavior mentioned above.
495 */
496 tmp_dst.writemask = WRITEMASK_XY;
497 emit(F32TO16(tmp_dst, src0));
498
499 /* Give the write-channels of dst the form:
500 * 0xhhhh0000
501 */
502 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
503 emit(SHL(dst, tmp_src, src_reg(16u)));
504
505 /* Finally, give the write-channels of dst the form of packHalf2x16's
506 * output:
507 * 0xhhhhllll
508 */
509 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
510 emit(OR(dst, src_reg(dst), tmp_src));
511 }
512
513 void
514 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
515 {
516 if (brw->gen < 7) {
517 unreachable("ir_unop_unpack_half_2x16 should be lowered");
518 }
519
520 assert(dst.type == BRW_REGISTER_TYPE_F);
521 assert(src0.type == BRW_REGISTER_TYPE_UD);
522
523 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
524 *
525 * Because this instruction does not have a 16-bit floating-point type,
526 * the source data type must be Word (W). The destination type must be
527 * F (Float).
528 *
529 * To use W as the source data type, we must adjust horizontal strides,
530 * which is only possible in align1 mode. All my [chadv] attempts at
531 * emitting align1 instructions for unpackHalf2x16 failed to pass the
532 * Piglit tests, so I gave up.
533 *
534 * I've verified that, on gen7 hardware and the simulator, it is safe to
535 * emit f16to32 in align16 mode with UD as source data type.
536 */
537
538 dst_reg tmp_dst(this, glsl_type::uvec2_type);
539 src_reg tmp_src(tmp_dst);
540
541 tmp_dst.writemask = WRITEMASK_X;
542 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
543
544 tmp_dst.writemask = WRITEMASK_Y;
545 emit(SHR(tmp_dst, src0, src_reg(16u)));
546
547 dst.writemask = WRITEMASK_XY;
548 emit(F16TO32(dst, tmp_src));
549 }
550
551 void
552 vec4_visitor::visit_instructions(const exec_list *list)
553 {
554 foreach_in_list(ir_instruction, ir, list) {
555 base_ir = ir;
556 ir->accept(this);
557 }
558 }
559
560
561 static int
562 type_size(const struct glsl_type *type)
563 {
564 unsigned int i;
565 int size;
566
567 switch (type->base_type) {
568 case GLSL_TYPE_UINT:
569 case GLSL_TYPE_INT:
570 case GLSL_TYPE_FLOAT:
571 case GLSL_TYPE_BOOL:
572 if (type->is_matrix()) {
573 return type->matrix_columns;
574 } else {
575 /* Regardless of size of vector, it gets a vec4. This is bad
576 * packing for things like floats, but otherwise arrays become a
577 * mess. Hopefully a later pass over the code can pack scalars
578 * down if appropriate.
579 */
580 return 1;
581 }
582 case GLSL_TYPE_ARRAY:
583 assert(type->length > 0);
584 return type_size(type->fields.array) * type->length;
585 case GLSL_TYPE_STRUCT:
586 size = 0;
587 for (i = 0; i < type->length; i++) {
588 size += type_size(type->fields.structure[i].type);
589 }
590 return size;
591 case GLSL_TYPE_SAMPLER:
592 /* Samplers take up one slot in UNIFORMS[], but they're baked in
593 * at link time.
594 */
595 return 1;
596 case GLSL_TYPE_ATOMIC_UINT:
597 return 0;
598 case GLSL_TYPE_IMAGE:
599 case GLSL_TYPE_VOID:
600 case GLSL_TYPE_ERROR:
601 case GLSL_TYPE_INTERFACE:
602 unreachable("not reached");
603 }
604
605 return 0;
606 }
607
608 int
609 vec4_visitor::virtual_grf_alloc(int size)
610 {
611 if (virtual_grf_array_size <= virtual_grf_count) {
612 if (virtual_grf_array_size == 0)
613 virtual_grf_array_size = 16;
614 else
615 virtual_grf_array_size *= 2;
616 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
617 virtual_grf_array_size);
618 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
619 virtual_grf_array_size);
620 }
621 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
622 virtual_grf_reg_count += size;
623 virtual_grf_sizes[virtual_grf_count] = size;
624 return virtual_grf_count++;
625 }
626
627 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
628 {
629 init();
630
631 this->file = GRF;
632 this->reg = v->virtual_grf_alloc(type_size(type));
633
634 if (type->is_array() || type->is_record()) {
635 this->swizzle = BRW_SWIZZLE_NOOP;
636 } else {
637 this->swizzle = swizzle_for_size(type->vector_elements);
638 }
639
640 this->type = brw_type_for_base_type(type);
641 }
642
643 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
644 {
645 init();
646
647 this->file = GRF;
648 this->reg = v->virtual_grf_alloc(type_size(type));
649
650 if (type->is_array() || type->is_record()) {
651 this->writemask = WRITEMASK_XYZW;
652 } else {
653 this->writemask = (1 << type->vector_elements) - 1;
654 }
655
656 this->type = brw_type_for_base_type(type);
657 }
658
659 /* Our support for uniforms is piggy-backed on the struct
660 * gl_fragment_program, because that's where the values actually
661 * get stored, rather than in some global gl_shader_program uniform
662 * store.
663 */
664 void
665 vec4_visitor::setup_uniform_values(ir_variable *ir)
666 {
667 int namelen = strlen(ir->name);
668
669 /* The data for our (non-builtin) uniforms is stored in a series of
670 * gl_uniform_driver_storage structs for each subcomponent that
671 * glGetUniformLocation() could name. We know it's been set up in the same
672 * order we'd walk the type, so walk the list of storage and find anything
673 * with our name, or the prefix of a component that starts with our name.
674 */
675 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
676 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
677
678 if (strncmp(ir->name, storage->name, namelen) != 0 ||
679 (storage->name[namelen] != 0 &&
680 storage->name[namelen] != '.' &&
681 storage->name[namelen] != '[')) {
682 continue;
683 }
684
685 gl_constant_value *components = storage->storage;
686 unsigned vector_count = (MAX2(storage->array_elements, 1) *
687 storage->type->matrix_columns);
688
689 for (unsigned s = 0; s < vector_count; s++) {
690 assert(uniforms < uniform_array_size);
691 uniform_vector_size[uniforms] = storage->type->vector_elements;
692
693 int i;
694 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
695 stage_prog_data->param[uniforms * 4 + i] = &components->f;
696 components++;
697 }
698 for (; i < 4; i++) {
699 static float zero = 0;
700 stage_prog_data->param[uniforms * 4 + i] = &zero;
701 }
702
703 uniforms++;
704 }
705 }
706 }
707
708 void
709 vec4_visitor::setup_uniform_clipplane_values()
710 {
711 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
712
713 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
714 assert(this->uniforms < uniform_array_size);
715 this->uniform_vector_size[this->uniforms] = 4;
716 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
717 this->userplane[i].type = BRW_REGISTER_TYPE_F;
718 for (int j = 0; j < 4; ++j) {
719 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
744
745 assert(this->uniforms < uniform_array_size);
746 this->uniform_vector_size[this->uniforms] = 0;
747 /* Add each of the unique swizzled channels of the element.
748 * This will end up matching the size of the glsl_type of this field.
749 */
750 int last_swiz = -1;
751 for (unsigned int j = 0; j < 4; j++) {
752 int swiz = GET_SWZ(slots[i].swizzle, j);
753 last_swiz = swiz;
754
755 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
756 assert(this->uniforms < uniform_array_size);
757 if (swiz <= last_swiz)
758 this->uniform_vector_size[this->uniforms]++;
759 }
760 this->uniforms++;
761 }
762 }
763
764 dst_reg *
765 vec4_visitor::variable_storage(ir_variable *var)
766 {
767 return (dst_reg *)hash_table_find(this->variable_ht, var);
768 }
769
770 void
771 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
772 enum brw_predicate *predicate)
773 {
774 ir_expression *expr = ir->as_expression();
775
776 *predicate = BRW_PREDICATE_NORMAL;
777
778 if (expr) {
779 src_reg op[2];
780 vec4_instruction *inst;
781
782 assert(expr->get_num_operands() <= 2);
783 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
784 expr->operands[i]->accept(this);
785 op[i] = this->result;
786
787 resolve_ud_negate(&op[i]);
788 }
789
790 switch (expr->operation) {
791 case ir_unop_logic_not:
792 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
793 inst->conditional_mod = BRW_CONDITIONAL_Z;
794 break;
795
796 case ir_binop_logic_xor:
797 inst = emit(XOR(dst_null_d(), op[0], op[1]));
798 inst->conditional_mod = BRW_CONDITIONAL_NZ;
799 break;
800
801 case ir_binop_logic_or:
802 inst = emit(OR(dst_null_d(), op[0], op[1]));
803 inst->conditional_mod = BRW_CONDITIONAL_NZ;
804 break;
805
806 case ir_binop_logic_and:
807 inst = emit(AND(dst_null_d(), op[0], op[1]));
808 inst->conditional_mod = BRW_CONDITIONAL_NZ;
809 break;
810
811 case ir_unop_f2b:
812 if (brw->gen >= 6) {
813 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
814 } else {
815 inst = emit(MOV(dst_null_f(), op[0]));
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 }
818 break;
819
820 case ir_unop_i2b:
821 if (brw->gen >= 6) {
822 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
823 } else {
824 inst = emit(MOV(dst_null_d(), op[0]));
825 inst->conditional_mod = BRW_CONDITIONAL_NZ;
826 }
827 break;
828
829 case ir_binop_all_equal:
830 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
831 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
832 break;
833
834 case ir_binop_any_nequal:
835 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
836 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
837 break;
838
839 case ir_unop_any:
840 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
841 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
842 break;
843
844 case ir_binop_greater:
845 case ir_binop_gequal:
846 case ir_binop_less:
847 case ir_binop_lequal:
848 case ir_binop_equal:
849 case ir_binop_nequal:
850 emit(CMP(dst_null_d(), op[0], op[1],
851 brw_conditional_for_comparison(expr->operation)));
852 break;
853
854 default:
855 unreachable("not reached");
856 }
857 return;
858 }
859
860 ir->accept(this);
861
862 resolve_ud_negate(&this->result);
863
864 if (brw->gen >= 6) {
865 vec4_instruction *inst = emit(AND(dst_null_d(),
866 this->result, src_reg(1)));
867 inst->conditional_mod = BRW_CONDITIONAL_NZ;
868 } else {
869 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
870 inst->conditional_mod = BRW_CONDITIONAL_NZ;
871 }
872 }
873
874 /**
875 * Emit a gen6 IF statement with the comparison folded into the IF
876 * instruction.
877 */
878 void
879 vec4_visitor::emit_if_gen6(ir_if *ir)
880 {
881 ir_expression *expr = ir->condition->as_expression();
882
883 if (expr) {
884 src_reg op[2];
885 dst_reg temp;
886
887 assert(expr->get_num_operands() <= 2);
888 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
889 expr->operands[i]->accept(this);
890 op[i] = this->result;
891 }
892
893 switch (expr->operation) {
894 case ir_unop_logic_not:
895 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
896 return;
897
898 case ir_binop_logic_xor:
899 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
900 return;
901
902 case ir_binop_logic_or:
903 temp = dst_reg(this, glsl_type::bool_type);
904 emit(OR(temp, op[0], op[1]));
905 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
906 return;
907
908 case ir_binop_logic_and:
909 temp = dst_reg(this, glsl_type::bool_type);
910 emit(AND(temp, op[0], op[1]));
911 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
912 return;
913
914 case ir_unop_f2b:
915 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
916 return;
917
918 case ir_unop_i2b:
919 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
920 return;
921
922 case ir_binop_greater:
923 case ir_binop_gequal:
924 case ir_binop_less:
925 case ir_binop_lequal:
926 case ir_binop_equal:
927 case ir_binop_nequal:
928 emit(IF(op[0], op[1],
929 brw_conditional_for_comparison(expr->operation)));
930 return;
931
932 case ir_binop_all_equal:
933 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
934 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
935 return;
936
937 case ir_binop_any_nequal:
938 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
939 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
940 return;
941
942 case ir_unop_any:
943 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
944 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
945 return;
946
947 default:
948 unreachable("not reached");
949 }
950 return;
951 }
952
953 ir->condition->accept(this);
954
955 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
956 }
957
958 void
959 vec4_visitor::visit(ir_variable *ir)
960 {
961 dst_reg *reg = NULL;
962
963 if (variable_storage(ir))
964 return;
965
966 switch (ir->data.mode) {
967 case ir_var_shader_in:
968 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
969 break;
970
971 case ir_var_shader_out:
972 reg = new(mem_ctx) dst_reg(this, ir->type);
973
974 for (int i = 0; i < type_size(ir->type); i++) {
975 output_reg[ir->data.location + i] = *reg;
976 output_reg[ir->data.location + i].reg_offset = i;
977 output_reg[ir->data.location + i].type =
978 brw_type_for_base_type(ir->type->get_scalar_type());
979 output_reg_annotation[ir->data.location + i] = ir->name;
980 }
981 break;
982
983 case ir_var_auto:
984 case ir_var_temporary:
985 reg = new(mem_ctx) dst_reg(this, ir->type);
986 break;
987
988 case ir_var_uniform:
989 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
990
991 /* Thanks to the lower_ubo_reference pass, we will see only
992 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
993 * variables, so no need for them to be in variable_ht.
994 *
995 * Atomic counters take no uniform storage, no need to do
996 * anything here.
997 */
998 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
999 return;
1000
1001 /* Track how big the whole uniform variable is, in case we need to put a
1002 * copy of its data into pull constants for array access.
1003 */
1004 assert(this->uniforms < uniform_array_size);
1005 this->uniform_size[this->uniforms] = type_size(ir->type);
1006
1007 if (!strncmp(ir->name, "gl_", 3)) {
1008 setup_builtin_uniform_values(ir);
1009 } else {
1010 setup_uniform_values(ir);
1011 }
1012 break;
1013
1014 case ir_var_system_value:
1015 reg = make_reg_for_system_value(ir);
1016 break;
1017
1018 default:
1019 unreachable("not reached");
1020 }
1021
1022 reg->type = brw_type_for_base_type(ir->type);
1023 hash_table_insert(this->variable_ht, reg, ir);
1024 }
1025
1026 void
1027 vec4_visitor::visit(ir_loop *ir)
1028 {
1029 /* We don't want debugging output to print the whole body of the
1030 * loop as the annotation.
1031 */
1032 this->base_ir = NULL;
1033
1034 emit(BRW_OPCODE_DO);
1035
1036 visit_instructions(&ir->body_instructions);
1037
1038 emit(BRW_OPCODE_WHILE);
1039 }
1040
1041 void
1042 vec4_visitor::visit(ir_loop_jump *ir)
1043 {
1044 switch (ir->mode) {
1045 case ir_loop_jump::jump_break:
1046 emit(BRW_OPCODE_BREAK);
1047 break;
1048 case ir_loop_jump::jump_continue:
1049 emit(BRW_OPCODE_CONTINUE);
1050 break;
1051 }
1052 }
1053
1054
1055 void
1056 vec4_visitor::visit(ir_function_signature *)
1057 {
1058 unreachable("not reached");
1059 }
1060
1061 void
1062 vec4_visitor::visit(ir_function *ir)
1063 {
1064 /* Ignore function bodies other than main() -- we shouldn't see calls to
1065 * them since they should all be inlined.
1066 */
1067 if (strcmp(ir->name, "main") == 0) {
1068 const ir_function_signature *sig;
1069 exec_list empty;
1070
1071 sig = ir->matching_signature(NULL, &empty, false);
1072
1073 assert(sig);
1074
1075 visit_instructions(&sig->body);
1076 }
1077 }
1078
1079 bool
1080 vec4_visitor::try_emit_sat(ir_expression *ir)
1081 {
1082 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1083 if (!sat_src)
1084 return false;
1085
1086 sat_src->accept(this);
1087 src_reg src = this->result;
1088
1089 this->result = src_reg(this, ir->type);
1090 vec4_instruction *inst;
1091 inst = emit(MOV(dst_reg(this->result), src));
1092 inst->saturate = true;
1093
1094 return true;
1095 }
1096
1097 bool
1098 vec4_visitor::try_emit_mad(ir_expression *ir)
1099 {
1100 /* 3-src instructions were introduced in gen6. */
1101 if (brw->gen < 6)
1102 return false;
1103
1104 /* MAD can only handle floating-point data. */
1105 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1106 return false;
1107
1108 ir_rvalue *nonmul = ir->operands[1];
1109 ir_expression *mul = ir->operands[0]->as_expression();
1110
1111 if (!mul || mul->operation != ir_binop_mul) {
1112 nonmul = ir->operands[0];
1113 mul = ir->operands[1]->as_expression();
1114
1115 if (!mul || mul->operation != ir_binop_mul)
1116 return false;
1117 }
1118
1119 nonmul->accept(this);
1120 src_reg src0 = fix_3src_operand(this->result);
1121
1122 mul->operands[0]->accept(this);
1123 src_reg src1 = fix_3src_operand(this->result);
1124
1125 mul->operands[1]->accept(this);
1126 src_reg src2 = fix_3src_operand(this->result);
1127
1128 this->result = src_reg(this, ir->type);
1129 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1130
1131 return true;
1132 }
1133
1134 bool
1135 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1136 {
1137 ir_expression *const cmp = ir->operands[0]->as_expression();
1138
1139 if (cmp == NULL)
1140 return false;
1141
1142 switch (cmp->operation) {
1143 case ir_binop_less:
1144 case ir_binop_greater:
1145 case ir_binop_lequal:
1146 case ir_binop_gequal:
1147 case ir_binop_equal:
1148 case ir_binop_nequal:
1149 break;
1150
1151 default:
1152 return false;
1153 }
1154
1155 cmp->operands[0]->accept(this);
1156 const src_reg cmp_src0 = this->result;
1157
1158 cmp->operands[1]->accept(this);
1159 const src_reg cmp_src1 = this->result;
1160
1161 this->result = src_reg(this, ir->type);
1162
1163 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1164 brw_conditional_for_comparison(cmp->operation)));
1165
1166 /* If the comparison is false, this->result will just happen to be zero.
1167 */
1168 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1169 this->result, src_reg(1.0f));
1170 inst->predicate = BRW_PREDICATE_NORMAL;
1171 inst->predicate_inverse = true;
1172
1173 return true;
1174 }
1175
1176 void
1177 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1178 src_reg src0, src_reg src1)
1179 {
1180 vec4_instruction *inst;
1181
1182 if (brw->gen >= 6) {
1183 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1184 inst->conditional_mod = conditionalmod;
1185 } else {
1186 emit(CMP(dst, src0, src1, conditionalmod));
1187
1188 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1189 inst->predicate = BRW_PREDICATE_NORMAL;
1190 }
1191 }
1192
1193 void
1194 vec4_visitor::emit_lrp(const dst_reg &dst,
1195 const src_reg &x, const src_reg &y, const src_reg &a)
1196 {
1197 if (brw->gen >= 6) {
1198 /* Note that the instruction's argument order is reversed from GLSL
1199 * and the IR.
1200 */
1201 emit(LRP(dst,
1202 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1203 } else {
1204 /* Earlier generations don't support three source operations, so we
1205 * need to emit x*(1-a) + y*a.
1206 */
1207 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1208 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1209 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1210 y_times_a.writemask = dst.writemask;
1211 one_minus_a.writemask = dst.writemask;
1212 x_times_one_minus_a.writemask = dst.writemask;
1213
1214 emit(MUL(y_times_a, y, a));
1215 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1216 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1217 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1218 }
1219 }
1220
1221 void
1222 vec4_visitor::visit(ir_expression *ir)
1223 {
1224 unsigned int operand;
1225 src_reg op[Elements(ir->operands)];
1226 src_reg result_src;
1227 dst_reg result_dst;
1228 vec4_instruction *inst;
1229
1230 if (try_emit_sat(ir))
1231 return;
1232
1233 if (ir->operation == ir_binop_add) {
1234 if (try_emit_mad(ir))
1235 return;
1236 }
1237
1238 if (ir->operation == ir_unop_b2f) {
1239 if (try_emit_b2f_of_compare(ir))
1240 return;
1241 }
1242
1243 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1244 this->result.file = BAD_FILE;
1245 ir->operands[operand]->accept(this);
1246 if (this->result.file == BAD_FILE) {
1247 fprintf(stderr, "Failed to get tree for expression operand:\n");
1248 ir->operands[operand]->fprint(stderr);
1249 exit(1);
1250 }
1251 op[operand] = this->result;
1252
1253 /* Matrix expression operands should have been broken down to vector
1254 * operations already.
1255 */
1256 assert(!ir->operands[operand]->type->is_matrix());
1257 }
1258
1259 int vector_elements = ir->operands[0]->type->vector_elements;
1260 if (ir->operands[1]) {
1261 vector_elements = MAX2(vector_elements,
1262 ir->operands[1]->type->vector_elements);
1263 }
1264
1265 this->result.file = BAD_FILE;
1266
1267 /* Storage for our result. Ideally for an assignment we'd be using
1268 * the actual storage for the result here, instead.
1269 */
1270 result_src = src_reg(this, ir->type);
1271 /* convenience for the emit functions below. */
1272 result_dst = dst_reg(result_src);
1273 /* If nothing special happens, this is the result. */
1274 this->result = result_src;
1275 /* Limit writes to the channels that will be used by result_src later.
1276 * This does limit this temp's use as a temporary for multi-instruction
1277 * sequences.
1278 */
1279 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1280
1281 switch (ir->operation) {
1282 case ir_unop_logic_not:
1283 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1284 * ones complement of the whole register, not just bit 0.
1285 */
1286 emit(XOR(result_dst, op[0], src_reg(1)));
1287 break;
1288 case ir_unop_neg:
1289 op[0].negate = !op[0].negate;
1290 emit(MOV(result_dst, op[0]));
1291 break;
1292 case ir_unop_abs:
1293 op[0].abs = true;
1294 op[0].negate = false;
1295 emit(MOV(result_dst, op[0]));
1296 break;
1297
1298 case ir_unop_sign:
1299 if (ir->type->is_float()) {
1300 /* AND(val, 0x80000000) gives the sign bit.
1301 *
1302 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1303 * zero.
1304 */
1305 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1306
1307 op[0].type = BRW_REGISTER_TYPE_UD;
1308 result_dst.type = BRW_REGISTER_TYPE_UD;
1309 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1310
1311 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1312 inst->predicate = BRW_PREDICATE_NORMAL;
1313
1314 this->result.type = BRW_REGISTER_TYPE_F;
1315 } else {
1316 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1317 * -> non-negative val generates 0x00000000.
1318 * Predicated OR sets 1 if val is positive.
1319 */
1320 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1321
1322 emit(ASR(result_dst, op[0], src_reg(31)));
1323
1324 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1325 inst->predicate = BRW_PREDICATE_NORMAL;
1326 }
1327 break;
1328
1329 case ir_unop_rcp:
1330 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1331 break;
1332
1333 case ir_unop_exp2:
1334 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1335 break;
1336 case ir_unop_log2:
1337 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1338 break;
1339 case ir_unop_exp:
1340 case ir_unop_log:
1341 unreachable("not reached: should be handled by ir_explog_to_explog2");
1342 case ir_unop_sin:
1343 case ir_unop_sin_reduced:
1344 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1345 break;
1346 case ir_unop_cos:
1347 case ir_unop_cos_reduced:
1348 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1349 break;
1350
1351 case ir_unop_dFdx:
1352 case ir_unop_dFdy:
1353 unreachable("derivatives not valid in vertex shader");
1354
1355 case ir_unop_bitfield_reverse:
1356 emit(BFREV(result_dst, op[0]));
1357 break;
1358 case ir_unop_bit_count:
1359 emit(CBIT(result_dst, op[0]));
1360 break;
1361 case ir_unop_find_msb: {
1362 src_reg temp = src_reg(this, glsl_type::uint_type);
1363
1364 inst = emit(FBH(dst_reg(temp), op[0]));
1365 inst->dst.writemask = WRITEMASK_XYZW;
1366
1367 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1368 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1369 * subtract the result from 31 to convert the MSB count into an LSB count.
1370 */
1371
1372 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1373 temp.swizzle = BRW_SWIZZLE_NOOP;
1374 emit(MOV(result_dst, temp));
1375
1376 src_reg src_tmp = src_reg(result_dst);
1377 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1378
1379 src_tmp.negate = true;
1380 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1381 inst->predicate = BRW_PREDICATE_NORMAL;
1382 break;
1383 }
1384 case ir_unop_find_lsb:
1385 emit(FBL(result_dst, op[0]));
1386 break;
1387
1388 case ir_unop_noise:
1389 unreachable("not reached: should be handled by lower_noise");
1390
1391 case ir_binop_add:
1392 emit(ADD(result_dst, op[0], op[1]));
1393 break;
1394 case ir_binop_sub:
1395 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1396
1397 case ir_binop_mul:
1398 if (brw->gen < 8 && ir->type->is_integer()) {
1399 /* For integer multiplication, the MUL uses the low 16 bits of one of
1400 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1401 * accumulates in the contribution of the upper 16 bits of that
1402 * operand. If we can determine that one of the args is in the low
1403 * 16 bits, though, we can just emit a single MUL.
1404 */
1405 if (ir->operands[0]->is_uint16_constant()) {
1406 if (brw->gen < 7)
1407 emit(MUL(result_dst, op[0], op[1]));
1408 else
1409 emit(MUL(result_dst, op[1], op[0]));
1410 } else if (ir->operands[1]->is_uint16_constant()) {
1411 if (brw->gen < 7)
1412 emit(MUL(result_dst, op[1], op[0]));
1413 else
1414 emit(MUL(result_dst, op[0], op[1]));
1415 } else {
1416 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1417
1418 emit(MUL(acc, op[0], op[1]));
1419 emit(MACH(dst_null_d(), op[0], op[1]));
1420 emit(MOV(result_dst, src_reg(acc)));
1421 }
1422 } else {
1423 emit(MUL(result_dst, op[0], op[1]));
1424 }
1425 break;
1426 case ir_binop_imul_high: {
1427 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1428
1429 emit(MUL(acc, op[0], op[1]));
1430 emit(MACH(result_dst, op[0], op[1]));
1431 break;
1432 }
1433 case ir_binop_div:
1434 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1435 assert(ir->type->is_integer());
1436 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1437 break;
1438 case ir_binop_carry: {
1439 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1440
1441 emit(ADDC(dst_null_ud(), op[0], op[1]));
1442 emit(MOV(result_dst, src_reg(acc)));
1443 break;
1444 }
1445 case ir_binop_borrow: {
1446 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1447
1448 emit(SUBB(dst_null_ud(), op[0], op[1]));
1449 emit(MOV(result_dst, src_reg(acc)));
1450 break;
1451 }
1452 case ir_binop_mod:
1453 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1454 assert(ir->type->is_integer());
1455 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1456 break;
1457
1458 case ir_binop_less:
1459 case ir_binop_greater:
1460 case ir_binop_lequal:
1461 case ir_binop_gequal:
1462 case ir_binop_equal:
1463 case ir_binop_nequal: {
1464 emit(CMP(result_dst, op[0], op[1],
1465 brw_conditional_for_comparison(ir->operation)));
1466 emit(AND(result_dst, result_src, src_reg(0x1)));
1467 break;
1468 }
1469
1470 case ir_binop_all_equal:
1471 /* "==" operator producing a scalar boolean. */
1472 if (ir->operands[0]->type->is_vector() ||
1473 ir->operands[1]->type->is_vector()) {
1474 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1475 emit(MOV(result_dst, src_reg(0)));
1476 inst = emit(MOV(result_dst, src_reg(1)));
1477 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1478 } else {
1479 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1480 emit(AND(result_dst, result_src, src_reg(0x1)));
1481 }
1482 break;
1483 case ir_binop_any_nequal:
1484 /* "!=" operator producing a scalar boolean. */
1485 if (ir->operands[0]->type->is_vector() ||
1486 ir->operands[1]->type->is_vector()) {
1487 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1488
1489 emit(MOV(result_dst, src_reg(0)));
1490 inst = emit(MOV(result_dst, src_reg(1)));
1491 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1492 } else {
1493 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1494 emit(AND(result_dst, result_src, src_reg(0x1)));
1495 }
1496 break;
1497
1498 case ir_unop_any:
1499 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1500 emit(MOV(result_dst, src_reg(0)));
1501
1502 inst = emit(MOV(result_dst, src_reg(1)));
1503 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1504 break;
1505
1506 case ir_binop_logic_xor:
1507 emit(XOR(result_dst, op[0], op[1]));
1508 break;
1509
1510 case ir_binop_logic_or:
1511 emit(OR(result_dst, op[0], op[1]));
1512 break;
1513
1514 case ir_binop_logic_and:
1515 emit(AND(result_dst, op[0], op[1]));
1516 break;
1517
1518 case ir_binop_dot:
1519 assert(ir->operands[0]->type->is_vector());
1520 assert(ir->operands[0]->type == ir->operands[1]->type);
1521 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1522 break;
1523
1524 case ir_unop_sqrt:
1525 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1526 break;
1527 case ir_unop_rsq:
1528 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1529 break;
1530
1531 case ir_unop_bitcast_i2f:
1532 case ir_unop_bitcast_u2f:
1533 this->result = op[0];
1534 this->result.type = BRW_REGISTER_TYPE_F;
1535 break;
1536
1537 case ir_unop_bitcast_f2i:
1538 this->result = op[0];
1539 this->result.type = BRW_REGISTER_TYPE_D;
1540 break;
1541
1542 case ir_unop_bitcast_f2u:
1543 this->result = op[0];
1544 this->result.type = BRW_REGISTER_TYPE_UD;
1545 break;
1546
1547 case ir_unop_i2f:
1548 case ir_unop_i2u:
1549 case ir_unop_u2i:
1550 case ir_unop_u2f:
1551 case ir_unop_b2f:
1552 case ir_unop_b2i:
1553 case ir_unop_f2i:
1554 case ir_unop_f2u:
1555 emit(MOV(result_dst, op[0]));
1556 break;
1557 case ir_unop_f2b:
1558 case ir_unop_i2b: {
1559 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1560 emit(AND(result_dst, result_src, src_reg(1)));
1561 break;
1562 }
1563
1564 case ir_unop_trunc:
1565 emit(RNDZ(result_dst, op[0]));
1566 break;
1567 case ir_unop_ceil:
1568 op[0].negate = !op[0].negate;
1569 inst = emit(RNDD(result_dst, op[0]));
1570 this->result.negate = true;
1571 break;
1572 case ir_unop_floor:
1573 inst = emit(RNDD(result_dst, op[0]));
1574 break;
1575 case ir_unop_fract:
1576 inst = emit(FRC(result_dst, op[0]));
1577 break;
1578 case ir_unop_round_even:
1579 emit(RNDE(result_dst, op[0]));
1580 break;
1581
1582 case ir_binop_min:
1583 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1584 break;
1585 case ir_binop_max:
1586 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1587 break;
1588
1589 case ir_binop_pow:
1590 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1591 break;
1592
1593 case ir_unop_bit_not:
1594 inst = emit(NOT(result_dst, op[0]));
1595 break;
1596 case ir_binop_bit_and:
1597 inst = emit(AND(result_dst, op[0], op[1]));
1598 break;
1599 case ir_binop_bit_xor:
1600 inst = emit(XOR(result_dst, op[0], op[1]));
1601 break;
1602 case ir_binop_bit_or:
1603 inst = emit(OR(result_dst, op[0], op[1]));
1604 break;
1605
1606 case ir_binop_lshift:
1607 inst = emit(SHL(result_dst, op[0], op[1]));
1608 break;
1609
1610 case ir_binop_rshift:
1611 if (ir->type->base_type == GLSL_TYPE_INT)
1612 inst = emit(ASR(result_dst, op[0], op[1]));
1613 else
1614 inst = emit(SHR(result_dst, op[0], op[1]));
1615 break;
1616
1617 case ir_binop_bfm:
1618 emit(BFI1(result_dst, op[0], op[1]));
1619 break;
1620
1621 case ir_binop_ubo_load: {
1622 ir_constant *uniform_block = ir->operands[0]->as_constant();
1623 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1624 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1625 src_reg offset;
1626
1627 /* Now, load the vector from that offset. */
1628 assert(ir->type->is_vector() || ir->type->is_scalar());
1629
1630 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1631 packed_consts.type = result.type;
1632 src_reg surf_index =
1633 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1634 if (const_offset_ir) {
1635 if (brw->gen >= 8) {
1636 /* Store the offset in a GRF so we can send-from-GRF. */
1637 offset = src_reg(this, glsl_type::int_type);
1638 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1639 } else {
1640 /* Immediates are fine on older generations since they'll be moved
1641 * to a (potentially fake) MRF at the generator level.
1642 */
1643 offset = src_reg(const_offset / 16);
1644 }
1645 } else {
1646 offset = src_reg(this, glsl_type::uint_type);
1647 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1648 }
1649
1650 if (brw->gen >= 7) {
1651 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1652 grf_offset.type = offset.type;
1653
1654 emit(MOV(grf_offset, offset));
1655
1656 emit(new(mem_ctx) vec4_instruction(this,
1657 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1658 dst_reg(packed_consts),
1659 surf_index,
1660 src_reg(grf_offset)));
1661 } else {
1662 vec4_instruction *pull =
1663 emit(new(mem_ctx) vec4_instruction(this,
1664 VS_OPCODE_PULL_CONSTANT_LOAD,
1665 dst_reg(packed_consts),
1666 surf_index,
1667 offset));
1668 pull->base_mrf = 14;
1669 pull->mlen = 1;
1670 }
1671
1672 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1673 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1674 const_offset % 16 / 4,
1675 const_offset % 16 / 4,
1676 const_offset % 16 / 4);
1677
1678 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1679 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1680 emit(CMP(result_dst, packed_consts, src_reg(0u),
1681 BRW_CONDITIONAL_NZ));
1682 emit(AND(result_dst, result, src_reg(0x1)));
1683 } else {
1684 emit(MOV(result_dst, packed_consts));
1685 }
1686 break;
1687 }
1688
1689 case ir_binop_vector_extract:
1690 unreachable("should have been lowered by vec_index_to_cond_assign");
1691
1692 case ir_triop_fma:
1693 op[0] = fix_3src_operand(op[0]);
1694 op[1] = fix_3src_operand(op[1]);
1695 op[2] = fix_3src_operand(op[2]);
1696 /* Note that the instruction's argument order is reversed from GLSL
1697 * and the IR.
1698 */
1699 emit(MAD(result_dst, op[2], op[1], op[0]));
1700 break;
1701
1702 case ir_triop_lrp:
1703 emit_lrp(result_dst, op[0], op[1], op[2]);
1704 break;
1705
1706 case ir_triop_csel:
1707 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1708 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1709 inst->predicate = BRW_PREDICATE_NORMAL;
1710 break;
1711
1712 case ir_triop_bfi:
1713 op[0] = fix_3src_operand(op[0]);
1714 op[1] = fix_3src_operand(op[1]);
1715 op[2] = fix_3src_operand(op[2]);
1716 emit(BFI2(result_dst, op[0], op[1], op[2]));
1717 break;
1718
1719 case ir_triop_bitfield_extract:
1720 op[0] = fix_3src_operand(op[0]);
1721 op[1] = fix_3src_operand(op[1]);
1722 op[2] = fix_3src_operand(op[2]);
1723 /* Note that the instruction's argument order is reversed from GLSL
1724 * and the IR.
1725 */
1726 emit(BFE(result_dst, op[2], op[1], op[0]));
1727 break;
1728
1729 case ir_triop_vector_insert:
1730 unreachable("should have been lowered by lower_vector_insert");
1731
1732 case ir_quadop_bitfield_insert:
1733 unreachable("not reached: should be handled by "
1734 "bitfield_insert_to_bfm_bfi\n");
1735
1736 case ir_quadop_vector:
1737 unreachable("not reached: should be handled by lower_quadop_vector");
1738
1739 case ir_unop_pack_half_2x16:
1740 emit_pack_half_2x16(result_dst, op[0]);
1741 break;
1742 case ir_unop_unpack_half_2x16:
1743 emit_unpack_half_2x16(result_dst, op[0]);
1744 break;
1745 case ir_unop_pack_snorm_2x16:
1746 case ir_unop_pack_snorm_4x8:
1747 case ir_unop_pack_unorm_2x16:
1748 case ir_unop_pack_unorm_4x8:
1749 case ir_unop_unpack_snorm_2x16:
1750 case ir_unop_unpack_snorm_4x8:
1751 case ir_unop_unpack_unorm_2x16:
1752 case ir_unop_unpack_unorm_4x8:
1753 unreachable("not reached: should be handled by lower_packing_builtins");
1754 case ir_unop_unpack_half_2x16_split_x:
1755 case ir_unop_unpack_half_2x16_split_y:
1756 case ir_binop_pack_half_2x16_split:
1757 case ir_unop_interpolate_at_centroid:
1758 case ir_binop_interpolate_at_sample:
1759 case ir_binop_interpolate_at_offset:
1760 unreachable("not reached: should not occur in vertex shader");
1761 case ir_binop_ldexp:
1762 unreachable("not reached: should be handled by ldexp_to_arith()");
1763 }
1764 }
1765
1766
1767 void
1768 vec4_visitor::visit(ir_swizzle *ir)
1769 {
1770 src_reg src;
1771 int i = 0;
1772 int swizzle[4];
1773
1774 /* Note that this is only swizzles in expressions, not those on the left
1775 * hand side of an assignment, which do write masking. See ir_assignment
1776 * for that.
1777 */
1778
1779 ir->val->accept(this);
1780 src = this->result;
1781 assert(src.file != BAD_FILE);
1782
1783 for (i = 0; i < ir->type->vector_elements; i++) {
1784 switch (i) {
1785 case 0:
1786 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1787 break;
1788 case 1:
1789 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1790 break;
1791 case 2:
1792 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1793 break;
1794 case 3:
1795 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1796 break;
1797 }
1798 }
1799 for (; i < 4; i++) {
1800 /* Replicate the last channel out. */
1801 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1802 }
1803
1804 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1805
1806 this->result = src;
1807 }
1808
1809 void
1810 vec4_visitor::visit(ir_dereference_variable *ir)
1811 {
1812 const struct glsl_type *type = ir->type;
1813 dst_reg *reg = variable_storage(ir->var);
1814
1815 if (!reg) {
1816 fail("Failed to find variable storage for %s\n", ir->var->name);
1817 this->result = src_reg(brw_null_reg());
1818 return;
1819 }
1820
1821 this->result = src_reg(*reg);
1822
1823 /* System values get their swizzle from the dst_reg writemask */
1824 if (ir->var->data.mode == ir_var_system_value)
1825 return;
1826
1827 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1828 this->result.swizzle = swizzle_for_size(type->vector_elements);
1829 }
1830
1831
1832 int
1833 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1834 {
1835 /* Under normal circumstances array elements are stored consecutively, so
1836 * the stride is equal to the size of the array element.
1837 */
1838 return type_size(ir->type);
1839 }
1840
1841
1842 void
1843 vec4_visitor::visit(ir_dereference_array *ir)
1844 {
1845 ir_constant *constant_index;
1846 src_reg src;
1847 int array_stride = compute_array_stride(ir);
1848
1849 constant_index = ir->array_index->constant_expression_value();
1850
1851 ir->array->accept(this);
1852 src = this->result;
1853
1854 if (constant_index) {
1855 src.reg_offset += constant_index->value.i[0] * array_stride;
1856 } else {
1857 /* Variable index array dereference. It eats the "vec4" of the
1858 * base of the array and an index that offsets the Mesa register
1859 * index.
1860 */
1861 ir->array_index->accept(this);
1862
1863 src_reg index_reg;
1864
1865 if (array_stride == 1) {
1866 index_reg = this->result;
1867 } else {
1868 index_reg = src_reg(this, glsl_type::int_type);
1869
1870 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1871 }
1872
1873 if (src.reladdr) {
1874 src_reg temp = src_reg(this, glsl_type::int_type);
1875
1876 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1877
1878 index_reg = temp;
1879 }
1880
1881 src.reladdr = ralloc(mem_ctx, src_reg);
1882 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1883 }
1884
1885 /* If the type is smaller than a vec4, replicate the last channel out. */
1886 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1887 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1888 else
1889 src.swizzle = BRW_SWIZZLE_NOOP;
1890 src.type = brw_type_for_base_type(ir->type);
1891
1892 this->result = src;
1893 }
1894
1895 void
1896 vec4_visitor::visit(ir_dereference_record *ir)
1897 {
1898 unsigned int i;
1899 const glsl_type *struct_type = ir->record->type;
1900 int offset = 0;
1901
1902 ir->record->accept(this);
1903
1904 for (i = 0; i < struct_type->length; i++) {
1905 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1906 break;
1907 offset += type_size(struct_type->fields.structure[i].type);
1908 }
1909
1910 /* If the type is smaller than a vec4, replicate the last channel out. */
1911 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1912 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1913 else
1914 this->result.swizzle = BRW_SWIZZLE_NOOP;
1915 this->result.type = brw_type_for_base_type(ir->type);
1916
1917 this->result.reg_offset += offset;
1918 }
1919
1920 /**
1921 * We want to be careful in assignment setup to hit the actual storage
1922 * instead of potentially using a temporary like we might with the
1923 * ir_dereference handler.
1924 */
1925 static dst_reg
1926 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1927 {
1928 /* The LHS must be a dereference. If the LHS is a variable indexed array
1929 * access of a vector, it must be separated into a series conditional moves
1930 * before reaching this point (see ir_vec_index_to_cond_assign).
1931 */
1932 assert(ir->as_dereference());
1933 ir_dereference_array *deref_array = ir->as_dereference_array();
1934 if (deref_array) {
1935 assert(!deref_array->array->type->is_vector());
1936 }
1937
1938 /* Use the rvalue deref handler for the most part. We'll ignore
1939 * swizzles in it and write swizzles using writemask, though.
1940 */
1941 ir->accept(v);
1942 return dst_reg(v->result);
1943 }
1944
1945 void
1946 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1947 const struct glsl_type *type,
1948 enum brw_predicate predicate)
1949 {
1950 if (type->base_type == GLSL_TYPE_STRUCT) {
1951 for (unsigned int i = 0; i < type->length; i++) {
1952 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1953 }
1954 return;
1955 }
1956
1957 if (type->is_array()) {
1958 for (unsigned int i = 0; i < type->length; i++) {
1959 emit_block_move(dst, src, type->fields.array, predicate);
1960 }
1961 return;
1962 }
1963
1964 if (type->is_matrix()) {
1965 const struct glsl_type *vec_type;
1966
1967 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1968 type->vector_elements, 1);
1969
1970 for (int i = 0; i < type->matrix_columns; i++) {
1971 emit_block_move(dst, src, vec_type, predicate);
1972 }
1973 return;
1974 }
1975
1976 assert(type->is_scalar() || type->is_vector());
1977
1978 dst->type = brw_type_for_base_type(type);
1979 src->type = dst->type;
1980
1981 dst->writemask = (1 << type->vector_elements) - 1;
1982
1983 src->swizzle = swizzle_for_size(type->vector_elements);
1984
1985 vec4_instruction *inst = emit(MOV(*dst, *src));
1986 inst->predicate = predicate;
1987
1988 dst->reg_offset++;
1989 src->reg_offset++;
1990 }
1991
1992
1993 /* If the RHS processing resulted in an instruction generating a
1994 * temporary value, and it would be easy to rewrite the instruction to
1995 * generate its result right into the LHS instead, do so. This ends
1996 * up reliably removing instructions where it can be tricky to do so
1997 * later without real UD chain information.
1998 */
1999 bool
2000 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2001 dst_reg dst,
2002 src_reg src,
2003 vec4_instruction *pre_rhs_inst,
2004 vec4_instruction *last_rhs_inst)
2005 {
2006 /* This could be supported, but it would take more smarts. */
2007 if (ir->condition)
2008 return false;
2009
2010 if (pre_rhs_inst == last_rhs_inst)
2011 return false; /* No instructions generated to work with. */
2012
2013 /* Make sure the last instruction generated our source reg. */
2014 if (src.file != GRF ||
2015 src.file != last_rhs_inst->dst.file ||
2016 src.reg != last_rhs_inst->dst.reg ||
2017 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2018 src.reladdr ||
2019 src.abs ||
2020 src.negate ||
2021 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2022 return false;
2023
2024 /* Check that that last instruction fully initialized the channels
2025 * we want to use, in the order we want to use them. We could
2026 * potentially reswizzle the operands of many instructions so that
2027 * we could handle out of order channels, but don't yet.
2028 */
2029
2030 for (unsigned i = 0; i < 4; i++) {
2031 if (dst.writemask & (1 << i)) {
2032 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2033 return false;
2034
2035 if (BRW_GET_SWZ(src.swizzle, i) != i)
2036 return false;
2037 }
2038 }
2039
2040 /* Success! Rewrite the instruction. */
2041 last_rhs_inst->dst.file = dst.file;
2042 last_rhs_inst->dst.reg = dst.reg;
2043 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2044 last_rhs_inst->dst.reladdr = dst.reladdr;
2045 last_rhs_inst->dst.writemask &= dst.writemask;
2046
2047 return true;
2048 }
2049
2050 void
2051 vec4_visitor::visit(ir_assignment *ir)
2052 {
2053 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2054 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2055
2056 if (!ir->lhs->type->is_scalar() &&
2057 !ir->lhs->type->is_vector()) {
2058 ir->rhs->accept(this);
2059 src_reg src = this->result;
2060
2061 if (ir->condition) {
2062 emit_bool_to_cond_code(ir->condition, &predicate);
2063 }
2064
2065 /* emit_block_move doesn't account for swizzles in the source register.
2066 * This should be ok, since the source register is a structure or an
2067 * array, and those can't be swizzled. But double-check to be sure.
2068 */
2069 assert(src.swizzle ==
2070 (ir->rhs->type->is_matrix()
2071 ? swizzle_for_size(ir->rhs->type->vector_elements)
2072 : BRW_SWIZZLE_NOOP));
2073
2074 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2075 return;
2076 }
2077
2078 /* Now we're down to just a scalar/vector with writemasks. */
2079 int i;
2080
2081 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2082 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2083
2084 ir->rhs->accept(this);
2085
2086 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2087
2088 src_reg src = this->result;
2089
2090 int swizzles[4];
2091 int first_enabled_chan = 0;
2092 int src_chan = 0;
2093
2094 assert(ir->lhs->type->is_vector() ||
2095 ir->lhs->type->is_scalar());
2096 dst.writemask = ir->write_mask;
2097
2098 for (int i = 0; i < 4; i++) {
2099 if (dst.writemask & (1 << i)) {
2100 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2101 break;
2102 }
2103 }
2104
2105 /* Swizzle a small RHS vector into the channels being written.
2106 *
2107 * glsl ir treats write_mask as dictating how many channels are
2108 * present on the RHS while in our instructions we need to make
2109 * those channels appear in the slots of the vec4 they're written to.
2110 */
2111 for (int i = 0; i < 4; i++) {
2112 if (dst.writemask & (1 << i))
2113 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2114 else
2115 swizzles[i] = first_enabled_chan;
2116 }
2117 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2118 swizzles[2], swizzles[3]);
2119
2120 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2121 return;
2122 }
2123
2124 if (ir->condition) {
2125 emit_bool_to_cond_code(ir->condition, &predicate);
2126 }
2127
2128 for (i = 0; i < type_size(ir->lhs->type); i++) {
2129 vec4_instruction *inst = emit(MOV(dst, src));
2130 inst->predicate = predicate;
2131
2132 dst.reg_offset++;
2133 src.reg_offset++;
2134 }
2135 }
2136
2137 void
2138 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2139 {
2140 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2141 foreach_in_list(ir_constant, field_value, &ir->components) {
2142 emit_constant_values(dst, field_value);
2143 }
2144 return;
2145 }
2146
2147 if (ir->type->is_array()) {
2148 for (unsigned int i = 0; i < ir->type->length; i++) {
2149 emit_constant_values(dst, ir->array_elements[i]);
2150 }
2151 return;
2152 }
2153
2154 if (ir->type->is_matrix()) {
2155 for (int i = 0; i < ir->type->matrix_columns; i++) {
2156 float *vec = &ir->value.f[i * ir->type->vector_elements];
2157
2158 for (int j = 0; j < ir->type->vector_elements; j++) {
2159 dst->writemask = 1 << j;
2160 dst->type = BRW_REGISTER_TYPE_F;
2161
2162 emit(MOV(*dst, src_reg(vec[j])));
2163 }
2164 dst->reg_offset++;
2165 }
2166 return;
2167 }
2168
2169 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2170
2171 for (int i = 0; i < ir->type->vector_elements; i++) {
2172 if (!(remaining_writemask & (1 << i)))
2173 continue;
2174
2175 dst->writemask = 1 << i;
2176 dst->type = brw_type_for_base_type(ir->type);
2177
2178 /* Find other components that match the one we're about to
2179 * write. Emits fewer instructions for things like vec4(0.5,
2180 * 1.5, 1.5, 1.5).
2181 */
2182 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2183 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2184 if (ir->value.b[i] == ir->value.b[j])
2185 dst->writemask |= (1 << j);
2186 } else {
2187 /* u, i, and f storage all line up, so no need for a
2188 * switch case for comparing each type.
2189 */
2190 if (ir->value.u[i] == ir->value.u[j])
2191 dst->writemask |= (1 << j);
2192 }
2193 }
2194
2195 switch (ir->type->base_type) {
2196 case GLSL_TYPE_FLOAT:
2197 emit(MOV(*dst, src_reg(ir->value.f[i])));
2198 break;
2199 case GLSL_TYPE_INT:
2200 emit(MOV(*dst, src_reg(ir->value.i[i])));
2201 break;
2202 case GLSL_TYPE_UINT:
2203 emit(MOV(*dst, src_reg(ir->value.u[i])));
2204 break;
2205 case GLSL_TYPE_BOOL:
2206 emit(MOV(*dst, src_reg(ir->value.b[i])));
2207 break;
2208 default:
2209 unreachable("Non-float/uint/int/bool constant");
2210 }
2211
2212 remaining_writemask &= ~dst->writemask;
2213 }
2214 dst->reg_offset++;
2215 }
2216
2217 void
2218 vec4_visitor::visit(ir_constant *ir)
2219 {
2220 dst_reg dst = dst_reg(this, ir->type);
2221 this->result = src_reg(dst);
2222
2223 emit_constant_values(&dst, ir);
2224 }
2225
2226 void
2227 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2228 {
2229 ir_dereference *deref = static_cast<ir_dereference *>(
2230 ir->actual_parameters.get_head());
2231 ir_variable *location = deref->variable_referenced();
2232 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2233 location->data.atomic.buffer_index);
2234
2235 /* Calculate the surface offset */
2236 src_reg offset(this, glsl_type::uint_type);
2237 ir_dereference_array *deref_array = deref->as_dereference_array();
2238 if (deref_array) {
2239 deref_array->array_index->accept(this);
2240
2241 src_reg tmp(this, glsl_type::uint_type);
2242 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2243 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2244 } else {
2245 offset = location->data.atomic.offset;
2246 }
2247
2248 /* Emit the appropriate machine instruction */
2249 const char *callee = ir->callee->function_name();
2250 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2251
2252 if (!strcmp("__intrinsic_atomic_read", callee)) {
2253 emit_untyped_surface_read(surf_index, dst, offset);
2254
2255 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2256 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2257 src_reg(), src_reg());
2258
2259 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2260 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2261 src_reg(), src_reg());
2262 }
2263 }
2264
2265 void
2266 vec4_visitor::visit(ir_call *ir)
2267 {
2268 const char *callee = ir->callee->function_name();
2269
2270 if (!strcmp("__intrinsic_atomic_read", callee) ||
2271 !strcmp("__intrinsic_atomic_increment", callee) ||
2272 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2273 visit_atomic_counter_intrinsic(ir);
2274 } else {
2275 unreachable("Unsupported intrinsic.");
2276 }
2277 }
2278
2279 src_reg
2280 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2281 {
2282 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2283 inst->base_mrf = 2;
2284 inst->mlen = 1;
2285 inst->sampler = sampler;
2286 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2287 inst->dst.writemask = WRITEMASK_XYZW;
2288
2289 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2290 int param_base = inst->base_mrf;
2291 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2292 int zero_mask = 0xf & ~coord_mask;
2293
2294 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2295 coordinate));
2296
2297 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2298 src_reg(0)));
2299
2300 emit(inst);
2301 return src_reg(inst->dst);
2302 }
2303
2304 void
2305 vec4_visitor::visit(ir_texture *ir)
2306 {
2307 int sampler =
2308 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2309
2310 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2311 * emitting anything other than setting up the constant result.
2312 */
2313 if (ir->op == ir_tg4) {
2314 ir_constant *chan = ir->lod_info.component->as_constant();
2315 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2316 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2317 dst_reg result(this, ir->type);
2318 this->result = src_reg(result);
2319 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2320 return;
2321 }
2322 }
2323
2324 /* Should be lowered by do_lower_texture_projection */
2325 assert(!ir->projector);
2326
2327 /* Should be lowered */
2328 assert(!ir->offset || !ir->offset->type->is_array());
2329
2330 /* Generate code to compute all the subexpression trees. This has to be
2331 * done before loading any values into MRFs for the sampler message since
2332 * generating these values may involve SEND messages that need the MRFs.
2333 */
2334 src_reg coordinate;
2335 if (ir->coordinate) {
2336 ir->coordinate->accept(this);
2337 coordinate = this->result;
2338 }
2339
2340 src_reg shadow_comparitor;
2341 if (ir->shadow_comparitor) {
2342 ir->shadow_comparitor->accept(this);
2343 shadow_comparitor = this->result;
2344 }
2345
2346 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2347 src_reg offset_value;
2348 if (has_nonconstant_offset) {
2349 ir->offset->accept(this);
2350 offset_value = src_reg(this->result);
2351 }
2352
2353 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2354 src_reg lod, dPdx, dPdy, sample_index, mcs;
2355 switch (ir->op) {
2356 case ir_tex:
2357 lod = src_reg(0.0f);
2358 lod_type = glsl_type::float_type;
2359 break;
2360 case ir_txf:
2361 case ir_txl:
2362 case ir_txs:
2363 ir->lod_info.lod->accept(this);
2364 lod = this->result;
2365 lod_type = ir->lod_info.lod->type;
2366 break;
2367 case ir_query_levels:
2368 lod = src_reg(0);
2369 lod_type = glsl_type::int_type;
2370 break;
2371 case ir_txf_ms:
2372 ir->lod_info.sample_index->accept(this);
2373 sample_index = this->result;
2374 sample_index_type = ir->lod_info.sample_index->type;
2375
2376 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2377 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2378 else
2379 mcs = src_reg(0u);
2380 break;
2381 case ir_txd:
2382 ir->lod_info.grad.dPdx->accept(this);
2383 dPdx = this->result;
2384
2385 ir->lod_info.grad.dPdy->accept(this);
2386 dPdy = this->result;
2387
2388 lod_type = ir->lod_info.grad.dPdx->type;
2389 break;
2390 case ir_txb:
2391 case ir_lod:
2392 case ir_tg4:
2393 break;
2394 }
2395
2396 enum opcode opcode;
2397 switch (ir->op) {
2398 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2399 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2400 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2401 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2402 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2403 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2404 case ir_tg4: opcode = has_nonconstant_offset
2405 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2406 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2407 case ir_txb:
2408 unreachable("TXB is not valid for vertex shaders.");
2409 case ir_lod:
2410 unreachable("LOD is not valid for vertex shaders.");
2411 default:
2412 unreachable("Unrecognized tex op");
2413 }
2414
2415 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2416
2417 if (ir->offset != NULL && ir->op != ir_txf)
2418 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2419
2420 /* Stuff the channel select bits in the top of the texture offset */
2421 if (ir->op == ir_tg4)
2422 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2423
2424 /* The message header is necessary for:
2425 * - Gen4 (always)
2426 * - Texel offsets
2427 * - Gather channel selection
2428 * - Sampler indices too large to fit in a 4-bit value.
2429 */
2430 inst->header_present =
2431 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2432 sampler >= 16;
2433 inst->base_mrf = 2;
2434 inst->mlen = inst->header_present + 1; /* always at least one */
2435 inst->sampler = sampler;
2436 inst->dst = dst_reg(this, ir->type);
2437 inst->dst.writemask = WRITEMASK_XYZW;
2438 inst->shadow_compare = ir->shadow_comparitor != NULL;
2439
2440 /* MRF for the first parameter */
2441 int param_base = inst->base_mrf + inst->header_present;
2442
2443 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2444 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2445 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2446 } else {
2447 /* Load the coordinate */
2448 /* FINISHME: gl_clamp_mask and saturate */
2449 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2450 int zero_mask = 0xf & ~coord_mask;
2451
2452 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2453 coordinate));
2454
2455 if (zero_mask != 0) {
2456 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2457 src_reg(0)));
2458 }
2459 /* Load the shadow comparitor */
2460 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2461 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2462 WRITEMASK_X),
2463 shadow_comparitor));
2464 inst->mlen++;
2465 }
2466
2467 /* Load the LOD info */
2468 if (ir->op == ir_tex || ir->op == ir_txl) {
2469 int mrf, writemask;
2470 if (brw->gen >= 5) {
2471 mrf = param_base + 1;
2472 if (ir->shadow_comparitor) {
2473 writemask = WRITEMASK_Y;
2474 /* mlen already incremented */
2475 } else {
2476 writemask = WRITEMASK_X;
2477 inst->mlen++;
2478 }
2479 } else /* brw->gen == 4 */ {
2480 mrf = param_base;
2481 writemask = WRITEMASK_W;
2482 }
2483 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2484 } else if (ir->op == ir_txf) {
2485 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2486 } else if (ir->op == ir_txf_ms) {
2487 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2488 sample_index));
2489 if (brw->gen >= 7)
2490 /* MCS data is in the first channel of `mcs`, but we need to get it into
2491 * the .y channel of the second vec4 of params, so replicate .x across
2492 * the whole vec4 and then mask off everything except .y
2493 */
2494 mcs.swizzle = BRW_SWIZZLE_XXXX;
2495 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2496 mcs));
2497 inst->mlen++;
2498 } else if (ir->op == ir_txd) {
2499 const glsl_type *type = lod_type;
2500
2501 if (brw->gen >= 5) {
2502 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2503 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2504 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2505 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2506 inst->mlen++;
2507
2508 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2509 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2510 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2511 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2512 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2513 inst->mlen++;
2514
2515 if (ir->shadow_comparitor) {
2516 emit(MOV(dst_reg(MRF, param_base + 2,
2517 ir->shadow_comparitor->type, WRITEMASK_Z),
2518 shadow_comparitor));
2519 }
2520 }
2521 } else /* brw->gen == 4 */ {
2522 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2523 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2524 inst->mlen += 2;
2525 }
2526 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2527 if (ir->shadow_comparitor) {
2528 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2529 shadow_comparitor));
2530 }
2531
2532 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2533 offset_value));
2534 inst->mlen++;
2535 }
2536 }
2537
2538 emit(inst);
2539
2540 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2541 * spec requires layers.
2542 */
2543 if (ir->op == ir_txs) {
2544 glsl_type const *type = ir->sampler->type;
2545 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2546 type->sampler_array) {
2547 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2548 writemask(inst->dst, WRITEMASK_Z),
2549 src_reg(inst->dst), src_reg(6));
2550 }
2551 }
2552
2553 if (brw->gen == 6 && ir->op == ir_tg4) {
2554 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2555 }
2556
2557 swizzle_result(ir, src_reg(inst->dst), sampler);
2558 }
2559
2560 /**
2561 * Apply workarounds for Gen6 gather with UINT/SINT
2562 */
2563 void
2564 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2565 {
2566 if (!wa)
2567 return;
2568
2569 int width = (wa & WA_8BIT) ? 8 : 16;
2570 dst_reg dst_f = dst;
2571 dst_f.type = BRW_REGISTER_TYPE_F;
2572
2573 /* Convert from UNORM to UINT */
2574 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2575 emit(MOV(dst, src_reg(dst_f)));
2576
2577 if (wa & WA_SIGN) {
2578 /* Reinterpret the UINT value as a signed INT value by
2579 * shifting the sign bit into place, then shifting back
2580 * preserving sign.
2581 */
2582 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2583 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2584 }
2585 }
2586
2587 /**
2588 * Set up the gather channel based on the swizzle, for gather4.
2589 */
2590 uint32_t
2591 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2592 {
2593 ir_constant *chan = ir->lod_info.component->as_constant();
2594 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2595 switch (swiz) {
2596 case SWIZZLE_X: return 0;
2597 case SWIZZLE_Y:
2598 /* gather4 sampler is broken for green channel on RG32F --
2599 * we must ask for blue instead.
2600 */
2601 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2602 return 2;
2603 return 1;
2604 case SWIZZLE_Z: return 2;
2605 case SWIZZLE_W: return 3;
2606 default:
2607 unreachable("Not reached"); /* zero, one swizzles handled already */
2608 }
2609 }
2610
2611 void
2612 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2613 {
2614 int s = key->tex.swizzles[sampler];
2615
2616 this->result = src_reg(this, ir->type);
2617 dst_reg swizzled_result(this->result);
2618
2619 if (ir->op == ir_query_levels) {
2620 /* # levels is in .w */
2621 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2622 emit(MOV(swizzled_result, orig_val));
2623 return;
2624 }
2625
2626 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2627 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2628 emit(MOV(swizzled_result, orig_val));
2629 return;
2630 }
2631
2632
2633 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2634 int swizzle[4] = {0};
2635
2636 for (int i = 0; i < 4; i++) {
2637 switch (GET_SWZ(s, i)) {
2638 case SWIZZLE_ZERO:
2639 zero_mask |= (1 << i);
2640 break;
2641 case SWIZZLE_ONE:
2642 one_mask |= (1 << i);
2643 break;
2644 default:
2645 copy_mask |= (1 << i);
2646 swizzle[i] = GET_SWZ(s, i);
2647 break;
2648 }
2649 }
2650
2651 if (copy_mask) {
2652 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2653 swizzled_result.writemask = copy_mask;
2654 emit(MOV(swizzled_result, orig_val));
2655 }
2656
2657 if (zero_mask) {
2658 swizzled_result.writemask = zero_mask;
2659 emit(MOV(swizzled_result, src_reg(0.0f)));
2660 }
2661
2662 if (one_mask) {
2663 swizzled_result.writemask = one_mask;
2664 emit(MOV(swizzled_result, src_reg(1.0f)));
2665 }
2666 }
2667
2668 void
2669 vec4_visitor::visit(ir_return *)
2670 {
2671 unreachable("not reached");
2672 }
2673
2674 void
2675 vec4_visitor::visit(ir_discard *)
2676 {
2677 unreachable("not reached");
2678 }
2679
2680 void
2681 vec4_visitor::visit(ir_if *ir)
2682 {
2683 /* Don't point the annotation at the if statement, because then it plus
2684 * the then and else blocks get printed.
2685 */
2686 this->base_ir = ir->condition;
2687
2688 if (brw->gen == 6) {
2689 emit_if_gen6(ir);
2690 } else {
2691 enum brw_predicate predicate;
2692 emit_bool_to_cond_code(ir->condition, &predicate);
2693 emit(IF(predicate));
2694 }
2695
2696 visit_instructions(&ir->then_instructions);
2697
2698 if (!ir->else_instructions.is_empty()) {
2699 this->base_ir = ir->condition;
2700 emit(BRW_OPCODE_ELSE);
2701
2702 visit_instructions(&ir->else_instructions);
2703 }
2704
2705 this->base_ir = ir->condition;
2706 emit(BRW_OPCODE_ENDIF);
2707 }
2708
2709 void
2710 vec4_visitor::visit(ir_emit_vertex *)
2711 {
2712 unreachable("not reached");
2713 }
2714
2715 void
2716 vec4_visitor::visit(ir_end_primitive *)
2717 {
2718 unreachable("not reached");
2719 }
2720
2721 void
2722 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2723 dst_reg dst, src_reg offset,
2724 src_reg src0, src_reg src1)
2725 {
2726 unsigned mlen = 0;
2727
2728 /* Set the atomic operation offset. */
2729 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2730 mlen++;
2731
2732 /* Set the atomic operation arguments. */
2733 if (src0.file != BAD_FILE) {
2734 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2735 mlen++;
2736 }
2737
2738 if (src1.file != BAD_FILE) {
2739 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2740 mlen++;
2741 }
2742
2743 /* Emit the instruction. Note that this maps to the normal SIMD8
2744 * untyped atomic message on Ivy Bridge, but that's OK because
2745 * unused channels will be masked out.
2746 */
2747 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2748 src_reg(atomic_op), src_reg(surf_index));
2749 inst->base_mrf = 0;
2750 inst->mlen = mlen;
2751 }
2752
2753 void
2754 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2755 src_reg offset)
2756 {
2757 /* Set the surface read offset. */
2758 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2759
2760 /* Emit the instruction. Note that this maps to the normal SIMD8
2761 * untyped surface read message, but that's OK because unused
2762 * channels will be masked out.
2763 */
2764 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2765 dst, src_reg(surf_index));
2766 inst->base_mrf = 0;
2767 inst->mlen = 1;
2768 }
2769
2770 void
2771 vec4_visitor::emit_ndc_computation()
2772 {
2773 /* Get the position */
2774 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2775
2776 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2777 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2778 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2779
2780 current_annotation = "NDC";
2781 dst_reg ndc_w = ndc;
2782 ndc_w.writemask = WRITEMASK_W;
2783 src_reg pos_w = pos;
2784 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2785 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2786
2787 dst_reg ndc_xyz = ndc;
2788 ndc_xyz.writemask = WRITEMASK_XYZ;
2789
2790 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2791 }
2792
2793 void
2794 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2795 {
2796 if (brw->gen < 6 &&
2797 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2798 key->userclip_active || brw->has_negative_rhw_bug)) {
2799 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2800 dst_reg header1_w = header1;
2801 header1_w.writemask = WRITEMASK_W;
2802
2803 emit(MOV(header1, 0u));
2804
2805 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2806 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2807
2808 current_annotation = "Point size";
2809 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2810 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2811 }
2812
2813 if (key->userclip_active) {
2814 current_annotation = "Clipping flags";
2815 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2816 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2817
2818 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2819 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2820 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2821
2822 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2823 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2824 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2825 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2826 }
2827
2828 /* i965 clipping workaround:
2829 * 1) Test for -ve rhw
2830 * 2) If set,
2831 * set ndc = (0,0,0,0)
2832 * set ucp[6] = 1
2833 *
2834 * Later, clipping will detect ucp[6] and ensure the primitive is
2835 * clipped against all fixed planes.
2836 */
2837 if (brw->has_negative_rhw_bug) {
2838 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2839 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2840 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2841 vec4_instruction *inst;
2842 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2843 inst->predicate = BRW_PREDICATE_NORMAL;
2844 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2845 inst->predicate = BRW_PREDICATE_NORMAL;
2846 }
2847
2848 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2849 } else if (brw->gen < 6) {
2850 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2851 } else {
2852 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2853 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2854 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2855 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2856 }
2857 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2858 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2859 src_reg(output_reg[VARYING_SLOT_LAYER])));
2860 }
2861 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2862 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2863 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2864 }
2865 }
2866 }
2867
2868 void
2869 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2870 {
2871 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2872 *
2873 * "If a linked set of shaders forming the vertex stage contains no
2874 * static write to gl_ClipVertex or gl_ClipDistance, but the
2875 * application has requested clipping against user clip planes through
2876 * the API, then the coordinate written to gl_Position is used for
2877 * comparison against the user clip planes."
2878 *
2879 * This function is only called if the shader didn't write to
2880 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2881 * if the user wrote to it; otherwise we use gl_Position.
2882 */
2883 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2884 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2885 clip_vertex = VARYING_SLOT_POS;
2886 }
2887
2888 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2889 ++i) {
2890 reg.writemask = 1 << i;
2891 emit(DP4(reg,
2892 src_reg(output_reg[clip_vertex]),
2893 src_reg(this->userplane[i + offset])));
2894 }
2895 }
2896
2897 void
2898 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2899 {
2900 assert (varying < VARYING_SLOT_MAX);
2901 reg.type = output_reg[varying].type;
2902 current_annotation = output_reg_annotation[varying];
2903 /* Copy the register, saturating if necessary */
2904 vec4_instruction *inst = emit(MOV(reg,
2905 src_reg(output_reg[varying])));
2906 if ((varying == VARYING_SLOT_COL0 ||
2907 varying == VARYING_SLOT_COL1 ||
2908 varying == VARYING_SLOT_BFC0 ||
2909 varying == VARYING_SLOT_BFC1) &&
2910 key->clamp_vertex_color) {
2911 inst->saturate = true;
2912 }
2913 }
2914
2915 void
2916 vec4_visitor::emit_urb_slot(int mrf, int varying)
2917 {
2918 struct brw_reg hw_reg = brw_message_reg(mrf);
2919 dst_reg reg = dst_reg(MRF, mrf);
2920 reg.type = BRW_REGISTER_TYPE_F;
2921
2922 switch (varying) {
2923 case VARYING_SLOT_PSIZ:
2924 /* PSIZ is always in slot 0, and is coupled with other flags. */
2925 current_annotation = "indices, point width, clip flags";
2926 emit_psiz_and_flags(hw_reg);
2927 break;
2928 case BRW_VARYING_SLOT_NDC:
2929 current_annotation = "NDC";
2930 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2931 break;
2932 case VARYING_SLOT_POS:
2933 current_annotation = "gl_Position";
2934 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2935 break;
2936 case VARYING_SLOT_EDGE:
2937 /* This is present when doing unfilled polygons. We're supposed to copy
2938 * the edge flag from the user-provided vertex array
2939 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2940 * of that attribute (starts as 1.0f). This is then used in clipping to
2941 * determine which edges should be drawn as wireframe.
2942 */
2943 current_annotation = "edge flag";
2944 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2945 glsl_type::float_type, WRITEMASK_XYZW))));
2946 break;
2947 case BRW_VARYING_SLOT_PAD:
2948 /* No need to write to this slot */
2949 break;
2950 default:
2951 emit_generic_urb_slot(reg, varying);
2952 break;
2953 }
2954 }
2955
2956 static int
2957 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2958 {
2959 if (brw->gen >= 6) {
2960 /* URB data written (does not include the message header reg) must
2961 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2962 * section 5.4.3.2.2: URB_INTERLEAVED.
2963 *
2964 * URB entries are allocated on a multiple of 1024 bits, so an
2965 * extra 128 bits written here to make the end align to 256 is
2966 * no problem.
2967 */
2968 if ((mlen % 2) != 1)
2969 mlen++;
2970 }
2971
2972 return mlen;
2973 }
2974
2975
2976 /**
2977 * Generates the VUE payload plus the necessary URB write instructions to
2978 * output it.
2979 *
2980 * The VUE layout is documented in Volume 2a.
2981 */
2982 void
2983 vec4_visitor::emit_vertex()
2984 {
2985 /* MRF 0 is reserved for the debugger, so start with message header
2986 * in MRF 1.
2987 */
2988 int base_mrf = 1;
2989 int mrf = base_mrf;
2990 /* In the process of generating our URB write message contents, we
2991 * may need to unspill a register or load from an array. Those
2992 * reads would use MRFs 14-15.
2993 */
2994 int max_usable_mrf = 13;
2995
2996 /* The following assertion verifies that max_usable_mrf causes an
2997 * even-numbered amount of URB write data, which will meet gen6's
2998 * requirements for length alignment.
2999 */
3000 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3001
3002 /* First mrf is the g0-based message header containing URB handles and
3003 * such.
3004 */
3005 emit_urb_write_header(mrf++);
3006
3007 if (brw->gen < 6) {
3008 emit_ndc_computation();
3009 }
3010
3011 /* Lower legacy ff and ClipVertex clipping to clip distances */
3012 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3013 current_annotation = "user clip distances";
3014
3015 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3016 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3017
3018 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3019 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3020 }
3021
3022 /* We may need to split this up into several URB writes, so do them in a
3023 * loop.
3024 */
3025 int slot = 0;
3026 bool complete = false;
3027 do {
3028 /* URB offset is in URB row increments, and each of our MRFs is half of
3029 * one of those, since we're doing interleaved writes.
3030 */
3031 int offset = slot / 2;
3032
3033 mrf = base_mrf + 1;
3034 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3035 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3036
3037 /* If this was max_usable_mrf, we can't fit anything more into this
3038 * URB WRITE.
3039 */
3040 if (mrf > max_usable_mrf) {
3041 slot++;
3042 break;
3043 }
3044 }
3045
3046 complete = slot >= prog_data->vue_map.num_slots;
3047 current_annotation = "URB write";
3048 vec4_instruction *inst = emit_urb_write_opcode(complete);
3049 inst->base_mrf = base_mrf;
3050 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3051 inst->offset += offset;
3052 } while(!complete);
3053 }
3054
3055
3056 src_reg
3057 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3058 src_reg *reladdr, int reg_offset)
3059 {
3060 /* Because we store the values to scratch interleaved like our
3061 * vertex data, we need to scale the vec4 index by 2.
3062 */
3063 int message_header_scale = 2;
3064
3065 /* Pre-gen6, the message header uses byte offsets instead of vec4
3066 * (16-byte) offset units.
3067 */
3068 if (brw->gen < 6)
3069 message_header_scale *= 16;
3070
3071 if (reladdr) {
3072 src_reg index = src_reg(this, glsl_type::int_type);
3073
3074 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3075 emit_before(inst, MUL(dst_reg(index),
3076 index, src_reg(message_header_scale)));
3077
3078 return index;
3079 } else {
3080 return src_reg(reg_offset * message_header_scale);
3081 }
3082 }
3083
3084 src_reg
3085 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3086 src_reg *reladdr, int reg_offset)
3087 {
3088 if (reladdr) {
3089 src_reg index = src_reg(this, glsl_type::int_type);
3090
3091 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3092
3093 /* Pre-gen6, the message header uses byte offsets instead of vec4
3094 * (16-byte) offset units.
3095 */
3096 if (brw->gen < 6) {
3097 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3098 }
3099
3100 return index;
3101 } else if (brw->gen >= 8) {
3102 /* Store the offset in a GRF so we can send-from-GRF. */
3103 src_reg offset = src_reg(this, glsl_type::int_type);
3104 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3105 return offset;
3106 } else {
3107 int message_header_scale = brw->gen < 6 ? 16 : 1;
3108 return src_reg(reg_offset * message_header_scale);
3109 }
3110 }
3111
3112 /**
3113 * Emits an instruction before @inst to load the value named by @orig_src
3114 * from scratch space at @base_offset to @temp.
3115 *
3116 * @base_offset is measured in 32-byte units (the size of a register).
3117 */
3118 void
3119 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3120 dst_reg temp, src_reg orig_src,
3121 int base_offset)
3122 {
3123 int reg_offset = base_offset + orig_src.reg_offset;
3124 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3125
3126 emit_before(inst, SCRATCH_READ(temp, index));
3127 }
3128
3129 /**
3130 * Emits an instruction after @inst to store the value to be written
3131 * to @orig_dst to scratch space at @base_offset, from @temp.
3132 *
3133 * @base_offset is measured in 32-byte units (the size of a register).
3134 */
3135 void
3136 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3137 {
3138 int reg_offset = base_offset + inst->dst.reg_offset;
3139 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3140
3141 /* Create a temporary register to store *inst's result in.
3142 *
3143 * We have to be careful in MOVing from our temporary result register in
3144 * the scratch write. If we swizzle from channels of the temporary that
3145 * weren't initialized, it will confuse live interval analysis, which will
3146 * make spilling fail to make progress.
3147 */
3148 src_reg temp = src_reg(this, glsl_type::vec4_type);
3149 temp.type = inst->dst.type;
3150 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3151 int swizzles[4];
3152 for (int i = 0; i < 4; i++)
3153 if (inst->dst.writemask & (1 << i))
3154 swizzles[i] = i;
3155 else
3156 swizzles[i] = first_writemask_chan;
3157 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3158 swizzles[2], swizzles[3]);
3159
3160 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3161 inst->dst.writemask));
3162 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3163 write->predicate = inst->predicate;
3164 write->ir = inst->ir;
3165 write->annotation = inst->annotation;
3166 inst->insert_after(write);
3167
3168 inst->dst.file = temp.file;
3169 inst->dst.reg = temp.reg;
3170 inst->dst.reg_offset = temp.reg_offset;
3171 inst->dst.reladdr = NULL;
3172 }
3173
3174 /**
3175 * We can't generally support array access in GRF space, because a
3176 * single instruction's destination can only span 2 contiguous
3177 * registers. So, we send all GRF arrays that get variable index
3178 * access to scratch space.
3179 */
3180 void
3181 vec4_visitor::move_grf_array_access_to_scratch()
3182 {
3183 int scratch_loc[this->virtual_grf_count];
3184
3185 for (int i = 0; i < this->virtual_grf_count; i++) {
3186 scratch_loc[i] = -1;
3187 }
3188
3189 /* First, calculate the set of virtual GRFs that need to be punted
3190 * to scratch due to having any array access on them, and where in
3191 * scratch.
3192 */
3193 foreach_in_list(vec4_instruction, inst, &instructions) {
3194 if (inst->dst.file == GRF && inst->dst.reladdr &&
3195 scratch_loc[inst->dst.reg] == -1) {
3196 scratch_loc[inst->dst.reg] = c->last_scratch;
3197 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3198 }
3199
3200 for (int i = 0 ; i < 3; i++) {
3201 src_reg *src = &inst->src[i];
3202
3203 if (src->file == GRF && src->reladdr &&
3204 scratch_loc[src->reg] == -1) {
3205 scratch_loc[src->reg] = c->last_scratch;
3206 c->last_scratch += this->virtual_grf_sizes[src->reg];
3207 }
3208 }
3209 }
3210
3211 /* Now, for anything that will be accessed through scratch, rewrite
3212 * it to load/store. Note that this is a _safe list walk, because
3213 * we may generate a new scratch_write instruction after the one
3214 * we're processing.
3215 */
3216 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3217 /* Set up the annotation tracking for new generated instructions. */
3218 base_ir = inst->ir;
3219 current_annotation = inst->annotation;
3220
3221 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3222 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3223 }
3224
3225 for (int i = 0 ; i < 3; i++) {
3226 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3227 continue;
3228
3229 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3230
3231 emit_scratch_read(inst, temp, inst->src[i],
3232 scratch_loc[inst->src[i].reg]);
3233
3234 inst->src[i].file = temp.file;
3235 inst->src[i].reg = temp.reg;
3236 inst->src[i].reg_offset = temp.reg_offset;
3237 inst->src[i].reladdr = NULL;
3238 }
3239 }
3240 }
3241
3242 /**
3243 * Emits an instruction before @inst to load the value named by @orig_src
3244 * from the pull constant buffer (surface) at @base_offset to @temp.
3245 */
3246 void
3247 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3248 dst_reg temp, src_reg orig_src,
3249 int base_offset)
3250 {
3251 int reg_offset = base_offset + orig_src.reg_offset;
3252 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3253 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3254 vec4_instruction *load;
3255
3256 if (brw->gen >= 7) {
3257 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3258 grf_offset.type = offset.type;
3259 emit_before(inst, MOV(grf_offset, offset));
3260
3261 load = new(mem_ctx) vec4_instruction(this,
3262 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3263 temp, index, src_reg(grf_offset));
3264 } else {
3265 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3266 temp, index, offset);
3267 load->base_mrf = 14;
3268 load->mlen = 1;
3269 }
3270 emit_before(inst, load);
3271 }
3272
3273 /**
3274 * Implements array access of uniforms by inserting a
3275 * PULL_CONSTANT_LOAD instruction.
3276 *
3277 * Unlike temporary GRF array access (where we don't support it due to
3278 * the difficulty of doing relative addressing on instruction
3279 * destinations), we could potentially do array access of uniforms
3280 * that were loaded in GRF space as push constants. In real-world
3281 * usage we've seen, though, the arrays being used are always larger
3282 * than we could load as push constants, so just always move all
3283 * uniform array access out to a pull constant buffer.
3284 */
3285 void
3286 vec4_visitor::move_uniform_array_access_to_pull_constants()
3287 {
3288 int pull_constant_loc[this->uniforms];
3289
3290 for (int i = 0; i < this->uniforms; i++) {
3291 pull_constant_loc[i] = -1;
3292 }
3293
3294 /* Walk through and find array access of uniforms. Put a copy of that
3295 * uniform in the pull constant buffer.
3296 *
3297 * Note that we don't move constant-indexed accesses to arrays. No
3298 * testing has been done of the performance impact of this choice.
3299 */
3300 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3301 for (int i = 0 ; i < 3; i++) {
3302 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3303 continue;
3304
3305 int uniform = inst->src[i].reg;
3306
3307 /* If this array isn't already present in the pull constant buffer,
3308 * add it.
3309 */
3310 if (pull_constant_loc[uniform] == -1) {
3311 const float **values = &stage_prog_data->param[uniform * 4];
3312
3313 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3314
3315 assert(uniform < uniform_array_size);
3316 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3317 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3318 = values[j];
3319 }
3320 }
3321
3322 /* Set up the annotation tracking for new generated instructions. */
3323 base_ir = inst->ir;
3324 current_annotation = inst->annotation;
3325
3326 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3327
3328 emit_pull_constant_load(inst, temp, inst->src[i],
3329 pull_constant_loc[uniform]);
3330
3331 inst->src[i].file = temp.file;
3332 inst->src[i].reg = temp.reg;
3333 inst->src[i].reg_offset = temp.reg_offset;
3334 inst->src[i].reladdr = NULL;
3335 }
3336 }
3337
3338 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3339 * no need to track them as larger-than-vec4 objects. This will be
3340 * relied on in cutting out unused uniform vectors from push
3341 * constants.
3342 */
3343 split_uniform_registers();
3344 }
3345
3346 void
3347 vec4_visitor::resolve_ud_negate(src_reg *reg)
3348 {
3349 if (reg->type != BRW_REGISTER_TYPE_UD ||
3350 !reg->negate)
3351 return;
3352
3353 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3354 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3355 *reg = temp;
3356 }
3357
3358 vec4_visitor::vec4_visitor(struct brw_context *brw,
3359 struct brw_vec4_compile *c,
3360 struct gl_program *prog,
3361 const struct brw_vec4_prog_key *key,
3362 struct brw_vec4_prog_data *prog_data,
3363 struct gl_shader_program *shader_prog,
3364 gl_shader_stage stage,
3365 void *mem_ctx,
3366 bool debug_flag,
3367 bool no_spills,
3368 shader_time_shader_type st_base,
3369 shader_time_shader_type st_written,
3370 shader_time_shader_type st_reset)
3371 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3372 c(c),
3373 key(key),
3374 prog_data(prog_data),
3375 sanity_param_count(0),
3376 fail_msg(NULL),
3377 first_non_payload_grf(0),
3378 need_all_constants_in_pull_buffer(false),
3379 debug_flag(debug_flag),
3380 no_spills(no_spills),
3381 st_base(st_base),
3382 st_written(st_written),
3383 st_reset(st_reset)
3384 {
3385 this->mem_ctx = mem_ctx;
3386 this->failed = false;
3387
3388 this->base_ir = NULL;
3389 this->current_annotation = NULL;
3390 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3391
3392 this->variable_ht = hash_table_ctor(0,
3393 hash_table_pointer_hash,
3394 hash_table_pointer_compare);
3395
3396 this->virtual_grf_start = NULL;
3397 this->virtual_grf_end = NULL;
3398 this->virtual_grf_sizes = NULL;
3399 this->virtual_grf_count = 0;
3400 this->virtual_grf_reg_map = NULL;
3401 this->virtual_grf_reg_count = 0;
3402 this->virtual_grf_array_size = 0;
3403 this->live_intervals_valid = false;
3404
3405 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3406
3407 this->uniforms = 0;
3408
3409 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3410 * at least one. See setup_uniforms() in brw_vec4.cpp.
3411 */
3412 this->uniform_array_size = 1;
3413 if (prog_data) {
3414 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3415 }
3416
3417 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3418 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3419 }
3420
3421 vec4_visitor::~vec4_visitor()
3422 {
3423 hash_table_dtor(this->variable_ht);
3424 }
3425
3426
3427 void
3428 vec4_visitor::fail(const char *format, ...)
3429 {
3430 va_list va;
3431 char *msg;
3432
3433 if (failed)
3434 return;
3435
3436 failed = true;
3437
3438 va_start(va, format);
3439 msg = ralloc_vasprintf(mem_ctx, format, va);
3440 va_end(va);
3441 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3442
3443 this->fail_msg = msg;
3444
3445 if (debug_flag) {
3446 fprintf(stderr, "%s", msg);
3447 }
3448 }
3449
3450 } /* namespace brw */