i965: Store uniform constant values in a gl_constant_value instead of float
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = components;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static gl_constant_value zero = { .f = 0.0 };
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] =
719 (gl_constant_value *) &clip_planes[i][j];
720 }
721 ++this->uniforms;
722 }
723 }
724
725 /* Our support for builtin uniforms is even scarier than non-builtin.
726 * It sits on top of the PROG_STATE_VAR parameters that are
727 * automatically updated from GL context state.
728 */
729 void
730 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
731 {
732 const ir_state_slot *const slots = ir->state_slots;
733 assert(ir->state_slots != NULL);
734
735 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
736 /* This state reference has already been setup by ir_to_mesa,
737 * but we'll get the same index back here. We can reference
738 * ParameterValues directly, since unlike brw_fs.cpp, we never
739 * add new state references during compile.
740 */
741 int index = _mesa_add_state_reference(this->prog->Parameters,
742 (gl_state_index *)slots[i].tokens);
743 gl_constant_value *values =
744 &this->prog->Parameters->ParameterValues[index][0];
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
773 enum brw_predicate *predicate)
774 {
775 ir_expression *expr = ir->as_expression();
776
777 *predicate = BRW_PREDICATE_NORMAL;
778
779 if (expr) {
780 src_reg op[2];
781 vec4_instruction *inst;
782
783 assert(expr->get_num_operands() <= 2);
784 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
785 expr->operands[i]->accept(this);
786 op[i] = this->result;
787
788 resolve_ud_negate(&op[i]);
789 }
790
791 switch (expr->operation) {
792 case ir_unop_logic_not:
793 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
794 inst->conditional_mod = BRW_CONDITIONAL_Z;
795 break;
796
797 case ir_binop_logic_xor:
798 inst = emit(XOR(dst_null_d(), op[0], op[1]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 break;
801
802 case ir_binop_logic_or:
803 inst = emit(OR(dst_null_d(), op[0], op[1]));
804 inst->conditional_mod = BRW_CONDITIONAL_NZ;
805 break;
806
807 case ir_binop_logic_and:
808 inst = emit(AND(dst_null_d(), op[0], op[1]));
809 inst->conditional_mod = BRW_CONDITIONAL_NZ;
810 break;
811
812 case ir_unop_f2b:
813 if (brw->gen >= 6) {
814 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
815 } else {
816 inst = emit(MOV(dst_null_f(), op[0]));
817 inst->conditional_mod = BRW_CONDITIONAL_NZ;
818 }
819 break;
820
821 case ir_unop_i2b:
822 if (brw->gen >= 6) {
823 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
824 } else {
825 inst = emit(MOV(dst_null_d(), op[0]));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 }
828 break;
829
830 case ir_binop_all_equal:
831 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
832 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
833 break;
834
835 case ir_binop_any_nequal:
836 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
837 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
838 break;
839
840 case ir_unop_any:
841 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
842 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
843 break;
844
845 case ir_binop_greater:
846 case ir_binop_gequal:
847 case ir_binop_less:
848 case ir_binop_lequal:
849 case ir_binop_equal:
850 case ir_binop_nequal:
851 emit(CMP(dst_null_d(), op[0], op[1],
852 brw_conditional_for_comparison(expr->operation)));
853 break;
854
855 default:
856 unreachable("not reached");
857 }
858 return;
859 }
860
861 ir->accept(this);
862
863 resolve_ud_negate(&this->result);
864
865 if (brw->gen >= 6) {
866 vec4_instruction *inst = emit(AND(dst_null_d(),
867 this->result, src_reg(1)));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 } else {
870 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
871 inst->conditional_mod = BRW_CONDITIONAL_NZ;
872 }
873 }
874
875 /**
876 * Emit a gen6 IF statement with the comparison folded into the IF
877 * instruction.
878 */
879 void
880 vec4_visitor::emit_if_gen6(ir_if *ir)
881 {
882 ir_expression *expr = ir->condition->as_expression();
883
884 if (expr) {
885 src_reg op[2];
886 dst_reg temp;
887
888 assert(expr->get_num_operands() <= 2);
889 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
890 expr->operands[i]->accept(this);
891 op[i] = this->result;
892 }
893
894 switch (expr->operation) {
895 case ir_unop_logic_not:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
897 return;
898
899 case ir_binop_logic_xor:
900 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
901 return;
902
903 case ir_binop_logic_or:
904 temp = dst_reg(this, glsl_type::bool_type);
905 emit(OR(temp, op[0], op[1]));
906 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
907 return;
908
909 case ir_binop_logic_and:
910 temp = dst_reg(this, glsl_type::bool_type);
911 emit(AND(temp, op[0], op[1]));
912 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_f2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_unop_i2b:
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922
923 case ir_binop_greater:
924 case ir_binop_gequal:
925 case ir_binop_less:
926 case ir_binop_lequal:
927 case ir_binop_equal:
928 case ir_binop_nequal:
929 emit(IF(op[0], op[1],
930 brw_conditional_for_comparison(expr->operation)));
931 return;
932
933 case ir_binop_all_equal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
935 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
936 return;
937
938 case ir_binop_any_nequal:
939 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 case ir_unop_any:
944 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
945 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
946 return;
947
948 default:
949 unreachable("not reached");
950 }
951 return;
952 }
953
954 ir->condition->accept(this);
955
956 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
957 }
958
959 void
960 vec4_visitor::visit(ir_variable *ir)
961 {
962 dst_reg *reg = NULL;
963
964 if (variable_storage(ir))
965 return;
966
967 switch (ir->data.mode) {
968 case ir_var_shader_in:
969 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
970 break;
971
972 case ir_var_shader_out:
973 reg = new(mem_ctx) dst_reg(this, ir->type);
974
975 for (int i = 0; i < type_size(ir->type); i++) {
976 output_reg[ir->data.location + i] = *reg;
977 output_reg[ir->data.location + i].reg_offset = i;
978 output_reg[ir->data.location + i].type =
979 brw_type_for_base_type(ir->type->get_scalar_type());
980 output_reg_annotation[ir->data.location + i] = ir->name;
981 }
982 break;
983
984 case ir_var_auto:
985 case ir_var_temporary:
986 reg = new(mem_ctx) dst_reg(this, ir->type);
987 break;
988
989 case ir_var_uniform:
990 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
991
992 /* Thanks to the lower_ubo_reference pass, we will see only
993 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
994 * variables, so no need for them to be in variable_ht.
995 *
996 * Atomic counters take no uniform storage, no need to do
997 * anything here.
998 */
999 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000 return;
1001
1002 /* Track how big the whole uniform variable is, in case we need to put a
1003 * copy of its data into pull constants for array access.
1004 */
1005 assert(this->uniforms < uniform_array_size);
1006 this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008 if (!strncmp(ir->name, "gl_", 3)) {
1009 setup_builtin_uniform_values(ir);
1010 } else {
1011 setup_uniform_values(ir);
1012 }
1013 break;
1014
1015 case ir_var_system_value:
1016 reg = make_reg_for_system_value(ir);
1017 break;
1018
1019 default:
1020 unreachable("not reached");
1021 }
1022
1023 reg->type = brw_type_for_base_type(ir->type);
1024 hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030 /* We don't want debugging output to print the whole body of the
1031 * loop as the annotation.
1032 */
1033 this->base_ir = NULL;
1034
1035 emit(BRW_OPCODE_DO);
1036
1037 visit_instructions(&ir->body_instructions);
1038
1039 emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045 switch (ir->mode) {
1046 case ir_loop_jump::jump_break:
1047 emit(BRW_OPCODE_BREAK);
1048 break;
1049 case ir_loop_jump::jump_continue:
1050 emit(BRW_OPCODE_CONTINUE);
1051 break;
1052 }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *)
1058 {
1059 unreachable("not reached");
1060 }
1061
1062 void
1063 vec4_visitor::visit(ir_function *ir)
1064 {
1065 /* Ignore function bodies other than main() -- we shouldn't see calls to
1066 * them since they should all be inlined.
1067 */
1068 if (strcmp(ir->name, "main") == 0) {
1069 const ir_function_signature *sig;
1070 exec_list empty;
1071
1072 sig = ir->matching_signature(NULL, &empty, false);
1073
1074 assert(sig);
1075
1076 visit_instructions(&sig->body);
1077 }
1078 }
1079
1080 bool
1081 vec4_visitor::try_emit_sat(ir_expression *ir)
1082 {
1083 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1084 if (!sat_src)
1085 return false;
1086
1087 sat_src->accept(this);
1088 src_reg src = this->result;
1089
1090 this->result = src_reg(this, ir->type);
1091 vec4_instruction *inst;
1092 inst = emit(MOV(dst_reg(this->result), src));
1093 inst->saturate = true;
1094
1095 return true;
1096 }
1097
1098 bool
1099 vec4_visitor::try_emit_mad(ir_expression *ir)
1100 {
1101 /* 3-src instructions were introduced in gen6. */
1102 if (brw->gen < 6)
1103 return false;
1104
1105 /* MAD can only handle floating-point data. */
1106 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1107 return false;
1108
1109 ir_rvalue *nonmul = ir->operands[1];
1110 ir_expression *mul = ir->operands[0]->as_expression();
1111
1112 if (!mul || mul->operation != ir_binop_mul) {
1113 nonmul = ir->operands[0];
1114 mul = ir->operands[1]->as_expression();
1115
1116 if (!mul || mul->operation != ir_binop_mul)
1117 return false;
1118 }
1119
1120 nonmul->accept(this);
1121 src_reg src0 = fix_3src_operand(this->result);
1122
1123 mul->operands[0]->accept(this);
1124 src_reg src1 = fix_3src_operand(this->result);
1125
1126 mul->operands[1]->accept(this);
1127 src_reg src2 = fix_3src_operand(this->result);
1128
1129 this->result = src_reg(this, ir->type);
1130 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1131
1132 return true;
1133 }
1134
1135 bool
1136 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1137 {
1138 ir_expression *const cmp = ir->operands[0]->as_expression();
1139
1140 if (cmp == NULL)
1141 return false;
1142
1143 switch (cmp->operation) {
1144 case ir_binop_less:
1145 case ir_binop_greater:
1146 case ir_binop_lequal:
1147 case ir_binop_gequal:
1148 case ir_binop_equal:
1149 case ir_binop_nequal:
1150 break;
1151
1152 default:
1153 return false;
1154 }
1155
1156 cmp->operands[0]->accept(this);
1157 const src_reg cmp_src0 = this->result;
1158
1159 cmp->operands[1]->accept(this);
1160 const src_reg cmp_src1 = this->result;
1161
1162 this->result = src_reg(this, ir->type);
1163
1164 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1165 brw_conditional_for_comparison(cmp->operation)));
1166
1167 /* If the comparison is false, this->result will just happen to be zero.
1168 */
1169 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1170 this->result, src_reg(1.0f));
1171 inst->predicate = BRW_PREDICATE_NORMAL;
1172 inst->predicate_inverse = true;
1173
1174 return true;
1175 }
1176
1177 void
1178 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1179 src_reg src0, src_reg src1)
1180 {
1181 vec4_instruction *inst;
1182
1183 if (brw->gen >= 6) {
1184 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1185 inst->conditional_mod = conditionalmod;
1186 } else {
1187 emit(CMP(dst, src0, src1, conditionalmod));
1188
1189 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1190 inst->predicate = BRW_PREDICATE_NORMAL;
1191 }
1192 }
1193
1194 void
1195 vec4_visitor::emit_lrp(const dst_reg &dst,
1196 const src_reg &x, const src_reg &y, const src_reg &a)
1197 {
1198 if (brw->gen >= 6) {
1199 /* Note that the instruction's argument order is reversed from GLSL
1200 * and the IR.
1201 */
1202 emit(LRP(dst,
1203 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1204 } else {
1205 /* Earlier generations don't support three source operations, so we
1206 * need to emit x*(1-a) + y*a.
1207 */
1208 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1209 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1210 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1211 y_times_a.writemask = dst.writemask;
1212 one_minus_a.writemask = dst.writemask;
1213 x_times_one_minus_a.writemask = dst.writemask;
1214
1215 emit(MUL(y_times_a, y, a));
1216 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1217 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1218 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1219 }
1220 }
1221
1222 void
1223 vec4_visitor::visit(ir_expression *ir)
1224 {
1225 unsigned int operand;
1226 src_reg op[Elements(ir->operands)];
1227 src_reg result_src;
1228 dst_reg result_dst;
1229 vec4_instruction *inst;
1230
1231 if (try_emit_sat(ir))
1232 return;
1233
1234 if (ir->operation == ir_binop_add) {
1235 if (try_emit_mad(ir))
1236 return;
1237 }
1238
1239 if (ir->operation == ir_unop_b2f) {
1240 if (try_emit_b2f_of_compare(ir))
1241 return;
1242 }
1243
1244 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1245 this->result.file = BAD_FILE;
1246 ir->operands[operand]->accept(this);
1247 if (this->result.file == BAD_FILE) {
1248 fprintf(stderr, "Failed to get tree for expression operand:\n");
1249 ir->operands[operand]->fprint(stderr);
1250 exit(1);
1251 }
1252 op[operand] = this->result;
1253
1254 /* Matrix expression operands should have been broken down to vector
1255 * operations already.
1256 */
1257 assert(!ir->operands[operand]->type->is_matrix());
1258 }
1259
1260 int vector_elements = ir->operands[0]->type->vector_elements;
1261 if (ir->operands[1]) {
1262 vector_elements = MAX2(vector_elements,
1263 ir->operands[1]->type->vector_elements);
1264 }
1265
1266 this->result.file = BAD_FILE;
1267
1268 /* Storage for our result. Ideally for an assignment we'd be using
1269 * the actual storage for the result here, instead.
1270 */
1271 result_src = src_reg(this, ir->type);
1272 /* convenience for the emit functions below. */
1273 result_dst = dst_reg(result_src);
1274 /* If nothing special happens, this is the result. */
1275 this->result = result_src;
1276 /* Limit writes to the channels that will be used by result_src later.
1277 * This does limit this temp's use as a temporary for multi-instruction
1278 * sequences.
1279 */
1280 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1281
1282 switch (ir->operation) {
1283 case ir_unop_logic_not:
1284 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1285 * ones complement of the whole register, not just bit 0.
1286 */
1287 emit(XOR(result_dst, op[0], src_reg(1)));
1288 break;
1289 case ir_unop_neg:
1290 op[0].negate = !op[0].negate;
1291 emit(MOV(result_dst, op[0]));
1292 break;
1293 case ir_unop_abs:
1294 op[0].abs = true;
1295 op[0].negate = false;
1296 emit(MOV(result_dst, op[0]));
1297 break;
1298
1299 case ir_unop_sign:
1300 if (ir->type->is_float()) {
1301 /* AND(val, 0x80000000) gives the sign bit.
1302 *
1303 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1304 * zero.
1305 */
1306 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1307
1308 op[0].type = BRW_REGISTER_TYPE_UD;
1309 result_dst.type = BRW_REGISTER_TYPE_UD;
1310 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1311
1312 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1313 inst->predicate = BRW_PREDICATE_NORMAL;
1314
1315 this->result.type = BRW_REGISTER_TYPE_F;
1316 } else {
1317 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1318 * -> non-negative val generates 0x00000000.
1319 * Predicated OR sets 1 if val is positive.
1320 */
1321 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1322
1323 emit(ASR(result_dst, op[0], src_reg(31)));
1324
1325 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1326 inst->predicate = BRW_PREDICATE_NORMAL;
1327 }
1328 break;
1329
1330 case ir_unop_rcp:
1331 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1332 break;
1333
1334 case ir_unop_exp2:
1335 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1336 break;
1337 case ir_unop_log2:
1338 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1339 break;
1340 case ir_unop_exp:
1341 case ir_unop_log:
1342 unreachable("not reached: should be handled by ir_explog_to_explog2");
1343 case ir_unop_sin:
1344 case ir_unop_sin_reduced:
1345 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1346 break;
1347 case ir_unop_cos:
1348 case ir_unop_cos_reduced:
1349 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1350 break;
1351
1352 case ir_unop_dFdx:
1353 case ir_unop_dFdy:
1354 unreachable("derivatives not valid in vertex shader");
1355
1356 case ir_unop_bitfield_reverse:
1357 emit(BFREV(result_dst, op[0]));
1358 break;
1359 case ir_unop_bit_count:
1360 emit(CBIT(result_dst, op[0]));
1361 break;
1362 case ir_unop_find_msb: {
1363 src_reg temp = src_reg(this, glsl_type::uint_type);
1364
1365 inst = emit(FBH(dst_reg(temp), op[0]));
1366 inst->dst.writemask = WRITEMASK_XYZW;
1367
1368 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1369 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1370 * subtract the result from 31 to convert the MSB count into an LSB count.
1371 */
1372
1373 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1374 temp.swizzle = BRW_SWIZZLE_NOOP;
1375 emit(MOV(result_dst, temp));
1376
1377 src_reg src_tmp = src_reg(result_dst);
1378 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1379
1380 src_tmp.negate = true;
1381 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1382 inst->predicate = BRW_PREDICATE_NORMAL;
1383 break;
1384 }
1385 case ir_unop_find_lsb:
1386 emit(FBL(result_dst, op[0]));
1387 break;
1388
1389 case ir_unop_noise:
1390 unreachable("not reached: should be handled by lower_noise");
1391
1392 case ir_binop_add:
1393 emit(ADD(result_dst, op[0], op[1]));
1394 break;
1395 case ir_binop_sub:
1396 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1397
1398 case ir_binop_mul:
1399 if (brw->gen < 8 && ir->type->is_integer()) {
1400 /* For integer multiplication, the MUL uses the low 16 bits of one of
1401 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1402 * accumulates in the contribution of the upper 16 bits of that
1403 * operand. If we can determine that one of the args is in the low
1404 * 16 bits, though, we can just emit a single MUL.
1405 */
1406 if (ir->operands[0]->is_uint16_constant()) {
1407 if (brw->gen < 7)
1408 emit(MUL(result_dst, op[0], op[1]));
1409 else
1410 emit(MUL(result_dst, op[1], op[0]));
1411 } else if (ir->operands[1]->is_uint16_constant()) {
1412 if (brw->gen < 7)
1413 emit(MUL(result_dst, op[1], op[0]));
1414 else
1415 emit(MUL(result_dst, op[0], op[1]));
1416 } else {
1417 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1418
1419 emit(MUL(acc, op[0], op[1]));
1420 emit(MACH(dst_null_d(), op[0], op[1]));
1421 emit(MOV(result_dst, src_reg(acc)));
1422 }
1423 } else {
1424 emit(MUL(result_dst, op[0], op[1]));
1425 }
1426 break;
1427 case ir_binop_imul_high: {
1428 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1429
1430 emit(MUL(acc, op[0], op[1]));
1431 emit(MACH(result_dst, op[0], op[1]));
1432 break;
1433 }
1434 case ir_binop_div:
1435 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1436 assert(ir->type->is_integer());
1437 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1438 break;
1439 case ir_binop_carry: {
1440 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1441
1442 emit(ADDC(dst_null_ud(), op[0], op[1]));
1443 emit(MOV(result_dst, src_reg(acc)));
1444 break;
1445 }
1446 case ir_binop_borrow: {
1447 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1448
1449 emit(SUBB(dst_null_ud(), op[0], op[1]));
1450 emit(MOV(result_dst, src_reg(acc)));
1451 break;
1452 }
1453 case ir_binop_mod:
1454 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1455 assert(ir->type->is_integer());
1456 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1457 break;
1458
1459 case ir_binop_less:
1460 case ir_binop_greater:
1461 case ir_binop_lequal:
1462 case ir_binop_gequal:
1463 case ir_binop_equal:
1464 case ir_binop_nequal: {
1465 emit(CMP(result_dst, op[0], op[1],
1466 brw_conditional_for_comparison(ir->operation)));
1467 emit(AND(result_dst, result_src, src_reg(0x1)));
1468 break;
1469 }
1470
1471 case ir_binop_all_equal:
1472 /* "==" operator producing a scalar boolean. */
1473 if (ir->operands[0]->type->is_vector() ||
1474 ir->operands[1]->type->is_vector()) {
1475 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1476 emit(MOV(result_dst, src_reg(0)));
1477 inst = emit(MOV(result_dst, src_reg(1)));
1478 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1479 } else {
1480 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1481 emit(AND(result_dst, result_src, src_reg(0x1)));
1482 }
1483 break;
1484 case ir_binop_any_nequal:
1485 /* "!=" operator producing a scalar boolean. */
1486 if (ir->operands[0]->type->is_vector() ||
1487 ir->operands[1]->type->is_vector()) {
1488 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1489
1490 emit(MOV(result_dst, src_reg(0)));
1491 inst = emit(MOV(result_dst, src_reg(1)));
1492 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1493 } else {
1494 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1495 emit(AND(result_dst, result_src, src_reg(0x1)));
1496 }
1497 break;
1498
1499 case ir_unop_any:
1500 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1501 emit(MOV(result_dst, src_reg(0)));
1502
1503 inst = emit(MOV(result_dst, src_reg(1)));
1504 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1505 break;
1506
1507 case ir_binop_logic_xor:
1508 emit(XOR(result_dst, op[0], op[1]));
1509 break;
1510
1511 case ir_binop_logic_or:
1512 emit(OR(result_dst, op[0], op[1]));
1513 break;
1514
1515 case ir_binop_logic_and:
1516 emit(AND(result_dst, op[0], op[1]));
1517 break;
1518
1519 case ir_binop_dot:
1520 assert(ir->operands[0]->type->is_vector());
1521 assert(ir->operands[0]->type == ir->operands[1]->type);
1522 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1523 break;
1524
1525 case ir_unop_sqrt:
1526 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1527 break;
1528 case ir_unop_rsq:
1529 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1530 break;
1531
1532 case ir_unop_bitcast_i2f:
1533 case ir_unop_bitcast_u2f:
1534 this->result = op[0];
1535 this->result.type = BRW_REGISTER_TYPE_F;
1536 break;
1537
1538 case ir_unop_bitcast_f2i:
1539 this->result = op[0];
1540 this->result.type = BRW_REGISTER_TYPE_D;
1541 break;
1542
1543 case ir_unop_bitcast_f2u:
1544 this->result = op[0];
1545 this->result.type = BRW_REGISTER_TYPE_UD;
1546 break;
1547
1548 case ir_unop_i2f:
1549 case ir_unop_i2u:
1550 case ir_unop_u2i:
1551 case ir_unop_u2f:
1552 case ir_unop_b2f:
1553 case ir_unop_b2i:
1554 case ir_unop_f2i:
1555 case ir_unop_f2u:
1556 emit(MOV(result_dst, op[0]));
1557 break;
1558 case ir_unop_f2b:
1559 case ir_unop_i2b: {
1560 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1561 emit(AND(result_dst, result_src, src_reg(1)));
1562 break;
1563 }
1564
1565 case ir_unop_trunc:
1566 emit(RNDZ(result_dst, op[0]));
1567 break;
1568 case ir_unop_ceil:
1569 op[0].negate = !op[0].negate;
1570 inst = emit(RNDD(result_dst, op[0]));
1571 this->result.negate = true;
1572 break;
1573 case ir_unop_floor:
1574 inst = emit(RNDD(result_dst, op[0]));
1575 break;
1576 case ir_unop_fract:
1577 inst = emit(FRC(result_dst, op[0]));
1578 break;
1579 case ir_unop_round_even:
1580 emit(RNDE(result_dst, op[0]));
1581 break;
1582
1583 case ir_binop_min:
1584 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1585 break;
1586 case ir_binop_max:
1587 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1588 break;
1589
1590 case ir_binop_pow:
1591 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1592 break;
1593
1594 case ir_unop_bit_not:
1595 inst = emit(NOT(result_dst, op[0]));
1596 break;
1597 case ir_binop_bit_and:
1598 inst = emit(AND(result_dst, op[0], op[1]));
1599 break;
1600 case ir_binop_bit_xor:
1601 inst = emit(XOR(result_dst, op[0], op[1]));
1602 break;
1603 case ir_binop_bit_or:
1604 inst = emit(OR(result_dst, op[0], op[1]));
1605 break;
1606
1607 case ir_binop_lshift:
1608 inst = emit(SHL(result_dst, op[0], op[1]));
1609 break;
1610
1611 case ir_binop_rshift:
1612 if (ir->type->base_type == GLSL_TYPE_INT)
1613 inst = emit(ASR(result_dst, op[0], op[1]));
1614 else
1615 inst = emit(SHR(result_dst, op[0], op[1]));
1616 break;
1617
1618 case ir_binop_bfm:
1619 emit(BFI1(result_dst, op[0], op[1]));
1620 break;
1621
1622 case ir_binop_ubo_load: {
1623 ir_constant *uniform_block = ir->operands[0]->as_constant();
1624 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1625 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1626 src_reg offset;
1627
1628 /* Now, load the vector from that offset. */
1629 assert(ir->type->is_vector() || ir->type->is_scalar());
1630
1631 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1632 packed_consts.type = result.type;
1633 src_reg surf_index =
1634 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1635 if (const_offset_ir) {
1636 if (brw->gen >= 8) {
1637 /* Store the offset in a GRF so we can send-from-GRF. */
1638 offset = src_reg(this, glsl_type::int_type);
1639 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1640 } else {
1641 /* Immediates are fine on older generations since they'll be moved
1642 * to a (potentially fake) MRF at the generator level.
1643 */
1644 offset = src_reg(const_offset / 16);
1645 }
1646 } else {
1647 offset = src_reg(this, glsl_type::uint_type);
1648 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1649 }
1650
1651 if (brw->gen >= 7) {
1652 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1653 grf_offset.type = offset.type;
1654
1655 emit(MOV(grf_offset, offset));
1656
1657 emit(new(mem_ctx) vec4_instruction(this,
1658 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1659 dst_reg(packed_consts),
1660 surf_index,
1661 src_reg(grf_offset)));
1662 } else {
1663 vec4_instruction *pull =
1664 emit(new(mem_ctx) vec4_instruction(this,
1665 VS_OPCODE_PULL_CONSTANT_LOAD,
1666 dst_reg(packed_consts),
1667 surf_index,
1668 offset));
1669 pull->base_mrf = 14;
1670 pull->mlen = 1;
1671 }
1672
1673 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1674 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1675 const_offset % 16 / 4,
1676 const_offset % 16 / 4,
1677 const_offset % 16 / 4);
1678
1679 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1680 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1681 emit(CMP(result_dst, packed_consts, src_reg(0u),
1682 BRW_CONDITIONAL_NZ));
1683 emit(AND(result_dst, result, src_reg(0x1)));
1684 } else {
1685 emit(MOV(result_dst, packed_consts));
1686 }
1687 break;
1688 }
1689
1690 case ir_binop_vector_extract:
1691 unreachable("should have been lowered by vec_index_to_cond_assign");
1692
1693 case ir_triop_fma:
1694 op[0] = fix_3src_operand(op[0]);
1695 op[1] = fix_3src_operand(op[1]);
1696 op[2] = fix_3src_operand(op[2]);
1697 /* Note that the instruction's argument order is reversed from GLSL
1698 * and the IR.
1699 */
1700 emit(MAD(result_dst, op[2], op[1], op[0]));
1701 break;
1702
1703 case ir_triop_lrp:
1704 emit_lrp(result_dst, op[0], op[1], op[2]);
1705 break;
1706
1707 case ir_triop_csel:
1708 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1709 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1710 inst->predicate = BRW_PREDICATE_NORMAL;
1711 break;
1712
1713 case ir_triop_bfi:
1714 op[0] = fix_3src_operand(op[0]);
1715 op[1] = fix_3src_operand(op[1]);
1716 op[2] = fix_3src_operand(op[2]);
1717 emit(BFI2(result_dst, op[0], op[1], op[2]));
1718 break;
1719
1720 case ir_triop_bitfield_extract:
1721 op[0] = fix_3src_operand(op[0]);
1722 op[1] = fix_3src_operand(op[1]);
1723 op[2] = fix_3src_operand(op[2]);
1724 /* Note that the instruction's argument order is reversed from GLSL
1725 * and the IR.
1726 */
1727 emit(BFE(result_dst, op[2], op[1], op[0]));
1728 break;
1729
1730 case ir_triop_vector_insert:
1731 unreachable("should have been lowered by lower_vector_insert");
1732
1733 case ir_quadop_bitfield_insert:
1734 unreachable("not reached: should be handled by "
1735 "bitfield_insert_to_bfm_bfi\n");
1736
1737 case ir_quadop_vector:
1738 unreachable("not reached: should be handled by lower_quadop_vector");
1739
1740 case ir_unop_pack_half_2x16:
1741 emit_pack_half_2x16(result_dst, op[0]);
1742 break;
1743 case ir_unop_unpack_half_2x16:
1744 emit_unpack_half_2x16(result_dst, op[0]);
1745 break;
1746 case ir_unop_pack_snorm_2x16:
1747 case ir_unop_pack_snorm_4x8:
1748 case ir_unop_pack_unorm_2x16:
1749 case ir_unop_pack_unorm_4x8:
1750 case ir_unop_unpack_snorm_2x16:
1751 case ir_unop_unpack_snorm_4x8:
1752 case ir_unop_unpack_unorm_2x16:
1753 case ir_unop_unpack_unorm_4x8:
1754 unreachable("not reached: should be handled by lower_packing_builtins");
1755 case ir_unop_unpack_half_2x16_split_x:
1756 case ir_unop_unpack_half_2x16_split_y:
1757 case ir_binop_pack_half_2x16_split:
1758 case ir_unop_interpolate_at_centroid:
1759 case ir_binop_interpolate_at_sample:
1760 case ir_binop_interpolate_at_offset:
1761 unreachable("not reached: should not occur in vertex shader");
1762 case ir_binop_ldexp:
1763 unreachable("not reached: should be handled by ldexp_to_arith()");
1764 }
1765 }
1766
1767
1768 void
1769 vec4_visitor::visit(ir_swizzle *ir)
1770 {
1771 src_reg src;
1772 int i = 0;
1773 int swizzle[4];
1774
1775 /* Note that this is only swizzles in expressions, not those on the left
1776 * hand side of an assignment, which do write masking. See ir_assignment
1777 * for that.
1778 */
1779
1780 ir->val->accept(this);
1781 src = this->result;
1782 assert(src.file != BAD_FILE);
1783
1784 for (i = 0; i < ir->type->vector_elements; i++) {
1785 switch (i) {
1786 case 0:
1787 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1788 break;
1789 case 1:
1790 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1791 break;
1792 case 2:
1793 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1794 break;
1795 case 3:
1796 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1797 break;
1798 }
1799 }
1800 for (; i < 4; i++) {
1801 /* Replicate the last channel out. */
1802 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1803 }
1804
1805 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1806
1807 this->result = src;
1808 }
1809
1810 void
1811 vec4_visitor::visit(ir_dereference_variable *ir)
1812 {
1813 const struct glsl_type *type = ir->type;
1814 dst_reg *reg = variable_storage(ir->var);
1815
1816 if (!reg) {
1817 fail("Failed to find variable storage for %s\n", ir->var->name);
1818 this->result = src_reg(brw_null_reg());
1819 return;
1820 }
1821
1822 this->result = src_reg(*reg);
1823
1824 /* System values get their swizzle from the dst_reg writemask */
1825 if (ir->var->data.mode == ir_var_system_value)
1826 return;
1827
1828 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1829 this->result.swizzle = swizzle_for_size(type->vector_elements);
1830 }
1831
1832
1833 int
1834 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1835 {
1836 /* Under normal circumstances array elements are stored consecutively, so
1837 * the stride is equal to the size of the array element.
1838 */
1839 return type_size(ir->type);
1840 }
1841
1842
1843 void
1844 vec4_visitor::visit(ir_dereference_array *ir)
1845 {
1846 ir_constant *constant_index;
1847 src_reg src;
1848 int array_stride = compute_array_stride(ir);
1849
1850 constant_index = ir->array_index->constant_expression_value();
1851
1852 ir->array->accept(this);
1853 src = this->result;
1854
1855 if (constant_index) {
1856 src.reg_offset += constant_index->value.i[0] * array_stride;
1857 } else {
1858 /* Variable index array dereference. It eats the "vec4" of the
1859 * base of the array and an index that offsets the Mesa register
1860 * index.
1861 */
1862 ir->array_index->accept(this);
1863
1864 src_reg index_reg;
1865
1866 if (array_stride == 1) {
1867 index_reg = this->result;
1868 } else {
1869 index_reg = src_reg(this, glsl_type::int_type);
1870
1871 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1872 }
1873
1874 if (src.reladdr) {
1875 src_reg temp = src_reg(this, glsl_type::int_type);
1876
1877 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1878
1879 index_reg = temp;
1880 }
1881
1882 src.reladdr = ralloc(mem_ctx, src_reg);
1883 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1884 }
1885
1886 /* If the type is smaller than a vec4, replicate the last channel out. */
1887 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1888 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1889 else
1890 src.swizzle = BRW_SWIZZLE_NOOP;
1891 src.type = brw_type_for_base_type(ir->type);
1892
1893 this->result = src;
1894 }
1895
1896 void
1897 vec4_visitor::visit(ir_dereference_record *ir)
1898 {
1899 unsigned int i;
1900 const glsl_type *struct_type = ir->record->type;
1901 int offset = 0;
1902
1903 ir->record->accept(this);
1904
1905 for (i = 0; i < struct_type->length; i++) {
1906 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1907 break;
1908 offset += type_size(struct_type->fields.structure[i].type);
1909 }
1910
1911 /* If the type is smaller than a vec4, replicate the last channel out. */
1912 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1913 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1914 else
1915 this->result.swizzle = BRW_SWIZZLE_NOOP;
1916 this->result.type = brw_type_for_base_type(ir->type);
1917
1918 this->result.reg_offset += offset;
1919 }
1920
1921 /**
1922 * We want to be careful in assignment setup to hit the actual storage
1923 * instead of potentially using a temporary like we might with the
1924 * ir_dereference handler.
1925 */
1926 static dst_reg
1927 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1928 {
1929 /* The LHS must be a dereference. If the LHS is a variable indexed array
1930 * access of a vector, it must be separated into a series conditional moves
1931 * before reaching this point (see ir_vec_index_to_cond_assign).
1932 */
1933 assert(ir->as_dereference());
1934 ir_dereference_array *deref_array = ir->as_dereference_array();
1935 if (deref_array) {
1936 assert(!deref_array->array->type->is_vector());
1937 }
1938
1939 /* Use the rvalue deref handler for the most part. We'll ignore
1940 * swizzles in it and write swizzles using writemask, though.
1941 */
1942 ir->accept(v);
1943 return dst_reg(v->result);
1944 }
1945
1946 void
1947 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1948 const struct glsl_type *type,
1949 enum brw_predicate predicate)
1950 {
1951 if (type->base_type == GLSL_TYPE_STRUCT) {
1952 for (unsigned int i = 0; i < type->length; i++) {
1953 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1954 }
1955 return;
1956 }
1957
1958 if (type->is_array()) {
1959 for (unsigned int i = 0; i < type->length; i++) {
1960 emit_block_move(dst, src, type->fields.array, predicate);
1961 }
1962 return;
1963 }
1964
1965 if (type->is_matrix()) {
1966 const struct glsl_type *vec_type;
1967
1968 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1969 type->vector_elements, 1);
1970
1971 for (int i = 0; i < type->matrix_columns; i++) {
1972 emit_block_move(dst, src, vec_type, predicate);
1973 }
1974 return;
1975 }
1976
1977 assert(type->is_scalar() || type->is_vector());
1978
1979 dst->type = brw_type_for_base_type(type);
1980 src->type = dst->type;
1981
1982 dst->writemask = (1 << type->vector_elements) - 1;
1983
1984 src->swizzle = swizzle_for_size(type->vector_elements);
1985
1986 vec4_instruction *inst = emit(MOV(*dst, *src));
1987 inst->predicate = predicate;
1988
1989 dst->reg_offset++;
1990 src->reg_offset++;
1991 }
1992
1993
1994 /* If the RHS processing resulted in an instruction generating a
1995 * temporary value, and it would be easy to rewrite the instruction to
1996 * generate its result right into the LHS instead, do so. This ends
1997 * up reliably removing instructions where it can be tricky to do so
1998 * later without real UD chain information.
1999 */
2000 bool
2001 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2002 dst_reg dst,
2003 src_reg src,
2004 vec4_instruction *pre_rhs_inst,
2005 vec4_instruction *last_rhs_inst)
2006 {
2007 /* This could be supported, but it would take more smarts. */
2008 if (ir->condition)
2009 return false;
2010
2011 if (pre_rhs_inst == last_rhs_inst)
2012 return false; /* No instructions generated to work with. */
2013
2014 /* Make sure the last instruction generated our source reg. */
2015 if (src.file != GRF ||
2016 src.file != last_rhs_inst->dst.file ||
2017 src.reg != last_rhs_inst->dst.reg ||
2018 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2019 src.reladdr ||
2020 src.abs ||
2021 src.negate ||
2022 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2023 return false;
2024
2025 /* Check that that last instruction fully initialized the channels
2026 * we want to use, in the order we want to use them. We could
2027 * potentially reswizzle the operands of many instructions so that
2028 * we could handle out of order channels, but don't yet.
2029 */
2030
2031 for (unsigned i = 0; i < 4; i++) {
2032 if (dst.writemask & (1 << i)) {
2033 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2034 return false;
2035
2036 if (BRW_GET_SWZ(src.swizzle, i) != i)
2037 return false;
2038 }
2039 }
2040
2041 /* Success! Rewrite the instruction. */
2042 last_rhs_inst->dst.file = dst.file;
2043 last_rhs_inst->dst.reg = dst.reg;
2044 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2045 last_rhs_inst->dst.reladdr = dst.reladdr;
2046 last_rhs_inst->dst.writemask &= dst.writemask;
2047
2048 return true;
2049 }
2050
2051 void
2052 vec4_visitor::visit(ir_assignment *ir)
2053 {
2054 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2055 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2056
2057 if (!ir->lhs->type->is_scalar() &&
2058 !ir->lhs->type->is_vector()) {
2059 ir->rhs->accept(this);
2060 src_reg src = this->result;
2061
2062 if (ir->condition) {
2063 emit_bool_to_cond_code(ir->condition, &predicate);
2064 }
2065
2066 /* emit_block_move doesn't account for swizzles in the source register.
2067 * This should be ok, since the source register is a structure or an
2068 * array, and those can't be swizzled. But double-check to be sure.
2069 */
2070 assert(src.swizzle ==
2071 (ir->rhs->type->is_matrix()
2072 ? swizzle_for_size(ir->rhs->type->vector_elements)
2073 : BRW_SWIZZLE_NOOP));
2074
2075 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2076 return;
2077 }
2078
2079 /* Now we're down to just a scalar/vector with writemasks. */
2080 int i;
2081
2082 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2083 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2084
2085 ir->rhs->accept(this);
2086
2087 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2088
2089 src_reg src = this->result;
2090
2091 int swizzles[4];
2092 int first_enabled_chan = 0;
2093 int src_chan = 0;
2094
2095 assert(ir->lhs->type->is_vector() ||
2096 ir->lhs->type->is_scalar());
2097 dst.writemask = ir->write_mask;
2098
2099 for (int i = 0; i < 4; i++) {
2100 if (dst.writemask & (1 << i)) {
2101 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2102 break;
2103 }
2104 }
2105
2106 /* Swizzle a small RHS vector into the channels being written.
2107 *
2108 * glsl ir treats write_mask as dictating how many channels are
2109 * present on the RHS while in our instructions we need to make
2110 * those channels appear in the slots of the vec4 they're written to.
2111 */
2112 for (int i = 0; i < 4; i++) {
2113 if (dst.writemask & (1 << i))
2114 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2115 else
2116 swizzles[i] = first_enabled_chan;
2117 }
2118 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2119 swizzles[2], swizzles[3]);
2120
2121 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2122 return;
2123 }
2124
2125 if (ir->condition) {
2126 emit_bool_to_cond_code(ir->condition, &predicate);
2127 }
2128
2129 for (i = 0; i < type_size(ir->lhs->type); i++) {
2130 vec4_instruction *inst = emit(MOV(dst, src));
2131 inst->predicate = predicate;
2132
2133 dst.reg_offset++;
2134 src.reg_offset++;
2135 }
2136 }
2137
2138 void
2139 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2140 {
2141 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2142 foreach_in_list(ir_constant, field_value, &ir->components) {
2143 emit_constant_values(dst, field_value);
2144 }
2145 return;
2146 }
2147
2148 if (ir->type->is_array()) {
2149 for (unsigned int i = 0; i < ir->type->length; i++) {
2150 emit_constant_values(dst, ir->array_elements[i]);
2151 }
2152 return;
2153 }
2154
2155 if (ir->type->is_matrix()) {
2156 for (int i = 0; i < ir->type->matrix_columns; i++) {
2157 float *vec = &ir->value.f[i * ir->type->vector_elements];
2158
2159 for (int j = 0; j < ir->type->vector_elements; j++) {
2160 dst->writemask = 1 << j;
2161 dst->type = BRW_REGISTER_TYPE_F;
2162
2163 emit(MOV(*dst, src_reg(vec[j])));
2164 }
2165 dst->reg_offset++;
2166 }
2167 return;
2168 }
2169
2170 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2171
2172 for (int i = 0; i < ir->type->vector_elements; i++) {
2173 if (!(remaining_writemask & (1 << i)))
2174 continue;
2175
2176 dst->writemask = 1 << i;
2177 dst->type = brw_type_for_base_type(ir->type);
2178
2179 /* Find other components that match the one we're about to
2180 * write. Emits fewer instructions for things like vec4(0.5,
2181 * 1.5, 1.5, 1.5).
2182 */
2183 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2184 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2185 if (ir->value.b[i] == ir->value.b[j])
2186 dst->writemask |= (1 << j);
2187 } else {
2188 /* u, i, and f storage all line up, so no need for a
2189 * switch case for comparing each type.
2190 */
2191 if (ir->value.u[i] == ir->value.u[j])
2192 dst->writemask |= (1 << j);
2193 }
2194 }
2195
2196 switch (ir->type->base_type) {
2197 case GLSL_TYPE_FLOAT:
2198 emit(MOV(*dst, src_reg(ir->value.f[i])));
2199 break;
2200 case GLSL_TYPE_INT:
2201 emit(MOV(*dst, src_reg(ir->value.i[i])));
2202 break;
2203 case GLSL_TYPE_UINT:
2204 emit(MOV(*dst, src_reg(ir->value.u[i])));
2205 break;
2206 case GLSL_TYPE_BOOL:
2207 emit(MOV(*dst, src_reg(ir->value.b[i])));
2208 break;
2209 default:
2210 unreachable("Non-float/uint/int/bool constant");
2211 }
2212
2213 remaining_writemask &= ~dst->writemask;
2214 }
2215 dst->reg_offset++;
2216 }
2217
2218 void
2219 vec4_visitor::visit(ir_constant *ir)
2220 {
2221 dst_reg dst = dst_reg(this, ir->type);
2222 this->result = src_reg(dst);
2223
2224 emit_constant_values(&dst, ir);
2225 }
2226
2227 void
2228 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2229 {
2230 ir_dereference *deref = static_cast<ir_dereference *>(
2231 ir->actual_parameters.get_head());
2232 ir_variable *location = deref->variable_referenced();
2233 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2234 location->data.atomic.buffer_index);
2235
2236 /* Calculate the surface offset */
2237 src_reg offset(this, glsl_type::uint_type);
2238 ir_dereference_array *deref_array = deref->as_dereference_array();
2239 if (deref_array) {
2240 deref_array->array_index->accept(this);
2241
2242 src_reg tmp(this, glsl_type::uint_type);
2243 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2244 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2245 } else {
2246 offset = location->data.atomic.offset;
2247 }
2248
2249 /* Emit the appropriate machine instruction */
2250 const char *callee = ir->callee->function_name();
2251 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2252
2253 if (!strcmp("__intrinsic_atomic_read", callee)) {
2254 emit_untyped_surface_read(surf_index, dst, offset);
2255
2256 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2257 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2258 src_reg(), src_reg());
2259
2260 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2261 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2262 src_reg(), src_reg());
2263 }
2264 }
2265
2266 void
2267 vec4_visitor::visit(ir_call *ir)
2268 {
2269 const char *callee = ir->callee->function_name();
2270
2271 if (!strcmp("__intrinsic_atomic_read", callee) ||
2272 !strcmp("__intrinsic_atomic_increment", callee) ||
2273 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2274 visit_atomic_counter_intrinsic(ir);
2275 } else {
2276 unreachable("Unsupported intrinsic.");
2277 }
2278 }
2279
2280 src_reg
2281 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, uint32_t sampler)
2282 {
2283 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2284 inst->base_mrf = 2;
2285 inst->mlen = 1;
2286 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2287 inst->dst.writemask = WRITEMASK_XYZW;
2288
2289 inst->src[1] = src_reg(sampler);
2290
2291 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2292 int param_base = inst->base_mrf;
2293 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2294 int zero_mask = 0xf & ~coord_mask;
2295
2296 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2297 coordinate));
2298
2299 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2300 src_reg(0)));
2301
2302 emit(inst);
2303 return src_reg(inst->dst);
2304 }
2305
2306 void
2307 vec4_visitor::visit(ir_texture *ir)
2308 {
2309 uint32_t sampler =
2310 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2311
2312 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2313 * emitting anything other than setting up the constant result.
2314 */
2315 if (ir->op == ir_tg4) {
2316 ir_constant *chan = ir->lod_info.component->as_constant();
2317 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2318 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2319 dst_reg result(this, ir->type);
2320 this->result = src_reg(result);
2321 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2322 return;
2323 }
2324 }
2325
2326 /* Should be lowered by do_lower_texture_projection */
2327 assert(!ir->projector);
2328
2329 /* Should be lowered */
2330 assert(!ir->offset || !ir->offset->type->is_array());
2331
2332 /* Generate code to compute all the subexpression trees. This has to be
2333 * done before loading any values into MRFs for the sampler message since
2334 * generating these values may involve SEND messages that need the MRFs.
2335 */
2336 src_reg coordinate;
2337 if (ir->coordinate) {
2338 ir->coordinate->accept(this);
2339 coordinate = this->result;
2340 }
2341
2342 src_reg shadow_comparitor;
2343 if (ir->shadow_comparitor) {
2344 ir->shadow_comparitor->accept(this);
2345 shadow_comparitor = this->result;
2346 }
2347
2348 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2349 src_reg offset_value;
2350 if (has_nonconstant_offset) {
2351 ir->offset->accept(this);
2352 offset_value = src_reg(this->result);
2353 }
2354
2355 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2356 src_reg lod, dPdx, dPdy, sample_index, mcs;
2357 switch (ir->op) {
2358 case ir_tex:
2359 lod = src_reg(0.0f);
2360 lod_type = glsl_type::float_type;
2361 break;
2362 case ir_txf:
2363 case ir_txl:
2364 case ir_txs:
2365 ir->lod_info.lod->accept(this);
2366 lod = this->result;
2367 lod_type = ir->lod_info.lod->type;
2368 break;
2369 case ir_query_levels:
2370 lod = src_reg(0);
2371 lod_type = glsl_type::int_type;
2372 break;
2373 case ir_txf_ms:
2374 ir->lod_info.sample_index->accept(this);
2375 sample_index = this->result;
2376 sample_index_type = ir->lod_info.sample_index->type;
2377
2378 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2379 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2380 else
2381 mcs = src_reg(0u);
2382 break;
2383 case ir_txd:
2384 ir->lod_info.grad.dPdx->accept(this);
2385 dPdx = this->result;
2386
2387 ir->lod_info.grad.dPdy->accept(this);
2388 dPdy = this->result;
2389
2390 lod_type = ir->lod_info.grad.dPdx->type;
2391 break;
2392 case ir_txb:
2393 case ir_lod:
2394 case ir_tg4:
2395 break;
2396 }
2397
2398 enum opcode opcode;
2399 switch (ir->op) {
2400 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2401 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2402 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2403 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2404 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2405 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2406 case ir_tg4: opcode = has_nonconstant_offset
2407 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2408 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2409 case ir_txb:
2410 unreachable("TXB is not valid for vertex shaders.");
2411 case ir_lod:
2412 unreachable("LOD is not valid for vertex shaders.");
2413 default:
2414 unreachable("Unrecognized tex op");
2415 }
2416
2417 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2418
2419 if (ir->offset != NULL && ir->op != ir_txf)
2420 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2421
2422 /* Stuff the channel select bits in the top of the texture offset */
2423 if (ir->op == ir_tg4)
2424 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2425
2426 /* The message header is necessary for:
2427 * - Gen4 (always)
2428 * - Texel offsets
2429 * - Gather channel selection
2430 * - Sampler indices too large to fit in a 4-bit value.
2431 */
2432 inst->header_present =
2433 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2434 sampler >= 16;
2435 inst->base_mrf = 2;
2436 inst->mlen = inst->header_present + 1; /* always at least one */
2437 inst->dst = dst_reg(this, ir->type);
2438 inst->dst.writemask = WRITEMASK_XYZW;
2439 inst->shadow_compare = ir->shadow_comparitor != NULL;
2440
2441 inst->src[1] = src_reg(sampler);
2442
2443 /* MRF for the first parameter */
2444 int param_base = inst->base_mrf + inst->header_present;
2445
2446 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2447 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2448 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2449 } else {
2450 /* Load the coordinate */
2451 /* FINISHME: gl_clamp_mask and saturate */
2452 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2453 int zero_mask = 0xf & ~coord_mask;
2454
2455 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2456 coordinate));
2457
2458 if (zero_mask != 0) {
2459 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2460 src_reg(0)));
2461 }
2462 /* Load the shadow comparitor */
2463 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2464 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2465 WRITEMASK_X),
2466 shadow_comparitor));
2467 inst->mlen++;
2468 }
2469
2470 /* Load the LOD info */
2471 if (ir->op == ir_tex || ir->op == ir_txl) {
2472 int mrf, writemask;
2473 if (brw->gen >= 5) {
2474 mrf = param_base + 1;
2475 if (ir->shadow_comparitor) {
2476 writemask = WRITEMASK_Y;
2477 /* mlen already incremented */
2478 } else {
2479 writemask = WRITEMASK_X;
2480 inst->mlen++;
2481 }
2482 } else /* brw->gen == 4 */ {
2483 mrf = param_base;
2484 writemask = WRITEMASK_W;
2485 }
2486 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2487 } else if (ir->op == ir_txf) {
2488 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2489 } else if (ir->op == ir_txf_ms) {
2490 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2491 sample_index));
2492 if (brw->gen >= 7)
2493 /* MCS data is in the first channel of `mcs`, but we need to get it into
2494 * the .y channel of the second vec4 of params, so replicate .x across
2495 * the whole vec4 and then mask off everything except .y
2496 */
2497 mcs.swizzle = BRW_SWIZZLE_XXXX;
2498 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2499 mcs));
2500 inst->mlen++;
2501 } else if (ir->op == ir_txd) {
2502 const glsl_type *type = lod_type;
2503
2504 if (brw->gen >= 5) {
2505 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2506 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2507 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2508 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2509 inst->mlen++;
2510
2511 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2512 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2513 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2514 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2515 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2516 inst->mlen++;
2517
2518 if (ir->shadow_comparitor) {
2519 emit(MOV(dst_reg(MRF, param_base + 2,
2520 ir->shadow_comparitor->type, WRITEMASK_Z),
2521 shadow_comparitor));
2522 }
2523 }
2524 } else /* brw->gen == 4 */ {
2525 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2526 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2527 inst->mlen += 2;
2528 }
2529 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2530 if (ir->shadow_comparitor) {
2531 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2532 shadow_comparitor));
2533 }
2534
2535 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2536 offset_value));
2537 inst->mlen++;
2538 }
2539 }
2540
2541 emit(inst);
2542
2543 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2544 * spec requires layers.
2545 */
2546 if (ir->op == ir_txs) {
2547 glsl_type const *type = ir->sampler->type;
2548 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2549 type->sampler_array) {
2550 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2551 writemask(inst->dst, WRITEMASK_Z),
2552 src_reg(inst->dst), src_reg(6));
2553 }
2554 }
2555
2556 if (brw->gen == 6 && ir->op == ir_tg4) {
2557 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2558 }
2559
2560 swizzle_result(ir, src_reg(inst->dst), sampler);
2561 }
2562
2563 /**
2564 * Apply workarounds for Gen6 gather with UINT/SINT
2565 */
2566 void
2567 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2568 {
2569 if (!wa)
2570 return;
2571
2572 int width = (wa & WA_8BIT) ? 8 : 16;
2573 dst_reg dst_f = dst;
2574 dst_f.type = BRW_REGISTER_TYPE_F;
2575
2576 /* Convert from UNORM to UINT */
2577 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2578 emit(MOV(dst, src_reg(dst_f)));
2579
2580 if (wa & WA_SIGN) {
2581 /* Reinterpret the UINT value as a signed INT value by
2582 * shifting the sign bit into place, then shifting back
2583 * preserving sign.
2584 */
2585 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2586 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2587 }
2588 }
2589
2590 /**
2591 * Set up the gather channel based on the swizzle, for gather4.
2592 */
2593 uint32_t
2594 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2595 {
2596 ir_constant *chan = ir->lod_info.component->as_constant();
2597 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2598 switch (swiz) {
2599 case SWIZZLE_X: return 0;
2600 case SWIZZLE_Y:
2601 /* gather4 sampler is broken for green channel on RG32F --
2602 * we must ask for blue instead.
2603 */
2604 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2605 return 2;
2606 return 1;
2607 case SWIZZLE_Z: return 2;
2608 case SWIZZLE_W: return 3;
2609 default:
2610 unreachable("Not reached"); /* zero, one swizzles handled already */
2611 }
2612 }
2613
2614 void
2615 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2616 {
2617 int s = key->tex.swizzles[sampler];
2618
2619 this->result = src_reg(this, ir->type);
2620 dst_reg swizzled_result(this->result);
2621
2622 if (ir->op == ir_query_levels) {
2623 /* # levels is in .w */
2624 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2625 emit(MOV(swizzled_result, orig_val));
2626 return;
2627 }
2628
2629 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2630 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2631 emit(MOV(swizzled_result, orig_val));
2632 return;
2633 }
2634
2635
2636 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2637 int swizzle[4] = {0};
2638
2639 for (int i = 0; i < 4; i++) {
2640 switch (GET_SWZ(s, i)) {
2641 case SWIZZLE_ZERO:
2642 zero_mask |= (1 << i);
2643 break;
2644 case SWIZZLE_ONE:
2645 one_mask |= (1 << i);
2646 break;
2647 default:
2648 copy_mask |= (1 << i);
2649 swizzle[i] = GET_SWZ(s, i);
2650 break;
2651 }
2652 }
2653
2654 if (copy_mask) {
2655 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2656 swizzled_result.writemask = copy_mask;
2657 emit(MOV(swizzled_result, orig_val));
2658 }
2659
2660 if (zero_mask) {
2661 swizzled_result.writemask = zero_mask;
2662 emit(MOV(swizzled_result, src_reg(0.0f)));
2663 }
2664
2665 if (one_mask) {
2666 swizzled_result.writemask = one_mask;
2667 emit(MOV(swizzled_result, src_reg(1.0f)));
2668 }
2669 }
2670
2671 void
2672 vec4_visitor::visit(ir_return *)
2673 {
2674 unreachable("not reached");
2675 }
2676
2677 void
2678 vec4_visitor::visit(ir_discard *)
2679 {
2680 unreachable("not reached");
2681 }
2682
2683 void
2684 vec4_visitor::visit(ir_if *ir)
2685 {
2686 /* Don't point the annotation at the if statement, because then it plus
2687 * the then and else blocks get printed.
2688 */
2689 this->base_ir = ir->condition;
2690
2691 if (brw->gen == 6) {
2692 emit_if_gen6(ir);
2693 } else {
2694 enum brw_predicate predicate;
2695 emit_bool_to_cond_code(ir->condition, &predicate);
2696 emit(IF(predicate));
2697 }
2698
2699 visit_instructions(&ir->then_instructions);
2700
2701 if (!ir->else_instructions.is_empty()) {
2702 this->base_ir = ir->condition;
2703 emit(BRW_OPCODE_ELSE);
2704
2705 visit_instructions(&ir->else_instructions);
2706 }
2707
2708 this->base_ir = ir->condition;
2709 emit(BRW_OPCODE_ENDIF);
2710 }
2711
2712 void
2713 vec4_visitor::visit(ir_emit_vertex *)
2714 {
2715 unreachable("not reached");
2716 }
2717
2718 void
2719 vec4_visitor::visit(ir_end_primitive *)
2720 {
2721 unreachable("not reached");
2722 }
2723
2724 void
2725 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2726 dst_reg dst, src_reg offset,
2727 src_reg src0, src_reg src1)
2728 {
2729 unsigned mlen = 0;
2730
2731 /* Set the atomic operation offset. */
2732 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2733 mlen++;
2734
2735 /* Set the atomic operation arguments. */
2736 if (src0.file != BAD_FILE) {
2737 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2738 mlen++;
2739 }
2740
2741 if (src1.file != BAD_FILE) {
2742 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2743 mlen++;
2744 }
2745
2746 /* Emit the instruction. Note that this maps to the normal SIMD8
2747 * untyped atomic message on Ivy Bridge, but that's OK because
2748 * unused channels will be masked out.
2749 */
2750 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2751 src_reg(atomic_op), src_reg(surf_index));
2752 inst->base_mrf = 0;
2753 inst->mlen = mlen;
2754 }
2755
2756 void
2757 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2758 src_reg offset)
2759 {
2760 /* Set the surface read offset. */
2761 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2762
2763 /* Emit the instruction. Note that this maps to the normal SIMD8
2764 * untyped surface read message, but that's OK because unused
2765 * channels will be masked out.
2766 */
2767 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2768 dst, src_reg(surf_index));
2769 inst->base_mrf = 0;
2770 inst->mlen = 1;
2771 }
2772
2773 void
2774 vec4_visitor::emit_ndc_computation()
2775 {
2776 /* Get the position */
2777 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2778
2779 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2780 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2781 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2782
2783 current_annotation = "NDC";
2784 dst_reg ndc_w = ndc;
2785 ndc_w.writemask = WRITEMASK_W;
2786 src_reg pos_w = pos;
2787 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2788 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2789
2790 dst_reg ndc_xyz = ndc;
2791 ndc_xyz.writemask = WRITEMASK_XYZ;
2792
2793 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2794 }
2795
2796 void
2797 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2798 {
2799 if (brw->gen < 6 &&
2800 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2801 key->userclip_active || brw->has_negative_rhw_bug)) {
2802 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2803 dst_reg header1_w = header1;
2804 header1_w.writemask = WRITEMASK_W;
2805
2806 emit(MOV(header1, 0u));
2807
2808 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2809 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2810
2811 current_annotation = "Point size";
2812 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2813 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2814 }
2815
2816 if (key->userclip_active) {
2817 current_annotation = "Clipping flags";
2818 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2819 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2820
2821 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2822 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2823 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2824
2825 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2826 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2827 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2828 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2829 }
2830
2831 /* i965 clipping workaround:
2832 * 1) Test for -ve rhw
2833 * 2) If set,
2834 * set ndc = (0,0,0,0)
2835 * set ucp[6] = 1
2836 *
2837 * Later, clipping will detect ucp[6] and ensure the primitive is
2838 * clipped against all fixed planes.
2839 */
2840 if (brw->has_negative_rhw_bug) {
2841 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2842 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2843 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2844 vec4_instruction *inst;
2845 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2846 inst->predicate = BRW_PREDICATE_NORMAL;
2847 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2848 inst->predicate = BRW_PREDICATE_NORMAL;
2849 }
2850
2851 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2852 } else if (brw->gen < 6) {
2853 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2854 } else {
2855 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2856 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2857 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2858 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2859 }
2860 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2861 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2862 src_reg(output_reg[VARYING_SLOT_LAYER])));
2863 }
2864 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2865 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2866 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2867 }
2868 }
2869 }
2870
2871 void
2872 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2873 {
2874 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2875 *
2876 * "If a linked set of shaders forming the vertex stage contains no
2877 * static write to gl_ClipVertex or gl_ClipDistance, but the
2878 * application has requested clipping against user clip planes through
2879 * the API, then the coordinate written to gl_Position is used for
2880 * comparison against the user clip planes."
2881 *
2882 * This function is only called if the shader didn't write to
2883 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2884 * if the user wrote to it; otherwise we use gl_Position.
2885 */
2886 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2887 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2888 clip_vertex = VARYING_SLOT_POS;
2889 }
2890
2891 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2892 ++i) {
2893 reg.writemask = 1 << i;
2894 emit(DP4(reg,
2895 src_reg(output_reg[clip_vertex]),
2896 src_reg(this->userplane[i + offset])));
2897 }
2898 }
2899
2900 void
2901 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2902 {
2903 assert (varying < VARYING_SLOT_MAX);
2904 reg.type = output_reg[varying].type;
2905 current_annotation = output_reg_annotation[varying];
2906 /* Copy the register, saturating if necessary */
2907 vec4_instruction *inst = emit(MOV(reg,
2908 src_reg(output_reg[varying])));
2909 if ((varying == VARYING_SLOT_COL0 ||
2910 varying == VARYING_SLOT_COL1 ||
2911 varying == VARYING_SLOT_BFC0 ||
2912 varying == VARYING_SLOT_BFC1) &&
2913 key->clamp_vertex_color) {
2914 inst->saturate = true;
2915 }
2916 }
2917
2918 void
2919 vec4_visitor::emit_urb_slot(int mrf, int varying)
2920 {
2921 struct brw_reg hw_reg = brw_message_reg(mrf);
2922 dst_reg reg = dst_reg(MRF, mrf);
2923 reg.type = BRW_REGISTER_TYPE_F;
2924
2925 switch (varying) {
2926 case VARYING_SLOT_PSIZ:
2927 /* PSIZ is always in slot 0, and is coupled with other flags. */
2928 current_annotation = "indices, point width, clip flags";
2929 emit_psiz_and_flags(hw_reg);
2930 break;
2931 case BRW_VARYING_SLOT_NDC:
2932 current_annotation = "NDC";
2933 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2934 break;
2935 case VARYING_SLOT_POS:
2936 current_annotation = "gl_Position";
2937 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2938 break;
2939 case VARYING_SLOT_EDGE:
2940 /* This is present when doing unfilled polygons. We're supposed to copy
2941 * the edge flag from the user-provided vertex array
2942 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2943 * of that attribute (starts as 1.0f). This is then used in clipping to
2944 * determine which edges should be drawn as wireframe.
2945 */
2946 current_annotation = "edge flag";
2947 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2948 glsl_type::float_type, WRITEMASK_XYZW))));
2949 break;
2950 case BRW_VARYING_SLOT_PAD:
2951 /* No need to write to this slot */
2952 break;
2953 default:
2954 emit_generic_urb_slot(reg, varying);
2955 break;
2956 }
2957 }
2958
2959 static int
2960 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2961 {
2962 if (brw->gen >= 6) {
2963 /* URB data written (does not include the message header reg) must
2964 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2965 * section 5.4.3.2.2: URB_INTERLEAVED.
2966 *
2967 * URB entries are allocated on a multiple of 1024 bits, so an
2968 * extra 128 bits written here to make the end align to 256 is
2969 * no problem.
2970 */
2971 if ((mlen % 2) != 1)
2972 mlen++;
2973 }
2974
2975 return mlen;
2976 }
2977
2978
2979 /**
2980 * Generates the VUE payload plus the necessary URB write instructions to
2981 * output it.
2982 *
2983 * The VUE layout is documented in Volume 2a.
2984 */
2985 void
2986 vec4_visitor::emit_vertex()
2987 {
2988 /* MRF 0 is reserved for the debugger, so start with message header
2989 * in MRF 1.
2990 */
2991 int base_mrf = 1;
2992 int mrf = base_mrf;
2993 /* In the process of generating our URB write message contents, we
2994 * may need to unspill a register or load from an array. Those
2995 * reads would use MRFs 14-15.
2996 */
2997 int max_usable_mrf = 13;
2998
2999 /* The following assertion verifies that max_usable_mrf causes an
3000 * even-numbered amount of URB write data, which will meet gen6's
3001 * requirements for length alignment.
3002 */
3003 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3004
3005 /* First mrf is the g0-based message header containing URB handles and
3006 * such.
3007 */
3008 emit_urb_write_header(mrf++);
3009
3010 if (brw->gen < 6) {
3011 emit_ndc_computation();
3012 }
3013
3014 /* Lower legacy ff and ClipVertex clipping to clip distances */
3015 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3016 current_annotation = "user clip distances";
3017
3018 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3019 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3020
3021 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3022 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3023 }
3024
3025 /* We may need to split this up into several URB writes, so do them in a
3026 * loop.
3027 */
3028 int slot = 0;
3029 bool complete = false;
3030 do {
3031 /* URB offset is in URB row increments, and each of our MRFs is half of
3032 * one of those, since we're doing interleaved writes.
3033 */
3034 int offset = slot / 2;
3035
3036 mrf = base_mrf + 1;
3037 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3038 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3039
3040 /* If this was max_usable_mrf, we can't fit anything more into this
3041 * URB WRITE.
3042 */
3043 if (mrf > max_usable_mrf) {
3044 slot++;
3045 break;
3046 }
3047 }
3048
3049 complete = slot >= prog_data->vue_map.num_slots;
3050 current_annotation = "URB write";
3051 vec4_instruction *inst = emit_urb_write_opcode(complete);
3052 inst->base_mrf = base_mrf;
3053 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3054 inst->offset += offset;
3055 } while(!complete);
3056 }
3057
3058
3059 src_reg
3060 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3061 src_reg *reladdr, int reg_offset)
3062 {
3063 /* Because we store the values to scratch interleaved like our
3064 * vertex data, we need to scale the vec4 index by 2.
3065 */
3066 int message_header_scale = 2;
3067
3068 /* Pre-gen6, the message header uses byte offsets instead of vec4
3069 * (16-byte) offset units.
3070 */
3071 if (brw->gen < 6)
3072 message_header_scale *= 16;
3073
3074 if (reladdr) {
3075 src_reg index = src_reg(this, glsl_type::int_type);
3076
3077 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3078 emit_before(inst, MUL(dst_reg(index),
3079 index, src_reg(message_header_scale)));
3080
3081 return index;
3082 } else {
3083 return src_reg(reg_offset * message_header_scale);
3084 }
3085 }
3086
3087 src_reg
3088 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3089 src_reg *reladdr, int reg_offset)
3090 {
3091 if (reladdr) {
3092 src_reg index = src_reg(this, glsl_type::int_type);
3093
3094 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3095
3096 /* Pre-gen6, the message header uses byte offsets instead of vec4
3097 * (16-byte) offset units.
3098 */
3099 if (brw->gen < 6) {
3100 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3101 }
3102
3103 return index;
3104 } else if (brw->gen >= 8) {
3105 /* Store the offset in a GRF so we can send-from-GRF. */
3106 src_reg offset = src_reg(this, glsl_type::int_type);
3107 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3108 return offset;
3109 } else {
3110 int message_header_scale = brw->gen < 6 ? 16 : 1;
3111 return src_reg(reg_offset * message_header_scale);
3112 }
3113 }
3114
3115 /**
3116 * Emits an instruction before @inst to load the value named by @orig_src
3117 * from scratch space at @base_offset to @temp.
3118 *
3119 * @base_offset is measured in 32-byte units (the size of a register).
3120 */
3121 void
3122 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3123 dst_reg temp, src_reg orig_src,
3124 int base_offset)
3125 {
3126 int reg_offset = base_offset + orig_src.reg_offset;
3127 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3128
3129 emit_before(inst, SCRATCH_READ(temp, index));
3130 }
3131
3132 /**
3133 * Emits an instruction after @inst to store the value to be written
3134 * to @orig_dst to scratch space at @base_offset, from @temp.
3135 *
3136 * @base_offset is measured in 32-byte units (the size of a register).
3137 */
3138 void
3139 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3140 {
3141 int reg_offset = base_offset + inst->dst.reg_offset;
3142 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3143
3144 /* Create a temporary register to store *inst's result in.
3145 *
3146 * We have to be careful in MOVing from our temporary result register in
3147 * the scratch write. If we swizzle from channels of the temporary that
3148 * weren't initialized, it will confuse live interval analysis, which will
3149 * make spilling fail to make progress.
3150 */
3151 src_reg temp = src_reg(this, glsl_type::vec4_type);
3152 temp.type = inst->dst.type;
3153 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3154 int swizzles[4];
3155 for (int i = 0; i < 4; i++)
3156 if (inst->dst.writemask & (1 << i))
3157 swizzles[i] = i;
3158 else
3159 swizzles[i] = first_writemask_chan;
3160 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3161 swizzles[2], swizzles[3]);
3162
3163 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3164 inst->dst.writemask));
3165 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3166 write->predicate = inst->predicate;
3167 write->ir = inst->ir;
3168 write->annotation = inst->annotation;
3169 inst->insert_after(write);
3170
3171 inst->dst.file = temp.file;
3172 inst->dst.reg = temp.reg;
3173 inst->dst.reg_offset = temp.reg_offset;
3174 inst->dst.reladdr = NULL;
3175 }
3176
3177 /**
3178 * We can't generally support array access in GRF space, because a
3179 * single instruction's destination can only span 2 contiguous
3180 * registers. So, we send all GRF arrays that get variable index
3181 * access to scratch space.
3182 */
3183 void
3184 vec4_visitor::move_grf_array_access_to_scratch()
3185 {
3186 int scratch_loc[this->virtual_grf_count];
3187
3188 for (int i = 0; i < this->virtual_grf_count; i++) {
3189 scratch_loc[i] = -1;
3190 }
3191
3192 /* First, calculate the set of virtual GRFs that need to be punted
3193 * to scratch due to having any array access on them, and where in
3194 * scratch.
3195 */
3196 foreach_in_list(vec4_instruction, inst, &instructions) {
3197 if (inst->dst.file == GRF && inst->dst.reladdr &&
3198 scratch_loc[inst->dst.reg] == -1) {
3199 scratch_loc[inst->dst.reg] = c->last_scratch;
3200 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3201 }
3202
3203 for (int i = 0 ; i < 3; i++) {
3204 src_reg *src = &inst->src[i];
3205
3206 if (src->file == GRF && src->reladdr &&
3207 scratch_loc[src->reg] == -1) {
3208 scratch_loc[src->reg] = c->last_scratch;
3209 c->last_scratch += this->virtual_grf_sizes[src->reg];
3210 }
3211 }
3212 }
3213
3214 /* Now, for anything that will be accessed through scratch, rewrite
3215 * it to load/store. Note that this is a _safe list walk, because
3216 * we may generate a new scratch_write instruction after the one
3217 * we're processing.
3218 */
3219 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3220 /* Set up the annotation tracking for new generated instructions. */
3221 base_ir = inst->ir;
3222 current_annotation = inst->annotation;
3223
3224 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3225 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3226 }
3227
3228 for (int i = 0 ; i < 3; i++) {
3229 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3230 continue;
3231
3232 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3233
3234 emit_scratch_read(inst, temp, inst->src[i],
3235 scratch_loc[inst->src[i].reg]);
3236
3237 inst->src[i].file = temp.file;
3238 inst->src[i].reg = temp.reg;
3239 inst->src[i].reg_offset = temp.reg_offset;
3240 inst->src[i].reladdr = NULL;
3241 }
3242 }
3243 }
3244
3245 /**
3246 * Emits an instruction before @inst to load the value named by @orig_src
3247 * from the pull constant buffer (surface) at @base_offset to @temp.
3248 */
3249 void
3250 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3251 dst_reg temp, src_reg orig_src,
3252 int base_offset)
3253 {
3254 int reg_offset = base_offset + orig_src.reg_offset;
3255 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3256 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3257 vec4_instruction *load;
3258
3259 if (brw->gen >= 7) {
3260 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3261 grf_offset.type = offset.type;
3262 emit_before(inst, MOV(grf_offset, offset));
3263
3264 load = new(mem_ctx) vec4_instruction(this,
3265 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3266 temp, index, src_reg(grf_offset));
3267 } else {
3268 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3269 temp, index, offset);
3270 load->base_mrf = 14;
3271 load->mlen = 1;
3272 }
3273 emit_before(inst, load);
3274 }
3275
3276 /**
3277 * Implements array access of uniforms by inserting a
3278 * PULL_CONSTANT_LOAD instruction.
3279 *
3280 * Unlike temporary GRF array access (where we don't support it due to
3281 * the difficulty of doing relative addressing on instruction
3282 * destinations), we could potentially do array access of uniforms
3283 * that were loaded in GRF space as push constants. In real-world
3284 * usage we've seen, though, the arrays being used are always larger
3285 * than we could load as push constants, so just always move all
3286 * uniform array access out to a pull constant buffer.
3287 */
3288 void
3289 vec4_visitor::move_uniform_array_access_to_pull_constants()
3290 {
3291 int pull_constant_loc[this->uniforms];
3292
3293 for (int i = 0; i < this->uniforms; i++) {
3294 pull_constant_loc[i] = -1;
3295 }
3296
3297 /* Walk through and find array access of uniforms. Put a copy of that
3298 * uniform in the pull constant buffer.
3299 *
3300 * Note that we don't move constant-indexed accesses to arrays. No
3301 * testing has been done of the performance impact of this choice.
3302 */
3303 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3304 for (int i = 0 ; i < 3; i++) {
3305 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3306 continue;
3307
3308 int uniform = inst->src[i].reg;
3309
3310 /* If this array isn't already present in the pull constant buffer,
3311 * add it.
3312 */
3313 if (pull_constant_loc[uniform] == -1) {
3314 const gl_constant_value **values =
3315 &stage_prog_data->param[uniform * 4];
3316
3317 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3318
3319 assert(uniform < uniform_array_size);
3320 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3321 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3322 = values[j];
3323 }
3324 }
3325
3326 /* Set up the annotation tracking for new generated instructions. */
3327 base_ir = inst->ir;
3328 current_annotation = inst->annotation;
3329
3330 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3331
3332 emit_pull_constant_load(inst, temp, inst->src[i],
3333 pull_constant_loc[uniform]);
3334
3335 inst->src[i].file = temp.file;
3336 inst->src[i].reg = temp.reg;
3337 inst->src[i].reg_offset = temp.reg_offset;
3338 inst->src[i].reladdr = NULL;
3339 }
3340 }
3341
3342 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3343 * no need to track them as larger-than-vec4 objects. This will be
3344 * relied on in cutting out unused uniform vectors from push
3345 * constants.
3346 */
3347 split_uniform_registers();
3348 }
3349
3350 void
3351 vec4_visitor::resolve_ud_negate(src_reg *reg)
3352 {
3353 if (reg->type != BRW_REGISTER_TYPE_UD ||
3354 !reg->negate)
3355 return;
3356
3357 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3358 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3359 *reg = temp;
3360 }
3361
3362 vec4_visitor::vec4_visitor(struct brw_context *brw,
3363 struct brw_vec4_compile *c,
3364 struct gl_program *prog,
3365 const struct brw_vec4_prog_key *key,
3366 struct brw_vec4_prog_data *prog_data,
3367 struct gl_shader_program *shader_prog,
3368 gl_shader_stage stage,
3369 void *mem_ctx,
3370 bool debug_flag,
3371 bool no_spills,
3372 shader_time_shader_type st_base,
3373 shader_time_shader_type st_written,
3374 shader_time_shader_type st_reset)
3375 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3376 c(c),
3377 key(key),
3378 prog_data(prog_data),
3379 sanity_param_count(0),
3380 fail_msg(NULL),
3381 first_non_payload_grf(0),
3382 need_all_constants_in_pull_buffer(false),
3383 debug_flag(debug_flag),
3384 no_spills(no_spills),
3385 st_base(st_base),
3386 st_written(st_written),
3387 st_reset(st_reset)
3388 {
3389 this->mem_ctx = mem_ctx;
3390 this->failed = false;
3391
3392 this->base_ir = NULL;
3393 this->current_annotation = NULL;
3394 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3395
3396 this->variable_ht = hash_table_ctor(0,
3397 hash_table_pointer_hash,
3398 hash_table_pointer_compare);
3399
3400 this->virtual_grf_start = NULL;
3401 this->virtual_grf_end = NULL;
3402 this->virtual_grf_sizes = NULL;
3403 this->virtual_grf_count = 0;
3404 this->virtual_grf_reg_map = NULL;
3405 this->virtual_grf_reg_count = 0;
3406 this->virtual_grf_array_size = 0;
3407 this->live_intervals_valid = false;
3408
3409 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3410
3411 this->uniforms = 0;
3412
3413 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3414 * at least one. See setup_uniforms() in brw_vec4.cpp.
3415 */
3416 this->uniform_array_size = 1;
3417 if (prog_data) {
3418 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3419 }
3420
3421 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3422 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3423 }
3424
3425 vec4_visitor::~vec4_visitor()
3426 {
3427 hash_table_dtor(this->variable_ht);
3428 }
3429
3430
3431 void
3432 vec4_visitor::fail(const char *format, ...)
3433 {
3434 va_list va;
3435 char *msg;
3436
3437 if (failed)
3438 return;
3439
3440 failed = true;
3441
3442 va_start(va, format);
3443 msg = ralloc_vasprintf(mem_ctx, format, va);
3444 va_end(va);
3445 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3446
3447 this->fail_msg = msg;
3448
3449 if (debug_flag) {
3450 fprintf(stderr, "%s", msg);
3451 }
3452 }
3453
3454 } /* namespace brw */