i965: Get rid of backend_instruction::sampler
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->texture_offset = 0;
49 this->target = 0;
50 this->shadow_compare = false;
51 this->ir = v->base_ir;
52 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
53 this->header_present = false;
54 this->mlen = 0;
55 this->base_mrf = 0;
56 this->offset = 0;
57 this->annotation = v->current_annotation;
58 }
59
60 vec4_instruction *
61 vec4_visitor::emit(vec4_instruction *inst)
62 {
63 this->instructions.push_tail(inst);
64
65 return inst;
66 }
67
68 vec4_instruction *
69 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
70 {
71 new_inst->ir = inst->ir;
72 new_inst->annotation = inst->annotation;
73
74 inst->insert_before(new_inst);
75
76 return inst;
77 }
78
79 vec4_instruction *
80 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
81 src_reg src0, src_reg src1, src_reg src2)
82 {
83 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
84 src0, src1, src2));
85 }
86
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
92 }
93
94 vec4_instruction *
95 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
96 {
97 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
98 }
99
100 vec4_instruction *
101 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
102 {
103 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
104 }
105
106 vec4_instruction *
107 vec4_visitor::emit(enum opcode opcode)
108 {
109 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
110 }
111
112 #define ALU1(op) \
113 vec4_instruction * \
114 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
115 { \
116 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
117 src0); \
118 }
119
120 #define ALU2(op) \
121 vec4_instruction * \
122 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
123 const src_reg &src1) \
124 { \
125 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
126 src0, src1); \
127 }
128
129 #define ALU2_ACC(op) \
130 vec4_instruction * \
131 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
132 const src_reg &src1) \
133 { \
134 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
135 BRW_OPCODE_##op, dst, src0, src1); \
136 inst->writes_accumulator = true; \
137 return inst; \
138 }
139
140 #define ALU3(op) \
141 vec4_instruction * \
142 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
143 const src_reg &src1, const src_reg &src2) \
144 { \
145 assert(brw->gen >= 6); \
146 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
147 src0, src1, src2); \
148 }
149
150 ALU1(NOT)
151 ALU1(MOV)
152 ALU1(FRC)
153 ALU1(RNDD)
154 ALU1(RNDE)
155 ALU1(RNDZ)
156 ALU1(F32TO16)
157 ALU1(F16TO32)
158 ALU2(ADD)
159 ALU2(MUL)
160 ALU2_ACC(MACH)
161 ALU2(AND)
162 ALU2(OR)
163 ALU2(XOR)
164 ALU2(DP3)
165 ALU2(DP4)
166 ALU2(DPH)
167 ALU2(SHL)
168 ALU2(SHR)
169 ALU2(ASR)
170 ALU3(LRP)
171 ALU1(BFREV)
172 ALU3(BFE)
173 ALU2(BFI1)
174 ALU3(BFI2)
175 ALU1(FBH)
176 ALU1(FBL)
177 ALU1(CBIT)
178 ALU3(MAD)
179 ALU2_ACC(ADDC)
180 ALU2_ACC(SUBB)
181 ALU2(MAC)
182
183 /** Gen4 predicated IF. */
184 vec4_instruction *
185 vec4_visitor::IF(enum brw_predicate predicate)
186 {
187 vec4_instruction *inst;
188
189 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
190 inst->predicate = predicate;
191
192 return inst;
193 }
194
195 /** Gen6 IF with embedded comparison. */
196 vec4_instruction *
197 vec4_visitor::IF(src_reg src0, src_reg src1,
198 enum brw_conditional_mod condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
221 enum brw_conditional_mod condition)
222 {
223 vec4_instruction *inst;
224
225 /* original gen4 does type conversion to the destination type
226 * before before comparison, producing garbage results for floating
227 * point comparisons.
228 */
229 if (brw->gen == 4) {
230 dst.type = src0.type;
231 if (dst.file == HW_REG)
232 dst.fixed_hw_reg.type = dst.type;
233 }
234
235 resolve_ud_negate(&src0);
236 resolve_ud_negate(&src1);
237
238 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
239 inst->conditional_mod = condition;
240
241 return inst;
242 }
243
244 vec4_instruction *
245 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
246 {
247 vec4_instruction *inst;
248
249 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
250 dst, index);
251 inst->base_mrf = 14;
252 inst->mlen = 2;
253
254 return inst;
255 }
256
257 vec4_instruction *
258 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
259 const src_reg &index)
260 {
261 vec4_instruction *inst;
262
263 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
264 dst, src, index);
265 inst->base_mrf = 13;
266 inst->mlen = 3;
267
268 return inst;
269 }
270
271 void
272 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
273 {
274 static enum opcode dot_opcodes[] = {
275 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
276 };
277
278 emit(dot_opcodes[elements - 2], dst, src0, src1);
279 }
280
281 src_reg
282 vec4_visitor::fix_3src_operand(src_reg src)
283 {
284 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
285 * able to use vertical stride of zero to replicate the vec4 uniform, like
286 *
287 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
288 *
289 * But you can't, since vertical stride is always four in three-source
290 * instructions. Instead, insert a MOV instruction to do the replication so
291 * that the three-source instruction can consume it.
292 */
293
294 /* The MOV is only needed if the source is a uniform or immediate. */
295 if (src.file != UNIFORM && src.file != IMM)
296 return src;
297
298 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
299 return src;
300
301 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
302 expanded.type = src.type;
303 emit(MOV(expanded, src));
304 return src_reg(expanded);
305 }
306
307 src_reg
308 vec4_visitor::fix_math_operand(src_reg src)
309 {
310 /* The gen6 math instruction ignores the source modifiers --
311 * swizzle, abs, negate, and at least some parts of the register
312 * region description.
313 *
314 * Rather than trying to enumerate all these cases, *always* expand the
315 * operand to a temp GRF for gen6.
316 *
317 * For gen7, keep the operand as-is, except if immediate, which gen7 still
318 * can't use.
319 */
320
321 if (brw->gen == 7 && src.file != IMM)
322 return src;
323
324 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
325 expanded.type = src.type;
326 emit(MOV(expanded, src));
327 return src_reg(expanded);
328 }
329
330 void
331 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
332 {
333 src = fix_math_operand(src);
334
335 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
336 /* The gen6 math instruction must be align1, so we can't do
337 * writemasks.
338 */
339 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
340
341 emit(opcode, temp_dst, src);
342
343 emit(MOV(dst, src_reg(temp_dst)));
344 } else {
345 emit(opcode, dst, src);
346 }
347 }
348
349 void
350 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
351 {
352 vec4_instruction *inst = emit(opcode, dst, src);
353 inst->base_mrf = 1;
354 inst->mlen = 1;
355 }
356
357 void
358 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
359 {
360 switch (opcode) {
361 case SHADER_OPCODE_RCP:
362 case SHADER_OPCODE_RSQ:
363 case SHADER_OPCODE_SQRT:
364 case SHADER_OPCODE_EXP2:
365 case SHADER_OPCODE_LOG2:
366 case SHADER_OPCODE_SIN:
367 case SHADER_OPCODE_COS:
368 break;
369 default:
370 unreachable("not reached: bad math opcode");
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 unreachable("not reached: unsupported binary math opcode");
424 }
425
426 if (brw->gen >= 8) {
427 emit(opcode, dst, src0, src1);
428 } else if (brw->gen >= 6) {
429 emit_math2_gen6(opcode, dst, src0, src1);
430 } else {
431 emit_math2_gen4(opcode, dst, src0, src1);
432 }
433 }
434
435 void
436 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
437 {
438 if (brw->gen < 7) {
439 unreachable("ir_unop_pack_half_2x16 should be lowered");
440 }
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7) {
516 unreachable("ir_unop_unpack_half_2x16 should be lowered");
517 }
518
519 assert(dst.type == BRW_REGISTER_TYPE_F);
520 assert(src0.type == BRW_REGISTER_TYPE_UD);
521
522 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
523 *
524 * Because this instruction does not have a 16-bit floating-point type,
525 * the source data type must be Word (W). The destination type must be
526 * F (Float).
527 *
528 * To use W as the source data type, we must adjust horizontal strides,
529 * which is only possible in align1 mode. All my [chadv] attempts at
530 * emitting align1 instructions for unpackHalf2x16 failed to pass the
531 * Piglit tests, so I gave up.
532 *
533 * I've verified that, on gen7 hardware and the simulator, it is safe to
534 * emit f16to32 in align16 mode with UD as source data type.
535 */
536
537 dst_reg tmp_dst(this, glsl_type::uvec2_type);
538 src_reg tmp_src(tmp_dst);
539
540 tmp_dst.writemask = WRITEMASK_X;
541 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
542
543 tmp_dst.writemask = WRITEMASK_Y;
544 emit(SHR(tmp_dst, src0, src_reg(16u)));
545
546 dst.writemask = WRITEMASK_XY;
547 emit(F16TO32(dst, tmp_src));
548 }
549
550 void
551 vec4_visitor::visit_instructions(const exec_list *list)
552 {
553 foreach_in_list(ir_instruction, ir, list) {
554 base_ir = ir;
555 ir->accept(this);
556 }
557 }
558
559
560 static int
561 type_size(const struct glsl_type *type)
562 {
563 unsigned int i;
564 int size;
565
566 switch (type->base_type) {
567 case GLSL_TYPE_UINT:
568 case GLSL_TYPE_INT:
569 case GLSL_TYPE_FLOAT:
570 case GLSL_TYPE_BOOL:
571 if (type->is_matrix()) {
572 return type->matrix_columns;
573 } else {
574 /* Regardless of size of vector, it gets a vec4. This is bad
575 * packing for things like floats, but otherwise arrays become a
576 * mess. Hopefully a later pass over the code can pack scalars
577 * down if appropriate.
578 */
579 return 1;
580 }
581 case GLSL_TYPE_ARRAY:
582 assert(type->length > 0);
583 return type_size(type->fields.array) * type->length;
584 case GLSL_TYPE_STRUCT:
585 size = 0;
586 for (i = 0; i < type->length; i++) {
587 size += type_size(type->fields.structure[i].type);
588 }
589 return size;
590 case GLSL_TYPE_SAMPLER:
591 /* Samplers take up one slot in UNIFORMS[], but they're baked in
592 * at link time.
593 */
594 return 1;
595 case GLSL_TYPE_ATOMIC_UINT:
596 return 0;
597 case GLSL_TYPE_IMAGE:
598 case GLSL_TYPE_VOID:
599 case GLSL_TYPE_ERROR:
600 case GLSL_TYPE_INTERFACE:
601 unreachable("not reached");
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = &components->f;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static float zero = 0;
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
719 }
720 ++this->uniforms;
721 }
722 }
723
724 /* Our support for builtin uniforms is even scarier than non-builtin.
725 * It sits on top of the PROG_STATE_VAR parameters that are
726 * automatically updated from GL context state.
727 */
728 void
729 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
730 {
731 const ir_state_slot *const slots = ir->state_slots;
732 assert(ir->state_slots != NULL);
733
734 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
735 /* This state reference has already been setup by ir_to_mesa,
736 * but we'll get the same index back here. We can reference
737 * ParameterValues directly, since unlike brw_fs.cpp, we never
738 * add new state references during compile.
739 */
740 int index = _mesa_add_state_reference(this->prog->Parameters,
741 (gl_state_index *)slots[i].tokens);
742 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
743
744 assert(this->uniforms < uniform_array_size);
745 this->uniform_vector_size[this->uniforms] = 0;
746 /* Add each of the unique swizzled channels of the element.
747 * This will end up matching the size of the glsl_type of this field.
748 */
749 int last_swiz = -1;
750 for (unsigned int j = 0; j < 4; j++) {
751 int swiz = GET_SWZ(slots[i].swizzle, j);
752 last_swiz = swiz;
753
754 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
755 assert(this->uniforms < uniform_array_size);
756 if (swiz <= last_swiz)
757 this->uniform_vector_size[this->uniforms]++;
758 }
759 this->uniforms++;
760 }
761 }
762
763 dst_reg *
764 vec4_visitor::variable_storage(ir_variable *var)
765 {
766 return (dst_reg *)hash_table_find(this->variable_ht, var);
767 }
768
769 void
770 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
771 enum brw_predicate *predicate)
772 {
773 ir_expression *expr = ir->as_expression();
774
775 *predicate = BRW_PREDICATE_NORMAL;
776
777 if (expr) {
778 src_reg op[2];
779 vec4_instruction *inst;
780
781 assert(expr->get_num_operands() <= 2);
782 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
783 expr->operands[i]->accept(this);
784 op[i] = this->result;
785
786 resolve_ud_negate(&op[i]);
787 }
788
789 switch (expr->operation) {
790 case ir_unop_logic_not:
791 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
792 inst->conditional_mod = BRW_CONDITIONAL_Z;
793 break;
794
795 case ir_binop_logic_xor:
796 inst = emit(XOR(dst_null_d(), op[0], op[1]));
797 inst->conditional_mod = BRW_CONDITIONAL_NZ;
798 break;
799
800 case ir_binop_logic_or:
801 inst = emit(OR(dst_null_d(), op[0], op[1]));
802 inst->conditional_mod = BRW_CONDITIONAL_NZ;
803 break;
804
805 case ir_binop_logic_and:
806 inst = emit(AND(dst_null_d(), op[0], op[1]));
807 inst->conditional_mod = BRW_CONDITIONAL_NZ;
808 break;
809
810 case ir_unop_f2b:
811 if (brw->gen >= 6) {
812 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
813 } else {
814 inst = emit(MOV(dst_null_f(), op[0]));
815 inst->conditional_mod = BRW_CONDITIONAL_NZ;
816 }
817 break;
818
819 case ir_unop_i2b:
820 if (brw->gen >= 6) {
821 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
822 } else {
823 inst = emit(MOV(dst_null_d(), op[0]));
824 inst->conditional_mod = BRW_CONDITIONAL_NZ;
825 }
826 break;
827
828 case ir_binop_all_equal:
829 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
830 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
831 break;
832
833 case ir_binop_any_nequal:
834 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
835 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
836 break;
837
838 case ir_unop_any:
839 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
840 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
841 break;
842
843 case ir_binop_greater:
844 case ir_binop_gequal:
845 case ir_binop_less:
846 case ir_binop_lequal:
847 case ir_binop_equal:
848 case ir_binop_nequal:
849 emit(CMP(dst_null_d(), op[0], op[1],
850 brw_conditional_for_comparison(expr->operation)));
851 break;
852
853 default:
854 unreachable("not reached");
855 }
856 return;
857 }
858
859 ir->accept(this);
860
861 resolve_ud_negate(&this->result);
862
863 if (brw->gen >= 6) {
864 vec4_instruction *inst = emit(AND(dst_null_d(),
865 this->result, src_reg(1)));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 } else {
868 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
869 inst->conditional_mod = BRW_CONDITIONAL_NZ;
870 }
871 }
872
873 /**
874 * Emit a gen6 IF statement with the comparison folded into the IF
875 * instruction.
876 */
877 void
878 vec4_visitor::emit_if_gen6(ir_if *ir)
879 {
880 ir_expression *expr = ir->condition->as_expression();
881
882 if (expr) {
883 src_reg op[2];
884 dst_reg temp;
885
886 assert(expr->get_num_operands() <= 2);
887 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
888 expr->operands[i]->accept(this);
889 op[i] = this->result;
890 }
891
892 switch (expr->operation) {
893 case ir_unop_logic_not:
894 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
895 return;
896
897 case ir_binop_logic_xor:
898 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
899 return;
900
901 case ir_binop_logic_or:
902 temp = dst_reg(this, glsl_type::bool_type);
903 emit(OR(temp, op[0], op[1]));
904 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
905 return;
906
907 case ir_binop_logic_and:
908 temp = dst_reg(this, glsl_type::bool_type);
909 emit(AND(temp, op[0], op[1]));
910 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
911 return;
912
913 case ir_unop_f2b:
914 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
915 return;
916
917 case ir_unop_i2b:
918 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
919 return;
920
921 case ir_binop_greater:
922 case ir_binop_gequal:
923 case ir_binop_less:
924 case ir_binop_lequal:
925 case ir_binop_equal:
926 case ir_binop_nequal:
927 emit(IF(op[0], op[1],
928 brw_conditional_for_comparison(expr->operation)));
929 return;
930
931 case ir_binop_all_equal:
932 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
933 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
934 return;
935
936 case ir_binop_any_nequal:
937 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
938 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
939 return;
940
941 case ir_unop_any:
942 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
943 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
944 return;
945
946 default:
947 unreachable("not reached");
948 }
949 return;
950 }
951
952 ir->condition->accept(this);
953
954 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
955 }
956
957 void
958 vec4_visitor::visit(ir_variable *ir)
959 {
960 dst_reg *reg = NULL;
961
962 if (variable_storage(ir))
963 return;
964
965 switch (ir->data.mode) {
966 case ir_var_shader_in:
967 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
968 break;
969
970 case ir_var_shader_out:
971 reg = new(mem_ctx) dst_reg(this, ir->type);
972
973 for (int i = 0; i < type_size(ir->type); i++) {
974 output_reg[ir->data.location + i] = *reg;
975 output_reg[ir->data.location + i].reg_offset = i;
976 output_reg[ir->data.location + i].type =
977 brw_type_for_base_type(ir->type->get_scalar_type());
978 output_reg_annotation[ir->data.location + i] = ir->name;
979 }
980 break;
981
982 case ir_var_auto:
983 case ir_var_temporary:
984 reg = new(mem_ctx) dst_reg(this, ir->type);
985 break;
986
987 case ir_var_uniform:
988 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
989
990 /* Thanks to the lower_ubo_reference pass, we will see only
991 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
992 * variables, so no need for them to be in variable_ht.
993 *
994 * Atomic counters take no uniform storage, no need to do
995 * anything here.
996 */
997 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
998 return;
999
1000 /* Track how big the whole uniform variable is, in case we need to put a
1001 * copy of its data into pull constants for array access.
1002 */
1003 assert(this->uniforms < uniform_array_size);
1004 this->uniform_size[this->uniforms] = type_size(ir->type);
1005
1006 if (!strncmp(ir->name, "gl_", 3)) {
1007 setup_builtin_uniform_values(ir);
1008 } else {
1009 setup_uniform_values(ir);
1010 }
1011 break;
1012
1013 case ir_var_system_value:
1014 reg = make_reg_for_system_value(ir);
1015 break;
1016
1017 default:
1018 unreachable("not reached");
1019 }
1020
1021 reg->type = brw_type_for_base_type(ir->type);
1022 hash_table_insert(this->variable_ht, reg, ir);
1023 }
1024
1025 void
1026 vec4_visitor::visit(ir_loop *ir)
1027 {
1028 /* We don't want debugging output to print the whole body of the
1029 * loop as the annotation.
1030 */
1031 this->base_ir = NULL;
1032
1033 emit(BRW_OPCODE_DO);
1034
1035 visit_instructions(&ir->body_instructions);
1036
1037 emit(BRW_OPCODE_WHILE);
1038 }
1039
1040 void
1041 vec4_visitor::visit(ir_loop_jump *ir)
1042 {
1043 switch (ir->mode) {
1044 case ir_loop_jump::jump_break:
1045 emit(BRW_OPCODE_BREAK);
1046 break;
1047 case ir_loop_jump::jump_continue:
1048 emit(BRW_OPCODE_CONTINUE);
1049 break;
1050 }
1051 }
1052
1053
1054 void
1055 vec4_visitor::visit(ir_function_signature *)
1056 {
1057 unreachable("not reached");
1058 }
1059
1060 void
1061 vec4_visitor::visit(ir_function *ir)
1062 {
1063 /* Ignore function bodies other than main() -- we shouldn't see calls to
1064 * them since they should all be inlined.
1065 */
1066 if (strcmp(ir->name, "main") == 0) {
1067 const ir_function_signature *sig;
1068 exec_list empty;
1069
1070 sig = ir->matching_signature(NULL, &empty, false);
1071
1072 assert(sig);
1073
1074 visit_instructions(&sig->body);
1075 }
1076 }
1077
1078 bool
1079 vec4_visitor::try_emit_sat(ir_expression *ir)
1080 {
1081 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1082 if (!sat_src)
1083 return false;
1084
1085 sat_src->accept(this);
1086 src_reg src = this->result;
1087
1088 this->result = src_reg(this, ir->type);
1089 vec4_instruction *inst;
1090 inst = emit(MOV(dst_reg(this->result), src));
1091 inst->saturate = true;
1092
1093 return true;
1094 }
1095
1096 bool
1097 vec4_visitor::try_emit_mad(ir_expression *ir)
1098 {
1099 /* 3-src instructions were introduced in gen6. */
1100 if (brw->gen < 6)
1101 return false;
1102
1103 /* MAD can only handle floating-point data. */
1104 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1105 return false;
1106
1107 ir_rvalue *nonmul = ir->operands[1];
1108 ir_expression *mul = ir->operands[0]->as_expression();
1109
1110 if (!mul || mul->operation != ir_binop_mul) {
1111 nonmul = ir->operands[0];
1112 mul = ir->operands[1]->as_expression();
1113
1114 if (!mul || mul->operation != ir_binop_mul)
1115 return false;
1116 }
1117
1118 nonmul->accept(this);
1119 src_reg src0 = fix_3src_operand(this->result);
1120
1121 mul->operands[0]->accept(this);
1122 src_reg src1 = fix_3src_operand(this->result);
1123
1124 mul->operands[1]->accept(this);
1125 src_reg src2 = fix_3src_operand(this->result);
1126
1127 this->result = src_reg(this, ir->type);
1128 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1129
1130 return true;
1131 }
1132
1133 bool
1134 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1135 {
1136 ir_expression *const cmp = ir->operands[0]->as_expression();
1137
1138 if (cmp == NULL)
1139 return false;
1140
1141 switch (cmp->operation) {
1142 case ir_binop_less:
1143 case ir_binop_greater:
1144 case ir_binop_lequal:
1145 case ir_binop_gequal:
1146 case ir_binop_equal:
1147 case ir_binop_nequal:
1148 break;
1149
1150 default:
1151 return false;
1152 }
1153
1154 cmp->operands[0]->accept(this);
1155 const src_reg cmp_src0 = this->result;
1156
1157 cmp->operands[1]->accept(this);
1158 const src_reg cmp_src1 = this->result;
1159
1160 this->result = src_reg(this, ir->type);
1161
1162 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1163 brw_conditional_for_comparison(cmp->operation)));
1164
1165 /* If the comparison is false, this->result will just happen to be zero.
1166 */
1167 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1168 this->result, src_reg(1.0f));
1169 inst->predicate = BRW_PREDICATE_NORMAL;
1170 inst->predicate_inverse = true;
1171
1172 return true;
1173 }
1174
1175 void
1176 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1177 src_reg src0, src_reg src1)
1178 {
1179 vec4_instruction *inst;
1180
1181 if (brw->gen >= 6) {
1182 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1183 inst->conditional_mod = conditionalmod;
1184 } else {
1185 emit(CMP(dst, src0, src1, conditionalmod));
1186
1187 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1188 inst->predicate = BRW_PREDICATE_NORMAL;
1189 }
1190 }
1191
1192 void
1193 vec4_visitor::emit_lrp(const dst_reg &dst,
1194 const src_reg &x, const src_reg &y, const src_reg &a)
1195 {
1196 if (brw->gen >= 6) {
1197 /* Note that the instruction's argument order is reversed from GLSL
1198 * and the IR.
1199 */
1200 emit(LRP(dst,
1201 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1202 } else {
1203 /* Earlier generations don't support three source operations, so we
1204 * need to emit x*(1-a) + y*a.
1205 */
1206 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1207 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1208 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1209 y_times_a.writemask = dst.writemask;
1210 one_minus_a.writemask = dst.writemask;
1211 x_times_one_minus_a.writemask = dst.writemask;
1212
1213 emit(MUL(y_times_a, y, a));
1214 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1215 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1216 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1217 }
1218 }
1219
1220 void
1221 vec4_visitor::visit(ir_expression *ir)
1222 {
1223 unsigned int operand;
1224 src_reg op[Elements(ir->operands)];
1225 src_reg result_src;
1226 dst_reg result_dst;
1227 vec4_instruction *inst;
1228
1229 if (try_emit_sat(ir))
1230 return;
1231
1232 if (ir->operation == ir_binop_add) {
1233 if (try_emit_mad(ir))
1234 return;
1235 }
1236
1237 if (ir->operation == ir_unop_b2f) {
1238 if (try_emit_b2f_of_compare(ir))
1239 return;
1240 }
1241
1242 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1243 this->result.file = BAD_FILE;
1244 ir->operands[operand]->accept(this);
1245 if (this->result.file == BAD_FILE) {
1246 fprintf(stderr, "Failed to get tree for expression operand:\n");
1247 ir->operands[operand]->fprint(stderr);
1248 exit(1);
1249 }
1250 op[operand] = this->result;
1251
1252 /* Matrix expression operands should have been broken down to vector
1253 * operations already.
1254 */
1255 assert(!ir->operands[operand]->type->is_matrix());
1256 }
1257
1258 int vector_elements = ir->operands[0]->type->vector_elements;
1259 if (ir->operands[1]) {
1260 vector_elements = MAX2(vector_elements,
1261 ir->operands[1]->type->vector_elements);
1262 }
1263
1264 this->result.file = BAD_FILE;
1265
1266 /* Storage for our result. Ideally for an assignment we'd be using
1267 * the actual storage for the result here, instead.
1268 */
1269 result_src = src_reg(this, ir->type);
1270 /* convenience for the emit functions below. */
1271 result_dst = dst_reg(result_src);
1272 /* If nothing special happens, this is the result. */
1273 this->result = result_src;
1274 /* Limit writes to the channels that will be used by result_src later.
1275 * This does limit this temp's use as a temporary for multi-instruction
1276 * sequences.
1277 */
1278 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1279
1280 switch (ir->operation) {
1281 case ir_unop_logic_not:
1282 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1283 * ones complement of the whole register, not just bit 0.
1284 */
1285 emit(XOR(result_dst, op[0], src_reg(1)));
1286 break;
1287 case ir_unop_neg:
1288 op[0].negate = !op[0].negate;
1289 emit(MOV(result_dst, op[0]));
1290 break;
1291 case ir_unop_abs:
1292 op[0].abs = true;
1293 op[0].negate = false;
1294 emit(MOV(result_dst, op[0]));
1295 break;
1296
1297 case ir_unop_sign:
1298 if (ir->type->is_float()) {
1299 /* AND(val, 0x80000000) gives the sign bit.
1300 *
1301 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1302 * zero.
1303 */
1304 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1305
1306 op[0].type = BRW_REGISTER_TYPE_UD;
1307 result_dst.type = BRW_REGISTER_TYPE_UD;
1308 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1309
1310 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1311 inst->predicate = BRW_PREDICATE_NORMAL;
1312
1313 this->result.type = BRW_REGISTER_TYPE_F;
1314 } else {
1315 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1316 * -> non-negative val generates 0x00000000.
1317 * Predicated OR sets 1 if val is positive.
1318 */
1319 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1320
1321 emit(ASR(result_dst, op[0], src_reg(31)));
1322
1323 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1324 inst->predicate = BRW_PREDICATE_NORMAL;
1325 }
1326 break;
1327
1328 case ir_unop_rcp:
1329 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1330 break;
1331
1332 case ir_unop_exp2:
1333 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1334 break;
1335 case ir_unop_log2:
1336 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1337 break;
1338 case ir_unop_exp:
1339 case ir_unop_log:
1340 unreachable("not reached: should be handled by ir_explog_to_explog2");
1341 case ir_unop_sin:
1342 case ir_unop_sin_reduced:
1343 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1344 break;
1345 case ir_unop_cos:
1346 case ir_unop_cos_reduced:
1347 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1348 break;
1349
1350 case ir_unop_dFdx:
1351 case ir_unop_dFdy:
1352 unreachable("derivatives not valid in vertex shader");
1353
1354 case ir_unop_bitfield_reverse:
1355 emit(BFREV(result_dst, op[0]));
1356 break;
1357 case ir_unop_bit_count:
1358 emit(CBIT(result_dst, op[0]));
1359 break;
1360 case ir_unop_find_msb: {
1361 src_reg temp = src_reg(this, glsl_type::uint_type);
1362
1363 inst = emit(FBH(dst_reg(temp), op[0]));
1364 inst->dst.writemask = WRITEMASK_XYZW;
1365
1366 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1367 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1368 * subtract the result from 31 to convert the MSB count into an LSB count.
1369 */
1370
1371 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1372 temp.swizzle = BRW_SWIZZLE_NOOP;
1373 emit(MOV(result_dst, temp));
1374
1375 src_reg src_tmp = src_reg(result_dst);
1376 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1377
1378 src_tmp.negate = true;
1379 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1380 inst->predicate = BRW_PREDICATE_NORMAL;
1381 break;
1382 }
1383 case ir_unop_find_lsb:
1384 emit(FBL(result_dst, op[0]));
1385 break;
1386
1387 case ir_unop_noise:
1388 unreachable("not reached: should be handled by lower_noise");
1389
1390 case ir_binop_add:
1391 emit(ADD(result_dst, op[0], op[1]));
1392 break;
1393 case ir_binop_sub:
1394 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1395
1396 case ir_binop_mul:
1397 if (brw->gen < 8 && ir->type->is_integer()) {
1398 /* For integer multiplication, the MUL uses the low 16 bits of one of
1399 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1400 * accumulates in the contribution of the upper 16 bits of that
1401 * operand. If we can determine that one of the args is in the low
1402 * 16 bits, though, we can just emit a single MUL.
1403 */
1404 if (ir->operands[0]->is_uint16_constant()) {
1405 if (brw->gen < 7)
1406 emit(MUL(result_dst, op[0], op[1]));
1407 else
1408 emit(MUL(result_dst, op[1], op[0]));
1409 } else if (ir->operands[1]->is_uint16_constant()) {
1410 if (brw->gen < 7)
1411 emit(MUL(result_dst, op[1], op[0]));
1412 else
1413 emit(MUL(result_dst, op[0], op[1]));
1414 } else {
1415 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1416
1417 emit(MUL(acc, op[0], op[1]));
1418 emit(MACH(dst_null_d(), op[0], op[1]));
1419 emit(MOV(result_dst, src_reg(acc)));
1420 }
1421 } else {
1422 emit(MUL(result_dst, op[0], op[1]));
1423 }
1424 break;
1425 case ir_binop_imul_high: {
1426 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1427
1428 emit(MUL(acc, op[0], op[1]));
1429 emit(MACH(result_dst, op[0], op[1]));
1430 break;
1431 }
1432 case ir_binop_div:
1433 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1434 assert(ir->type->is_integer());
1435 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1436 break;
1437 case ir_binop_carry: {
1438 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1439
1440 emit(ADDC(dst_null_ud(), op[0], op[1]));
1441 emit(MOV(result_dst, src_reg(acc)));
1442 break;
1443 }
1444 case ir_binop_borrow: {
1445 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1446
1447 emit(SUBB(dst_null_ud(), op[0], op[1]));
1448 emit(MOV(result_dst, src_reg(acc)));
1449 break;
1450 }
1451 case ir_binop_mod:
1452 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1453 assert(ir->type->is_integer());
1454 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1455 break;
1456
1457 case ir_binop_less:
1458 case ir_binop_greater:
1459 case ir_binop_lequal:
1460 case ir_binop_gequal:
1461 case ir_binop_equal:
1462 case ir_binop_nequal: {
1463 emit(CMP(result_dst, op[0], op[1],
1464 brw_conditional_for_comparison(ir->operation)));
1465 emit(AND(result_dst, result_src, src_reg(0x1)));
1466 break;
1467 }
1468
1469 case ir_binop_all_equal:
1470 /* "==" operator producing a scalar boolean. */
1471 if (ir->operands[0]->type->is_vector() ||
1472 ir->operands[1]->type->is_vector()) {
1473 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1474 emit(MOV(result_dst, src_reg(0)));
1475 inst = emit(MOV(result_dst, src_reg(1)));
1476 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1477 } else {
1478 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1479 emit(AND(result_dst, result_src, src_reg(0x1)));
1480 }
1481 break;
1482 case ir_binop_any_nequal:
1483 /* "!=" operator producing a scalar boolean. */
1484 if (ir->operands[0]->type->is_vector() ||
1485 ir->operands[1]->type->is_vector()) {
1486 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1487
1488 emit(MOV(result_dst, src_reg(0)));
1489 inst = emit(MOV(result_dst, src_reg(1)));
1490 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1491 } else {
1492 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1493 emit(AND(result_dst, result_src, src_reg(0x1)));
1494 }
1495 break;
1496
1497 case ir_unop_any:
1498 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1499 emit(MOV(result_dst, src_reg(0)));
1500
1501 inst = emit(MOV(result_dst, src_reg(1)));
1502 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1503 break;
1504
1505 case ir_binop_logic_xor:
1506 emit(XOR(result_dst, op[0], op[1]));
1507 break;
1508
1509 case ir_binop_logic_or:
1510 emit(OR(result_dst, op[0], op[1]));
1511 break;
1512
1513 case ir_binop_logic_and:
1514 emit(AND(result_dst, op[0], op[1]));
1515 break;
1516
1517 case ir_binop_dot:
1518 assert(ir->operands[0]->type->is_vector());
1519 assert(ir->operands[0]->type == ir->operands[1]->type);
1520 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1521 break;
1522
1523 case ir_unop_sqrt:
1524 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1525 break;
1526 case ir_unop_rsq:
1527 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1528 break;
1529
1530 case ir_unop_bitcast_i2f:
1531 case ir_unop_bitcast_u2f:
1532 this->result = op[0];
1533 this->result.type = BRW_REGISTER_TYPE_F;
1534 break;
1535
1536 case ir_unop_bitcast_f2i:
1537 this->result = op[0];
1538 this->result.type = BRW_REGISTER_TYPE_D;
1539 break;
1540
1541 case ir_unop_bitcast_f2u:
1542 this->result = op[0];
1543 this->result.type = BRW_REGISTER_TYPE_UD;
1544 break;
1545
1546 case ir_unop_i2f:
1547 case ir_unop_i2u:
1548 case ir_unop_u2i:
1549 case ir_unop_u2f:
1550 case ir_unop_b2f:
1551 case ir_unop_b2i:
1552 case ir_unop_f2i:
1553 case ir_unop_f2u:
1554 emit(MOV(result_dst, op[0]));
1555 break;
1556 case ir_unop_f2b:
1557 case ir_unop_i2b: {
1558 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1559 emit(AND(result_dst, result_src, src_reg(1)));
1560 break;
1561 }
1562
1563 case ir_unop_trunc:
1564 emit(RNDZ(result_dst, op[0]));
1565 break;
1566 case ir_unop_ceil:
1567 op[0].negate = !op[0].negate;
1568 inst = emit(RNDD(result_dst, op[0]));
1569 this->result.negate = true;
1570 break;
1571 case ir_unop_floor:
1572 inst = emit(RNDD(result_dst, op[0]));
1573 break;
1574 case ir_unop_fract:
1575 inst = emit(FRC(result_dst, op[0]));
1576 break;
1577 case ir_unop_round_even:
1578 emit(RNDE(result_dst, op[0]));
1579 break;
1580
1581 case ir_binop_min:
1582 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1583 break;
1584 case ir_binop_max:
1585 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1586 break;
1587
1588 case ir_binop_pow:
1589 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1590 break;
1591
1592 case ir_unop_bit_not:
1593 inst = emit(NOT(result_dst, op[0]));
1594 break;
1595 case ir_binop_bit_and:
1596 inst = emit(AND(result_dst, op[0], op[1]));
1597 break;
1598 case ir_binop_bit_xor:
1599 inst = emit(XOR(result_dst, op[0], op[1]));
1600 break;
1601 case ir_binop_bit_or:
1602 inst = emit(OR(result_dst, op[0], op[1]));
1603 break;
1604
1605 case ir_binop_lshift:
1606 inst = emit(SHL(result_dst, op[0], op[1]));
1607 break;
1608
1609 case ir_binop_rshift:
1610 if (ir->type->base_type == GLSL_TYPE_INT)
1611 inst = emit(ASR(result_dst, op[0], op[1]));
1612 else
1613 inst = emit(SHR(result_dst, op[0], op[1]));
1614 break;
1615
1616 case ir_binop_bfm:
1617 emit(BFI1(result_dst, op[0], op[1]));
1618 break;
1619
1620 case ir_binop_ubo_load: {
1621 ir_constant *uniform_block = ir->operands[0]->as_constant();
1622 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1623 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1624 src_reg offset;
1625
1626 /* Now, load the vector from that offset. */
1627 assert(ir->type->is_vector() || ir->type->is_scalar());
1628
1629 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1630 packed_consts.type = result.type;
1631 src_reg surf_index =
1632 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1633 if (const_offset_ir) {
1634 if (brw->gen >= 8) {
1635 /* Store the offset in a GRF so we can send-from-GRF. */
1636 offset = src_reg(this, glsl_type::int_type);
1637 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1638 } else {
1639 /* Immediates are fine on older generations since they'll be moved
1640 * to a (potentially fake) MRF at the generator level.
1641 */
1642 offset = src_reg(const_offset / 16);
1643 }
1644 } else {
1645 offset = src_reg(this, glsl_type::uint_type);
1646 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1647 }
1648
1649 if (brw->gen >= 7) {
1650 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1651 grf_offset.type = offset.type;
1652
1653 emit(MOV(grf_offset, offset));
1654
1655 emit(new(mem_ctx) vec4_instruction(this,
1656 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1657 dst_reg(packed_consts),
1658 surf_index,
1659 src_reg(grf_offset)));
1660 } else {
1661 vec4_instruction *pull =
1662 emit(new(mem_ctx) vec4_instruction(this,
1663 VS_OPCODE_PULL_CONSTANT_LOAD,
1664 dst_reg(packed_consts),
1665 surf_index,
1666 offset));
1667 pull->base_mrf = 14;
1668 pull->mlen = 1;
1669 }
1670
1671 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1672 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1673 const_offset % 16 / 4,
1674 const_offset % 16 / 4,
1675 const_offset % 16 / 4);
1676
1677 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1678 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1679 emit(CMP(result_dst, packed_consts, src_reg(0u),
1680 BRW_CONDITIONAL_NZ));
1681 emit(AND(result_dst, result, src_reg(0x1)));
1682 } else {
1683 emit(MOV(result_dst, packed_consts));
1684 }
1685 break;
1686 }
1687
1688 case ir_binop_vector_extract:
1689 unreachable("should have been lowered by vec_index_to_cond_assign");
1690
1691 case ir_triop_fma:
1692 op[0] = fix_3src_operand(op[0]);
1693 op[1] = fix_3src_operand(op[1]);
1694 op[2] = fix_3src_operand(op[2]);
1695 /* Note that the instruction's argument order is reversed from GLSL
1696 * and the IR.
1697 */
1698 emit(MAD(result_dst, op[2], op[1], op[0]));
1699 break;
1700
1701 case ir_triop_lrp:
1702 emit_lrp(result_dst, op[0], op[1], op[2]);
1703 break;
1704
1705 case ir_triop_csel:
1706 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1707 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1708 inst->predicate = BRW_PREDICATE_NORMAL;
1709 break;
1710
1711 case ir_triop_bfi:
1712 op[0] = fix_3src_operand(op[0]);
1713 op[1] = fix_3src_operand(op[1]);
1714 op[2] = fix_3src_operand(op[2]);
1715 emit(BFI2(result_dst, op[0], op[1], op[2]));
1716 break;
1717
1718 case ir_triop_bitfield_extract:
1719 op[0] = fix_3src_operand(op[0]);
1720 op[1] = fix_3src_operand(op[1]);
1721 op[2] = fix_3src_operand(op[2]);
1722 /* Note that the instruction's argument order is reversed from GLSL
1723 * and the IR.
1724 */
1725 emit(BFE(result_dst, op[2], op[1], op[0]));
1726 break;
1727
1728 case ir_triop_vector_insert:
1729 unreachable("should have been lowered by lower_vector_insert");
1730
1731 case ir_quadop_bitfield_insert:
1732 unreachable("not reached: should be handled by "
1733 "bitfield_insert_to_bfm_bfi\n");
1734
1735 case ir_quadop_vector:
1736 unreachable("not reached: should be handled by lower_quadop_vector");
1737
1738 case ir_unop_pack_half_2x16:
1739 emit_pack_half_2x16(result_dst, op[0]);
1740 break;
1741 case ir_unop_unpack_half_2x16:
1742 emit_unpack_half_2x16(result_dst, op[0]);
1743 break;
1744 case ir_unop_pack_snorm_2x16:
1745 case ir_unop_pack_snorm_4x8:
1746 case ir_unop_pack_unorm_2x16:
1747 case ir_unop_pack_unorm_4x8:
1748 case ir_unop_unpack_snorm_2x16:
1749 case ir_unop_unpack_snorm_4x8:
1750 case ir_unop_unpack_unorm_2x16:
1751 case ir_unop_unpack_unorm_4x8:
1752 unreachable("not reached: should be handled by lower_packing_builtins");
1753 case ir_unop_unpack_half_2x16_split_x:
1754 case ir_unop_unpack_half_2x16_split_y:
1755 case ir_binop_pack_half_2x16_split:
1756 case ir_unop_interpolate_at_centroid:
1757 case ir_binop_interpolate_at_sample:
1758 case ir_binop_interpolate_at_offset:
1759 unreachable("not reached: should not occur in vertex shader");
1760 case ir_binop_ldexp:
1761 unreachable("not reached: should be handled by ldexp_to_arith()");
1762 }
1763 }
1764
1765
1766 void
1767 vec4_visitor::visit(ir_swizzle *ir)
1768 {
1769 src_reg src;
1770 int i = 0;
1771 int swizzle[4];
1772
1773 /* Note that this is only swizzles in expressions, not those on the left
1774 * hand side of an assignment, which do write masking. See ir_assignment
1775 * for that.
1776 */
1777
1778 ir->val->accept(this);
1779 src = this->result;
1780 assert(src.file != BAD_FILE);
1781
1782 for (i = 0; i < ir->type->vector_elements; i++) {
1783 switch (i) {
1784 case 0:
1785 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1786 break;
1787 case 1:
1788 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1789 break;
1790 case 2:
1791 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1792 break;
1793 case 3:
1794 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1795 break;
1796 }
1797 }
1798 for (; i < 4; i++) {
1799 /* Replicate the last channel out. */
1800 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1801 }
1802
1803 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1804
1805 this->result = src;
1806 }
1807
1808 void
1809 vec4_visitor::visit(ir_dereference_variable *ir)
1810 {
1811 const struct glsl_type *type = ir->type;
1812 dst_reg *reg = variable_storage(ir->var);
1813
1814 if (!reg) {
1815 fail("Failed to find variable storage for %s\n", ir->var->name);
1816 this->result = src_reg(brw_null_reg());
1817 return;
1818 }
1819
1820 this->result = src_reg(*reg);
1821
1822 /* System values get their swizzle from the dst_reg writemask */
1823 if (ir->var->data.mode == ir_var_system_value)
1824 return;
1825
1826 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1827 this->result.swizzle = swizzle_for_size(type->vector_elements);
1828 }
1829
1830
1831 int
1832 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1833 {
1834 /* Under normal circumstances array elements are stored consecutively, so
1835 * the stride is equal to the size of the array element.
1836 */
1837 return type_size(ir->type);
1838 }
1839
1840
1841 void
1842 vec4_visitor::visit(ir_dereference_array *ir)
1843 {
1844 ir_constant *constant_index;
1845 src_reg src;
1846 int array_stride = compute_array_stride(ir);
1847
1848 constant_index = ir->array_index->constant_expression_value();
1849
1850 ir->array->accept(this);
1851 src = this->result;
1852
1853 if (constant_index) {
1854 src.reg_offset += constant_index->value.i[0] * array_stride;
1855 } else {
1856 /* Variable index array dereference. It eats the "vec4" of the
1857 * base of the array and an index that offsets the Mesa register
1858 * index.
1859 */
1860 ir->array_index->accept(this);
1861
1862 src_reg index_reg;
1863
1864 if (array_stride == 1) {
1865 index_reg = this->result;
1866 } else {
1867 index_reg = src_reg(this, glsl_type::int_type);
1868
1869 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1870 }
1871
1872 if (src.reladdr) {
1873 src_reg temp = src_reg(this, glsl_type::int_type);
1874
1875 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1876
1877 index_reg = temp;
1878 }
1879
1880 src.reladdr = ralloc(mem_ctx, src_reg);
1881 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1882 }
1883
1884 /* If the type is smaller than a vec4, replicate the last channel out. */
1885 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1886 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1887 else
1888 src.swizzle = BRW_SWIZZLE_NOOP;
1889 src.type = brw_type_for_base_type(ir->type);
1890
1891 this->result = src;
1892 }
1893
1894 void
1895 vec4_visitor::visit(ir_dereference_record *ir)
1896 {
1897 unsigned int i;
1898 const glsl_type *struct_type = ir->record->type;
1899 int offset = 0;
1900
1901 ir->record->accept(this);
1902
1903 for (i = 0; i < struct_type->length; i++) {
1904 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1905 break;
1906 offset += type_size(struct_type->fields.structure[i].type);
1907 }
1908
1909 /* If the type is smaller than a vec4, replicate the last channel out. */
1910 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1911 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1912 else
1913 this->result.swizzle = BRW_SWIZZLE_NOOP;
1914 this->result.type = brw_type_for_base_type(ir->type);
1915
1916 this->result.reg_offset += offset;
1917 }
1918
1919 /**
1920 * We want to be careful in assignment setup to hit the actual storage
1921 * instead of potentially using a temporary like we might with the
1922 * ir_dereference handler.
1923 */
1924 static dst_reg
1925 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1926 {
1927 /* The LHS must be a dereference. If the LHS is a variable indexed array
1928 * access of a vector, it must be separated into a series conditional moves
1929 * before reaching this point (see ir_vec_index_to_cond_assign).
1930 */
1931 assert(ir->as_dereference());
1932 ir_dereference_array *deref_array = ir->as_dereference_array();
1933 if (deref_array) {
1934 assert(!deref_array->array->type->is_vector());
1935 }
1936
1937 /* Use the rvalue deref handler for the most part. We'll ignore
1938 * swizzles in it and write swizzles using writemask, though.
1939 */
1940 ir->accept(v);
1941 return dst_reg(v->result);
1942 }
1943
1944 void
1945 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1946 const struct glsl_type *type,
1947 enum brw_predicate predicate)
1948 {
1949 if (type->base_type == GLSL_TYPE_STRUCT) {
1950 for (unsigned int i = 0; i < type->length; i++) {
1951 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1952 }
1953 return;
1954 }
1955
1956 if (type->is_array()) {
1957 for (unsigned int i = 0; i < type->length; i++) {
1958 emit_block_move(dst, src, type->fields.array, predicate);
1959 }
1960 return;
1961 }
1962
1963 if (type->is_matrix()) {
1964 const struct glsl_type *vec_type;
1965
1966 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1967 type->vector_elements, 1);
1968
1969 for (int i = 0; i < type->matrix_columns; i++) {
1970 emit_block_move(dst, src, vec_type, predicate);
1971 }
1972 return;
1973 }
1974
1975 assert(type->is_scalar() || type->is_vector());
1976
1977 dst->type = brw_type_for_base_type(type);
1978 src->type = dst->type;
1979
1980 dst->writemask = (1 << type->vector_elements) - 1;
1981
1982 src->swizzle = swizzle_for_size(type->vector_elements);
1983
1984 vec4_instruction *inst = emit(MOV(*dst, *src));
1985 inst->predicate = predicate;
1986
1987 dst->reg_offset++;
1988 src->reg_offset++;
1989 }
1990
1991
1992 /* If the RHS processing resulted in an instruction generating a
1993 * temporary value, and it would be easy to rewrite the instruction to
1994 * generate its result right into the LHS instead, do so. This ends
1995 * up reliably removing instructions where it can be tricky to do so
1996 * later without real UD chain information.
1997 */
1998 bool
1999 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2000 dst_reg dst,
2001 src_reg src,
2002 vec4_instruction *pre_rhs_inst,
2003 vec4_instruction *last_rhs_inst)
2004 {
2005 /* This could be supported, but it would take more smarts. */
2006 if (ir->condition)
2007 return false;
2008
2009 if (pre_rhs_inst == last_rhs_inst)
2010 return false; /* No instructions generated to work with. */
2011
2012 /* Make sure the last instruction generated our source reg. */
2013 if (src.file != GRF ||
2014 src.file != last_rhs_inst->dst.file ||
2015 src.reg != last_rhs_inst->dst.reg ||
2016 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2017 src.reladdr ||
2018 src.abs ||
2019 src.negate ||
2020 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2021 return false;
2022
2023 /* Check that that last instruction fully initialized the channels
2024 * we want to use, in the order we want to use them. We could
2025 * potentially reswizzle the operands of many instructions so that
2026 * we could handle out of order channels, but don't yet.
2027 */
2028
2029 for (unsigned i = 0; i < 4; i++) {
2030 if (dst.writemask & (1 << i)) {
2031 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2032 return false;
2033
2034 if (BRW_GET_SWZ(src.swizzle, i) != i)
2035 return false;
2036 }
2037 }
2038
2039 /* Success! Rewrite the instruction. */
2040 last_rhs_inst->dst.file = dst.file;
2041 last_rhs_inst->dst.reg = dst.reg;
2042 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2043 last_rhs_inst->dst.reladdr = dst.reladdr;
2044 last_rhs_inst->dst.writemask &= dst.writemask;
2045
2046 return true;
2047 }
2048
2049 void
2050 vec4_visitor::visit(ir_assignment *ir)
2051 {
2052 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2053 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2054
2055 if (!ir->lhs->type->is_scalar() &&
2056 !ir->lhs->type->is_vector()) {
2057 ir->rhs->accept(this);
2058 src_reg src = this->result;
2059
2060 if (ir->condition) {
2061 emit_bool_to_cond_code(ir->condition, &predicate);
2062 }
2063
2064 /* emit_block_move doesn't account for swizzles in the source register.
2065 * This should be ok, since the source register is a structure or an
2066 * array, and those can't be swizzled. But double-check to be sure.
2067 */
2068 assert(src.swizzle ==
2069 (ir->rhs->type->is_matrix()
2070 ? swizzle_for_size(ir->rhs->type->vector_elements)
2071 : BRW_SWIZZLE_NOOP));
2072
2073 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2074 return;
2075 }
2076
2077 /* Now we're down to just a scalar/vector with writemasks. */
2078 int i;
2079
2080 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2081 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2082
2083 ir->rhs->accept(this);
2084
2085 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2086
2087 src_reg src = this->result;
2088
2089 int swizzles[4];
2090 int first_enabled_chan = 0;
2091 int src_chan = 0;
2092
2093 assert(ir->lhs->type->is_vector() ||
2094 ir->lhs->type->is_scalar());
2095 dst.writemask = ir->write_mask;
2096
2097 for (int i = 0; i < 4; i++) {
2098 if (dst.writemask & (1 << i)) {
2099 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2100 break;
2101 }
2102 }
2103
2104 /* Swizzle a small RHS vector into the channels being written.
2105 *
2106 * glsl ir treats write_mask as dictating how many channels are
2107 * present on the RHS while in our instructions we need to make
2108 * those channels appear in the slots of the vec4 they're written to.
2109 */
2110 for (int i = 0; i < 4; i++) {
2111 if (dst.writemask & (1 << i))
2112 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2113 else
2114 swizzles[i] = first_enabled_chan;
2115 }
2116 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2117 swizzles[2], swizzles[3]);
2118
2119 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2120 return;
2121 }
2122
2123 if (ir->condition) {
2124 emit_bool_to_cond_code(ir->condition, &predicate);
2125 }
2126
2127 for (i = 0; i < type_size(ir->lhs->type); i++) {
2128 vec4_instruction *inst = emit(MOV(dst, src));
2129 inst->predicate = predicate;
2130
2131 dst.reg_offset++;
2132 src.reg_offset++;
2133 }
2134 }
2135
2136 void
2137 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2138 {
2139 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2140 foreach_in_list(ir_constant, field_value, &ir->components) {
2141 emit_constant_values(dst, field_value);
2142 }
2143 return;
2144 }
2145
2146 if (ir->type->is_array()) {
2147 for (unsigned int i = 0; i < ir->type->length; i++) {
2148 emit_constant_values(dst, ir->array_elements[i]);
2149 }
2150 return;
2151 }
2152
2153 if (ir->type->is_matrix()) {
2154 for (int i = 0; i < ir->type->matrix_columns; i++) {
2155 float *vec = &ir->value.f[i * ir->type->vector_elements];
2156
2157 for (int j = 0; j < ir->type->vector_elements; j++) {
2158 dst->writemask = 1 << j;
2159 dst->type = BRW_REGISTER_TYPE_F;
2160
2161 emit(MOV(*dst, src_reg(vec[j])));
2162 }
2163 dst->reg_offset++;
2164 }
2165 return;
2166 }
2167
2168 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2169
2170 for (int i = 0; i < ir->type->vector_elements; i++) {
2171 if (!(remaining_writemask & (1 << i)))
2172 continue;
2173
2174 dst->writemask = 1 << i;
2175 dst->type = brw_type_for_base_type(ir->type);
2176
2177 /* Find other components that match the one we're about to
2178 * write. Emits fewer instructions for things like vec4(0.5,
2179 * 1.5, 1.5, 1.5).
2180 */
2181 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2182 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2183 if (ir->value.b[i] == ir->value.b[j])
2184 dst->writemask |= (1 << j);
2185 } else {
2186 /* u, i, and f storage all line up, so no need for a
2187 * switch case for comparing each type.
2188 */
2189 if (ir->value.u[i] == ir->value.u[j])
2190 dst->writemask |= (1 << j);
2191 }
2192 }
2193
2194 switch (ir->type->base_type) {
2195 case GLSL_TYPE_FLOAT:
2196 emit(MOV(*dst, src_reg(ir->value.f[i])));
2197 break;
2198 case GLSL_TYPE_INT:
2199 emit(MOV(*dst, src_reg(ir->value.i[i])));
2200 break;
2201 case GLSL_TYPE_UINT:
2202 emit(MOV(*dst, src_reg(ir->value.u[i])));
2203 break;
2204 case GLSL_TYPE_BOOL:
2205 emit(MOV(*dst, src_reg(ir->value.b[i])));
2206 break;
2207 default:
2208 unreachable("Non-float/uint/int/bool constant");
2209 }
2210
2211 remaining_writemask &= ~dst->writemask;
2212 }
2213 dst->reg_offset++;
2214 }
2215
2216 void
2217 vec4_visitor::visit(ir_constant *ir)
2218 {
2219 dst_reg dst = dst_reg(this, ir->type);
2220 this->result = src_reg(dst);
2221
2222 emit_constant_values(&dst, ir);
2223 }
2224
2225 void
2226 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2227 {
2228 ir_dereference *deref = static_cast<ir_dereference *>(
2229 ir->actual_parameters.get_head());
2230 ir_variable *location = deref->variable_referenced();
2231 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2232 location->data.atomic.buffer_index);
2233
2234 /* Calculate the surface offset */
2235 src_reg offset(this, glsl_type::uint_type);
2236 ir_dereference_array *deref_array = deref->as_dereference_array();
2237 if (deref_array) {
2238 deref_array->array_index->accept(this);
2239
2240 src_reg tmp(this, glsl_type::uint_type);
2241 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2242 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2243 } else {
2244 offset = location->data.atomic.offset;
2245 }
2246
2247 /* Emit the appropriate machine instruction */
2248 const char *callee = ir->callee->function_name();
2249 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2250
2251 if (!strcmp("__intrinsic_atomic_read", callee)) {
2252 emit_untyped_surface_read(surf_index, dst, offset);
2253
2254 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2255 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2256 src_reg(), src_reg());
2257
2258 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2259 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2260 src_reg(), src_reg());
2261 }
2262 }
2263
2264 void
2265 vec4_visitor::visit(ir_call *ir)
2266 {
2267 const char *callee = ir->callee->function_name();
2268
2269 if (!strcmp("__intrinsic_atomic_read", callee) ||
2270 !strcmp("__intrinsic_atomic_increment", callee) ||
2271 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2272 visit_atomic_counter_intrinsic(ir);
2273 } else {
2274 unreachable("Unsupported intrinsic.");
2275 }
2276 }
2277
2278 src_reg
2279 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, uint32_t sampler)
2280 {
2281 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2282 inst->base_mrf = 2;
2283 inst->mlen = 1;
2284 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2285 inst->dst.writemask = WRITEMASK_XYZW;
2286
2287 inst->src[1] = src_reg(sampler);
2288
2289 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2290 int param_base = inst->base_mrf;
2291 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2292 int zero_mask = 0xf & ~coord_mask;
2293
2294 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2295 coordinate));
2296
2297 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2298 src_reg(0)));
2299
2300 emit(inst);
2301 return src_reg(inst->dst);
2302 }
2303
2304 void
2305 vec4_visitor::visit(ir_texture *ir)
2306 {
2307 uint32_t sampler =
2308 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2309
2310 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2311 * emitting anything other than setting up the constant result.
2312 */
2313 if (ir->op == ir_tg4) {
2314 ir_constant *chan = ir->lod_info.component->as_constant();
2315 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2316 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2317 dst_reg result(this, ir->type);
2318 this->result = src_reg(result);
2319 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2320 return;
2321 }
2322 }
2323
2324 /* Should be lowered by do_lower_texture_projection */
2325 assert(!ir->projector);
2326
2327 /* Should be lowered */
2328 assert(!ir->offset || !ir->offset->type->is_array());
2329
2330 /* Generate code to compute all the subexpression trees. This has to be
2331 * done before loading any values into MRFs for the sampler message since
2332 * generating these values may involve SEND messages that need the MRFs.
2333 */
2334 src_reg coordinate;
2335 if (ir->coordinate) {
2336 ir->coordinate->accept(this);
2337 coordinate = this->result;
2338 }
2339
2340 src_reg shadow_comparitor;
2341 if (ir->shadow_comparitor) {
2342 ir->shadow_comparitor->accept(this);
2343 shadow_comparitor = this->result;
2344 }
2345
2346 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2347 src_reg offset_value;
2348 if (has_nonconstant_offset) {
2349 ir->offset->accept(this);
2350 offset_value = src_reg(this->result);
2351 }
2352
2353 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2354 src_reg lod, dPdx, dPdy, sample_index, mcs;
2355 switch (ir->op) {
2356 case ir_tex:
2357 lod = src_reg(0.0f);
2358 lod_type = glsl_type::float_type;
2359 break;
2360 case ir_txf:
2361 case ir_txl:
2362 case ir_txs:
2363 ir->lod_info.lod->accept(this);
2364 lod = this->result;
2365 lod_type = ir->lod_info.lod->type;
2366 break;
2367 case ir_query_levels:
2368 lod = src_reg(0);
2369 lod_type = glsl_type::int_type;
2370 break;
2371 case ir_txf_ms:
2372 ir->lod_info.sample_index->accept(this);
2373 sample_index = this->result;
2374 sample_index_type = ir->lod_info.sample_index->type;
2375
2376 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2377 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2378 else
2379 mcs = src_reg(0u);
2380 break;
2381 case ir_txd:
2382 ir->lod_info.grad.dPdx->accept(this);
2383 dPdx = this->result;
2384
2385 ir->lod_info.grad.dPdy->accept(this);
2386 dPdy = this->result;
2387
2388 lod_type = ir->lod_info.grad.dPdx->type;
2389 break;
2390 case ir_txb:
2391 case ir_lod:
2392 case ir_tg4:
2393 break;
2394 }
2395
2396 enum opcode opcode;
2397 switch (ir->op) {
2398 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2399 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2400 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2401 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2402 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2403 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2404 case ir_tg4: opcode = has_nonconstant_offset
2405 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2406 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2407 case ir_txb:
2408 unreachable("TXB is not valid for vertex shaders.");
2409 case ir_lod:
2410 unreachable("LOD is not valid for vertex shaders.");
2411 default:
2412 unreachable("Unrecognized tex op");
2413 }
2414
2415 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2416
2417 if (ir->offset != NULL && ir->op != ir_txf)
2418 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2419
2420 /* Stuff the channel select bits in the top of the texture offset */
2421 if (ir->op == ir_tg4)
2422 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2423
2424 /* The message header is necessary for:
2425 * - Gen4 (always)
2426 * - Texel offsets
2427 * - Gather channel selection
2428 * - Sampler indices too large to fit in a 4-bit value.
2429 */
2430 inst->header_present =
2431 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2432 sampler >= 16;
2433 inst->base_mrf = 2;
2434 inst->mlen = inst->header_present + 1; /* always at least one */
2435 inst->dst = dst_reg(this, ir->type);
2436 inst->dst.writemask = WRITEMASK_XYZW;
2437 inst->shadow_compare = ir->shadow_comparitor != NULL;
2438
2439 inst->src[1] = src_reg(sampler);
2440
2441 /* MRF for the first parameter */
2442 int param_base = inst->base_mrf + inst->header_present;
2443
2444 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2445 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2446 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2447 } else {
2448 /* Load the coordinate */
2449 /* FINISHME: gl_clamp_mask and saturate */
2450 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2451 int zero_mask = 0xf & ~coord_mask;
2452
2453 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2454 coordinate));
2455
2456 if (zero_mask != 0) {
2457 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2458 src_reg(0)));
2459 }
2460 /* Load the shadow comparitor */
2461 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2462 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2463 WRITEMASK_X),
2464 shadow_comparitor));
2465 inst->mlen++;
2466 }
2467
2468 /* Load the LOD info */
2469 if (ir->op == ir_tex || ir->op == ir_txl) {
2470 int mrf, writemask;
2471 if (brw->gen >= 5) {
2472 mrf = param_base + 1;
2473 if (ir->shadow_comparitor) {
2474 writemask = WRITEMASK_Y;
2475 /* mlen already incremented */
2476 } else {
2477 writemask = WRITEMASK_X;
2478 inst->mlen++;
2479 }
2480 } else /* brw->gen == 4 */ {
2481 mrf = param_base;
2482 writemask = WRITEMASK_W;
2483 }
2484 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2485 } else if (ir->op == ir_txf) {
2486 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2487 } else if (ir->op == ir_txf_ms) {
2488 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2489 sample_index));
2490 if (brw->gen >= 7)
2491 /* MCS data is in the first channel of `mcs`, but we need to get it into
2492 * the .y channel of the second vec4 of params, so replicate .x across
2493 * the whole vec4 and then mask off everything except .y
2494 */
2495 mcs.swizzle = BRW_SWIZZLE_XXXX;
2496 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2497 mcs));
2498 inst->mlen++;
2499 } else if (ir->op == ir_txd) {
2500 const glsl_type *type = lod_type;
2501
2502 if (brw->gen >= 5) {
2503 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2504 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2505 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2506 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2507 inst->mlen++;
2508
2509 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2510 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2511 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2512 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2513 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2514 inst->mlen++;
2515
2516 if (ir->shadow_comparitor) {
2517 emit(MOV(dst_reg(MRF, param_base + 2,
2518 ir->shadow_comparitor->type, WRITEMASK_Z),
2519 shadow_comparitor));
2520 }
2521 }
2522 } else /* brw->gen == 4 */ {
2523 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2524 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2525 inst->mlen += 2;
2526 }
2527 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2528 if (ir->shadow_comparitor) {
2529 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2530 shadow_comparitor));
2531 }
2532
2533 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2534 offset_value));
2535 inst->mlen++;
2536 }
2537 }
2538
2539 emit(inst);
2540
2541 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2542 * spec requires layers.
2543 */
2544 if (ir->op == ir_txs) {
2545 glsl_type const *type = ir->sampler->type;
2546 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2547 type->sampler_array) {
2548 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2549 writemask(inst->dst, WRITEMASK_Z),
2550 src_reg(inst->dst), src_reg(6));
2551 }
2552 }
2553
2554 if (brw->gen == 6 && ir->op == ir_tg4) {
2555 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2556 }
2557
2558 swizzle_result(ir, src_reg(inst->dst), sampler);
2559 }
2560
2561 /**
2562 * Apply workarounds for Gen6 gather with UINT/SINT
2563 */
2564 void
2565 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2566 {
2567 if (!wa)
2568 return;
2569
2570 int width = (wa & WA_8BIT) ? 8 : 16;
2571 dst_reg dst_f = dst;
2572 dst_f.type = BRW_REGISTER_TYPE_F;
2573
2574 /* Convert from UNORM to UINT */
2575 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2576 emit(MOV(dst, src_reg(dst_f)));
2577
2578 if (wa & WA_SIGN) {
2579 /* Reinterpret the UINT value as a signed INT value by
2580 * shifting the sign bit into place, then shifting back
2581 * preserving sign.
2582 */
2583 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2584 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2585 }
2586 }
2587
2588 /**
2589 * Set up the gather channel based on the swizzle, for gather4.
2590 */
2591 uint32_t
2592 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2593 {
2594 ir_constant *chan = ir->lod_info.component->as_constant();
2595 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2596 switch (swiz) {
2597 case SWIZZLE_X: return 0;
2598 case SWIZZLE_Y:
2599 /* gather4 sampler is broken for green channel on RG32F --
2600 * we must ask for blue instead.
2601 */
2602 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2603 return 2;
2604 return 1;
2605 case SWIZZLE_Z: return 2;
2606 case SWIZZLE_W: return 3;
2607 default:
2608 unreachable("Not reached"); /* zero, one swizzles handled already */
2609 }
2610 }
2611
2612 void
2613 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2614 {
2615 int s = key->tex.swizzles[sampler];
2616
2617 this->result = src_reg(this, ir->type);
2618 dst_reg swizzled_result(this->result);
2619
2620 if (ir->op == ir_query_levels) {
2621 /* # levels is in .w */
2622 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2623 emit(MOV(swizzled_result, orig_val));
2624 return;
2625 }
2626
2627 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2628 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2629 emit(MOV(swizzled_result, orig_val));
2630 return;
2631 }
2632
2633
2634 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2635 int swizzle[4] = {0};
2636
2637 for (int i = 0; i < 4; i++) {
2638 switch (GET_SWZ(s, i)) {
2639 case SWIZZLE_ZERO:
2640 zero_mask |= (1 << i);
2641 break;
2642 case SWIZZLE_ONE:
2643 one_mask |= (1 << i);
2644 break;
2645 default:
2646 copy_mask |= (1 << i);
2647 swizzle[i] = GET_SWZ(s, i);
2648 break;
2649 }
2650 }
2651
2652 if (copy_mask) {
2653 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2654 swizzled_result.writemask = copy_mask;
2655 emit(MOV(swizzled_result, orig_val));
2656 }
2657
2658 if (zero_mask) {
2659 swizzled_result.writemask = zero_mask;
2660 emit(MOV(swizzled_result, src_reg(0.0f)));
2661 }
2662
2663 if (one_mask) {
2664 swizzled_result.writemask = one_mask;
2665 emit(MOV(swizzled_result, src_reg(1.0f)));
2666 }
2667 }
2668
2669 void
2670 vec4_visitor::visit(ir_return *)
2671 {
2672 unreachable("not reached");
2673 }
2674
2675 void
2676 vec4_visitor::visit(ir_discard *)
2677 {
2678 unreachable("not reached");
2679 }
2680
2681 void
2682 vec4_visitor::visit(ir_if *ir)
2683 {
2684 /* Don't point the annotation at the if statement, because then it plus
2685 * the then and else blocks get printed.
2686 */
2687 this->base_ir = ir->condition;
2688
2689 if (brw->gen == 6) {
2690 emit_if_gen6(ir);
2691 } else {
2692 enum brw_predicate predicate;
2693 emit_bool_to_cond_code(ir->condition, &predicate);
2694 emit(IF(predicate));
2695 }
2696
2697 visit_instructions(&ir->then_instructions);
2698
2699 if (!ir->else_instructions.is_empty()) {
2700 this->base_ir = ir->condition;
2701 emit(BRW_OPCODE_ELSE);
2702
2703 visit_instructions(&ir->else_instructions);
2704 }
2705
2706 this->base_ir = ir->condition;
2707 emit(BRW_OPCODE_ENDIF);
2708 }
2709
2710 void
2711 vec4_visitor::visit(ir_emit_vertex *)
2712 {
2713 unreachable("not reached");
2714 }
2715
2716 void
2717 vec4_visitor::visit(ir_end_primitive *)
2718 {
2719 unreachable("not reached");
2720 }
2721
2722 void
2723 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2724 dst_reg dst, src_reg offset,
2725 src_reg src0, src_reg src1)
2726 {
2727 unsigned mlen = 0;
2728
2729 /* Set the atomic operation offset. */
2730 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2731 mlen++;
2732
2733 /* Set the atomic operation arguments. */
2734 if (src0.file != BAD_FILE) {
2735 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2736 mlen++;
2737 }
2738
2739 if (src1.file != BAD_FILE) {
2740 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2741 mlen++;
2742 }
2743
2744 /* Emit the instruction. Note that this maps to the normal SIMD8
2745 * untyped atomic message on Ivy Bridge, but that's OK because
2746 * unused channels will be masked out.
2747 */
2748 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2749 src_reg(atomic_op), src_reg(surf_index));
2750 inst->base_mrf = 0;
2751 inst->mlen = mlen;
2752 }
2753
2754 void
2755 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2756 src_reg offset)
2757 {
2758 /* Set the surface read offset. */
2759 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2760
2761 /* Emit the instruction. Note that this maps to the normal SIMD8
2762 * untyped surface read message, but that's OK because unused
2763 * channels will be masked out.
2764 */
2765 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2766 dst, src_reg(surf_index));
2767 inst->base_mrf = 0;
2768 inst->mlen = 1;
2769 }
2770
2771 void
2772 vec4_visitor::emit_ndc_computation()
2773 {
2774 /* Get the position */
2775 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2776
2777 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2778 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2779 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2780
2781 current_annotation = "NDC";
2782 dst_reg ndc_w = ndc;
2783 ndc_w.writemask = WRITEMASK_W;
2784 src_reg pos_w = pos;
2785 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2786 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2787
2788 dst_reg ndc_xyz = ndc;
2789 ndc_xyz.writemask = WRITEMASK_XYZ;
2790
2791 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2792 }
2793
2794 void
2795 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2796 {
2797 if (brw->gen < 6 &&
2798 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2799 key->userclip_active || brw->has_negative_rhw_bug)) {
2800 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2801 dst_reg header1_w = header1;
2802 header1_w.writemask = WRITEMASK_W;
2803
2804 emit(MOV(header1, 0u));
2805
2806 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2807 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2808
2809 current_annotation = "Point size";
2810 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2811 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2812 }
2813
2814 if (key->userclip_active) {
2815 current_annotation = "Clipping flags";
2816 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2817 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2818
2819 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2820 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2821 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2822
2823 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2824 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2825 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2826 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2827 }
2828
2829 /* i965 clipping workaround:
2830 * 1) Test for -ve rhw
2831 * 2) If set,
2832 * set ndc = (0,0,0,0)
2833 * set ucp[6] = 1
2834 *
2835 * Later, clipping will detect ucp[6] and ensure the primitive is
2836 * clipped against all fixed planes.
2837 */
2838 if (brw->has_negative_rhw_bug) {
2839 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2840 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2841 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2842 vec4_instruction *inst;
2843 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2844 inst->predicate = BRW_PREDICATE_NORMAL;
2845 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2846 inst->predicate = BRW_PREDICATE_NORMAL;
2847 }
2848
2849 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2850 } else if (brw->gen < 6) {
2851 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2852 } else {
2853 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2854 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2855 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2856 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2857 }
2858 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2859 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2860 src_reg(output_reg[VARYING_SLOT_LAYER])));
2861 }
2862 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2863 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2864 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2865 }
2866 }
2867 }
2868
2869 void
2870 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2871 {
2872 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2873 *
2874 * "If a linked set of shaders forming the vertex stage contains no
2875 * static write to gl_ClipVertex or gl_ClipDistance, but the
2876 * application has requested clipping against user clip planes through
2877 * the API, then the coordinate written to gl_Position is used for
2878 * comparison against the user clip planes."
2879 *
2880 * This function is only called if the shader didn't write to
2881 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2882 * if the user wrote to it; otherwise we use gl_Position.
2883 */
2884 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2885 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2886 clip_vertex = VARYING_SLOT_POS;
2887 }
2888
2889 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2890 ++i) {
2891 reg.writemask = 1 << i;
2892 emit(DP4(reg,
2893 src_reg(output_reg[clip_vertex]),
2894 src_reg(this->userplane[i + offset])));
2895 }
2896 }
2897
2898 void
2899 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2900 {
2901 assert (varying < VARYING_SLOT_MAX);
2902 reg.type = output_reg[varying].type;
2903 current_annotation = output_reg_annotation[varying];
2904 /* Copy the register, saturating if necessary */
2905 vec4_instruction *inst = emit(MOV(reg,
2906 src_reg(output_reg[varying])));
2907 if ((varying == VARYING_SLOT_COL0 ||
2908 varying == VARYING_SLOT_COL1 ||
2909 varying == VARYING_SLOT_BFC0 ||
2910 varying == VARYING_SLOT_BFC1) &&
2911 key->clamp_vertex_color) {
2912 inst->saturate = true;
2913 }
2914 }
2915
2916 void
2917 vec4_visitor::emit_urb_slot(int mrf, int varying)
2918 {
2919 struct brw_reg hw_reg = brw_message_reg(mrf);
2920 dst_reg reg = dst_reg(MRF, mrf);
2921 reg.type = BRW_REGISTER_TYPE_F;
2922
2923 switch (varying) {
2924 case VARYING_SLOT_PSIZ:
2925 /* PSIZ is always in slot 0, and is coupled with other flags. */
2926 current_annotation = "indices, point width, clip flags";
2927 emit_psiz_and_flags(hw_reg);
2928 break;
2929 case BRW_VARYING_SLOT_NDC:
2930 current_annotation = "NDC";
2931 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2932 break;
2933 case VARYING_SLOT_POS:
2934 current_annotation = "gl_Position";
2935 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2936 break;
2937 case VARYING_SLOT_EDGE:
2938 /* This is present when doing unfilled polygons. We're supposed to copy
2939 * the edge flag from the user-provided vertex array
2940 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2941 * of that attribute (starts as 1.0f). This is then used in clipping to
2942 * determine which edges should be drawn as wireframe.
2943 */
2944 current_annotation = "edge flag";
2945 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2946 glsl_type::float_type, WRITEMASK_XYZW))));
2947 break;
2948 case BRW_VARYING_SLOT_PAD:
2949 /* No need to write to this slot */
2950 break;
2951 default:
2952 emit_generic_urb_slot(reg, varying);
2953 break;
2954 }
2955 }
2956
2957 static int
2958 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2959 {
2960 if (brw->gen >= 6) {
2961 /* URB data written (does not include the message header reg) must
2962 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2963 * section 5.4.3.2.2: URB_INTERLEAVED.
2964 *
2965 * URB entries are allocated on a multiple of 1024 bits, so an
2966 * extra 128 bits written here to make the end align to 256 is
2967 * no problem.
2968 */
2969 if ((mlen % 2) != 1)
2970 mlen++;
2971 }
2972
2973 return mlen;
2974 }
2975
2976
2977 /**
2978 * Generates the VUE payload plus the necessary URB write instructions to
2979 * output it.
2980 *
2981 * The VUE layout is documented in Volume 2a.
2982 */
2983 void
2984 vec4_visitor::emit_vertex()
2985 {
2986 /* MRF 0 is reserved for the debugger, so start with message header
2987 * in MRF 1.
2988 */
2989 int base_mrf = 1;
2990 int mrf = base_mrf;
2991 /* In the process of generating our URB write message contents, we
2992 * may need to unspill a register or load from an array. Those
2993 * reads would use MRFs 14-15.
2994 */
2995 int max_usable_mrf = 13;
2996
2997 /* The following assertion verifies that max_usable_mrf causes an
2998 * even-numbered amount of URB write data, which will meet gen6's
2999 * requirements for length alignment.
3000 */
3001 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3002
3003 /* First mrf is the g0-based message header containing URB handles and
3004 * such.
3005 */
3006 emit_urb_write_header(mrf++);
3007
3008 if (brw->gen < 6) {
3009 emit_ndc_computation();
3010 }
3011
3012 /* Lower legacy ff and ClipVertex clipping to clip distances */
3013 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3014 current_annotation = "user clip distances";
3015
3016 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3017 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3018
3019 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3020 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3021 }
3022
3023 /* We may need to split this up into several URB writes, so do them in a
3024 * loop.
3025 */
3026 int slot = 0;
3027 bool complete = false;
3028 do {
3029 /* URB offset is in URB row increments, and each of our MRFs is half of
3030 * one of those, since we're doing interleaved writes.
3031 */
3032 int offset = slot / 2;
3033
3034 mrf = base_mrf + 1;
3035 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3036 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3037
3038 /* If this was max_usable_mrf, we can't fit anything more into this
3039 * URB WRITE.
3040 */
3041 if (mrf > max_usable_mrf) {
3042 slot++;
3043 break;
3044 }
3045 }
3046
3047 complete = slot >= prog_data->vue_map.num_slots;
3048 current_annotation = "URB write";
3049 vec4_instruction *inst = emit_urb_write_opcode(complete);
3050 inst->base_mrf = base_mrf;
3051 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3052 inst->offset += offset;
3053 } while(!complete);
3054 }
3055
3056
3057 src_reg
3058 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3059 src_reg *reladdr, int reg_offset)
3060 {
3061 /* Because we store the values to scratch interleaved like our
3062 * vertex data, we need to scale the vec4 index by 2.
3063 */
3064 int message_header_scale = 2;
3065
3066 /* Pre-gen6, the message header uses byte offsets instead of vec4
3067 * (16-byte) offset units.
3068 */
3069 if (brw->gen < 6)
3070 message_header_scale *= 16;
3071
3072 if (reladdr) {
3073 src_reg index = src_reg(this, glsl_type::int_type);
3074
3075 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3076 emit_before(inst, MUL(dst_reg(index),
3077 index, src_reg(message_header_scale)));
3078
3079 return index;
3080 } else {
3081 return src_reg(reg_offset * message_header_scale);
3082 }
3083 }
3084
3085 src_reg
3086 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3087 src_reg *reladdr, int reg_offset)
3088 {
3089 if (reladdr) {
3090 src_reg index = src_reg(this, glsl_type::int_type);
3091
3092 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3093
3094 /* Pre-gen6, the message header uses byte offsets instead of vec4
3095 * (16-byte) offset units.
3096 */
3097 if (brw->gen < 6) {
3098 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3099 }
3100
3101 return index;
3102 } else if (brw->gen >= 8) {
3103 /* Store the offset in a GRF so we can send-from-GRF. */
3104 src_reg offset = src_reg(this, glsl_type::int_type);
3105 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3106 return offset;
3107 } else {
3108 int message_header_scale = brw->gen < 6 ? 16 : 1;
3109 return src_reg(reg_offset * message_header_scale);
3110 }
3111 }
3112
3113 /**
3114 * Emits an instruction before @inst to load the value named by @orig_src
3115 * from scratch space at @base_offset to @temp.
3116 *
3117 * @base_offset is measured in 32-byte units (the size of a register).
3118 */
3119 void
3120 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3121 dst_reg temp, src_reg orig_src,
3122 int base_offset)
3123 {
3124 int reg_offset = base_offset + orig_src.reg_offset;
3125 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3126
3127 emit_before(inst, SCRATCH_READ(temp, index));
3128 }
3129
3130 /**
3131 * Emits an instruction after @inst to store the value to be written
3132 * to @orig_dst to scratch space at @base_offset, from @temp.
3133 *
3134 * @base_offset is measured in 32-byte units (the size of a register).
3135 */
3136 void
3137 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3138 {
3139 int reg_offset = base_offset + inst->dst.reg_offset;
3140 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3141
3142 /* Create a temporary register to store *inst's result in.
3143 *
3144 * We have to be careful in MOVing from our temporary result register in
3145 * the scratch write. If we swizzle from channels of the temporary that
3146 * weren't initialized, it will confuse live interval analysis, which will
3147 * make spilling fail to make progress.
3148 */
3149 src_reg temp = src_reg(this, glsl_type::vec4_type);
3150 temp.type = inst->dst.type;
3151 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3152 int swizzles[4];
3153 for (int i = 0; i < 4; i++)
3154 if (inst->dst.writemask & (1 << i))
3155 swizzles[i] = i;
3156 else
3157 swizzles[i] = first_writemask_chan;
3158 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3159 swizzles[2], swizzles[3]);
3160
3161 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3162 inst->dst.writemask));
3163 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3164 write->predicate = inst->predicate;
3165 write->ir = inst->ir;
3166 write->annotation = inst->annotation;
3167 inst->insert_after(write);
3168
3169 inst->dst.file = temp.file;
3170 inst->dst.reg = temp.reg;
3171 inst->dst.reg_offset = temp.reg_offset;
3172 inst->dst.reladdr = NULL;
3173 }
3174
3175 /**
3176 * We can't generally support array access in GRF space, because a
3177 * single instruction's destination can only span 2 contiguous
3178 * registers. So, we send all GRF arrays that get variable index
3179 * access to scratch space.
3180 */
3181 void
3182 vec4_visitor::move_grf_array_access_to_scratch()
3183 {
3184 int scratch_loc[this->virtual_grf_count];
3185
3186 for (int i = 0; i < this->virtual_grf_count; i++) {
3187 scratch_loc[i] = -1;
3188 }
3189
3190 /* First, calculate the set of virtual GRFs that need to be punted
3191 * to scratch due to having any array access on them, and where in
3192 * scratch.
3193 */
3194 foreach_in_list(vec4_instruction, inst, &instructions) {
3195 if (inst->dst.file == GRF && inst->dst.reladdr &&
3196 scratch_loc[inst->dst.reg] == -1) {
3197 scratch_loc[inst->dst.reg] = c->last_scratch;
3198 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3199 }
3200
3201 for (int i = 0 ; i < 3; i++) {
3202 src_reg *src = &inst->src[i];
3203
3204 if (src->file == GRF && src->reladdr &&
3205 scratch_loc[src->reg] == -1) {
3206 scratch_loc[src->reg] = c->last_scratch;
3207 c->last_scratch += this->virtual_grf_sizes[src->reg];
3208 }
3209 }
3210 }
3211
3212 /* Now, for anything that will be accessed through scratch, rewrite
3213 * it to load/store. Note that this is a _safe list walk, because
3214 * we may generate a new scratch_write instruction after the one
3215 * we're processing.
3216 */
3217 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3218 /* Set up the annotation tracking for new generated instructions. */
3219 base_ir = inst->ir;
3220 current_annotation = inst->annotation;
3221
3222 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3223 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3224 }
3225
3226 for (int i = 0 ; i < 3; i++) {
3227 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3228 continue;
3229
3230 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3231
3232 emit_scratch_read(inst, temp, inst->src[i],
3233 scratch_loc[inst->src[i].reg]);
3234
3235 inst->src[i].file = temp.file;
3236 inst->src[i].reg = temp.reg;
3237 inst->src[i].reg_offset = temp.reg_offset;
3238 inst->src[i].reladdr = NULL;
3239 }
3240 }
3241 }
3242
3243 /**
3244 * Emits an instruction before @inst to load the value named by @orig_src
3245 * from the pull constant buffer (surface) at @base_offset to @temp.
3246 */
3247 void
3248 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3249 dst_reg temp, src_reg orig_src,
3250 int base_offset)
3251 {
3252 int reg_offset = base_offset + orig_src.reg_offset;
3253 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3254 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3255 vec4_instruction *load;
3256
3257 if (brw->gen >= 7) {
3258 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3259 grf_offset.type = offset.type;
3260 emit_before(inst, MOV(grf_offset, offset));
3261
3262 load = new(mem_ctx) vec4_instruction(this,
3263 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3264 temp, index, src_reg(grf_offset));
3265 } else {
3266 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3267 temp, index, offset);
3268 load->base_mrf = 14;
3269 load->mlen = 1;
3270 }
3271 emit_before(inst, load);
3272 }
3273
3274 /**
3275 * Implements array access of uniforms by inserting a
3276 * PULL_CONSTANT_LOAD instruction.
3277 *
3278 * Unlike temporary GRF array access (where we don't support it due to
3279 * the difficulty of doing relative addressing on instruction
3280 * destinations), we could potentially do array access of uniforms
3281 * that were loaded in GRF space as push constants. In real-world
3282 * usage we've seen, though, the arrays being used are always larger
3283 * than we could load as push constants, so just always move all
3284 * uniform array access out to a pull constant buffer.
3285 */
3286 void
3287 vec4_visitor::move_uniform_array_access_to_pull_constants()
3288 {
3289 int pull_constant_loc[this->uniforms];
3290
3291 for (int i = 0; i < this->uniforms; i++) {
3292 pull_constant_loc[i] = -1;
3293 }
3294
3295 /* Walk through and find array access of uniforms. Put a copy of that
3296 * uniform in the pull constant buffer.
3297 *
3298 * Note that we don't move constant-indexed accesses to arrays. No
3299 * testing has been done of the performance impact of this choice.
3300 */
3301 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3302 for (int i = 0 ; i < 3; i++) {
3303 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3304 continue;
3305
3306 int uniform = inst->src[i].reg;
3307
3308 /* If this array isn't already present in the pull constant buffer,
3309 * add it.
3310 */
3311 if (pull_constant_loc[uniform] == -1) {
3312 const float **values = &stage_prog_data->param[uniform * 4];
3313
3314 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3315
3316 assert(uniform < uniform_array_size);
3317 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3318 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3319 = values[j];
3320 }
3321 }
3322
3323 /* Set up the annotation tracking for new generated instructions. */
3324 base_ir = inst->ir;
3325 current_annotation = inst->annotation;
3326
3327 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3328
3329 emit_pull_constant_load(inst, temp, inst->src[i],
3330 pull_constant_loc[uniform]);
3331
3332 inst->src[i].file = temp.file;
3333 inst->src[i].reg = temp.reg;
3334 inst->src[i].reg_offset = temp.reg_offset;
3335 inst->src[i].reladdr = NULL;
3336 }
3337 }
3338
3339 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3340 * no need to track them as larger-than-vec4 objects. This will be
3341 * relied on in cutting out unused uniform vectors from push
3342 * constants.
3343 */
3344 split_uniform_registers();
3345 }
3346
3347 void
3348 vec4_visitor::resolve_ud_negate(src_reg *reg)
3349 {
3350 if (reg->type != BRW_REGISTER_TYPE_UD ||
3351 !reg->negate)
3352 return;
3353
3354 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3355 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3356 *reg = temp;
3357 }
3358
3359 vec4_visitor::vec4_visitor(struct brw_context *brw,
3360 struct brw_vec4_compile *c,
3361 struct gl_program *prog,
3362 const struct brw_vec4_prog_key *key,
3363 struct brw_vec4_prog_data *prog_data,
3364 struct gl_shader_program *shader_prog,
3365 gl_shader_stage stage,
3366 void *mem_ctx,
3367 bool debug_flag,
3368 bool no_spills,
3369 shader_time_shader_type st_base,
3370 shader_time_shader_type st_written,
3371 shader_time_shader_type st_reset)
3372 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3373 c(c),
3374 key(key),
3375 prog_data(prog_data),
3376 sanity_param_count(0),
3377 fail_msg(NULL),
3378 first_non_payload_grf(0),
3379 need_all_constants_in_pull_buffer(false),
3380 debug_flag(debug_flag),
3381 no_spills(no_spills),
3382 st_base(st_base),
3383 st_written(st_written),
3384 st_reset(st_reset)
3385 {
3386 this->mem_ctx = mem_ctx;
3387 this->failed = false;
3388
3389 this->base_ir = NULL;
3390 this->current_annotation = NULL;
3391 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3392
3393 this->variable_ht = hash_table_ctor(0,
3394 hash_table_pointer_hash,
3395 hash_table_pointer_compare);
3396
3397 this->virtual_grf_start = NULL;
3398 this->virtual_grf_end = NULL;
3399 this->virtual_grf_sizes = NULL;
3400 this->virtual_grf_count = 0;
3401 this->virtual_grf_reg_map = NULL;
3402 this->virtual_grf_reg_count = 0;
3403 this->virtual_grf_array_size = 0;
3404 this->live_intervals_valid = false;
3405
3406 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3407
3408 this->uniforms = 0;
3409
3410 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3411 * at least one. See setup_uniforms() in brw_vec4.cpp.
3412 */
3413 this->uniform_array_size = 1;
3414 if (prog_data) {
3415 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3416 }
3417
3418 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3419 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3420 }
3421
3422 vec4_visitor::~vec4_visitor()
3423 {
3424 hash_table_dtor(this->variable_ht);
3425 }
3426
3427
3428 void
3429 vec4_visitor::fail(const char *format, ...)
3430 {
3431 va_list va;
3432 char *msg;
3433
3434 if (failed)
3435 return;
3436
3437 failed = true;
3438
3439 va_start(va, format);
3440 msg = ralloc_vasprintf(mem_ctx, format, va);
3441 va_end(va);
3442 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3443
3444 this->fail_msg = msg;
3445
3446 if (debug_flag) {
3447 fprintf(stderr, "%s", msg);
3448 }
3449 }
3450
3451 } /* namespace brw */