i965/vec4: Pass const references to instruction functions.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->sampler = 0;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
82 src_reg src0, src_reg src1, src_reg src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
91 {
92 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
93 }
94
95 vec4_instruction *
96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
97 {
98 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
103 {
104 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode)
109 {
110 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
111 }
112
113 #define ALU1(op) \
114 vec4_instruction * \
115 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
116 { \
117 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
118 src0); \
119 }
120
121 #define ALU2(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
124 const src_reg &src1) \
125 { \
126 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
127 src0, src1); \
128 }
129
130 #define ALU2_ACC(op) \
131 vec4_instruction * \
132 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
133 const src_reg &src1) \
134 { \
135 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
136 BRW_OPCODE_##op, dst, src0, src1); \
137 inst->writes_accumulator = true; \
138 return inst; \
139 }
140
141 #define ALU3(op) \
142 vec4_instruction * \
143 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
144 const src_reg &src1, const src_reg &src2) \
145 { \
146 assert(brw->gen >= 6); \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1, src2); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU1(F32TO16)
158 ALU1(F16TO32)
159 ALU2(ADD)
160 ALU2(MUL)
161 ALU2_ACC(MACH)
162 ALU2(AND)
163 ALU2(OR)
164 ALU2(XOR)
165 ALU2(DP3)
166 ALU2(DP4)
167 ALU2(DPH)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172 ALU1(BFREV)
173 ALU3(BFE)
174 ALU2(BFI1)
175 ALU3(BFI2)
176 ALU1(FBH)
177 ALU1(FBL)
178 ALU1(CBIT)
179 ALU3(MAD)
180 ALU2_ACC(ADDC)
181 ALU2_ACC(SUBB)
182 ALU2(MAC)
183
184 /** Gen4 predicated IF. */
185 vec4_instruction *
186 vec4_visitor::IF(uint32_t predicate)
187 {
188 vec4_instruction *inst;
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
191 inst->predicate = predicate;
192
193 return inst;
194 }
195
196 /** Gen6 IF with embedded comparison. */
197 vec4_instruction *
198 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
221 {
222 vec4_instruction *inst;
223
224 /* original gen4 does type conversion to the destination type
225 * before before comparison, producing garbage results for floating
226 * point comparisons.
227 */
228 if (brw->gen == 4) {
229 dst.type = src0.type;
230 if (dst.file == HW_REG)
231 dst.fixed_hw_reg.type = dst.type;
232 }
233
234 resolve_ud_negate(&src0);
235 resolve_ud_negate(&src1);
236
237 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
238 inst->conditional_mod = condition;
239
240 return inst;
241 }
242
243 vec4_instruction *
244 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
245 {
246 vec4_instruction *inst;
247
248 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
249 dst, index);
250 inst->base_mrf = 14;
251 inst->mlen = 2;
252
253 return inst;
254 }
255
256 vec4_instruction *
257 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
258 const src_reg &index)
259 {
260 vec4_instruction *inst;
261
262 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
263 dst, src, index);
264 inst->base_mrf = 13;
265 inst->mlen = 3;
266
267 return inst;
268 }
269
270 void
271 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
272 {
273 static enum opcode dot_opcodes[] = {
274 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
275 };
276
277 emit(dot_opcodes[elements - 2], dst, src0, src1);
278 }
279
280 src_reg
281 vec4_visitor::fix_3src_operand(src_reg src)
282 {
283 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
284 * able to use vertical stride of zero to replicate the vec4 uniform, like
285 *
286 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
287 *
288 * But you can't, since vertical stride is always four in three-source
289 * instructions. Instead, insert a MOV instruction to do the replication so
290 * that the three-source instruction can consume it.
291 */
292
293 /* The MOV is only needed if the source is a uniform or immediate. */
294 if (src.file != UNIFORM && src.file != IMM)
295 return src;
296
297 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
298 return src;
299
300 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
301 expanded.type = src.type;
302 emit(MOV(expanded, src));
303 return src_reg(expanded);
304 }
305
306 src_reg
307 vec4_visitor::fix_math_operand(src_reg src)
308 {
309 /* The gen6 math instruction ignores the source modifiers --
310 * swizzle, abs, negate, and at least some parts of the register
311 * region description.
312 *
313 * Rather than trying to enumerate all these cases, *always* expand the
314 * operand to a temp GRF for gen6.
315 *
316 * For gen7, keep the operand as-is, except if immediate, which gen7 still
317 * can't use.
318 */
319
320 if (brw->gen == 7 && src.file != IMM)
321 return src;
322
323 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
324 expanded.type = src.type;
325 emit(MOV(expanded, src));
326 return src_reg(expanded);
327 }
328
329 void
330 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
331 {
332 src = fix_math_operand(src);
333
334 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
335 /* The gen6 math instruction must be align1, so we can't do
336 * writemasks.
337 */
338 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
339
340 emit(opcode, temp_dst, src);
341
342 emit(MOV(dst, src_reg(temp_dst)));
343 } else {
344 emit(opcode, dst, src);
345 }
346 }
347
348 void
349 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
350 {
351 vec4_instruction *inst = emit(opcode, dst, src);
352 inst->base_mrf = 1;
353 inst->mlen = 1;
354 }
355
356 void
357 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
358 {
359 switch (opcode) {
360 case SHADER_OPCODE_RCP:
361 case SHADER_OPCODE_RSQ:
362 case SHADER_OPCODE_SQRT:
363 case SHADER_OPCODE_EXP2:
364 case SHADER_OPCODE_LOG2:
365 case SHADER_OPCODE_SIN:
366 case SHADER_OPCODE_COS:
367 break;
368 default:
369 assert(!"not reached: bad math opcode");
370 return;
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 assert(!"not reached: unsupported binary math opcode");
424 return;
425 }
426
427 if (brw->gen >= 8) {
428 emit(opcode, dst, src0, src1);
429 } else if (brw->gen >= 6) {
430 emit_math2_gen6(opcode, dst, src0, src1);
431 } else {
432 emit_math2_gen4(opcode, dst, src0, src1);
433 }
434 }
435
436 void
437 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
438 {
439 if (brw->gen < 7)
440 assert(!"ir_unop_pack_half_2x16 should be lowered");
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7)
516 assert(!"ir_unop_unpack_half_2x16 should be lowered");
517
518 assert(dst.type == BRW_REGISTER_TYPE_F);
519 assert(src0.type == BRW_REGISTER_TYPE_UD);
520
521 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
522 *
523 * Because this instruction does not have a 16-bit floating-point type,
524 * the source data type must be Word (W). The destination type must be
525 * F (Float).
526 *
527 * To use W as the source data type, we must adjust horizontal strides,
528 * which is only possible in align1 mode. All my [chadv] attempts at
529 * emitting align1 instructions for unpackHalf2x16 failed to pass the
530 * Piglit tests, so I gave up.
531 *
532 * I've verified that, on gen7 hardware and the simulator, it is safe to
533 * emit f16to32 in align16 mode with UD as source data type.
534 */
535
536 dst_reg tmp_dst(this, glsl_type::uvec2_type);
537 src_reg tmp_src(tmp_dst);
538
539 tmp_dst.writemask = WRITEMASK_X;
540 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
541
542 tmp_dst.writemask = WRITEMASK_Y;
543 emit(SHR(tmp_dst, src0, src_reg(16u)));
544
545 dst.writemask = WRITEMASK_XY;
546 emit(F16TO32(dst, tmp_src));
547 }
548
549 void
550 vec4_visitor::visit_instructions(const exec_list *list)
551 {
552 foreach_list(node, list) {
553 ir_instruction *ir = (ir_instruction *)node;
554
555 base_ir = ir;
556 ir->accept(this);
557 }
558 }
559
560
561 static int
562 type_size(const struct glsl_type *type)
563 {
564 unsigned int i;
565 int size;
566
567 switch (type->base_type) {
568 case GLSL_TYPE_UINT:
569 case GLSL_TYPE_INT:
570 case GLSL_TYPE_FLOAT:
571 case GLSL_TYPE_BOOL:
572 if (type->is_matrix()) {
573 return type->matrix_columns;
574 } else {
575 /* Regardless of size of vector, it gets a vec4. This is bad
576 * packing for things like floats, but otherwise arrays become a
577 * mess. Hopefully a later pass over the code can pack scalars
578 * down if appropriate.
579 */
580 return 1;
581 }
582 case GLSL_TYPE_ARRAY:
583 assert(type->length > 0);
584 return type_size(type->fields.array) * type->length;
585 case GLSL_TYPE_STRUCT:
586 size = 0;
587 for (i = 0; i < type->length; i++) {
588 size += type_size(type->fields.structure[i].type);
589 }
590 return size;
591 case GLSL_TYPE_SAMPLER:
592 /* Samplers take up one slot in UNIFORMS[], but they're baked in
593 * at link time.
594 */
595 return 1;
596 case GLSL_TYPE_ATOMIC_UINT:
597 return 0;
598 case GLSL_TYPE_IMAGE:
599 case GLSL_TYPE_VOID:
600 case GLSL_TYPE_ERROR:
601 case GLSL_TYPE_INTERFACE:
602 assert(0);
603 break;
604 }
605
606 return 0;
607 }
608
609 int
610 vec4_visitor::virtual_grf_alloc(int size)
611 {
612 if (virtual_grf_array_size <= virtual_grf_count) {
613 if (virtual_grf_array_size == 0)
614 virtual_grf_array_size = 16;
615 else
616 virtual_grf_array_size *= 2;
617 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
618 virtual_grf_array_size);
619 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
620 virtual_grf_array_size);
621 }
622 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
623 virtual_grf_reg_count += size;
624 virtual_grf_sizes[virtual_grf_count] = size;
625 return virtual_grf_count++;
626 }
627
628 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
629 {
630 init();
631
632 this->file = GRF;
633 this->reg = v->virtual_grf_alloc(type_size(type));
634
635 if (type->is_array() || type->is_record()) {
636 this->swizzle = BRW_SWIZZLE_NOOP;
637 } else {
638 this->swizzle = swizzle_for_size(type->vector_elements);
639 }
640
641 this->type = brw_type_for_base_type(type);
642 }
643
644 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
645 {
646 init();
647
648 this->file = GRF;
649 this->reg = v->virtual_grf_alloc(type_size(type));
650
651 if (type->is_array() || type->is_record()) {
652 this->writemask = WRITEMASK_XYZW;
653 } else {
654 this->writemask = (1 << type->vector_elements) - 1;
655 }
656
657 this->type = brw_type_for_base_type(type);
658 }
659
660 /* Our support for uniforms is piggy-backed on the struct
661 * gl_fragment_program, because that's where the values actually
662 * get stored, rather than in some global gl_shader_program uniform
663 * store.
664 */
665 void
666 vec4_visitor::setup_uniform_values(ir_variable *ir)
667 {
668 int namelen = strlen(ir->name);
669
670 /* The data for our (non-builtin) uniforms is stored in a series of
671 * gl_uniform_driver_storage structs for each subcomponent that
672 * glGetUniformLocation() could name. We know it's been set up in the same
673 * order we'd walk the type, so walk the list of storage and find anything
674 * with our name, or the prefix of a component that starts with our name.
675 */
676 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
677 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
678
679 if (strncmp(ir->name, storage->name, namelen) != 0 ||
680 (storage->name[namelen] != 0 &&
681 storage->name[namelen] != '.' &&
682 storage->name[namelen] != '[')) {
683 continue;
684 }
685
686 gl_constant_value *components = storage->storage;
687 unsigned vector_count = (MAX2(storage->array_elements, 1) *
688 storage->type->matrix_columns);
689
690 for (unsigned s = 0; s < vector_count; s++) {
691 assert(uniforms < uniform_array_size);
692 uniform_vector_size[uniforms] = storage->type->vector_elements;
693
694 int i;
695 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
696 stage_prog_data->param[uniforms * 4 + i] = &components->f;
697 components++;
698 }
699 for (; i < 4; i++) {
700 static float zero = 0;
701 stage_prog_data->param[uniforms * 4 + i] = &zero;
702 }
703
704 uniforms++;
705 }
706 }
707 }
708
709 void
710 vec4_visitor::setup_uniform_clipplane_values()
711 {
712 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
713
714 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
715 assert(this->uniforms < uniform_array_size);
716 this->uniform_vector_size[this->uniforms] = 4;
717 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
718 this->userplane[i].type = BRW_REGISTER_TYPE_F;
719 for (int j = 0; j < 4; ++j) {
720 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
721 }
722 ++this->uniforms;
723 }
724 }
725
726 /* Our support for builtin uniforms is even scarier than non-builtin.
727 * It sits on top of the PROG_STATE_VAR parameters that are
728 * automatically updated from GL context state.
729 */
730 void
731 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
732 {
733 const ir_state_slot *const slots = ir->state_slots;
734 assert(ir->state_slots != NULL);
735
736 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
737 /* This state reference has already been setup by ir_to_mesa,
738 * but we'll get the same index back here. We can reference
739 * ParameterValues directly, since unlike brw_fs.cpp, we never
740 * add new state references during compile.
741 */
742 int index = _mesa_add_state_reference(this->prog->Parameters,
743 (gl_state_index *)slots[i].tokens);
744 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
745
746 assert(this->uniforms < uniform_array_size);
747 this->uniform_vector_size[this->uniforms] = 0;
748 /* Add each of the unique swizzled channels of the element.
749 * This will end up matching the size of the glsl_type of this field.
750 */
751 int last_swiz = -1;
752 for (unsigned int j = 0; j < 4; j++) {
753 int swiz = GET_SWZ(slots[i].swizzle, j);
754 last_swiz = swiz;
755
756 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
757 assert(this->uniforms < uniform_array_size);
758 if (swiz <= last_swiz)
759 this->uniform_vector_size[this->uniforms]++;
760 }
761 this->uniforms++;
762 }
763 }
764
765 dst_reg *
766 vec4_visitor::variable_storage(ir_variable *var)
767 {
768 return (dst_reg *)hash_table_find(this->variable_ht, var);
769 }
770
771 void
772 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
773 {
774 ir_expression *expr = ir->as_expression();
775
776 *predicate = BRW_PREDICATE_NORMAL;
777
778 if (expr) {
779 src_reg op[2];
780 vec4_instruction *inst;
781
782 assert(expr->get_num_operands() <= 2);
783 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
784 expr->operands[i]->accept(this);
785 op[i] = this->result;
786
787 resolve_ud_negate(&op[i]);
788 }
789
790 switch (expr->operation) {
791 case ir_unop_logic_not:
792 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
793 inst->conditional_mod = BRW_CONDITIONAL_Z;
794 break;
795
796 case ir_binop_logic_xor:
797 inst = emit(XOR(dst_null_d(), op[0], op[1]));
798 inst->conditional_mod = BRW_CONDITIONAL_NZ;
799 break;
800
801 case ir_binop_logic_or:
802 inst = emit(OR(dst_null_d(), op[0], op[1]));
803 inst->conditional_mod = BRW_CONDITIONAL_NZ;
804 break;
805
806 case ir_binop_logic_and:
807 inst = emit(AND(dst_null_d(), op[0], op[1]));
808 inst->conditional_mod = BRW_CONDITIONAL_NZ;
809 break;
810
811 case ir_unop_f2b:
812 if (brw->gen >= 6) {
813 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
814 } else {
815 inst = emit(MOV(dst_null_f(), op[0]));
816 inst->conditional_mod = BRW_CONDITIONAL_NZ;
817 }
818 break;
819
820 case ir_unop_i2b:
821 if (brw->gen >= 6) {
822 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
823 } else {
824 inst = emit(MOV(dst_null_d(), op[0]));
825 inst->conditional_mod = BRW_CONDITIONAL_NZ;
826 }
827 break;
828
829 case ir_binop_all_equal:
830 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
831 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
832 break;
833
834 case ir_binop_any_nequal:
835 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
836 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
837 break;
838
839 case ir_unop_any:
840 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
841 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
842 break;
843
844 case ir_binop_greater:
845 case ir_binop_gequal:
846 case ir_binop_less:
847 case ir_binop_lequal:
848 case ir_binop_equal:
849 case ir_binop_nequal:
850 emit(CMP(dst_null_d(), op[0], op[1],
851 brw_conditional_for_comparison(expr->operation)));
852 break;
853
854 default:
855 assert(!"not reached");
856 break;
857 }
858 return;
859 }
860
861 ir->accept(this);
862
863 resolve_ud_negate(&this->result);
864
865 if (brw->gen >= 6) {
866 vec4_instruction *inst = emit(AND(dst_null_d(),
867 this->result, src_reg(1)));
868 inst->conditional_mod = BRW_CONDITIONAL_NZ;
869 } else {
870 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
871 inst->conditional_mod = BRW_CONDITIONAL_NZ;
872 }
873 }
874
875 /**
876 * Emit a gen6 IF statement with the comparison folded into the IF
877 * instruction.
878 */
879 void
880 vec4_visitor::emit_if_gen6(ir_if *ir)
881 {
882 ir_expression *expr = ir->condition->as_expression();
883
884 if (expr) {
885 src_reg op[2];
886 dst_reg temp;
887
888 assert(expr->get_num_operands() <= 2);
889 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
890 expr->operands[i]->accept(this);
891 op[i] = this->result;
892 }
893
894 switch (expr->operation) {
895 case ir_unop_logic_not:
896 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
897 return;
898
899 case ir_binop_logic_xor:
900 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
901 return;
902
903 case ir_binop_logic_or:
904 temp = dst_reg(this, glsl_type::bool_type);
905 emit(OR(temp, op[0], op[1]));
906 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
907 return;
908
909 case ir_binop_logic_and:
910 temp = dst_reg(this, glsl_type::bool_type);
911 emit(AND(temp, op[0], op[1]));
912 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_f2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_unop_i2b:
920 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
921 return;
922
923 case ir_binop_greater:
924 case ir_binop_gequal:
925 case ir_binop_less:
926 case ir_binop_lequal:
927 case ir_binop_equal:
928 case ir_binop_nequal:
929 emit(IF(op[0], op[1],
930 brw_conditional_for_comparison(expr->operation)));
931 return;
932
933 case ir_binop_all_equal:
934 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
935 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
936 return;
937
938 case ir_binop_any_nequal:
939 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
940 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
941 return;
942
943 case ir_unop_any:
944 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
945 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
946 return;
947
948 default:
949 assert(!"not reached");
950 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
951 return;
952 }
953 return;
954 }
955
956 ir->condition->accept(this);
957
958 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
959 }
960
961 void
962 vec4_visitor::visit(ir_variable *ir)
963 {
964 dst_reg *reg = NULL;
965
966 if (variable_storage(ir))
967 return;
968
969 switch (ir->data.mode) {
970 case ir_var_shader_in:
971 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
972 break;
973
974 case ir_var_shader_out:
975 reg = new(mem_ctx) dst_reg(this, ir->type);
976
977 for (int i = 0; i < type_size(ir->type); i++) {
978 output_reg[ir->data.location + i] = *reg;
979 output_reg[ir->data.location + i].reg_offset = i;
980 output_reg[ir->data.location + i].type =
981 brw_type_for_base_type(ir->type->get_scalar_type());
982 output_reg_annotation[ir->data.location + i] = ir->name;
983 }
984 break;
985
986 case ir_var_auto:
987 case ir_var_temporary:
988 reg = new(mem_ctx) dst_reg(this, ir->type);
989 break;
990
991 case ir_var_uniform:
992 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
993
994 /* Thanks to the lower_ubo_reference pass, we will see only
995 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
996 * variables, so no need for them to be in variable_ht.
997 *
998 * Atomic counters take no uniform storage, no need to do
999 * anything here.
1000 */
1001 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1002 return;
1003
1004 /* Track how big the whole uniform variable is, in case we need to put a
1005 * copy of its data into pull constants for array access.
1006 */
1007 assert(this->uniforms < uniform_array_size);
1008 this->uniform_size[this->uniforms] = type_size(ir->type);
1009
1010 if (!strncmp(ir->name, "gl_", 3)) {
1011 setup_builtin_uniform_values(ir);
1012 } else {
1013 setup_uniform_values(ir);
1014 }
1015 break;
1016
1017 case ir_var_system_value:
1018 reg = make_reg_for_system_value(ir);
1019 break;
1020
1021 default:
1022 assert(!"not reached");
1023 }
1024
1025 reg->type = brw_type_for_base_type(ir->type);
1026 hash_table_insert(this->variable_ht, reg, ir);
1027 }
1028
1029 void
1030 vec4_visitor::visit(ir_loop *ir)
1031 {
1032 /* We don't want debugging output to print the whole body of the
1033 * loop as the annotation.
1034 */
1035 this->base_ir = NULL;
1036
1037 emit(BRW_OPCODE_DO);
1038
1039 visit_instructions(&ir->body_instructions);
1040
1041 emit(BRW_OPCODE_WHILE);
1042 }
1043
1044 void
1045 vec4_visitor::visit(ir_loop_jump *ir)
1046 {
1047 switch (ir->mode) {
1048 case ir_loop_jump::jump_break:
1049 emit(BRW_OPCODE_BREAK);
1050 break;
1051 case ir_loop_jump::jump_continue:
1052 emit(BRW_OPCODE_CONTINUE);
1053 break;
1054 }
1055 }
1056
1057
1058 void
1059 vec4_visitor::visit(ir_function_signature *ir)
1060 {
1061 assert(0);
1062 (void)ir;
1063 }
1064
1065 void
1066 vec4_visitor::visit(ir_function *ir)
1067 {
1068 /* Ignore function bodies other than main() -- we shouldn't see calls to
1069 * them since they should all be inlined.
1070 */
1071 if (strcmp(ir->name, "main") == 0) {
1072 const ir_function_signature *sig;
1073 exec_list empty;
1074
1075 sig = ir->matching_signature(NULL, &empty);
1076
1077 assert(sig);
1078
1079 visit_instructions(&sig->body);
1080 }
1081 }
1082
1083 bool
1084 vec4_visitor::try_emit_sat(ir_expression *ir)
1085 {
1086 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1087 if (!sat_src)
1088 return false;
1089
1090 sat_src->accept(this);
1091 src_reg src = this->result;
1092
1093 this->result = src_reg(this, ir->type);
1094 vec4_instruction *inst;
1095 inst = emit(MOV(dst_reg(this->result), src));
1096 inst->saturate = true;
1097
1098 return true;
1099 }
1100
1101 bool
1102 vec4_visitor::try_emit_mad(ir_expression *ir)
1103 {
1104 /* 3-src instructions were introduced in gen6. */
1105 if (brw->gen < 6)
1106 return false;
1107
1108 /* MAD can only handle floating-point data. */
1109 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1110 return false;
1111
1112 ir_rvalue *nonmul = ir->operands[1];
1113 ir_expression *mul = ir->operands[0]->as_expression();
1114
1115 if (!mul || mul->operation != ir_binop_mul) {
1116 nonmul = ir->operands[0];
1117 mul = ir->operands[1]->as_expression();
1118
1119 if (!mul || mul->operation != ir_binop_mul)
1120 return false;
1121 }
1122
1123 nonmul->accept(this);
1124 src_reg src0 = fix_3src_operand(this->result);
1125
1126 mul->operands[0]->accept(this);
1127 src_reg src1 = fix_3src_operand(this->result);
1128
1129 mul->operands[1]->accept(this);
1130 src_reg src2 = fix_3src_operand(this->result);
1131
1132 this->result = src_reg(this, ir->type);
1133 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1134
1135 return true;
1136 }
1137
1138 bool
1139 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1140 {
1141 ir_expression *const cmp = ir->operands[0]->as_expression();
1142
1143 if (cmp == NULL)
1144 return false;
1145
1146 switch (cmp->operation) {
1147 case ir_binop_less:
1148 case ir_binop_greater:
1149 case ir_binop_lequal:
1150 case ir_binop_gequal:
1151 case ir_binop_equal:
1152 case ir_binop_nequal:
1153 break;
1154
1155 default:
1156 return false;
1157 }
1158
1159 cmp->operands[0]->accept(this);
1160 const src_reg cmp_src0 = this->result;
1161
1162 cmp->operands[1]->accept(this);
1163 const src_reg cmp_src1 = this->result;
1164
1165 this->result = src_reg(this, ir->type);
1166
1167 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1168 brw_conditional_for_comparison(cmp->operation)));
1169
1170 /* If the comparison is false, this->result will just happen to be zero.
1171 */
1172 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1173 this->result, src_reg(1.0f));
1174 inst->predicate = BRW_PREDICATE_NORMAL;
1175 inst->predicate_inverse = true;
1176
1177 return true;
1178 }
1179
1180 void
1181 vec4_visitor::emit_bool_comparison(unsigned int op,
1182 dst_reg dst, src_reg src0, src_reg src1)
1183 {
1184 /* original gen4 does destination conversion before comparison. */
1185 if (brw->gen < 5)
1186 dst.type = src0.type;
1187
1188 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1189
1190 dst.type = BRW_REGISTER_TYPE_D;
1191 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1192 }
1193
1194 void
1195 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1196 src_reg src0, src_reg src1)
1197 {
1198 vec4_instruction *inst;
1199
1200 if (brw->gen >= 6) {
1201 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1202 inst->conditional_mod = conditionalmod;
1203 } else {
1204 emit(CMP(dst, src0, src1, conditionalmod));
1205
1206 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1207 inst->predicate = BRW_PREDICATE_NORMAL;
1208 }
1209 }
1210
1211 void
1212 vec4_visitor::emit_lrp(const dst_reg &dst,
1213 const src_reg &x, const src_reg &y, const src_reg &a)
1214 {
1215 if (brw->gen >= 6) {
1216 /* Note that the instruction's argument order is reversed from GLSL
1217 * and the IR.
1218 */
1219 emit(LRP(dst,
1220 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1221 } else {
1222 /* Earlier generations don't support three source operations, so we
1223 * need to emit x*(1-a) + y*a.
1224 */
1225 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1226 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1227 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1228 y_times_a.writemask = dst.writemask;
1229 one_minus_a.writemask = dst.writemask;
1230 x_times_one_minus_a.writemask = dst.writemask;
1231
1232 emit(MUL(y_times_a, y, a));
1233 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1234 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1235 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1236 }
1237 }
1238
1239 void
1240 vec4_visitor::visit(ir_expression *ir)
1241 {
1242 unsigned int operand;
1243 src_reg op[Elements(ir->operands)];
1244 src_reg result_src;
1245 dst_reg result_dst;
1246 vec4_instruction *inst;
1247
1248 if (try_emit_sat(ir))
1249 return;
1250
1251 if (ir->operation == ir_binop_add) {
1252 if (try_emit_mad(ir))
1253 return;
1254 }
1255
1256 if (ir->operation == ir_unop_b2f) {
1257 if (try_emit_b2f_of_compare(ir))
1258 return;
1259 }
1260
1261 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1262 this->result.file = BAD_FILE;
1263 ir->operands[operand]->accept(this);
1264 if (this->result.file == BAD_FILE) {
1265 fprintf(stderr, "Failed to get tree for expression operand:\n");
1266 ir->operands[operand]->fprint(stderr);
1267 exit(1);
1268 }
1269 op[operand] = this->result;
1270
1271 /* Matrix expression operands should have been broken down to vector
1272 * operations already.
1273 */
1274 assert(!ir->operands[operand]->type->is_matrix());
1275 }
1276
1277 int vector_elements = ir->operands[0]->type->vector_elements;
1278 if (ir->operands[1]) {
1279 vector_elements = MAX2(vector_elements,
1280 ir->operands[1]->type->vector_elements);
1281 }
1282
1283 this->result.file = BAD_FILE;
1284
1285 /* Storage for our result. Ideally for an assignment we'd be using
1286 * the actual storage for the result here, instead.
1287 */
1288 result_src = src_reg(this, ir->type);
1289 /* convenience for the emit functions below. */
1290 result_dst = dst_reg(result_src);
1291 /* If nothing special happens, this is the result. */
1292 this->result = result_src;
1293 /* Limit writes to the channels that will be used by result_src later.
1294 * This does limit this temp's use as a temporary for multi-instruction
1295 * sequences.
1296 */
1297 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1298
1299 switch (ir->operation) {
1300 case ir_unop_logic_not:
1301 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1302 * ones complement of the whole register, not just bit 0.
1303 */
1304 emit(XOR(result_dst, op[0], src_reg(1)));
1305 break;
1306 case ir_unop_neg:
1307 op[0].negate = !op[0].negate;
1308 emit(MOV(result_dst, op[0]));
1309 break;
1310 case ir_unop_abs:
1311 op[0].abs = true;
1312 op[0].negate = false;
1313 emit(MOV(result_dst, op[0]));
1314 break;
1315
1316 case ir_unop_sign:
1317 if (ir->type->is_float()) {
1318 /* AND(val, 0x80000000) gives the sign bit.
1319 *
1320 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1321 * zero.
1322 */
1323 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1324
1325 op[0].type = BRW_REGISTER_TYPE_UD;
1326 result_dst.type = BRW_REGISTER_TYPE_UD;
1327 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1328
1329 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1330 inst->predicate = BRW_PREDICATE_NORMAL;
1331
1332 this->result.type = BRW_REGISTER_TYPE_F;
1333 } else {
1334 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1335 * -> non-negative val generates 0x00000000.
1336 * Predicated OR sets 1 if val is positive.
1337 */
1338 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1339
1340 emit(ASR(result_dst, op[0], src_reg(31)));
1341
1342 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1343 inst->predicate = BRW_PREDICATE_NORMAL;
1344 }
1345 break;
1346
1347 case ir_unop_rcp:
1348 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1349 break;
1350
1351 case ir_unop_exp2:
1352 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1353 break;
1354 case ir_unop_log2:
1355 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1356 break;
1357 case ir_unop_exp:
1358 case ir_unop_log:
1359 assert(!"not reached: should be handled by ir_explog_to_explog2");
1360 break;
1361 case ir_unop_sin:
1362 case ir_unop_sin_reduced:
1363 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1364 break;
1365 case ir_unop_cos:
1366 case ir_unop_cos_reduced:
1367 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1368 break;
1369
1370 case ir_unop_dFdx:
1371 case ir_unop_dFdy:
1372 assert(!"derivatives not valid in vertex shader");
1373 break;
1374
1375 case ir_unop_bitfield_reverse:
1376 emit(BFREV(result_dst, op[0]));
1377 break;
1378 case ir_unop_bit_count:
1379 emit(CBIT(result_dst, op[0]));
1380 break;
1381 case ir_unop_find_msb: {
1382 src_reg temp = src_reg(this, glsl_type::uint_type);
1383
1384 inst = emit(FBH(dst_reg(temp), op[0]));
1385 inst->dst.writemask = WRITEMASK_XYZW;
1386
1387 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1388 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1389 * subtract the result from 31 to convert the MSB count into an LSB count.
1390 */
1391
1392 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1393 temp.swizzle = BRW_SWIZZLE_NOOP;
1394 emit(MOV(result_dst, temp));
1395
1396 src_reg src_tmp = src_reg(result_dst);
1397 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1398
1399 src_tmp.negate = true;
1400 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1401 inst->predicate = BRW_PREDICATE_NORMAL;
1402 break;
1403 }
1404 case ir_unop_find_lsb:
1405 emit(FBL(result_dst, op[0]));
1406 break;
1407
1408 case ir_unop_noise:
1409 assert(!"not reached: should be handled by lower_noise");
1410 break;
1411
1412 case ir_binop_add:
1413 emit(ADD(result_dst, op[0], op[1]));
1414 break;
1415 case ir_binop_sub:
1416 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1417 break;
1418
1419 case ir_binop_mul:
1420 if (brw->gen < 8 && ir->type->is_integer()) {
1421 /* For integer multiplication, the MUL uses the low 16 bits of one of
1422 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1423 * accumulates in the contribution of the upper 16 bits of that
1424 * operand. If we can determine that one of the args is in the low
1425 * 16 bits, though, we can just emit a single MUL.
1426 */
1427 if (ir->operands[0]->is_uint16_constant()) {
1428 if (brw->gen < 7)
1429 emit(MUL(result_dst, op[0], op[1]));
1430 else
1431 emit(MUL(result_dst, op[1], op[0]));
1432 } else if (ir->operands[1]->is_uint16_constant()) {
1433 if (brw->gen < 7)
1434 emit(MUL(result_dst, op[1], op[0]));
1435 else
1436 emit(MUL(result_dst, op[0], op[1]));
1437 } else {
1438 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1439
1440 emit(MUL(acc, op[0], op[1]));
1441 emit(MACH(dst_null_d(), op[0], op[1]));
1442 emit(MOV(result_dst, src_reg(acc)));
1443 }
1444 } else {
1445 emit(MUL(result_dst, op[0], op[1]));
1446 }
1447 break;
1448 case ir_binop_imul_high: {
1449 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1450
1451 emit(MUL(acc, op[0], op[1]));
1452 emit(MACH(result_dst, op[0], op[1]));
1453 break;
1454 }
1455 case ir_binop_div:
1456 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1457 assert(ir->type->is_integer());
1458 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1459 break;
1460 case ir_binop_carry: {
1461 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1462
1463 emit(ADDC(dst_null_ud(), op[0], op[1]));
1464 emit(MOV(result_dst, src_reg(acc)));
1465 break;
1466 }
1467 case ir_binop_borrow: {
1468 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1469
1470 emit(SUBB(dst_null_ud(), op[0], op[1]));
1471 emit(MOV(result_dst, src_reg(acc)));
1472 break;
1473 }
1474 case ir_binop_mod:
1475 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1476 assert(ir->type->is_integer());
1477 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1478 break;
1479
1480 case ir_binop_less:
1481 case ir_binop_greater:
1482 case ir_binop_lequal:
1483 case ir_binop_gequal:
1484 case ir_binop_equal:
1485 case ir_binop_nequal: {
1486 emit(CMP(result_dst, op[0], op[1],
1487 brw_conditional_for_comparison(ir->operation)));
1488 emit(AND(result_dst, result_src, src_reg(0x1)));
1489 break;
1490 }
1491
1492 case ir_binop_all_equal:
1493 /* "==" operator producing a scalar boolean. */
1494 if (ir->operands[0]->type->is_vector() ||
1495 ir->operands[1]->type->is_vector()) {
1496 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1497 emit(MOV(result_dst, src_reg(0)));
1498 inst = emit(MOV(result_dst, src_reg(1)));
1499 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1500 } else {
1501 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1502 emit(AND(result_dst, result_src, src_reg(0x1)));
1503 }
1504 break;
1505 case ir_binop_any_nequal:
1506 /* "!=" operator producing a scalar boolean. */
1507 if (ir->operands[0]->type->is_vector() ||
1508 ir->operands[1]->type->is_vector()) {
1509 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1510
1511 emit(MOV(result_dst, src_reg(0)));
1512 inst = emit(MOV(result_dst, src_reg(1)));
1513 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1514 } else {
1515 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1516 emit(AND(result_dst, result_src, src_reg(0x1)));
1517 }
1518 break;
1519
1520 case ir_unop_any:
1521 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1522 emit(MOV(result_dst, src_reg(0)));
1523
1524 inst = emit(MOV(result_dst, src_reg(1)));
1525 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1526 break;
1527
1528 case ir_binop_logic_xor:
1529 emit(XOR(result_dst, op[0], op[1]));
1530 break;
1531
1532 case ir_binop_logic_or:
1533 emit(OR(result_dst, op[0], op[1]));
1534 break;
1535
1536 case ir_binop_logic_and:
1537 emit(AND(result_dst, op[0], op[1]));
1538 break;
1539
1540 case ir_binop_dot:
1541 assert(ir->operands[0]->type->is_vector());
1542 assert(ir->operands[0]->type == ir->operands[1]->type);
1543 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1544 break;
1545
1546 case ir_unop_sqrt:
1547 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1548 break;
1549 case ir_unop_rsq:
1550 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1551 break;
1552
1553 case ir_unop_bitcast_i2f:
1554 case ir_unop_bitcast_u2f:
1555 this->result = op[0];
1556 this->result.type = BRW_REGISTER_TYPE_F;
1557 break;
1558
1559 case ir_unop_bitcast_f2i:
1560 this->result = op[0];
1561 this->result.type = BRW_REGISTER_TYPE_D;
1562 break;
1563
1564 case ir_unop_bitcast_f2u:
1565 this->result = op[0];
1566 this->result.type = BRW_REGISTER_TYPE_UD;
1567 break;
1568
1569 case ir_unop_i2f:
1570 case ir_unop_i2u:
1571 case ir_unop_u2i:
1572 case ir_unop_u2f:
1573 case ir_unop_b2f:
1574 case ir_unop_b2i:
1575 case ir_unop_f2i:
1576 case ir_unop_f2u:
1577 emit(MOV(result_dst, op[0]));
1578 break;
1579 case ir_unop_f2b:
1580 case ir_unop_i2b: {
1581 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1582 emit(AND(result_dst, result_src, src_reg(1)));
1583 break;
1584 }
1585
1586 case ir_unop_trunc:
1587 emit(RNDZ(result_dst, op[0]));
1588 break;
1589 case ir_unop_ceil:
1590 op[0].negate = !op[0].negate;
1591 inst = emit(RNDD(result_dst, op[0]));
1592 this->result.negate = true;
1593 break;
1594 case ir_unop_floor:
1595 inst = emit(RNDD(result_dst, op[0]));
1596 break;
1597 case ir_unop_fract:
1598 inst = emit(FRC(result_dst, op[0]));
1599 break;
1600 case ir_unop_round_even:
1601 emit(RNDE(result_dst, op[0]));
1602 break;
1603
1604 case ir_binop_min:
1605 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1606 break;
1607 case ir_binop_max:
1608 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1609 break;
1610
1611 case ir_binop_pow:
1612 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1613 break;
1614
1615 case ir_unop_bit_not:
1616 inst = emit(NOT(result_dst, op[0]));
1617 break;
1618 case ir_binop_bit_and:
1619 inst = emit(AND(result_dst, op[0], op[1]));
1620 break;
1621 case ir_binop_bit_xor:
1622 inst = emit(XOR(result_dst, op[0], op[1]));
1623 break;
1624 case ir_binop_bit_or:
1625 inst = emit(OR(result_dst, op[0], op[1]));
1626 break;
1627
1628 case ir_binop_lshift:
1629 inst = emit(SHL(result_dst, op[0], op[1]));
1630 break;
1631
1632 case ir_binop_rshift:
1633 if (ir->type->base_type == GLSL_TYPE_INT)
1634 inst = emit(ASR(result_dst, op[0], op[1]));
1635 else
1636 inst = emit(SHR(result_dst, op[0], op[1]));
1637 break;
1638
1639 case ir_binop_bfm:
1640 emit(BFI1(result_dst, op[0], op[1]));
1641 break;
1642
1643 case ir_binop_ubo_load: {
1644 ir_constant *uniform_block = ir->operands[0]->as_constant();
1645 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1646 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1647 src_reg offset;
1648
1649 /* Now, load the vector from that offset. */
1650 assert(ir->type->is_vector() || ir->type->is_scalar());
1651
1652 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1653 packed_consts.type = result.type;
1654 src_reg surf_index =
1655 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1656 if (const_offset_ir) {
1657 if (brw->gen >= 8) {
1658 /* Store the offset in a GRF so we can send-from-GRF. */
1659 offset = src_reg(this, glsl_type::int_type);
1660 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1661 } else {
1662 /* Immediates are fine on older generations since they'll be moved
1663 * to a (potentially fake) MRF at the generator level.
1664 */
1665 offset = src_reg(const_offset / 16);
1666 }
1667 } else {
1668 offset = src_reg(this, glsl_type::uint_type);
1669 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1670 }
1671
1672 if (brw->gen >= 7) {
1673 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1674 grf_offset.type = offset.type;
1675
1676 emit(MOV(grf_offset, offset));
1677
1678 emit(new(mem_ctx) vec4_instruction(this,
1679 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1680 dst_reg(packed_consts),
1681 surf_index,
1682 src_reg(grf_offset)));
1683 } else {
1684 vec4_instruction *pull =
1685 emit(new(mem_ctx) vec4_instruction(this,
1686 VS_OPCODE_PULL_CONSTANT_LOAD,
1687 dst_reg(packed_consts),
1688 surf_index,
1689 offset));
1690 pull->base_mrf = 14;
1691 pull->mlen = 1;
1692 }
1693
1694 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1695 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1696 const_offset % 16 / 4,
1697 const_offset % 16 / 4,
1698 const_offset % 16 / 4);
1699
1700 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1701 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1702 emit(CMP(result_dst, packed_consts, src_reg(0u),
1703 BRW_CONDITIONAL_NZ));
1704 emit(AND(result_dst, result, src_reg(0x1)));
1705 } else {
1706 emit(MOV(result_dst, packed_consts));
1707 }
1708 break;
1709 }
1710
1711 case ir_binop_vector_extract:
1712 assert(!"should have been lowered by vec_index_to_cond_assign");
1713 break;
1714
1715 case ir_triop_fma:
1716 op[0] = fix_3src_operand(op[0]);
1717 op[1] = fix_3src_operand(op[1]);
1718 op[2] = fix_3src_operand(op[2]);
1719 /* Note that the instruction's argument order is reversed from GLSL
1720 * and the IR.
1721 */
1722 emit(MAD(result_dst, op[2], op[1], op[0]));
1723 break;
1724
1725 case ir_triop_lrp:
1726 emit_lrp(result_dst, op[0], op[1], op[2]);
1727 break;
1728
1729 case ir_triop_csel:
1730 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1731 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1732 inst->predicate = BRW_PREDICATE_NORMAL;
1733 break;
1734
1735 case ir_triop_bfi:
1736 op[0] = fix_3src_operand(op[0]);
1737 op[1] = fix_3src_operand(op[1]);
1738 op[2] = fix_3src_operand(op[2]);
1739 emit(BFI2(result_dst, op[0], op[1], op[2]));
1740 break;
1741
1742 case ir_triop_bitfield_extract:
1743 op[0] = fix_3src_operand(op[0]);
1744 op[1] = fix_3src_operand(op[1]);
1745 op[2] = fix_3src_operand(op[2]);
1746 /* Note that the instruction's argument order is reversed from GLSL
1747 * and the IR.
1748 */
1749 emit(BFE(result_dst, op[2], op[1], op[0]));
1750 break;
1751
1752 case ir_triop_vector_insert:
1753 assert(!"should have been lowered by lower_vector_insert");
1754 break;
1755
1756 case ir_quadop_bitfield_insert:
1757 assert(!"not reached: should be handled by "
1758 "bitfield_insert_to_bfm_bfi\n");
1759 break;
1760
1761 case ir_quadop_vector:
1762 assert(!"not reached: should be handled by lower_quadop_vector");
1763 break;
1764
1765 case ir_unop_pack_half_2x16:
1766 emit_pack_half_2x16(result_dst, op[0]);
1767 break;
1768 case ir_unop_unpack_half_2x16:
1769 emit_unpack_half_2x16(result_dst, op[0]);
1770 break;
1771 case ir_unop_pack_snorm_2x16:
1772 case ir_unop_pack_snorm_4x8:
1773 case ir_unop_pack_unorm_2x16:
1774 case ir_unop_pack_unorm_4x8:
1775 case ir_unop_unpack_snorm_2x16:
1776 case ir_unop_unpack_snorm_4x8:
1777 case ir_unop_unpack_unorm_2x16:
1778 case ir_unop_unpack_unorm_4x8:
1779 assert(!"not reached: should be handled by lower_packing_builtins");
1780 break;
1781 case ir_unop_unpack_half_2x16_split_x:
1782 case ir_unop_unpack_half_2x16_split_y:
1783 case ir_binop_pack_half_2x16_split:
1784 assert(!"not reached: should not occur in vertex shader");
1785 break;
1786 case ir_binop_ldexp:
1787 assert(!"not reached: should be handled by ldexp_to_arith()");
1788 break;
1789 }
1790 }
1791
1792
1793 void
1794 vec4_visitor::visit(ir_swizzle *ir)
1795 {
1796 src_reg src;
1797 int i = 0;
1798 int swizzle[4];
1799
1800 /* Note that this is only swizzles in expressions, not those on the left
1801 * hand side of an assignment, which do write masking. See ir_assignment
1802 * for that.
1803 */
1804
1805 ir->val->accept(this);
1806 src = this->result;
1807 assert(src.file != BAD_FILE);
1808
1809 for (i = 0; i < ir->type->vector_elements; i++) {
1810 switch (i) {
1811 case 0:
1812 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1813 break;
1814 case 1:
1815 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1816 break;
1817 case 2:
1818 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1819 break;
1820 case 3:
1821 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1822 break;
1823 }
1824 }
1825 for (; i < 4; i++) {
1826 /* Replicate the last channel out. */
1827 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1828 }
1829
1830 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1831
1832 this->result = src;
1833 }
1834
1835 void
1836 vec4_visitor::visit(ir_dereference_variable *ir)
1837 {
1838 const struct glsl_type *type = ir->type;
1839 dst_reg *reg = variable_storage(ir->var);
1840
1841 if (!reg) {
1842 fail("Failed to find variable storage for %s\n", ir->var->name);
1843 this->result = src_reg(brw_null_reg());
1844 return;
1845 }
1846
1847 this->result = src_reg(*reg);
1848
1849 /* System values get their swizzle from the dst_reg writemask */
1850 if (ir->var->data.mode == ir_var_system_value)
1851 return;
1852
1853 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1854 this->result.swizzle = swizzle_for_size(type->vector_elements);
1855 }
1856
1857
1858 int
1859 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1860 {
1861 /* Under normal circumstances array elements are stored consecutively, so
1862 * the stride is equal to the size of the array element.
1863 */
1864 return type_size(ir->type);
1865 }
1866
1867
1868 void
1869 vec4_visitor::visit(ir_dereference_array *ir)
1870 {
1871 ir_constant *constant_index;
1872 src_reg src;
1873 int array_stride = compute_array_stride(ir);
1874
1875 constant_index = ir->array_index->constant_expression_value();
1876
1877 ir->array->accept(this);
1878 src = this->result;
1879
1880 if (constant_index) {
1881 src.reg_offset += constant_index->value.i[0] * array_stride;
1882 } else {
1883 /* Variable index array dereference. It eats the "vec4" of the
1884 * base of the array and an index that offsets the Mesa register
1885 * index.
1886 */
1887 ir->array_index->accept(this);
1888
1889 src_reg index_reg;
1890
1891 if (array_stride == 1) {
1892 index_reg = this->result;
1893 } else {
1894 index_reg = src_reg(this, glsl_type::int_type);
1895
1896 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1897 }
1898
1899 if (src.reladdr) {
1900 src_reg temp = src_reg(this, glsl_type::int_type);
1901
1902 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1903
1904 index_reg = temp;
1905 }
1906
1907 src.reladdr = ralloc(mem_ctx, src_reg);
1908 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1909 }
1910
1911 /* If the type is smaller than a vec4, replicate the last channel out. */
1912 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1913 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1914 else
1915 src.swizzle = BRW_SWIZZLE_NOOP;
1916 src.type = brw_type_for_base_type(ir->type);
1917
1918 this->result = src;
1919 }
1920
1921 void
1922 vec4_visitor::visit(ir_dereference_record *ir)
1923 {
1924 unsigned int i;
1925 const glsl_type *struct_type = ir->record->type;
1926 int offset = 0;
1927
1928 ir->record->accept(this);
1929
1930 for (i = 0; i < struct_type->length; i++) {
1931 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1932 break;
1933 offset += type_size(struct_type->fields.structure[i].type);
1934 }
1935
1936 /* If the type is smaller than a vec4, replicate the last channel out. */
1937 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1938 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1939 else
1940 this->result.swizzle = BRW_SWIZZLE_NOOP;
1941 this->result.type = brw_type_for_base_type(ir->type);
1942
1943 this->result.reg_offset += offset;
1944 }
1945
1946 /**
1947 * We want to be careful in assignment setup to hit the actual storage
1948 * instead of potentially using a temporary like we might with the
1949 * ir_dereference handler.
1950 */
1951 static dst_reg
1952 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1953 {
1954 /* The LHS must be a dereference. If the LHS is a variable indexed array
1955 * access of a vector, it must be separated into a series conditional moves
1956 * before reaching this point (see ir_vec_index_to_cond_assign).
1957 */
1958 assert(ir->as_dereference());
1959 ir_dereference_array *deref_array = ir->as_dereference_array();
1960 if (deref_array) {
1961 assert(!deref_array->array->type->is_vector());
1962 }
1963
1964 /* Use the rvalue deref handler for the most part. We'll ignore
1965 * swizzles in it and write swizzles using writemask, though.
1966 */
1967 ir->accept(v);
1968 return dst_reg(v->result);
1969 }
1970
1971 void
1972 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1973 const struct glsl_type *type, uint32_t predicate)
1974 {
1975 if (type->base_type == GLSL_TYPE_STRUCT) {
1976 for (unsigned int i = 0; i < type->length; i++) {
1977 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1978 }
1979 return;
1980 }
1981
1982 if (type->is_array()) {
1983 for (unsigned int i = 0; i < type->length; i++) {
1984 emit_block_move(dst, src, type->fields.array, predicate);
1985 }
1986 return;
1987 }
1988
1989 if (type->is_matrix()) {
1990 const struct glsl_type *vec_type;
1991
1992 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1993 type->vector_elements, 1);
1994
1995 for (int i = 0; i < type->matrix_columns; i++) {
1996 emit_block_move(dst, src, vec_type, predicate);
1997 }
1998 return;
1999 }
2000
2001 assert(type->is_scalar() || type->is_vector());
2002
2003 dst->type = brw_type_for_base_type(type);
2004 src->type = dst->type;
2005
2006 dst->writemask = (1 << type->vector_elements) - 1;
2007
2008 src->swizzle = swizzle_for_size(type->vector_elements);
2009
2010 vec4_instruction *inst = emit(MOV(*dst, *src));
2011 inst->predicate = predicate;
2012
2013 dst->reg_offset++;
2014 src->reg_offset++;
2015 }
2016
2017
2018 /* If the RHS processing resulted in an instruction generating a
2019 * temporary value, and it would be easy to rewrite the instruction to
2020 * generate its result right into the LHS instead, do so. This ends
2021 * up reliably removing instructions where it can be tricky to do so
2022 * later without real UD chain information.
2023 */
2024 bool
2025 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2026 dst_reg dst,
2027 src_reg src,
2028 vec4_instruction *pre_rhs_inst,
2029 vec4_instruction *last_rhs_inst)
2030 {
2031 /* This could be supported, but it would take more smarts. */
2032 if (ir->condition)
2033 return false;
2034
2035 if (pre_rhs_inst == last_rhs_inst)
2036 return false; /* No instructions generated to work with. */
2037
2038 /* Make sure the last instruction generated our source reg. */
2039 if (src.file != GRF ||
2040 src.file != last_rhs_inst->dst.file ||
2041 src.reg != last_rhs_inst->dst.reg ||
2042 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2043 src.reladdr ||
2044 src.abs ||
2045 src.negate ||
2046 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2047 return false;
2048
2049 /* Check that that last instruction fully initialized the channels
2050 * we want to use, in the order we want to use them. We could
2051 * potentially reswizzle the operands of many instructions so that
2052 * we could handle out of order channels, but don't yet.
2053 */
2054
2055 for (unsigned i = 0; i < 4; i++) {
2056 if (dst.writemask & (1 << i)) {
2057 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2058 return false;
2059
2060 if (BRW_GET_SWZ(src.swizzle, i) != i)
2061 return false;
2062 }
2063 }
2064
2065 /* Success! Rewrite the instruction. */
2066 last_rhs_inst->dst.file = dst.file;
2067 last_rhs_inst->dst.reg = dst.reg;
2068 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2069 last_rhs_inst->dst.reladdr = dst.reladdr;
2070 last_rhs_inst->dst.writemask &= dst.writemask;
2071
2072 return true;
2073 }
2074
2075 void
2076 vec4_visitor::visit(ir_assignment *ir)
2077 {
2078 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2079 uint32_t predicate = BRW_PREDICATE_NONE;
2080
2081 if (!ir->lhs->type->is_scalar() &&
2082 !ir->lhs->type->is_vector()) {
2083 ir->rhs->accept(this);
2084 src_reg src = this->result;
2085
2086 if (ir->condition) {
2087 emit_bool_to_cond_code(ir->condition, &predicate);
2088 }
2089
2090 /* emit_block_move doesn't account for swizzles in the source register.
2091 * This should be ok, since the source register is a structure or an
2092 * array, and those can't be swizzled. But double-check to be sure.
2093 */
2094 assert(src.swizzle ==
2095 (ir->rhs->type->is_matrix()
2096 ? swizzle_for_size(ir->rhs->type->vector_elements)
2097 : BRW_SWIZZLE_NOOP));
2098
2099 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2100 return;
2101 }
2102
2103 /* Now we're down to just a scalar/vector with writemasks. */
2104 int i;
2105
2106 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2107 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2108
2109 ir->rhs->accept(this);
2110
2111 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2112
2113 src_reg src = this->result;
2114
2115 int swizzles[4];
2116 int first_enabled_chan = 0;
2117 int src_chan = 0;
2118
2119 assert(ir->lhs->type->is_vector() ||
2120 ir->lhs->type->is_scalar());
2121 dst.writemask = ir->write_mask;
2122
2123 for (int i = 0; i < 4; i++) {
2124 if (dst.writemask & (1 << i)) {
2125 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2126 break;
2127 }
2128 }
2129
2130 /* Swizzle a small RHS vector into the channels being written.
2131 *
2132 * glsl ir treats write_mask as dictating how many channels are
2133 * present on the RHS while in our instructions we need to make
2134 * those channels appear in the slots of the vec4 they're written to.
2135 */
2136 for (int i = 0; i < 4; i++) {
2137 if (dst.writemask & (1 << i))
2138 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2139 else
2140 swizzles[i] = first_enabled_chan;
2141 }
2142 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2143 swizzles[2], swizzles[3]);
2144
2145 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2146 return;
2147 }
2148
2149 if (ir->condition) {
2150 emit_bool_to_cond_code(ir->condition, &predicate);
2151 }
2152
2153 for (i = 0; i < type_size(ir->lhs->type); i++) {
2154 vec4_instruction *inst = emit(MOV(dst, src));
2155 inst->predicate = predicate;
2156
2157 dst.reg_offset++;
2158 src.reg_offset++;
2159 }
2160 }
2161
2162 void
2163 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2164 {
2165 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2166 foreach_list(node, &ir->components) {
2167 ir_constant *field_value = (ir_constant *)node;
2168
2169 emit_constant_values(dst, field_value);
2170 }
2171 return;
2172 }
2173
2174 if (ir->type->is_array()) {
2175 for (unsigned int i = 0; i < ir->type->length; i++) {
2176 emit_constant_values(dst, ir->array_elements[i]);
2177 }
2178 return;
2179 }
2180
2181 if (ir->type->is_matrix()) {
2182 for (int i = 0; i < ir->type->matrix_columns; i++) {
2183 float *vec = &ir->value.f[i * ir->type->vector_elements];
2184
2185 for (int j = 0; j < ir->type->vector_elements; j++) {
2186 dst->writemask = 1 << j;
2187 dst->type = BRW_REGISTER_TYPE_F;
2188
2189 emit(MOV(*dst, src_reg(vec[j])));
2190 }
2191 dst->reg_offset++;
2192 }
2193 return;
2194 }
2195
2196 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2197
2198 for (int i = 0; i < ir->type->vector_elements; i++) {
2199 if (!(remaining_writemask & (1 << i)))
2200 continue;
2201
2202 dst->writemask = 1 << i;
2203 dst->type = brw_type_for_base_type(ir->type);
2204
2205 /* Find other components that match the one we're about to
2206 * write. Emits fewer instructions for things like vec4(0.5,
2207 * 1.5, 1.5, 1.5).
2208 */
2209 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2210 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2211 if (ir->value.b[i] == ir->value.b[j])
2212 dst->writemask |= (1 << j);
2213 } else {
2214 /* u, i, and f storage all line up, so no need for a
2215 * switch case for comparing each type.
2216 */
2217 if (ir->value.u[i] == ir->value.u[j])
2218 dst->writemask |= (1 << j);
2219 }
2220 }
2221
2222 switch (ir->type->base_type) {
2223 case GLSL_TYPE_FLOAT:
2224 emit(MOV(*dst, src_reg(ir->value.f[i])));
2225 break;
2226 case GLSL_TYPE_INT:
2227 emit(MOV(*dst, src_reg(ir->value.i[i])));
2228 break;
2229 case GLSL_TYPE_UINT:
2230 emit(MOV(*dst, src_reg(ir->value.u[i])));
2231 break;
2232 case GLSL_TYPE_BOOL:
2233 emit(MOV(*dst, src_reg(ir->value.b[i])));
2234 break;
2235 default:
2236 assert(!"Non-float/uint/int/bool constant");
2237 break;
2238 }
2239
2240 remaining_writemask &= ~dst->writemask;
2241 }
2242 dst->reg_offset++;
2243 }
2244
2245 void
2246 vec4_visitor::visit(ir_constant *ir)
2247 {
2248 dst_reg dst = dst_reg(this, ir->type);
2249 this->result = src_reg(dst);
2250
2251 emit_constant_values(&dst, ir);
2252 }
2253
2254 void
2255 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2256 {
2257 ir_dereference *deref = static_cast<ir_dereference *>(
2258 ir->actual_parameters.get_head());
2259 ir_variable *location = deref->variable_referenced();
2260 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2261 location->data.atomic.buffer_index);
2262
2263 /* Calculate the surface offset */
2264 src_reg offset(this, glsl_type::uint_type);
2265 ir_dereference_array *deref_array = deref->as_dereference_array();
2266 if (deref_array) {
2267 deref_array->array_index->accept(this);
2268
2269 src_reg tmp(this, glsl_type::uint_type);
2270 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2271 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2272 } else {
2273 offset = location->data.atomic.offset;
2274 }
2275
2276 /* Emit the appropriate machine instruction */
2277 const char *callee = ir->callee->function_name();
2278 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2279
2280 if (!strcmp("__intrinsic_atomic_read", callee)) {
2281 emit_untyped_surface_read(surf_index, dst, offset);
2282
2283 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2284 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2285 src_reg(), src_reg());
2286
2287 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2288 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2289 src_reg(), src_reg());
2290 }
2291 }
2292
2293 void
2294 vec4_visitor::visit(ir_call *ir)
2295 {
2296 const char *callee = ir->callee->function_name();
2297
2298 if (!strcmp("__intrinsic_atomic_read", callee) ||
2299 !strcmp("__intrinsic_atomic_increment", callee) ||
2300 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2301 visit_atomic_counter_intrinsic(ir);
2302 } else {
2303 assert(!"Unsupported intrinsic.");
2304 }
2305 }
2306
2307 src_reg
2308 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2309 {
2310 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2311 inst->base_mrf = 2;
2312 inst->mlen = 1;
2313 inst->sampler = sampler;
2314 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2315 inst->dst.writemask = WRITEMASK_XYZW;
2316
2317 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2318 int param_base = inst->base_mrf;
2319 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2320 int zero_mask = 0xf & ~coord_mask;
2321
2322 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2323 coordinate));
2324
2325 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2326 src_reg(0)));
2327
2328 emit(inst);
2329 return src_reg(inst->dst);
2330 }
2331
2332 void
2333 vec4_visitor::visit(ir_texture *ir)
2334 {
2335 int sampler =
2336 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2337
2338 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2339 * emitting anything other than setting up the constant result.
2340 */
2341 if (ir->op == ir_tg4) {
2342 ir_constant *chan = ir->lod_info.component->as_constant();
2343 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2344 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2345 dst_reg result(this, ir->type);
2346 this->result = src_reg(result);
2347 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2348 return;
2349 }
2350 }
2351
2352 /* Should be lowered by do_lower_texture_projection */
2353 assert(!ir->projector);
2354
2355 /* Should be lowered */
2356 assert(!ir->offset || !ir->offset->type->is_array());
2357
2358 /* Generate code to compute all the subexpression trees. This has to be
2359 * done before loading any values into MRFs for the sampler message since
2360 * generating these values may involve SEND messages that need the MRFs.
2361 */
2362 src_reg coordinate;
2363 if (ir->coordinate) {
2364 ir->coordinate->accept(this);
2365 coordinate = this->result;
2366 }
2367
2368 src_reg shadow_comparitor;
2369 if (ir->shadow_comparitor) {
2370 ir->shadow_comparitor->accept(this);
2371 shadow_comparitor = this->result;
2372 }
2373
2374 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2375 src_reg offset_value;
2376 if (has_nonconstant_offset) {
2377 ir->offset->accept(this);
2378 offset_value = src_reg(this->result);
2379 }
2380
2381 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2382 src_reg lod, dPdx, dPdy, sample_index, mcs;
2383 switch (ir->op) {
2384 case ir_tex:
2385 lod = src_reg(0.0f);
2386 lod_type = glsl_type::float_type;
2387 break;
2388 case ir_txf:
2389 case ir_txl:
2390 case ir_txs:
2391 ir->lod_info.lod->accept(this);
2392 lod = this->result;
2393 lod_type = ir->lod_info.lod->type;
2394 break;
2395 case ir_query_levels:
2396 lod = src_reg(0);
2397 lod_type = glsl_type::int_type;
2398 break;
2399 case ir_txf_ms:
2400 ir->lod_info.sample_index->accept(this);
2401 sample_index = this->result;
2402 sample_index_type = ir->lod_info.sample_index->type;
2403
2404 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2405 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2406 else
2407 mcs = src_reg(0u);
2408 break;
2409 case ir_txd:
2410 ir->lod_info.grad.dPdx->accept(this);
2411 dPdx = this->result;
2412
2413 ir->lod_info.grad.dPdy->accept(this);
2414 dPdy = this->result;
2415
2416 lod_type = ir->lod_info.grad.dPdx->type;
2417 break;
2418 case ir_txb:
2419 case ir_lod:
2420 case ir_tg4:
2421 break;
2422 }
2423
2424 vec4_instruction *inst = NULL;
2425 switch (ir->op) {
2426 case ir_tex:
2427 case ir_txl:
2428 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2429 break;
2430 case ir_txd:
2431 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2432 break;
2433 case ir_txf:
2434 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2435 break;
2436 case ir_txf_ms:
2437 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2438 break;
2439 case ir_txs:
2440 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2441 break;
2442 case ir_tg4:
2443 if (has_nonconstant_offset)
2444 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2445 else
2446 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2447 break;
2448 case ir_query_levels:
2449 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2450 break;
2451 case ir_txb:
2452 assert(!"TXB is not valid for vertex shaders.");
2453 break;
2454 case ir_lod:
2455 assert(!"LOD is not valid for vertex shaders.");
2456 break;
2457 default:
2458 assert(!"Unrecognized tex op");
2459 }
2460
2461 if (ir->offset != NULL && ir->op != ir_txf)
2462 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2463
2464 /* Stuff the channel select bits in the top of the texture offset */
2465 if (ir->op == ir_tg4)
2466 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2467
2468 /* The message header is necessary for:
2469 * - Gen4 (always)
2470 * - Texel offsets
2471 * - Gather channel selection
2472 * - Sampler indices too large to fit in a 4-bit value.
2473 */
2474 inst->header_present =
2475 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2476 sampler >= 16;
2477 inst->base_mrf = 2;
2478 inst->mlen = inst->header_present + 1; /* always at least one */
2479 inst->sampler = sampler;
2480 inst->dst = dst_reg(this, ir->type);
2481 inst->dst.writemask = WRITEMASK_XYZW;
2482 inst->shadow_compare = ir->shadow_comparitor != NULL;
2483
2484 /* MRF for the first parameter */
2485 int param_base = inst->base_mrf + inst->header_present;
2486
2487 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2488 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2489 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2490 } else {
2491 /* Load the coordinate */
2492 /* FINISHME: gl_clamp_mask and saturate */
2493 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2494 int zero_mask = 0xf & ~coord_mask;
2495
2496 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2497 coordinate));
2498
2499 if (zero_mask != 0) {
2500 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2501 src_reg(0)));
2502 }
2503 /* Load the shadow comparitor */
2504 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2505 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2506 WRITEMASK_X),
2507 shadow_comparitor));
2508 inst->mlen++;
2509 }
2510
2511 /* Load the LOD info */
2512 if (ir->op == ir_tex || ir->op == ir_txl) {
2513 int mrf, writemask;
2514 if (brw->gen >= 5) {
2515 mrf = param_base + 1;
2516 if (ir->shadow_comparitor) {
2517 writemask = WRITEMASK_Y;
2518 /* mlen already incremented */
2519 } else {
2520 writemask = WRITEMASK_X;
2521 inst->mlen++;
2522 }
2523 } else /* brw->gen == 4 */ {
2524 mrf = param_base;
2525 writemask = WRITEMASK_W;
2526 }
2527 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2528 } else if (ir->op == ir_txf) {
2529 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2530 } else if (ir->op == ir_txf_ms) {
2531 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2532 sample_index));
2533 if (brw->gen >= 7)
2534 /* MCS data is in the first channel of `mcs`, but we need to get it into
2535 * the .y channel of the second vec4 of params, so replicate .x across
2536 * the whole vec4 and then mask off everything except .y
2537 */
2538 mcs.swizzle = BRW_SWIZZLE_XXXX;
2539 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2540 mcs));
2541 inst->mlen++;
2542 } else if (ir->op == ir_txd) {
2543 const glsl_type *type = lod_type;
2544
2545 if (brw->gen >= 5) {
2546 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2547 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2548 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2549 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2550 inst->mlen++;
2551
2552 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2553 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2554 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2555 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2556 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2557 inst->mlen++;
2558
2559 if (ir->shadow_comparitor) {
2560 emit(MOV(dst_reg(MRF, param_base + 2,
2561 ir->shadow_comparitor->type, WRITEMASK_Z),
2562 shadow_comparitor));
2563 }
2564 }
2565 } else /* brw->gen == 4 */ {
2566 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2567 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2568 inst->mlen += 2;
2569 }
2570 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2571 if (ir->shadow_comparitor) {
2572 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2573 shadow_comparitor));
2574 }
2575
2576 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2577 offset_value));
2578 inst->mlen++;
2579 }
2580 }
2581
2582 emit(inst);
2583
2584 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2585 * spec requires layers.
2586 */
2587 if (ir->op == ir_txs) {
2588 glsl_type const *type = ir->sampler->type;
2589 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2590 type->sampler_array) {
2591 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2592 writemask(inst->dst, WRITEMASK_Z),
2593 src_reg(inst->dst), src_reg(6));
2594 }
2595 }
2596
2597 if (brw->gen == 6 && ir->op == ir_tg4) {
2598 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2599 }
2600
2601 swizzle_result(ir, src_reg(inst->dst), sampler);
2602 }
2603
2604 /**
2605 * Apply workarounds for Gen6 gather with UINT/SINT
2606 */
2607 void
2608 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2609 {
2610 if (!wa)
2611 return;
2612
2613 int width = (wa & WA_8BIT) ? 8 : 16;
2614 dst_reg dst_f = dst;
2615 dst_f.type = BRW_REGISTER_TYPE_F;
2616
2617 /* Convert from UNORM to UINT */
2618 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2619 emit(MOV(dst, src_reg(dst_f)));
2620
2621 if (wa & WA_SIGN) {
2622 /* Reinterpret the UINT value as a signed INT value by
2623 * shifting the sign bit into place, then shifting back
2624 * preserving sign.
2625 */
2626 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2627 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2628 }
2629 }
2630
2631 /**
2632 * Set up the gather channel based on the swizzle, for gather4.
2633 */
2634 uint32_t
2635 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2636 {
2637 ir_constant *chan = ir->lod_info.component->as_constant();
2638 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2639 switch (swiz) {
2640 case SWIZZLE_X: return 0;
2641 case SWIZZLE_Y:
2642 /* gather4 sampler is broken for green channel on RG32F --
2643 * we must ask for blue instead.
2644 */
2645 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2646 return 2;
2647 return 1;
2648 case SWIZZLE_Z: return 2;
2649 case SWIZZLE_W: return 3;
2650 default:
2651 assert(!"Not reached"); /* zero, one swizzles handled already */
2652 return 0;
2653 }
2654 }
2655
2656 void
2657 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2658 {
2659 int s = key->tex.swizzles[sampler];
2660
2661 this->result = src_reg(this, ir->type);
2662 dst_reg swizzled_result(this->result);
2663
2664 if (ir->op == ir_query_levels) {
2665 /* # levels is in .w */
2666 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2667 emit(MOV(swizzled_result, orig_val));
2668 return;
2669 }
2670
2671 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2672 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2673 emit(MOV(swizzled_result, orig_val));
2674 return;
2675 }
2676
2677
2678 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2679 int swizzle[4] = {0};
2680
2681 for (int i = 0; i < 4; i++) {
2682 switch (GET_SWZ(s, i)) {
2683 case SWIZZLE_ZERO:
2684 zero_mask |= (1 << i);
2685 break;
2686 case SWIZZLE_ONE:
2687 one_mask |= (1 << i);
2688 break;
2689 default:
2690 copy_mask |= (1 << i);
2691 swizzle[i] = GET_SWZ(s, i);
2692 break;
2693 }
2694 }
2695
2696 if (copy_mask) {
2697 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2698 swizzled_result.writemask = copy_mask;
2699 emit(MOV(swizzled_result, orig_val));
2700 }
2701
2702 if (zero_mask) {
2703 swizzled_result.writemask = zero_mask;
2704 emit(MOV(swizzled_result, src_reg(0.0f)));
2705 }
2706
2707 if (one_mask) {
2708 swizzled_result.writemask = one_mask;
2709 emit(MOV(swizzled_result, src_reg(1.0f)));
2710 }
2711 }
2712
2713 void
2714 vec4_visitor::visit(ir_return *)
2715 {
2716 assert(!"not reached");
2717 }
2718
2719 void
2720 vec4_visitor::visit(ir_discard *)
2721 {
2722 assert(!"not reached");
2723 }
2724
2725 void
2726 vec4_visitor::visit(ir_if *ir)
2727 {
2728 /* Don't point the annotation at the if statement, because then it plus
2729 * the then and else blocks get printed.
2730 */
2731 this->base_ir = ir->condition;
2732
2733 if (brw->gen == 6) {
2734 emit_if_gen6(ir);
2735 } else {
2736 uint32_t predicate;
2737 emit_bool_to_cond_code(ir->condition, &predicate);
2738 emit(IF(predicate));
2739 }
2740
2741 visit_instructions(&ir->then_instructions);
2742
2743 if (!ir->else_instructions.is_empty()) {
2744 this->base_ir = ir->condition;
2745 emit(BRW_OPCODE_ELSE);
2746
2747 visit_instructions(&ir->else_instructions);
2748 }
2749
2750 this->base_ir = ir->condition;
2751 emit(BRW_OPCODE_ENDIF);
2752 }
2753
2754 void
2755 vec4_visitor::visit(ir_emit_vertex *)
2756 {
2757 assert(!"not reached");
2758 }
2759
2760 void
2761 vec4_visitor::visit(ir_end_primitive *)
2762 {
2763 assert(!"not reached");
2764 }
2765
2766 void
2767 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2768 dst_reg dst, src_reg offset,
2769 src_reg src0, src_reg src1)
2770 {
2771 unsigned mlen = 0;
2772
2773 /* Set the atomic operation offset. */
2774 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2775 mlen++;
2776
2777 /* Set the atomic operation arguments. */
2778 if (src0.file != BAD_FILE) {
2779 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2780 mlen++;
2781 }
2782
2783 if (src1.file != BAD_FILE) {
2784 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2785 mlen++;
2786 }
2787
2788 /* Emit the instruction. Note that this maps to the normal SIMD8
2789 * untyped atomic message on Ivy Bridge, but that's OK because
2790 * unused channels will be masked out.
2791 */
2792 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2793 src_reg(atomic_op), src_reg(surf_index));
2794 inst->base_mrf = 0;
2795 inst->mlen = mlen;
2796 }
2797
2798 void
2799 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2800 src_reg offset)
2801 {
2802 /* Set the surface read offset. */
2803 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2804
2805 /* Emit the instruction. Note that this maps to the normal SIMD8
2806 * untyped surface read message, but that's OK because unused
2807 * channels will be masked out.
2808 */
2809 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2810 dst, src_reg(surf_index));
2811 inst->base_mrf = 0;
2812 inst->mlen = 1;
2813 }
2814
2815 void
2816 vec4_visitor::emit_ndc_computation()
2817 {
2818 /* Get the position */
2819 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2820
2821 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2822 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2823 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2824
2825 current_annotation = "NDC";
2826 dst_reg ndc_w = ndc;
2827 ndc_w.writemask = WRITEMASK_W;
2828 src_reg pos_w = pos;
2829 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2830 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2831
2832 dst_reg ndc_xyz = ndc;
2833 ndc_xyz.writemask = WRITEMASK_XYZ;
2834
2835 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2836 }
2837
2838 void
2839 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2840 {
2841 if (brw->gen < 6 &&
2842 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2843 key->userclip_active || brw->has_negative_rhw_bug)) {
2844 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2845 dst_reg header1_w = header1;
2846 header1_w.writemask = WRITEMASK_W;
2847
2848 emit(MOV(header1, 0u));
2849
2850 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2851 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2852
2853 current_annotation = "Point size";
2854 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2855 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2856 }
2857
2858 if (key->userclip_active) {
2859 current_annotation = "Clipping flags";
2860 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2861 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2862
2863 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2864 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2865 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2866
2867 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2868 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2869 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2870 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2871 }
2872
2873 /* i965 clipping workaround:
2874 * 1) Test for -ve rhw
2875 * 2) If set,
2876 * set ndc = (0,0,0,0)
2877 * set ucp[6] = 1
2878 *
2879 * Later, clipping will detect ucp[6] and ensure the primitive is
2880 * clipped against all fixed planes.
2881 */
2882 if (brw->has_negative_rhw_bug) {
2883 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2884 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2885 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2886 vec4_instruction *inst;
2887 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2888 inst->predicate = BRW_PREDICATE_NORMAL;
2889 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2890 inst->predicate = BRW_PREDICATE_NORMAL;
2891 }
2892
2893 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2894 } else if (brw->gen < 6) {
2895 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2896 } else {
2897 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2898 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2899 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2900 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2901 }
2902 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2903 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2904 src_reg(output_reg[VARYING_SLOT_LAYER])));
2905 }
2906 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2907 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2908 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2909 }
2910 }
2911 }
2912
2913 void
2914 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2915 {
2916 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2917 *
2918 * "If a linked set of shaders forming the vertex stage contains no
2919 * static write to gl_ClipVertex or gl_ClipDistance, but the
2920 * application has requested clipping against user clip planes through
2921 * the API, then the coordinate written to gl_Position is used for
2922 * comparison against the user clip planes."
2923 *
2924 * This function is only called if the shader didn't write to
2925 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2926 * if the user wrote to it; otherwise we use gl_Position.
2927 */
2928 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2929 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2930 clip_vertex = VARYING_SLOT_POS;
2931 }
2932
2933 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2934 ++i) {
2935 reg.writemask = 1 << i;
2936 emit(DP4(reg,
2937 src_reg(output_reg[clip_vertex]),
2938 src_reg(this->userplane[i + offset])));
2939 }
2940 }
2941
2942 void
2943 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2944 {
2945 assert (varying < VARYING_SLOT_MAX);
2946 reg.type = output_reg[varying].type;
2947 current_annotation = output_reg_annotation[varying];
2948 /* Copy the register, saturating if necessary */
2949 vec4_instruction *inst = emit(MOV(reg,
2950 src_reg(output_reg[varying])));
2951 if ((varying == VARYING_SLOT_COL0 ||
2952 varying == VARYING_SLOT_COL1 ||
2953 varying == VARYING_SLOT_BFC0 ||
2954 varying == VARYING_SLOT_BFC1) &&
2955 key->clamp_vertex_color) {
2956 inst->saturate = true;
2957 }
2958 }
2959
2960 void
2961 vec4_visitor::emit_urb_slot(int mrf, int varying)
2962 {
2963 struct brw_reg hw_reg = brw_message_reg(mrf);
2964 dst_reg reg = dst_reg(MRF, mrf);
2965 reg.type = BRW_REGISTER_TYPE_F;
2966
2967 switch (varying) {
2968 case VARYING_SLOT_PSIZ:
2969 /* PSIZ is always in slot 0, and is coupled with other flags. */
2970 current_annotation = "indices, point width, clip flags";
2971 emit_psiz_and_flags(hw_reg);
2972 break;
2973 case BRW_VARYING_SLOT_NDC:
2974 current_annotation = "NDC";
2975 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2976 break;
2977 case VARYING_SLOT_POS:
2978 current_annotation = "gl_Position";
2979 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2980 break;
2981 case VARYING_SLOT_EDGE:
2982 /* This is present when doing unfilled polygons. We're supposed to copy
2983 * the edge flag from the user-provided vertex array
2984 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2985 * of that attribute (starts as 1.0f). This is then used in clipping to
2986 * determine which edges should be drawn as wireframe.
2987 */
2988 current_annotation = "edge flag";
2989 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2990 glsl_type::float_type, WRITEMASK_XYZW))));
2991 break;
2992 case BRW_VARYING_SLOT_PAD:
2993 /* No need to write to this slot */
2994 break;
2995 default:
2996 emit_generic_urb_slot(reg, varying);
2997 break;
2998 }
2999 }
3000
3001 static int
3002 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3003 {
3004 if (brw->gen >= 6) {
3005 /* URB data written (does not include the message header reg) must
3006 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3007 * section 5.4.3.2.2: URB_INTERLEAVED.
3008 *
3009 * URB entries are allocated on a multiple of 1024 bits, so an
3010 * extra 128 bits written here to make the end align to 256 is
3011 * no problem.
3012 */
3013 if ((mlen % 2) != 1)
3014 mlen++;
3015 }
3016
3017 return mlen;
3018 }
3019
3020
3021 /**
3022 * Generates the VUE payload plus the necessary URB write instructions to
3023 * output it.
3024 *
3025 * The VUE layout is documented in Volume 2a.
3026 */
3027 void
3028 vec4_visitor::emit_vertex()
3029 {
3030 /* MRF 0 is reserved for the debugger, so start with message header
3031 * in MRF 1.
3032 */
3033 int base_mrf = 1;
3034 int mrf = base_mrf;
3035 /* In the process of generating our URB write message contents, we
3036 * may need to unspill a register or load from an array. Those
3037 * reads would use MRFs 14-15.
3038 */
3039 int max_usable_mrf = 13;
3040
3041 /* The following assertion verifies that max_usable_mrf causes an
3042 * even-numbered amount of URB write data, which will meet gen6's
3043 * requirements for length alignment.
3044 */
3045 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3046
3047 /* First mrf is the g0-based message header containing URB handles and
3048 * such.
3049 */
3050 emit_urb_write_header(mrf++);
3051
3052 if (brw->gen < 6) {
3053 emit_ndc_computation();
3054 }
3055
3056 /* Lower legacy ff and ClipVertex clipping to clip distances */
3057 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3058 current_annotation = "user clip distances";
3059
3060 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3061 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3062
3063 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3064 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3065 }
3066
3067 /* We may need to split this up into several URB writes, so do them in a
3068 * loop.
3069 */
3070 int slot = 0;
3071 bool complete = false;
3072 do {
3073 /* URB offset is in URB row increments, and each of our MRFs is half of
3074 * one of those, since we're doing interleaved writes.
3075 */
3076 int offset = slot / 2;
3077
3078 mrf = base_mrf + 1;
3079 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3080 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3081
3082 /* If this was max_usable_mrf, we can't fit anything more into this
3083 * URB WRITE.
3084 */
3085 if (mrf > max_usable_mrf) {
3086 slot++;
3087 break;
3088 }
3089 }
3090
3091 complete = slot >= prog_data->vue_map.num_slots;
3092 current_annotation = "URB write";
3093 vec4_instruction *inst = emit_urb_write_opcode(complete);
3094 inst->base_mrf = base_mrf;
3095 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3096 inst->offset += offset;
3097 } while(!complete);
3098 }
3099
3100
3101 src_reg
3102 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3103 src_reg *reladdr, int reg_offset)
3104 {
3105 /* Because we store the values to scratch interleaved like our
3106 * vertex data, we need to scale the vec4 index by 2.
3107 */
3108 int message_header_scale = 2;
3109
3110 /* Pre-gen6, the message header uses byte offsets instead of vec4
3111 * (16-byte) offset units.
3112 */
3113 if (brw->gen < 6)
3114 message_header_scale *= 16;
3115
3116 if (reladdr) {
3117 src_reg index = src_reg(this, glsl_type::int_type);
3118
3119 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3120 emit_before(inst, MUL(dst_reg(index),
3121 index, src_reg(message_header_scale)));
3122
3123 return index;
3124 } else {
3125 return src_reg(reg_offset * message_header_scale);
3126 }
3127 }
3128
3129 src_reg
3130 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3131 src_reg *reladdr, int reg_offset)
3132 {
3133 if (reladdr) {
3134 src_reg index = src_reg(this, glsl_type::int_type);
3135
3136 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3137
3138 /* Pre-gen6, the message header uses byte offsets instead of vec4
3139 * (16-byte) offset units.
3140 */
3141 if (brw->gen < 6) {
3142 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3143 }
3144
3145 return index;
3146 } else if (brw->gen >= 8) {
3147 /* Store the offset in a GRF so we can send-from-GRF. */
3148 src_reg offset = src_reg(this, glsl_type::int_type);
3149 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3150 return offset;
3151 } else {
3152 int message_header_scale = brw->gen < 6 ? 16 : 1;
3153 return src_reg(reg_offset * message_header_scale);
3154 }
3155 }
3156
3157 /**
3158 * Emits an instruction before @inst to load the value named by @orig_src
3159 * from scratch space at @base_offset to @temp.
3160 *
3161 * @base_offset is measured in 32-byte units (the size of a register).
3162 */
3163 void
3164 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3165 dst_reg temp, src_reg orig_src,
3166 int base_offset)
3167 {
3168 int reg_offset = base_offset + orig_src.reg_offset;
3169 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3170
3171 emit_before(inst, SCRATCH_READ(temp, index));
3172 }
3173
3174 /**
3175 * Emits an instruction after @inst to store the value to be written
3176 * to @orig_dst to scratch space at @base_offset, from @temp.
3177 *
3178 * @base_offset is measured in 32-byte units (the size of a register).
3179 */
3180 void
3181 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3182 {
3183 int reg_offset = base_offset + inst->dst.reg_offset;
3184 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3185
3186 /* Create a temporary register to store *inst's result in.
3187 *
3188 * We have to be careful in MOVing from our temporary result register in
3189 * the scratch write. If we swizzle from channels of the temporary that
3190 * weren't initialized, it will confuse live interval analysis, which will
3191 * make spilling fail to make progress.
3192 */
3193 src_reg temp = src_reg(this, glsl_type::vec4_type);
3194 temp.type = inst->dst.type;
3195 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3196 int swizzles[4];
3197 for (int i = 0; i < 4; i++)
3198 if (inst->dst.writemask & (1 << i))
3199 swizzles[i] = i;
3200 else
3201 swizzles[i] = first_writemask_chan;
3202 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3203 swizzles[2], swizzles[3]);
3204
3205 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3206 inst->dst.writemask));
3207 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3208 write->predicate = inst->predicate;
3209 write->ir = inst->ir;
3210 write->annotation = inst->annotation;
3211 inst->insert_after(write);
3212
3213 inst->dst.file = temp.file;
3214 inst->dst.reg = temp.reg;
3215 inst->dst.reg_offset = temp.reg_offset;
3216 inst->dst.reladdr = NULL;
3217 }
3218
3219 /**
3220 * We can't generally support array access in GRF space, because a
3221 * single instruction's destination can only span 2 contiguous
3222 * registers. So, we send all GRF arrays that get variable index
3223 * access to scratch space.
3224 */
3225 void
3226 vec4_visitor::move_grf_array_access_to_scratch()
3227 {
3228 int scratch_loc[this->virtual_grf_count];
3229
3230 for (int i = 0; i < this->virtual_grf_count; i++) {
3231 scratch_loc[i] = -1;
3232 }
3233
3234 /* First, calculate the set of virtual GRFs that need to be punted
3235 * to scratch due to having any array access on them, and where in
3236 * scratch.
3237 */
3238 foreach_list(node, &this->instructions) {
3239 vec4_instruction *inst = (vec4_instruction *)node;
3240
3241 if (inst->dst.file == GRF && inst->dst.reladdr &&
3242 scratch_loc[inst->dst.reg] == -1) {
3243 scratch_loc[inst->dst.reg] = c->last_scratch;
3244 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3245 }
3246
3247 for (int i = 0 ; i < 3; i++) {
3248 src_reg *src = &inst->src[i];
3249
3250 if (src->file == GRF && src->reladdr &&
3251 scratch_loc[src->reg] == -1) {
3252 scratch_loc[src->reg] = c->last_scratch;
3253 c->last_scratch += this->virtual_grf_sizes[src->reg];
3254 }
3255 }
3256 }
3257
3258 /* Now, for anything that will be accessed through scratch, rewrite
3259 * it to load/store. Note that this is a _safe list walk, because
3260 * we may generate a new scratch_write instruction after the one
3261 * we're processing.
3262 */
3263 foreach_list_safe(node, &this->instructions) {
3264 vec4_instruction *inst = (vec4_instruction *)node;
3265
3266 /* Set up the annotation tracking for new generated instructions. */
3267 base_ir = inst->ir;
3268 current_annotation = inst->annotation;
3269
3270 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3271 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3272 }
3273
3274 for (int i = 0 ; i < 3; i++) {
3275 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3276 continue;
3277
3278 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3279
3280 emit_scratch_read(inst, temp, inst->src[i],
3281 scratch_loc[inst->src[i].reg]);
3282
3283 inst->src[i].file = temp.file;
3284 inst->src[i].reg = temp.reg;
3285 inst->src[i].reg_offset = temp.reg_offset;
3286 inst->src[i].reladdr = NULL;
3287 }
3288 }
3289 }
3290
3291 /**
3292 * Emits an instruction before @inst to load the value named by @orig_src
3293 * from the pull constant buffer (surface) at @base_offset to @temp.
3294 */
3295 void
3296 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3297 dst_reg temp, src_reg orig_src,
3298 int base_offset)
3299 {
3300 int reg_offset = base_offset + orig_src.reg_offset;
3301 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3302 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3303 vec4_instruction *load;
3304
3305 if (brw->gen >= 7) {
3306 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3307 grf_offset.type = offset.type;
3308 emit_before(inst, MOV(grf_offset, offset));
3309
3310 load = new(mem_ctx) vec4_instruction(this,
3311 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3312 temp, index, src_reg(grf_offset));
3313 } else {
3314 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3315 temp, index, offset);
3316 load->base_mrf = 14;
3317 load->mlen = 1;
3318 }
3319 emit_before(inst, load);
3320 }
3321
3322 /**
3323 * Implements array access of uniforms by inserting a
3324 * PULL_CONSTANT_LOAD instruction.
3325 *
3326 * Unlike temporary GRF array access (where we don't support it due to
3327 * the difficulty of doing relative addressing on instruction
3328 * destinations), we could potentially do array access of uniforms
3329 * that were loaded in GRF space as push constants. In real-world
3330 * usage we've seen, though, the arrays being used are always larger
3331 * than we could load as push constants, so just always move all
3332 * uniform array access out to a pull constant buffer.
3333 */
3334 void
3335 vec4_visitor::move_uniform_array_access_to_pull_constants()
3336 {
3337 int pull_constant_loc[this->uniforms];
3338
3339 for (int i = 0; i < this->uniforms; i++) {
3340 pull_constant_loc[i] = -1;
3341 }
3342
3343 /* Walk through and find array access of uniforms. Put a copy of that
3344 * uniform in the pull constant buffer.
3345 *
3346 * Note that we don't move constant-indexed accesses to arrays. No
3347 * testing has been done of the performance impact of this choice.
3348 */
3349 foreach_list_safe(node, &this->instructions) {
3350 vec4_instruction *inst = (vec4_instruction *)node;
3351
3352 for (int i = 0 ; i < 3; i++) {
3353 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3354 continue;
3355
3356 int uniform = inst->src[i].reg;
3357
3358 /* If this array isn't already present in the pull constant buffer,
3359 * add it.
3360 */
3361 if (pull_constant_loc[uniform] == -1) {
3362 const float **values = &stage_prog_data->param[uniform * 4];
3363
3364 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3365
3366 assert(uniform < uniform_array_size);
3367 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3368 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3369 = values[j];
3370 }
3371 }
3372
3373 /* Set up the annotation tracking for new generated instructions. */
3374 base_ir = inst->ir;
3375 current_annotation = inst->annotation;
3376
3377 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3378
3379 emit_pull_constant_load(inst, temp, inst->src[i],
3380 pull_constant_loc[uniform]);
3381
3382 inst->src[i].file = temp.file;
3383 inst->src[i].reg = temp.reg;
3384 inst->src[i].reg_offset = temp.reg_offset;
3385 inst->src[i].reladdr = NULL;
3386 }
3387 }
3388
3389 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3390 * no need to track them as larger-than-vec4 objects. This will be
3391 * relied on in cutting out unused uniform vectors from push
3392 * constants.
3393 */
3394 split_uniform_registers();
3395 }
3396
3397 void
3398 vec4_visitor::resolve_ud_negate(src_reg *reg)
3399 {
3400 if (reg->type != BRW_REGISTER_TYPE_UD ||
3401 !reg->negate)
3402 return;
3403
3404 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3405 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3406 *reg = temp;
3407 }
3408
3409 vec4_visitor::vec4_visitor(struct brw_context *brw,
3410 struct brw_vec4_compile *c,
3411 struct gl_program *prog,
3412 const struct brw_vec4_prog_key *key,
3413 struct brw_vec4_prog_data *prog_data,
3414 struct gl_shader_program *shader_prog,
3415 gl_shader_stage stage,
3416 void *mem_ctx,
3417 bool debug_flag,
3418 bool no_spills,
3419 shader_time_shader_type st_base,
3420 shader_time_shader_type st_written,
3421 shader_time_shader_type st_reset)
3422 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3423 c(c),
3424 key(key),
3425 prog_data(prog_data),
3426 sanity_param_count(0),
3427 fail_msg(NULL),
3428 first_non_payload_grf(0),
3429 need_all_constants_in_pull_buffer(false),
3430 debug_flag(debug_flag),
3431 no_spills(no_spills),
3432 st_base(st_base),
3433 st_written(st_written),
3434 st_reset(st_reset)
3435 {
3436 this->mem_ctx = mem_ctx;
3437 this->failed = false;
3438
3439 this->base_ir = NULL;
3440 this->current_annotation = NULL;
3441 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3442
3443 this->variable_ht = hash_table_ctor(0,
3444 hash_table_pointer_hash,
3445 hash_table_pointer_compare);
3446
3447 this->virtual_grf_start = NULL;
3448 this->virtual_grf_end = NULL;
3449 this->virtual_grf_sizes = NULL;
3450 this->virtual_grf_count = 0;
3451 this->virtual_grf_reg_map = NULL;
3452 this->virtual_grf_reg_count = 0;
3453 this->virtual_grf_array_size = 0;
3454 this->live_intervals_valid = false;
3455
3456 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3457
3458 this->uniforms = 0;
3459
3460 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3461 * at least one. See setup_uniforms() in brw_vec4.cpp.
3462 */
3463 this->uniform_array_size = 1;
3464 if (prog_data) {
3465 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3466 }
3467
3468 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3469 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3470 }
3471
3472 vec4_visitor::~vec4_visitor()
3473 {
3474 hash_table_dtor(this->variable_ht);
3475 }
3476
3477
3478 void
3479 vec4_visitor::fail(const char *format, ...)
3480 {
3481 va_list va;
3482 char *msg;
3483
3484 if (failed)
3485 return;
3486
3487 failed = true;
3488
3489 va_start(va, format);
3490 msg = ralloc_vasprintf(mem_ctx, format, va);
3491 va_end(va);
3492 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3493
3494 this->fail_msg = msg;
3495
3496 if (debug_flag) {
3497 fprintf(stderr, "%s", msg);
3498 }
3499 }
3500
3501 } /* namespace brw */