i965: Use unreachable() instead of unconditional assert().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->sampler = 0;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
82 src_reg src0, src_reg src1, src_reg src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
91 {
92 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
93 }
94
95 vec4_instruction *
96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
97 {
98 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
103 {
104 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode)
109 {
110 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
111 }
112
113 #define ALU1(op) \
114 vec4_instruction * \
115 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
116 { \
117 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
118 src0); \
119 }
120
121 #define ALU2(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
124 const src_reg &src1) \
125 { \
126 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
127 src0, src1); \
128 }
129
130 #define ALU2_ACC(op) \
131 vec4_instruction * \
132 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
133 const src_reg &src1) \
134 { \
135 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
136 BRW_OPCODE_##op, dst, src0, src1); \
137 inst->writes_accumulator = true; \
138 return inst; \
139 }
140
141 #define ALU3(op) \
142 vec4_instruction * \
143 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
144 const src_reg &src1, const src_reg &src2) \
145 { \
146 assert(brw->gen >= 6); \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1, src2); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU1(F32TO16)
158 ALU1(F16TO32)
159 ALU2(ADD)
160 ALU2(MUL)
161 ALU2_ACC(MACH)
162 ALU2(AND)
163 ALU2(OR)
164 ALU2(XOR)
165 ALU2(DP3)
166 ALU2(DP4)
167 ALU2(DPH)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172 ALU1(BFREV)
173 ALU3(BFE)
174 ALU2(BFI1)
175 ALU3(BFI2)
176 ALU1(FBH)
177 ALU1(FBL)
178 ALU1(CBIT)
179 ALU3(MAD)
180 ALU2_ACC(ADDC)
181 ALU2_ACC(SUBB)
182 ALU2(MAC)
183
184 /** Gen4 predicated IF. */
185 vec4_instruction *
186 vec4_visitor::IF(uint32_t predicate)
187 {
188 vec4_instruction *inst;
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
191 inst->predicate = predicate;
192
193 return inst;
194 }
195
196 /** Gen6 IF with embedded comparison. */
197 vec4_instruction *
198 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
221 {
222 vec4_instruction *inst;
223
224 /* original gen4 does type conversion to the destination type
225 * before before comparison, producing garbage results for floating
226 * point comparisons.
227 */
228 if (brw->gen == 4) {
229 dst.type = src0.type;
230 if (dst.file == HW_REG)
231 dst.fixed_hw_reg.type = dst.type;
232 }
233
234 resolve_ud_negate(&src0);
235 resolve_ud_negate(&src1);
236
237 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
238 inst->conditional_mod = condition;
239
240 return inst;
241 }
242
243 vec4_instruction *
244 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
245 {
246 vec4_instruction *inst;
247
248 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
249 dst, index);
250 inst->base_mrf = 14;
251 inst->mlen = 2;
252
253 return inst;
254 }
255
256 vec4_instruction *
257 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
258 const src_reg &index)
259 {
260 vec4_instruction *inst;
261
262 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
263 dst, src, index);
264 inst->base_mrf = 13;
265 inst->mlen = 3;
266
267 return inst;
268 }
269
270 void
271 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
272 {
273 static enum opcode dot_opcodes[] = {
274 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
275 };
276
277 emit(dot_opcodes[elements - 2], dst, src0, src1);
278 }
279
280 src_reg
281 vec4_visitor::fix_3src_operand(src_reg src)
282 {
283 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
284 * able to use vertical stride of zero to replicate the vec4 uniform, like
285 *
286 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
287 *
288 * But you can't, since vertical stride is always four in three-source
289 * instructions. Instead, insert a MOV instruction to do the replication so
290 * that the three-source instruction can consume it.
291 */
292
293 /* The MOV is only needed if the source is a uniform or immediate. */
294 if (src.file != UNIFORM && src.file != IMM)
295 return src;
296
297 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
298 return src;
299
300 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
301 expanded.type = src.type;
302 emit(MOV(expanded, src));
303 return src_reg(expanded);
304 }
305
306 src_reg
307 vec4_visitor::fix_math_operand(src_reg src)
308 {
309 /* The gen6 math instruction ignores the source modifiers --
310 * swizzle, abs, negate, and at least some parts of the register
311 * region description.
312 *
313 * Rather than trying to enumerate all these cases, *always* expand the
314 * operand to a temp GRF for gen6.
315 *
316 * For gen7, keep the operand as-is, except if immediate, which gen7 still
317 * can't use.
318 */
319
320 if (brw->gen == 7 && src.file != IMM)
321 return src;
322
323 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
324 expanded.type = src.type;
325 emit(MOV(expanded, src));
326 return src_reg(expanded);
327 }
328
329 void
330 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
331 {
332 src = fix_math_operand(src);
333
334 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
335 /* The gen6 math instruction must be align1, so we can't do
336 * writemasks.
337 */
338 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
339
340 emit(opcode, temp_dst, src);
341
342 emit(MOV(dst, src_reg(temp_dst)));
343 } else {
344 emit(opcode, dst, src);
345 }
346 }
347
348 void
349 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
350 {
351 vec4_instruction *inst = emit(opcode, dst, src);
352 inst->base_mrf = 1;
353 inst->mlen = 1;
354 }
355
356 void
357 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
358 {
359 switch (opcode) {
360 case SHADER_OPCODE_RCP:
361 case SHADER_OPCODE_RSQ:
362 case SHADER_OPCODE_SQRT:
363 case SHADER_OPCODE_EXP2:
364 case SHADER_OPCODE_LOG2:
365 case SHADER_OPCODE_SIN:
366 case SHADER_OPCODE_COS:
367 break;
368 default:
369 unreachable("not reached: bad math opcode");
370 }
371
372 if (brw->gen >= 8) {
373 emit(opcode, dst, src);
374 } else if (brw->gen >= 6) {
375 emit_math1_gen6(opcode, dst, src);
376 } else {
377 emit_math1_gen4(opcode, dst, src);
378 }
379 }
380
381 void
382 vec4_visitor::emit_math2_gen6(enum opcode opcode,
383 dst_reg dst, src_reg src0, src_reg src1)
384 {
385 src0 = fix_math_operand(src0);
386 src1 = fix_math_operand(src1);
387
388 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
389 /* The gen6 math instruction must be align1, so we can't do
390 * writemasks.
391 */
392 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
393 temp_dst.type = dst.type;
394
395 emit(opcode, temp_dst, src0, src1);
396
397 emit(MOV(dst, src_reg(temp_dst)));
398 } else {
399 emit(opcode, dst, src0, src1);
400 }
401 }
402
403 void
404 vec4_visitor::emit_math2_gen4(enum opcode opcode,
405 dst_reg dst, src_reg src0, src_reg src1)
406 {
407 vec4_instruction *inst = emit(opcode, dst, src0, src1);
408 inst->base_mrf = 1;
409 inst->mlen = 2;
410 }
411
412 void
413 vec4_visitor::emit_math(enum opcode opcode,
414 dst_reg dst, src_reg src0, src_reg src1)
415 {
416 switch (opcode) {
417 case SHADER_OPCODE_POW:
418 case SHADER_OPCODE_INT_QUOTIENT:
419 case SHADER_OPCODE_INT_REMAINDER:
420 break;
421 default:
422 unreachable("not reached: unsupported binary math opcode");
423 }
424
425 if (brw->gen >= 8) {
426 emit(opcode, dst, src0, src1);
427 } else if (brw->gen >= 6) {
428 emit_math2_gen6(opcode, dst, src0, src1);
429 } else {
430 emit_math2_gen4(opcode, dst, src0, src1);
431 }
432 }
433
434 void
435 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
436 {
437 if (brw->gen < 7) {
438 unreachable("ir_unop_pack_half_2x16 should be lowered");
439 }
440
441 assert(dst.type == BRW_REGISTER_TYPE_UD);
442 assert(src0.type == BRW_REGISTER_TYPE_F);
443
444 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
445 *
446 * Because this instruction does not have a 16-bit floating-point type,
447 * the destination data type must be Word (W).
448 *
449 * The destination must be DWord-aligned and specify a horizontal stride
450 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
451 * each destination channel and the upper word is not modified.
452 *
453 * The above restriction implies that the f32to16 instruction must use
454 * align1 mode, because only in align1 mode is it possible to specify
455 * horizontal stride. We choose here to defy the hardware docs and emit
456 * align16 instructions.
457 *
458 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
459 * instructions. I was partially successful in that the code passed all
460 * tests. However, the code was dubiously correct and fragile, and the
461 * tests were not harsh enough to probe that frailty. Not trusting the
462 * code, I chose instead to remain in align16 mode in defiance of the hw
463 * docs).
464 *
465 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
466 * simulator, emitting a f32to16 in align16 mode with UD as destination
467 * data type is safe. The behavior differs from that specified in the PRM
468 * in that the upper word of each destination channel is cleared to 0.
469 */
470
471 dst_reg tmp_dst(this, glsl_type::uvec2_type);
472 src_reg tmp_src(tmp_dst);
473
474 #if 0
475 /* Verify the undocumented behavior on which the following instructions
476 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
477 * then the result of the bit-or instruction below will be incorrect.
478 *
479 * You should inspect the disasm output in order to verify that the MOV is
480 * not optimized away.
481 */
482 emit(MOV(tmp_dst, src_reg(0x12345678u)));
483 #endif
484
485 /* Give tmp the form below, where "." means untouched.
486 *
487 * w z y x w z y x
488 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
489 *
490 * That the upper word of each write-channel be 0 is required for the
491 * following bit-shift and bit-or instructions to work. Note that this
492 * relies on the undocumented hardware behavior mentioned above.
493 */
494 tmp_dst.writemask = WRITEMASK_XY;
495 emit(F32TO16(tmp_dst, src0));
496
497 /* Give the write-channels of dst the form:
498 * 0xhhhh0000
499 */
500 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
501 emit(SHL(dst, tmp_src, src_reg(16u)));
502
503 /* Finally, give the write-channels of dst the form of packHalf2x16's
504 * output:
505 * 0xhhhhllll
506 */
507 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
508 emit(OR(dst, src_reg(dst), tmp_src));
509 }
510
511 void
512 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
513 {
514 if (brw->gen < 7) {
515 unreachable("ir_unop_unpack_half_2x16 should be lowered");
516 }
517
518 assert(dst.type == BRW_REGISTER_TYPE_F);
519 assert(src0.type == BRW_REGISTER_TYPE_UD);
520
521 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
522 *
523 * Because this instruction does not have a 16-bit floating-point type,
524 * the source data type must be Word (W). The destination type must be
525 * F (Float).
526 *
527 * To use W as the source data type, we must adjust horizontal strides,
528 * which is only possible in align1 mode. All my [chadv] attempts at
529 * emitting align1 instructions for unpackHalf2x16 failed to pass the
530 * Piglit tests, so I gave up.
531 *
532 * I've verified that, on gen7 hardware and the simulator, it is safe to
533 * emit f16to32 in align16 mode with UD as source data type.
534 */
535
536 dst_reg tmp_dst(this, glsl_type::uvec2_type);
537 src_reg tmp_src(tmp_dst);
538
539 tmp_dst.writemask = WRITEMASK_X;
540 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
541
542 tmp_dst.writemask = WRITEMASK_Y;
543 emit(SHR(tmp_dst, src0, src_reg(16u)));
544
545 dst.writemask = WRITEMASK_XY;
546 emit(F16TO32(dst, tmp_src));
547 }
548
549 void
550 vec4_visitor::visit_instructions(const exec_list *list)
551 {
552 foreach_in_list(ir_instruction, ir, list) {
553 base_ir = ir;
554 ir->accept(this);
555 }
556 }
557
558
559 static int
560 type_size(const struct glsl_type *type)
561 {
562 unsigned int i;
563 int size;
564
565 switch (type->base_type) {
566 case GLSL_TYPE_UINT:
567 case GLSL_TYPE_INT:
568 case GLSL_TYPE_FLOAT:
569 case GLSL_TYPE_BOOL:
570 if (type->is_matrix()) {
571 return type->matrix_columns;
572 } else {
573 /* Regardless of size of vector, it gets a vec4. This is bad
574 * packing for things like floats, but otherwise arrays become a
575 * mess. Hopefully a later pass over the code can pack scalars
576 * down if appropriate.
577 */
578 return 1;
579 }
580 case GLSL_TYPE_ARRAY:
581 assert(type->length > 0);
582 return type_size(type->fields.array) * type->length;
583 case GLSL_TYPE_STRUCT:
584 size = 0;
585 for (i = 0; i < type->length; i++) {
586 size += type_size(type->fields.structure[i].type);
587 }
588 return size;
589 case GLSL_TYPE_SAMPLER:
590 /* Samplers take up one slot in UNIFORMS[], but they're baked in
591 * at link time.
592 */
593 return 1;
594 case GLSL_TYPE_ATOMIC_UINT:
595 return 0;
596 case GLSL_TYPE_IMAGE:
597 case GLSL_TYPE_VOID:
598 case GLSL_TYPE_ERROR:
599 case GLSL_TYPE_INTERFACE:
600 unreachable("not reached");
601 }
602
603 return 0;
604 }
605
606 int
607 vec4_visitor::virtual_grf_alloc(int size)
608 {
609 if (virtual_grf_array_size <= virtual_grf_count) {
610 if (virtual_grf_array_size == 0)
611 virtual_grf_array_size = 16;
612 else
613 virtual_grf_array_size *= 2;
614 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
615 virtual_grf_array_size);
616 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
617 virtual_grf_array_size);
618 }
619 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
620 virtual_grf_reg_count += size;
621 virtual_grf_sizes[virtual_grf_count] = size;
622 return virtual_grf_count++;
623 }
624
625 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
626 {
627 init();
628
629 this->file = GRF;
630 this->reg = v->virtual_grf_alloc(type_size(type));
631
632 if (type->is_array() || type->is_record()) {
633 this->swizzle = BRW_SWIZZLE_NOOP;
634 } else {
635 this->swizzle = swizzle_for_size(type->vector_elements);
636 }
637
638 this->type = brw_type_for_base_type(type);
639 }
640
641 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
642 {
643 init();
644
645 this->file = GRF;
646 this->reg = v->virtual_grf_alloc(type_size(type));
647
648 if (type->is_array() || type->is_record()) {
649 this->writemask = WRITEMASK_XYZW;
650 } else {
651 this->writemask = (1 << type->vector_elements) - 1;
652 }
653
654 this->type = brw_type_for_base_type(type);
655 }
656
657 /* Our support for uniforms is piggy-backed on the struct
658 * gl_fragment_program, because that's where the values actually
659 * get stored, rather than in some global gl_shader_program uniform
660 * store.
661 */
662 void
663 vec4_visitor::setup_uniform_values(ir_variable *ir)
664 {
665 int namelen = strlen(ir->name);
666
667 /* The data for our (non-builtin) uniforms is stored in a series of
668 * gl_uniform_driver_storage structs for each subcomponent that
669 * glGetUniformLocation() could name. We know it's been set up in the same
670 * order we'd walk the type, so walk the list of storage and find anything
671 * with our name, or the prefix of a component that starts with our name.
672 */
673 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
674 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
675
676 if (strncmp(ir->name, storage->name, namelen) != 0 ||
677 (storage->name[namelen] != 0 &&
678 storage->name[namelen] != '.' &&
679 storage->name[namelen] != '[')) {
680 continue;
681 }
682
683 gl_constant_value *components = storage->storage;
684 unsigned vector_count = (MAX2(storage->array_elements, 1) *
685 storage->type->matrix_columns);
686
687 for (unsigned s = 0; s < vector_count; s++) {
688 assert(uniforms < uniform_array_size);
689 uniform_vector_size[uniforms] = storage->type->vector_elements;
690
691 int i;
692 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
693 stage_prog_data->param[uniforms * 4 + i] = &components->f;
694 components++;
695 }
696 for (; i < 4; i++) {
697 static float zero = 0;
698 stage_prog_data->param[uniforms * 4 + i] = &zero;
699 }
700
701 uniforms++;
702 }
703 }
704 }
705
706 void
707 vec4_visitor::setup_uniform_clipplane_values()
708 {
709 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
710
711 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
712 assert(this->uniforms < uniform_array_size);
713 this->uniform_vector_size[this->uniforms] = 4;
714 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
715 this->userplane[i].type = BRW_REGISTER_TYPE_F;
716 for (int j = 0; j < 4; ++j) {
717 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
718 }
719 ++this->uniforms;
720 }
721 }
722
723 /* Our support for builtin uniforms is even scarier than non-builtin.
724 * It sits on top of the PROG_STATE_VAR parameters that are
725 * automatically updated from GL context state.
726 */
727 void
728 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
729 {
730 const ir_state_slot *const slots = ir->state_slots;
731 assert(ir->state_slots != NULL);
732
733 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
734 /* This state reference has already been setup by ir_to_mesa,
735 * but we'll get the same index back here. We can reference
736 * ParameterValues directly, since unlike brw_fs.cpp, we never
737 * add new state references during compile.
738 */
739 int index = _mesa_add_state_reference(this->prog->Parameters,
740 (gl_state_index *)slots[i].tokens);
741 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
742
743 assert(this->uniforms < uniform_array_size);
744 this->uniform_vector_size[this->uniforms] = 0;
745 /* Add each of the unique swizzled channels of the element.
746 * This will end up matching the size of the glsl_type of this field.
747 */
748 int last_swiz = -1;
749 for (unsigned int j = 0; j < 4; j++) {
750 int swiz = GET_SWZ(slots[i].swizzle, j);
751 last_swiz = swiz;
752
753 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
754 assert(this->uniforms < uniform_array_size);
755 if (swiz <= last_swiz)
756 this->uniform_vector_size[this->uniforms]++;
757 }
758 this->uniforms++;
759 }
760 }
761
762 dst_reg *
763 vec4_visitor::variable_storage(ir_variable *var)
764 {
765 return (dst_reg *)hash_table_find(this->variable_ht, var);
766 }
767
768 void
769 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
770 {
771 ir_expression *expr = ir->as_expression();
772
773 *predicate = BRW_PREDICATE_NORMAL;
774
775 if (expr) {
776 src_reg op[2];
777 vec4_instruction *inst;
778
779 assert(expr->get_num_operands() <= 2);
780 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
781 expr->operands[i]->accept(this);
782 op[i] = this->result;
783
784 resolve_ud_negate(&op[i]);
785 }
786
787 switch (expr->operation) {
788 case ir_unop_logic_not:
789 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
790 inst->conditional_mod = BRW_CONDITIONAL_Z;
791 break;
792
793 case ir_binop_logic_xor:
794 inst = emit(XOR(dst_null_d(), op[0], op[1]));
795 inst->conditional_mod = BRW_CONDITIONAL_NZ;
796 break;
797
798 case ir_binop_logic_or:
799 inst = emit(OR(dst_null_d(), op[0], op[1]));
800 inst->conditional_mod = BRW_CONDITIONAL_NZ;
801 break;
802
803 case ir_binop_logic_and:
804 inst = emit(AND(dst_null_d(), op[0], op[1]));
805 inst->conditional_mod = BRW_CONDITIONAL_NZ;
806 break;
807
808 case ir_unop_f2b:
809 if (brw->gen >= 6) {
810 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
811 } else {
812 inst = emit(MOV(dst_null_f(), op[0]));
813 inst->conditional_mod = BRW_CONDITIONAL_NZ;
814 }
815 break;
816
817 case ir_unop_i2b:
818 if (brw->gen >= 6) {
819 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
820 } else {
821 inst = emit(MOV(dst_null_d(), op[0]));
822 inst->conditional_mod = BRW_CONDITIONAL_NZ;
823 }
824 break;
825
826 case ir_binop_all_equal:
827 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
828 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
829 break;
830
831 case ir_binop_any_nequal:
832 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
833 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
834 break;
835
836 case ir_unop_any:
837 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
838 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
839 break;
840
841 case ir_binop_greater:
842 case ir_binop_gequal:
843 case ir_binop_less:
844 case ir_binop_lequal:
845 case ir_binop_equal:
846 case ir_binop_nequal:
847 emit(CMP(dst_null_d(), op[0], op[1],
848 brw_conditional_for_comparison(expr->operation)));
849 break;
850
851 default:
852 unreachable("not reached");
853 }
854 return;
855 }
856
857 ir->accept(this);
858
859 resolve_ud_negate(&this->result);
860
861 if (brw->gen >= 6) {
862 vec4_instruction *inst = emit(AND(dst_null_d(),
863 this->result, src_reg(1)));
864 inst->conditional_mod = BRW_CONDITIONAL_NZ;
865 } else {
866 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
867 inst->conditional_mod = BRW_CONDITIONAL_NZ;
868 }
869 }
870
871 /**
872 * Emit a gen6 IF statement with the comparison folded into the IF
873 * instruction.
874 */
875 void
876 vec4_visitor::emit_if_gen6(ir_if *ir)
877 {
878 ir_expression *expr = ir->condition->as_expression();
879
880 if (expr) {
881 src_reg op[2];
882 dst_reg temp;
883
884 assert(expr->get_num_operands() <= 2);
885 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
886 expr->operands[i]->accept(this);
887 op[i] = this->result;
888 }
889
890 switch (expr->operation) {
891 case ir_unop_logic_not:
892 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
893 return;
894
895 case ir_binop_logic_xor:
896 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
897 return;
898
899 case ir_binop_logic_or:
900 temp = dst_reg(this, glsl_type::bool_type);
901 emit(OR(temp, op[0], op[1]));
902 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
903 return;
904
905 case ir_binop_logic_and:
906 temp = dst_reg(this, glsl_type::bool_type);
907 emit(AND(temp, op[0], op[1]));
908 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
909 return;
910
911 case ir_unop_f2b:
912 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
913 return;
914
915 case ir_unop_i2b:
916 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
917 return;
918
919 case ir_binop_greater:
920 case ir_binop_gequal:
921 case ir_binop_less:
922 case ir_binop_lequal:
923 case ir_binop_equal:
924 case ir_binop_nequal:
925 emit(IF(op[0], op[1],
926 brw_conditional_for_comparison(expr->operation)));
927 return;
928
929 case ir_binop_all_equal:
930 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
931 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
932 return;
933
934 case ir_binop_any_nequal:
935 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
936 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
937 return;
938
939 case ir_unop_any:
940 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
941 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
942 return;
943
944 default:
945 unreachable("not reached");
946 }
947 return;
948 }
949
950 ir->condition->accept(this);
951
952 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
953 }
954
955 void
956 vec4_visitor::visit(ir_variable *ir)
957 {
958 dst_reg *reg = NULL;
959
960 if (variable_storage(ir))
961 return;
962
963 switch (ir->data.mode) {
964 case ir_var_shader_in:
965 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
966 break;
967
968 case ir_var_shader_out:
969 reg = new(mem_ctx) dst_reg(this, ir->type);
970
971 for (int i = 0; i < type_size(ir->type); i++) {
972 output_reg[ir->data.location + i] = *reg;
973 output_reg[ir->data.location + i].reg_offset = i;
974 output_reg[ir->data.location + i].type =
975 brw_type_for_base_type(ir->type->get_scalar_type());
976 output_reg_annotation[ir->data.location + i] = ir->name;
977 }
978 break;
979
980 case ir_var_auto:
981 case ir_var_temporary:
982 reg = new(mem_ctx) dst_reg(this, ir->type);
983 break;
984
985 case ir_var_uniform:
986 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
987
988 /* Thanks to the lower_ubo_reference pass, we will see only
989 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
990 * variables, so no need for them to be in variable_ht.
991 *
992 * Atomic counters take no uniform storage, no need to do
993 * anything here.
994 */
995 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
996 return;
997
998 /* Track how big the whole uniform variable is, in case we need to put a
999 * copy of its data into pull constants for array access.
1000 */
1001 assert(this->uniforms < uniform_array_size);
1002 this->uniform_size[this->uniforms] = type_size(ir->type);
1003
1004 if (!strncmp(ir->name, "gl_", 3)) {
1005 setup_builtin_uniform_values(ir);
1006 } else {
1007 setup_uniform_values(ir);
1008 }
1009 break;
1010
1011 case ir_var_system_value:
1012 reg = make_reg_for_system_value(ir);
1013 break;
1014
1015 default:
1016 unreachable("not reached");
1017 }
1018
1019 reg->type = brw_type_for_base_type(ir->type);
1020 hash_table_insert(this->variable_ht, reg, ir);
1021 }
1022
1023 void
1024 vec4_visitor::visit(ir_loop *ir)
1025 {
1026 /* We don't want debugging output to print the whole body of the
1027 * loop as the annotation.
1028 */
1029 this->base_ir = NULL;
1030
1031 emit(BRW_OPCODE_DO);
1032
1033 visit_instructions(&ir->body_instructions);
1034
1035 emit(BRW_OPCODE_WHILE);
1036 }
1037
1038 void
1039 vec4_visitor::visit(ir_loop_jump *ir)
1040 {
1041 switch (ir->mode) {
1042 case ir_loop_jump::jump_break:
1043 emit(BRW_OPCODE_BREAK);
1044 break;
1045 case ir_loop_jump::jump_continue:
1046 emit(BRW_OPCODE_CONTINUE);
1047 break;
1048 }
1049 }
1050
1051
1052 void
1053 vec4_visitor::visit(ir_function_signature *)
1054 {
1055 unreachable("not reached");
1056 }
1057
1058 void
1059 vec4_visitor::visit(ir_function *ir)
1060 {
1061 /* Ignore function bodies other than main() -- we shouldn't see calls to
1062 * them since they should all be inlined.
1063 */
1064 if (strcmp(ir->name, "main") == 0) {
1065 const ir_function_signature *sig;
1066 exec_list empty;
1067
1068 sig = ir->matching_signature(NULL, &empty);
1069
1070 assert(sig);
1071
1072 visit_instructions(&sig->body);
1073 }
1074 }
1075
1076 bool
1077 vec4_visitor::try_emit_sat(ir_expression *ir)
1078 {
1079 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1080 if (!sat_src)
1081 return false;
1082
1083 sat_src->accept(this);
1084 src_reg src = this->result;
1085
1086 this->result = src_reg(this, ir->type);
1087 vec4_instruction *inst;
1088 inst = emit(MOV(dst_reg(this->result), src));
1089 inst->saturate = true;
1090
1091 return true;
1092 }
1093
1094 bool
1095 vec4_visitor::try_emit_mad(ir_expression *ir)
1096 {
1097 /* 3-src instructions were introduced in gen6. */
1098 if (brw->gen < 6)
1099 return false;
1100
1101 /* MAD can only handle floating-point data. */
1102 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1103 return false;
1104
1105 ir_rvalue *nonmul = ir->operands[1];
1106 ir_expression *mul = ir->operands[0]->as_expression();
1107
1108 if (!mul || mul->operation != ir_binop_mul) {
1109 nonmul = ir->operands[0];
1110 mul = ir->operands[1]->as_expression();
1111
1112 if (!mul || mul->operation != ir_binop_mul)
1113 return false;
1114 }
1115
1116 nonmul->accept(this);
1117 src_reg src0 = fix_3src_operand(this->result);
1118
1119 mul->operands[0]->accept(this);
1120 src_reg src1 = fix_3src_operand(this->result);
1121
1122 mul->operands[1]->accept(this);
1123 src_reg src2 = fix_3src_operand(this->result);
1124
1125 this->result = src_reg(this, ir->type);
1126 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1127
1128 return true;
1129 }
1130
1131 bool
1132 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1133 {
1134 ir_expression *const cmp = ir->operands[0]->as_expression();
1135
1136 if (cmp == NULL)
1137 return false;
1138
1139 switch (cmp->operation) {
1140 case ir_binop_less:
1141 case ir_binop_greater:
1142 case ir_binop_lequal:
1143 case ir_binop_gequal:
1144 case ir_binop_equal:
1145 case ir_binop_nequal:
1146 break;
1147
1148 default:
1149 return false;
1150 }
1151
1152 cmp->operands[0]->accept(this);
1153 const src_reg cmp_src0 = this->result;
1154
1155 cmp->operands[1]->accept(this);
1156 const src_reg cmp_src1 = this->result;
1157
1158 this->result = src_reg(this, ir->type);
1159
1160 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1161 brw_conditional_for_comparison(cmp->operation)));
1162
1163 /* If the comparison is false, this->result will just happen to be zero.
1164 */
1165 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1166 this->result, src_reg(1.0f));
1167 inst->predicate = BRW_PREDICATE_NORMAL;
1168 inst->predicate_inverse = true;
1169
1170 return true;
1171 }
1172
1173 void
1174 vec4_visitor::emit_bool_comparison(unsigned int op,
1175 dst_reg dst, src_reg src0, src_reg src1)
1176 {
1177 /* original gen4 does destination conversion before comparison. */
1178 if (brw->gen < 5)
1179 dst.type = src0.type;
1180
1181 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1182
1183 dst.type = BRW_REGISTER_TYPE_D;
1184 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1185 }
1186
1187 void
1188 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1189 src_reg src0, src_reg src1)
1190 {
1191 vec4_instruction *inst;
1192
1193 if (brw->gen >= 6) {
1194 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1195 inst->conditional_mod = conditionalmod;
1196 } else {
1197 emit(CMP(dst, src0, src1, conditionalmod));
1198
1199 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1200 inst->predicate = BRW_PREDICATE_NORMAL;
1201 }
1202 }
1203
1204 void
1205 vec4_visitor::emit_lrp(const dst_reg &dst,
1206 const src_reg &x, const src_reg &y, const src_reg &a)
1207 {
1208 if (brw->gen >= 6) {
1209 /* Note that the instruction's argument order is reversed from GLSL
1210 * and the IR.
1211 */
1212 emit(LRP(dst,
1213 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1214 } else {
1215 /* Earlier generations don't support three source operations, so we
1216 * need to emit x*(1-a) + y*a.
1217 */
1218 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1219 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1220 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1221 y_times_a.writemask = dst.writemask;
1222 one_minus_a.writemask = dst.writemask;
1223 x_times_one_minus_a.writemask = dst.writemask;
1224
1225 emit(MUL(y_times_a, y, a));
1226 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1227 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1228 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1229 }
1230 }
1231
1232 void
1233 vec4_visitor::visit(ir_expression *ir)
1234 {
1235 unsigned int operand;
1236 src_reg op[Elements(ir->operands)];
1237 src_reg result_src;
1238 dst_reg result_dst;
1239 vec4_instruction *inst;
1240
1241 if (try_emit_sat(ir))
1242 return;
1243
1244 if (ir->operation == ir_binop_add) {
1245 if (try_emit_mad(ir))
1246 return;
1247 }
1248
1249 if (ir->operation == ir_unop_b2f) {
1250 if (try_emit_b2f_of_compare(ir))
1251 return;
1252 }
1253
1254 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1255 this->result.file = BAD_FILE;
1256 ir->operands[operand]->accept(this);
1257 if (this->result.file == BAD_FILE) {
1258 fprintf(stderr, "Failed to get tree for expression operand:\n");
1259 ir->operands[operand]->fprint(stderr);
1260 exit(1);
1261 }
1262 op[operand] = this->result;
1263
1264 /* Matrix expression operands should have been broken down to vector
1265 * operations already.
1266 */
1267 assert(!ir->operands[operand]->type->is_matrix());
1268 }
1269
1270 int vector_elements = ir->operands[0]->type->vector_elements;
1271 if (ir->operands[1]) {
1272 vector_elements = MAX2(vector_elements,
1273 ir->operands[1]->type->vector_elements);
1274 }
1275
1276 this->result.file = BAD_FILE;
1277
1278 /* Storage for our result. Ideally for an assignment we'd be using
1279 * the actual storage for the result here, instead.
1280 */
1281 result_src = src_reg(this, ir->type);
1282 /* convenience for the emit functions below. */
1283 result_dst = dst_reg(result_src);
1284 /* If nothing special happens, this is the result. */
1285 this->result = result_src;
1286 /* Limit writes to the channels that will be used by result_src later.
1287 * This does limit this temp's use as a temporary for multi-instruction
1288 * sequences.
1289 */
1290 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1291
1292 switch (ir->operation) {
1293 case ir_unop_logic_not:
1294 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1295 * ones complement of the whole register, not just bit 0.
1296 */
1297 emit(XOR(result_dst, op[0], src_reg(1)));
1298 break;
1299 case ir_unop_neg:
1300 op[0].negate = !op[0].negate;
1301 emit(MOV(result_dst, op[0]));
1302 break;
1303 case ir_unop_abs:
1304 op[0].abs = true;
1305 op[0].negate = false;
1306 emit(MOV(result_dst, op[0]));
1307 break;
1308
1309 case ir_unop_sign:
1310 if (ir->type->is_float()) {
1311 /* AND(val, 0x80000000) gives the sign bit.
1312 *
1313 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1314 * zero.
1315 */
1316 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1317
1318 op[0].type = BRW_REGISTER_TYPE_UD;
1319 result_dst.type = BRW_REGISTER_TYPE_UD;
1320 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1321
1322 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1323 inst->predicate = BRW_PREDICATE_NORMAL;
1324
1325 this->result.type = BRW_REGISTER_TYPE_F;
1326 } else {
1327 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1328 * -> non-negative val generates 0x00000000.
1329 * Predicated OR sets 1 if val is positive.
1330 */
1331 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1332
1333 emit(ASR(result_dst, op[0], src_reg(31)));
1334
1335 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1336 inst->predicate = BRW_PREDICATE_NORMAL;
1337 }
1338 break;
1339
1340 case ir_unop_rcp:
1341 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1342 break;
1343
1344 case ir_unop_exp2:
1345 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1346 break;
1347 case ir_unop_log2:
1348 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1349 break;
1350 case ir_unop_exp:
1351 case ir_unop_log:
1352 unreachable("not reached: should be handled by ir_explog_to_explog2");
1353 case ir_unop_sin:
1354 case ir_unop_sin_reduced:
1355 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1356 break;
1357 case ir_unop_cos:
1358 case ir_unop_cos_reduced:
1359 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1360 break;
1361
1362 case ir_unop_dFdx:
1363 case ir_unop_dFdy:
1364 unreachable("derivatives not valid in vertex shader");
1365
1366 case ir_unop_bitfield_reverse:
1367 emit(BFREV(result_dst, op[0]));
1368 break;
1369 case ir_unop_bit_count:
1370 emit(CBIT(result_dst, op[0]));
1371 break;
1372 case ir_unop_find_msb: {
1373 src_reg temp = src_reg(this, glsl_type::uint_type);
1374
1375 inst = emit(FBH(dst_reg(temp), op[0]));
1376 inst->dst.writemask = WRITEMASK_XYZW;
1377
1378 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1379 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1380 * subtract the result from 31 to convert the MSB count into an LSB count.
1381 */
1382
1383 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1384 temp.swizzle = BRW_SWIZZLE_NOOP;
1385 emit(MOV(result_dst, temp));
1386
1387 src_reg src_tmp = src_reg(result_dst);
1388 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1389
1390 src_tmp.negate = true;
1391 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1392 inst->predicate = BRW_PREDICATE_NORMAL;
1393 break;
1394 }
1395 case ir_unop_find_lsb:
1396 emit(FBL(result_dst, op[0]));
1397 break;
1398
1399 case ir_unop_noise:
1400 unreachable("not reached: should be handled by lower_noise");
1401
1402 case ir_binop_add:
1403 emit(ADD(result_dst, op[0], op[1]));
1404 break;
1405 case ir_binop_sub:
1406 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1407
1408 case ir_binop_mul:
1409 if (brw->gen < 8 && ir->type->is_integer()) {
1410 /* For integer multiplication, the MUL uses the low 16 bits of one of
1411 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1412 * accumulates in the contribution of the upper 16 bits of that
1413 * operand. If we can determine that one of the args is in the low
1414 * 16 bits, though, we can just emit a single MUL.
1415 */
1416 if (ir->operands[0]->is_uint16_constant()) {
1417 if (brw->gen < 7)
1418 emit(MUL(result_dst, op[0], op[1]));
1419 else
1420 emit(MUL(result_dst, op[1], op[0]));
1421 } else if (ir->operands[1]->is_uint16_constant()) {
1422 if (brw->gen < 7)
1423 emit(MUL(result_dst, op[1], op[0]));
1424 else
1425 emit(MUL(result_dst, op[0], op[1]));
1426 } else {
1427 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1428
1429 emit(MUL(acc, op[0], op[1]));
1430 emit(MACH(dst_null_d(), op[0], op[1]));
1431 emit(MOV(result_dst, src_reg(acc)));
1432 }
1433 } else {
1434 emit(MUL(result_dst, op[0], op[1]));
1435 }
1436 break;
1437 case ir_binop_imul_high: {
1438 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1439
1440 emit(MUL(acc, op[0], op[1]));
1441 emit(MACH(result_dst, op[0], op[1]));
1442 break;
1443 }
1444 case ir_binop_div:
1445 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1446 assert(ir->type->is_integer());
1447 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1448 break;
1449 case ir_binop_carry: {
1450 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1451
1452 emit(ADDC(dst_null_ud(), op[0], op[1]));
1453 emit(MOV(result_dst, src_reg(acc)));
1454 break;
1455 }
1456 case ir_binop_borrow: {
1457 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1458
1459 emit(SUBB(dst_null_ud(), op[0], op[1]));
1460 emit(MOV(result_dst, src_reg(acc)));
1461 break;
1462 }
1463 case ir_binop_mod:
1464 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1465 assert(ir->type->is_integer());
1466 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1467 break;
1468
1469 case ir_binop_less:
1470 case ir_binop_greater:
1471 case ir_binop_lequal:
1472 case ir_binop_gequal:
1473 case ir_binop_equal:
1474 case ir_binop_nequal: {
1475 emit(CMP(result_dst, op[0], op[1],
1476 brw_conditional_for_comparison(ir->operation)));
1477 emit(AND(result_dst, result_src, src_reg(0x1)));
1478 break;
1479 }
1480
1481 case ir_binop_all_equal:
1482 /* "==" operator producing a scalar boolean. */
1483 if (ir->operands[0]->type->is_vector() ||
1484 ir->operands[1]->type->is_vector()) {
1485 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1486 emit(MOV(result_dst, src_reg(0)));
1487 inst = emit(MOV(result_dst, src_reg(1)));
1488 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1489 } else {
1490 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1491 emit(AND(result_dst, result_src, src_reg(0x1)));
1492 }
1493 break;
1494 case ir_binop_any_nequal:
1495 /* "!=" operator producing a scalar boolean. */
1496 if (ir->operands[0]->type->is_vector() ||
1497 ir->operands[1]->type->is_vector()) {
1498 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1499
1500 emit(MOV(result_dst, src_reg(0)));
1501 inst = emit(MOV(result_dst, src_reg(1)));
1502 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1503 } else {
1504 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1505 emit(AND(result_dst, result_src, src_reg(0x1)));
1506 }
1507 break;
1508
1509 case ir_unop_any:
1510 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1511 emit(MOV(result_dst, src_reg(0)));
1512
1513 inst = emit(MOV(result_dst, src_reg(1)));
1514 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1515 break;
1516
1517 case ir_binop_logic_xor:
1518 emit(XOR(result_dst, op[0], op[1]));
1519 break;
1520
1521 case ir_binop_logic_or:
1522 emit(OR(result_dst, op[0], op[1]));
1523 break;
1524
1525 case ir_binop_logic_and:
1526 emit(AND(result_dst, op[0], op[1]));
1527 break;
1528
1529 case ir_binop_dot:
1530 assert(ir->operands[0]->type->is_vector());
1531 assert(ir->operands[0]->type == ir->operands[1]->type);
1532 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1533 break;
1534
1535 case ir_unop_sqrt:
1536 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1537 break;
1538 case ir_unop_rsq:
1539 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1540 break;
1541
1542 case ir_unop_bitcast_i2f:
1543 case ir_unop_bitcast_u2f:
1544 this->result = op[0];
1545 this->result.type = BRW_REGISTER_TYPE_F;
1546 break;
1547
1548 case ir_unop_bitcast_f2i:
1549 this->result = op[0];
1550 this->result.type = BRW_REGISTER_TYPE_D;
1551 break;
1552
1553 case ir_unop_bitcast_f2u:
1554 this->result = op[0];
1555 this->result.type = BRW_REGISTER_TYPE_UD;
1556 break;
1557
1558 case ir_unop_i2f:
1559 case ir_unop_i2u:
1560 case ir_unop_u2i:
1561 case ir_unop_u2f:
1562 case ir_unop_b2f:
1563 case ir_unop_b2i:
1564 case ir_unop_f2i:
1565 case ir_unop_f2u:
1566 emit(MOV(result_dst, op[0]));
1567 break;
1568 case ir_unop_f2b:
1569 case ir_unop_i2b: {
1570 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1571 emit(AND(result_dst, result_src, src_reg(1)));
1572 break;
1573 }
1574
1575 case ir_unop_trunc:
1576 emit(RNDZ(result_dst, op[0]));
1577 break;
1578 case ir_unop_ceil:
1579 op[0].negate = !op[0].negate;
1580 inst = emit(RNDD(result_dst, op[0]));
1581 this->result.negate = true;
1582 break;
1583 case ir_unop_floor:
1584 inst = emit(RNDD(result_dst, op[0]));
1585 break;
1586 case ir_unop_fract:
1587 inst = emit(FRC(result_dst, op[0]));
1588 break;
1589 case ir_unop_round_even:
1590 emit(RNDE(result_dst, op[0]));
1591 break;
1592
1593 case ir_binop_min:
1594 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1595 break;
1596 case ir_binop_max:
1597 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1598 break;
1599
1600 case ir_binop_pow:
1601 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1602 break;
1603
1604 case ir_unop_bit_not:
1605 inst = emit(NOT(result_dst, op[0]));
1606 break;
1607 case ir_binop_bit_and:
1608 inst = emit(AND(result_dst, op[0], op[1]));
1609 break;
1610 case ir_binop_bit_xor:
1611 inst = emit(XOR(result_dst, op[0], op[1]));
1612 break;
1613 case ir_binop_bit_or:
1614 inst = emit(OR(result_dst, op[0], op[1]));
1615 break;
1616
1617 case ir_binop_lshift:
1618 inst = emit(SHL(result_dst, op[0], op[1]));
1619 break;
1620
1621 case ir_binop_rshift:
1622 if (ir->type->base_type == GLSL_TYPE_INT)
1623 inst = emit(ASR(result_dst, op[0], op[1]));
1624 else
1625 inst = emit(SHR(result_dst, op[0], op[1]));
1626 break;
1627
1628 case ir_binop_bfm:
1629 emit(BFI1(result_dst, op[0], op[1]));
1630 break;
1631
1632 case ir_binop_ubo_load: {
1633 ir_constant *uniform_block = ir->operands[0]->as_constant();
1634 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1635 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1636 src_reg offset;
1637
1638 /* Now, load the vector from that offset. */
1639 assert(ir->type->is_vector() || ir->type->is_scalar());
1640
1641 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1642 packed_consts.type = result.type;
1643 src_reg surf_index =
1644 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1645 if (const_offset_ir) {
1646 if (brw->gen >= 8) {
1647 /* Store the offset in a GRF so we can send-from-GRF. */
1648 offset = src_reg(this, glsl_type::int_type);
1649 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1650 } else {
1651 /* Immediates are fine on older generations since they'll be moved
1652 * to a (potentially fake) MRF at the generator level.
1653 */
1654 offset = src_reg(const_offset / 16);
1655 }
1656 } else {
1657 offset = src_reg(this, glsl_type::uint_type);
1658 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1659 }
1660
1661 if (brw->gen >= 7) {
1662 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1663 grf_offset.type = offset.type;
1664
1665 emit(MOV(grf_offset, offset));
1666
1667 emit(new(mem_ctx) vec4_instruction(this,
1668 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1669 dst_reg(packed_consts),
1670 surf_index,
1671 src_reg(grf_offset)));
1672 } else {
1673 vec4_instruction *pull =
1674 emit(new(mem_ctx) vec4_instruction(this,
1675 VS_OPCODE_PULL_CONSTANT_LOAD,
1676 dst_reg(packed_consts),
1677 surf_index,
1678 offset));
1679 pull->base_mrf = 14;
1680 pull->mlen = 1;
1681 }
1682
1683 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1684 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1685 const_offset % 16 / 4,
1686 const_offset % 16 / 4,
1687 const_offset % 16 / 4);
1688
1689 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1690 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1691 emit(CMP(result_dst, packed_consts, src_reg(0u),
1692 BRW_CONDITIONAL_NZ));
1693 emit(AND(result_dst, result, src_reg(0x1)));
1694 } else {
1695 emit(MOV(result_dst, packed_consts));
1696 }
1697 break;
1698 }
1699
1700 case ir_binop_vector_extract:
1701 unreachable("should have been lowered by vec_index_to_cond_assign");
1702
1703 case ir_triop_fma:
1704 op[0] = fix_3src_operand(op[0]);
1705 op[1] = fix_3src_operand(op[1]);
1706 op[2] = fix_3src_operand(op[2]);
1707 /* Note that the instruction's argument order is reversed from GLSL
1708 * and the IR.
1709 */
1710 emit(MAD(result_dst, op[2], op[1], op[0]));
1711 break;
1712
1713 case ir_triop_lrp:
1714 emit_lrp(result_dst, op[0], op[1], op[2]);
1715 break;
1716
1717 case ir_triop_csel:
1718 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1719 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1720 inst->predicate = BRW_PREDICATE_NORMAL;
1721 break;
1722
1723 case ir_triop_bfi:
1724 op[0] = fix_3src_operand(op[0]);
1725 op[1] = fix_3src_operand(op[1]);
1726 op[2] = fix_3src_operand(op[2]);
1727 emit(BFI2(result_dst, op[0], op[1], op[2]));
1728 break;
1729
1730 case ir_triop_bitfield_extract:
1731 op[0] = fix_3src_operand(op[0]);
1732 op[1] = fix_3src_operand(op[1]);
1733 op[2] = fix_3src_operand(op[2]);
1734 /* Note that the instruction's argument order is reversed from GLSL
1735 * and the IR.
1736 */
1737 emit(BFE(result_dst, op[2], op[1], op[0]));
1738 break;
1739
1740 case ir_triop_vector_insert:
1741 unreachable("should have been lowered by lower_vector_insert");
1742
1743 case ir_quadop_bitfield_insert:
1744 unreachable("not reached: should be handled by "
1745 "bitfield_insert_to_bfm_bfi\n");
1746
1747 case ir_quadop_vector:
1748 unreachable("not reached: should be handled by lower_quadop_vector");
1749
1750 case ir_unop_pack_half_2x16:
1751 emit_pack_half_2x16(result_dst, op[0]);
1752 break;
1753 case ir_unop_unpack_half_2x16:
1754 emit_unpack_half_2x16(result_dst, op[0]);
1755 break;
1756 case ir_unop_pack_snorm_2x16:
1757 case ir_unop_pack_snorm_4x8:
1758 case ir_unop_pack_unorm_2x16:
1759 case ir_unop_pack_unorm_4x8:
1760 case ir_unop_unpack_snorm_2x16:
1761 case ir_unop_unpack_snorm_4x8:
1762 case ir_unop_unpack_unorm_2x16:
1763 case ir_unop_unpack_unorm_4x8:
1764 unreachable("not reached: should be handled by lower_packing_builtins");
1765 case ir_unop_unpack_half_2x16_split_x:
1766 case ir_unop_unpack_half_2x16_split_y:
1767 case ir_binop_pack_half_2x16_split:
1768 unreachable("not reached: should not occur in vertex shader");
1769 case ir_binop_ldexp:
1770 unreachable("not reached: should be handled by ldexp_to_arith()");
1771 }
1772 }
1773
1774
1775 void
1776 vec4_visitor::visit(ir_swizzle *ir)
1777 {
1778 src_reg src;
1779 int i = 0;
1780 int swizzle[4];
1781
1782 /* Note that this is only swizzles in expressions, not those on the left
1783 * hand side of an assignment, which do write masking. See ir_assignment
1784 * for that.
1785 */
1786
1787 ir->val->accept(this);
1788 src = this->result;
1789 assert(src.file != BAD_FILE);
1790
1791 for (i = 0; i < ir->type->vector_elements; i++) {
1792 switch (i) {
1793 case 0:
1794 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1795 break;
1796 case 1:
1797 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1798 break;
1799 case 2:
1800 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1801 break;
1802 case 3:
1803 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1804 break;
1805 }
1806 }
1807 for (; i < 4; i++) {
1808 /* Replicate the last channel out. */
1809 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1810 }
1811
1812 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1813
1814 this->result = src;
1815 }
1816
1817 void
1818 vec4_visitor::visit(ir_dereference_variable *ir)
1819 {
1820 const struct glsl_type *type = ir->type;
1821 dst_reg *reg = variable_storage(ir->var);
1822
1823 if (!reg) {
1824 fail("Failed to find variable storage for %s\n", ir->var->name);
1825 this->result = src_reg(brw_null_reg());
1826 return;
1827 }
1828
1829 this->result = src_reg(*reg);
1830
1831 /* System values get their swizzle from the dst_reg writemask */
1832 if (ir->var->data.mode == ir_var_system_value)
1833 return;
1834
1835 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1836 this->result.swizzle = swizzle_for_size(type->vector_elements);
1837 }
1838
1839
1840 int
1841 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1842 {
1843 /* Under normal circumstances array elements are stored consecutively, so
1844 * the stride is equal to the size of the array element.
1845 */
1846 return type_size(ir->type);
1847 }
1848
1849
1850 void
1851 vec4_visitor::visit(ir_dereference_array *ir)
1852 {
1853 ir_constant *constant_index;
1854 src_reg src;
1855 int array_stride = compute_array_stride(ir);
1856
1857 constant_index = ir->array_index->constant_expression_value();
1858
1859 ir->array->accept(this);
1860 src = this->result;
1861
1862 if (constant_index) {
1863 src.reg_offset += constant_index->value.i[0] * array_stride;
1864 } else {
1865 /* Variable index array dereference. It eats the "vec4" of the
1866 * base of the array and an index that offsets the Mesa register
1867 * index.
1868 */
1869 ir->array_index->accept(this);
1870
1871 src_reg index_reg;
1872
1873 if (array_stride == 1) {
1874 index_reg = this->result;
1875 } else {
1876 index_reg = src_reg(this, glsl_type::int_type);
1877
1878 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1879 }
1880
1881 if (src.reladdr) {
1882 src_reg temp = src_reg(this, glsl_type::int_type);
1883
1884 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1885
1886 index_reg = temp;
1887 }
1888
1889 src.reladdr = ralloc(mem_ctx, src_reg);
1890 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1891 }
1892
1893 /* If the type is smaller than a vec4, replicate the last channel out. */
1894 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1895 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1896 else
1897 src.swizzle = BRW_SWIZZLE_NOOP;
1898 src.type = brw_type_for_base_type(ir->type);
1899
1900 this->result = src;
1901 }
1902
1903 void
1904 vec4_visitor::visit(ir_dereference_record *ir)
1905 {
1906 unsigned int i;
1907 const glsl_type *struct_type = ir->record->type;
1908 int offset = 0;
1909
1910 ir->record->accept(this);
1911
1912 for (i = 0; i < struct_type->length; i++) {
1913 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1914 break;
1915 offset += type_size(struct_type->fields.structure[i].type);
1916 }
1917
1918 /* If the type is smaller than a vec4, replicate the last channel out. */
1919 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1920 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1921 else
1922 this->result.swizzle = BRW_SWIZZLE_NOOP;
1923 this->result.type = brw_type_for_base_type(ir->type);
1924
1925 this->result.reg_offset += offset;
1926 }
1927
1928 /**
1929 * We want to be careful in assignment setup to hit the actual storage
1930 * instead of potentially using a temporary like we might with the
1931 * ir_dereference handler.
1932 */
1933 static dst_reg
1934 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1935 {
1936 /* The LHS must be a dereference. If the LHS is a variable indexed array
1937 * access of a vector, it must be separated into a series conditional moves
1938 * before reaching this point (see ir_vec_index_to_cond_assign).
1939 */
1940 assert(ir->as_dereference());
1941 ir_dereference_array *deref_array = ir->as_dereference_array();
1942 if (deref_array) {
1943 assert(!deref_array->array->type->is_vector());
1944 }
1945
1946 /* Use the rvalue deref handler for the most part. We'll ignore
1947 * swizzles in it and write swizzles using writemask, though.
1948 */
1949 ir->accept(v);
1950 return dst_reg(v->result);
1951 }
1952
1953 void
1954 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1955 const struct glsl_type *type, uint32_t predicate)
1956 {
1957 if (type->base_type == GLSL_TYPE_STRUCT) {
1958 for (unsigned int i = 0; i < type->length; i++) {
1959 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1960 }
1961 return;
1962 }
1963
1964 if (type->is_array()) {
1965 for (unsigned int i = 0; i < type->length; i++) {
1966 emit_block_move(dst, src, type->fields.array, predicate);
1967 }
1968 return;
1969 }
1970
1971 if (type->is_matrix()) {
1972 const struct glsl_type *vec_type;
1973
1974 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1975 type->vector_elements, 1);
1976
1977 for (int i = 0; i < type->matrix_columns; i++) {
1978 emit_block_move(dst, src, vec_type, predicate);
1979 }
1980 return;
1981 }
1982
1983 assert(type->is_scalar() || type->is_vector());
1984
1985 dst->type = brw_type_for_base_type(type);
1986 src->type = dst->type;
1987
1988 dst->writemask = (1 << type->vector_elements) - 1;
1989
1990 src->swizzle = swizzle_for_size(type->vector_elements);
1991
1992 vec4_instruction *inst = emit(MOV(*dst, *src));
1993 inst->predicate = predicate;
1994
1995 dst->reg_offset++;
1996 src->reg_offset++;
1997 }
1998
1999
2000 /* If the RHS processing resulted in an instruction generating a
2001 * temporary value, and it would be easy to rewrite the instruction to
2002 * generate its result right into the LHS instead, do so. This ends
2003 * up reliably removing instructions where it can be tricky to do so
2004 * later without real UD chain information.
2005 */
2006 bool
2007 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2008 dst_reg dst,
2009 src_reg src,
2010 vec4_instruction *pre_rhs_inst,
2011 vec4_instruction *last_rhs_inst)
2012 {
2013 /* This could be supported, but it would take more smarts. */
2014 if (ir->condition)
2015 return false;
2016
2017 if (pre_rhs_inst == last_rhs_inst)
2018 return false; /* No instructions generated to work with. */
2019
2020 /* Make sure the last instruction generated our source reg. */
2021 if (src.file != GRF ||
2022 src.file != last_rhs_inst->dst.file ||
2023 src.reg != last_rhs_inst->dst.reg ||
2024 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2025 src.reladdr ||
2026 src.abs ||
2027 src.negate ||
2028 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2029 return false;
2030
2031 /* Check that that last instruction fully initialized the channels
2032 * we want to use, in the order we want to use them. We could
2033 * potentially reswizzle the operands of many instructions so that
2034 * we could handle out of order channels, but don't yet.
2035 */
2036
2037 for (unsigned i = 0; i < 4; i++) {
2038 if (dst.writemask & (1 << i)) {
2039 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2040 return false;
2041
2042 if (BRW_GET_SWZ(src.swizzle, i) != i)
2043 return false;
2044 }
2045 }
2046
2047 /* Success! Rewrite the instruction. */
2048 last_rhs_inst->dst.file = dst.file;
2049 last_rhs_inst->dst.reg = dst.reg;
2050 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2051 last_rhs_inst->dst.reladdr = dst.reladdr;
2052 last_rhs_inst->dst.writemask &= dst.writemask;
2053
2054 return true;
2055 }
2056
2057 void
2058 vec4_visitor::visit(ir_assignment *ir)
2059 {
2060 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2061 uint32_t predicate = BRW_PREDICATE_NONE;
2062
2063 if (!ir->lhs->type->is_scalar() &&
2064 !ir->lhs->type->is_vector()) {
2065 ir->rhs->accept(this);
2066 src_reg src = this->result;
2067
2068 if (ir->condition) {
2069 emit_bool_to_cond_code(ir->condition, &predicate);
2070 }
2071
2072 /* emit_block_move doesn't account for swizzles in the source register.
2073 * This should be ok, since the source register is a structure or an
2074 * array, and those can't be swizzled. But double-check to be sure.
2075 */
2076 assert(src.swizzle ==
2077 (ir->rhs->type->is_matrix()
2078 ? swizzle_for_size(ir->rhs->type->vector_elements)
2079 : BRW_SWIZZLE_NOOP));
2080
2081 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2082 return;
2083 }
2084
2085 /* Now we're down to just a scalar/vector with writemasks. */
2086 int i;
2087
2088 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2089 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2090
2091 ir->rhs->accept(this);
2092
2093 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2094
2095 src_reg src = this->result;
2096
2097 int swizzles[4];
2098 int first_enabled_chan = 0;
2099 int src_chan = 0;
2100
2101 assert(ir->lhs->type->is_vector() ||
2102 ir->lhs->type->is_scalar());
2103 dst.writemask = ir->write_mask;
2104
2105 for (int i = 0; i < 4; i++) {
2106 if (dst.writemask & (1 << i)) {
2107 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2108 break;
2109 }
2110 }
2111
2112 /* Swizzle a small RHS vector into the channels being written.
2113 *
2114 * glsl ir treats write_mask as dictating how many channels are
2115 * present on the RHS while in our instructions we need to make
2116 * those channels appear in the slots of the vec4 they're written to.
2117 */
2118 for (int i = 0; i < 4; i++) {
2119 if (dst.writemask & (1 << i))
2120 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2121 else
2122 swizzles[i] = first_enabled_chan;
2123 }
2124 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2125 swizzles[2], swizzles[3]);
2126
2127 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2128 return;
2129 }
2130
2131 if (ir->condition) {
2132 emit_bool_to_cond_code(ir->condition, &predicate);
2133 }
2134
2135 for (i = 0; i < type_size(ir->lhs->type); i++) {
2136 vec4_instruction *inst = emit(MOV(dst, src));
2137 inst->predicate = predicate;
2138
2139 dst.reg_offset++;
2140 src.reg_offset++;
2141 }
2142 }
2143
2144 void
2145 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2146 {
2147 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2148 foreach_in_list(ir_constant, field_value, &ir->components) {
2149 emit_constant_values(dst, field_value);
2150 }
2151 return;
2152 }
2153
2154 if (ir->type->is_array()) {
2155 for (unsigned int i = 0; i < ir->type->length; i++) {
2156 emit_constant_values(dst, ir->array_elements[i]);
2157 }
2158 return;
2159 }
2160
2161 if (ir->type->is_matrix()) {
2162 for (int i = 0; i < ir->type->matrix_columns; i++) {
2163 float *vec = &ir->value.f[i * ir->type->vector_elements];
2164
2165 for (int j = 0; j < ir->type->vector_elements; j++) {
2166 dst->writemask = 1 << j;
2167 dst->type = BRW_REGISTER_TYPE_F;
2168
2169 emit(MOV(*dst, src_reg(vec[j])));
2170 }
2171 dst->reg_offset++;
2172 }
2173 return;
2174 }
2175
2176 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2177
2178 for (int i = 0; i < ir->type->vector_elements; i++) {
2179 if (!(remaining_writemask & (1 << i)))
2180 continue;
2181
2182 dst->writemask = 1 << i;
2183 dst->type = brw_type_for_base_type(ir->type);
2184
2185 /* Find other components that match the one we're about to
2186 * write. Emits fewer instructions for things like vec4(0.5,
2187 * 1.5, 1.5, 1.5).
2188 */
2189 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2190 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2191 if (ir->value.b[i] == ir->value.b[j])
2192 dst->writemask |= (1 << j);
2193 } else {
2194 /* u, i, and f storage all line up, so no need for a
2195 * switch case for comparing each type.
2196 */
2197 if (ir->value.u[i] == ir->value.u[j])
2198 dst->writemask |= (1 << j);
2199 }
2200 }
2201
2202 switch (ir->type->base_type) {
2203 case GLSL_TYPE_FLOAT:
2204 emit(MOV(*dst, src_reg(ir->value.f[i])));
2205 break;
2206 case GLSL_TYPE_INT:
2207 emit(MOV(*dst, src_reg(ir->value.i[i])));
2208 break;
2209 case GLSL_TYPE_UINT:
2210 emit(MOV(*dst, src_reg(ir->value.u[i])));
2211 break;
2212 case GLSL_TYPE_BOOL:
2213 emit(MOV(*dst, src_reg(ir->value.b[i])));
2214 break;
2215 default:
2216 unreachable("Non-float/uint/int/bool constant");
2217 }
2218
2219 remaining_writemask &= ~dst->writemask;
2220 }
2221 dst->reg_offset++;
2222 }
2223
2224 void
2225 vec4_visitor::visit(ir_constant *ir)
2226 {
2227 dst_reg dst = dst_reg(this, ir->type);
2228 this->result = src_reg(dst);
2229
2230 emit_constant_values(&dst, ir);
2231 }
2232
2233 void
2234 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2235 {
2236 ir_dereference *deref = static_cast<ir_dereference *>(
2237 ir->actual_parameters.get_head());
2238 ir_variable *location = deref->variable_referenced();
2239 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2240 location->data.atomic.buffer_index);
2241
2242 /* Calculate the surface offset */
2243 src_reg offset(this, glsl_type::uint_type);
2244 ir_dereference_array *deref_array = deref->as_dereference_array();
2245 if (deref_array) {
2246 deref_array->array_index->accept(this);
2247
2248 src_reg tmp(this, glsl_type::uint_type);
2249 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2250 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2251 } else {
2252 offset = location->data.atomic.offset;
2253 }
2254
2255 /* Emit the appropriate machine instruction */
2256 const char *callee = ir->callee->function_name();
2257 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2258
2259 if (!strcmp("__intrinsic_atomic_read", callee)) {
2260 emit_untyped_surface_read(surf_index, dst, offset);
2261
2262 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2263 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2264 src_reg(), src_reg());
2265
2266 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2267 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2268 src_reg(), src_reg());
2269 }
2270 }
2271
2272 void
2273 vec4_visitor::visit(ir_call *ir)
2274 {
2275 const char *callee = ir->callee->function_name();
2276
2277 if (!strcmp("__intrinsic_atomic_read", callee) ||
2278 !strcmp("__intrinsic_atomic_increment", callee) ||
2279 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2280 visit_atomic_counter_intrinsic(ir);
2281 } else {
2282 unreachable("Unsupported intrinsic.");
2283 }
2284 }
2285
2286 src_reg
2287 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2288 {
2289 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2290 inst->base_mrf = 2;
2291 inst->mlen = 1;
2292 inst->sampler = sampler;
2293 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2294 inst->dst.writemask = WRITEMASK_XYZW;
2295
2296 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2297 int param_base = inst->base_mrf;
2298 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2299 int zero_mask = 0xf & ~coord_mask;
2300
2301 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2302 coordinate));
2303
2304 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2305 src_reg(0)));
2306
2307 emit(inst);
2308 return src_reg(inst->dst);
2309 }
2310
2311 void
2312 vec4_visitor::visit(ir_texture *ir)
2313 {
2314 int sampler =
2315 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2316
2317 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2318 * emitting anything other than setting up the constant result.
2319 */
2320 if (ir->op == ir_tg4) {
2321 ir_constant *chan = ir->lod_info.component->as_constant();
2322 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2323 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2324 dst_reg result(this, ir->type);
2325 this->result = src_reg(result);
2326 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2327 return;
2328 }
2329 }
2330
2331 /* Should be lowered by do_lower_texture_projection */
2332 assert(!ir->projector);
2333
2334 /* Should be lowered */
2335 assert(!ir->offset || !ir->offset->type->is_array());
2336
2337 /* Generate code to compute all the subexpression trees. This has to be
2338 * done before loading any values into MRFs for the sampler message since
2339 * generating these values may involve SEND messages that need the MRFs.
2340 */
2341 src_reg coordinate;
2342 if (ir->coordinate) {
2343 ir->coordinate->accept(this);
2344 coordinate = this->result;
2345 }
2346
2347 src_reg shadow_comparitor;
2348 if (ir->shadow_comparitor) {
2349 ir->shadow_comparitor->accept(this);
2350 shadow_comparitor = this->result;
2351 }
2352
2353 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2354 src_reg offset_value;
2355 if (has_nonconstant_offset) {
2356 ir->offset->accept(this);
2357 offset_value = src_reg(this->result);
2358 }
2359
2360 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2361 src_reg lod, dPdx, dPdy, sample_index, mcs;
2362 switch (ir->op) {
2363 case ir_tex:
2364 lod = src_reg(0.0f);
2365 lod_type = glsl_type::float_type;
2366 break;
2367 case ir_txf:
2368 case ir_txl:
2369 case ir_txs:
2370 ir->lod_info.lod->accept(this);
2371 lod = this->result;
2372 lod_type = ir->lod_info.lod->type;
2373 break;
2374 case ir_query_levels:
2375 lod = src_reg(0);
2376 lod_type = glsl_type::int_type;
2377 break;
2378 case ir_txf_ms:
2379 ir->lod_info.sample_index->accept(this);
2380 sample_index = this->result;
2381 sample_index_type = ir->lod_info.sample_index->type;
2382
2383 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2384 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2385 else
2386 mcs = src_reg(0u);
2387 break;
2388 case ir_txd:
2389 ir->lod_info.grad.dPdx->accept(this);
2390 dPdx = this->result;
2391
2392 ir->lod_info.grad.dPdy->accept(this);
2393 dPdy = this->result;
2394
2395 lod_type = ir->lod_info.grad.dPdx->type;
2396 break;
2397 case ir_txb:
2398 case ir_lod:
2399 case ir_tg4:
2400 break;
2401 }
2402
2403 vec4_instruction *inst = NULL;
2404 switch (ir->op) {
2405 case ir_tex:
2406 case ir_txl:
2407 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2408 break;
2409 case ir_txd:
2410 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2411 break;
2412 case ir_txf:
2413 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2414 break;
2415 case ir_txf_ms:
2416 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2417 break;
2418 case ir_txs:
2419 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2420 break;
2421 case ir_tg4:
2422 if (has_nonconstant_offset)
2423 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2424 else
2425 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2426 break;
2427 case ir_query_levels:
2428 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2429 break;
2430 case ir_txb:
2431 unreachable("TXB is not valid for vertex shaders.");
2432 case ir_lod:
2433 unreachable("LOD is not valid for vertex shaders.");
2434 default:
2435 unreachable("Unrecognized tex op");
2436 }
2437
2438 if (ir->offset != NULL && ir->op != ir_txf)
2439 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2440
2441 /* Stuff the channel select bits in the top of the texture offset */
2442 if (ir->op == ir_tg4)
2443 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2444
2445 /* The message header is necessary for:
2446 * - Gen4 (always)
2447 * - Texel offsets
2448 * - Gather channel selection
2449 * - Sampler indices too large to fit in a 4-bit value.
2450 */
2451 inst->header_present =
2452 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2453 sampler >= 16;
2454 inst->base_mrf = 2;
2455 inst->mlen = inst->header_present + 1; /* always at least one */
2456 inst->sampler = sampler;
2457 inst->dst = dst_reg(this, ir->type);
2458 inst->dst.writemask = WRITEMASK_XYZW;
2459 inst->shadow_compare = ir->shadow_comparitor != NULL;
2460
2461 /* MRF for the first parameter */
2462 int param_base = inst->base_mrf + inst->header_present;
2463
2464 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2465 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2466 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2467 } else {
2468 /* Load the coordinate */
2469 /* FINISHME: gl_clamp_mask and saturate */
2470 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2471 int zero_mask = 0xf & ~coord_mask;
2472
2473 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2474 coordinate));
2475
2476 if (zero_mask != 0) {
2477 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2478 src_reg(0)));
2479 }
2480 /* Load the shadow comparitor */
2481 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2482 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2483 WRITEMASK_X),
2484 shadow_comparitor));
2485 inst->mlen++;
2486 }
2487
2488 /* Load the LOD info */
2489 if (ir->op == ir_tex || ir->op == ir_txl) {
2490 int mrf, writemask;
2491 if (brw->gen >= 5) {
2492 mrf = param_base + 1;
2493 if (ir->shadow_comparitor) {
2494 writemask = WRITEMASK_Y;
2495 /* mlen already incremented */
2496 } else {
2497 writemask = WRITEMASK_X;
2498 inst->mlen++;
2499 }
2500 } else /* brw->gen == 4 */ {
2501 mrf = param_base;
2502 writemask = WRITEMASK_W;
2503 }
2504 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2505 } else if (ir->op == ir_txf) {
2506 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2507 } else if (ir->op == ir_txf_ms) {
2508 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2509 sample_index));
2510 if (brw->gen >= 7)
2511 /* MCS data is in the first channel of `mcs`, but we need to get it into
2512 * the .y channel of the second vec4 of params, so replicate .x across
2513 * the whole vec4 and then mask off everything except .y
2514 */
2515 mcs.swizzle = BRW_SWIZZLE_XXXX;
2516 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2517 mcs));
2518 inst->mlen++;
2519 } else if (ir->op == ir_txd) {
2520 const glsl_type *type = lod_type;
2521
2522 if (brw->gen >= 5) {
2523 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2524 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2525 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2526 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2527 inst->mlen++;
2528
2529 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2530 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2531 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2532 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2533 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2534 inst->mlen++;
2535
2536 if (ir->shadow_comparitor) {
2537 emit(MOV(dst_reg(MRF, param_base + 2,
2538 ir->shadow_comparitor->type, WRITEMASK_Z),
2539 shadow_comparitor));
2540 }
2541 }
2542 } else /* brw->gen == 4 */ {
2543 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2544 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2545 inst->mlen += 2;
2546 }
2547 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2548 if (ir->shadow_comparitor) {
2549 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2550 shadow_comparitor));
2551 }
2552
2553 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2554 offset_value));
2555 inst->mlen++;
2556 }
2557 }
2558
2559 emit(inst);
2560
2561 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2562 * spec requires layers.
2563 */
2564 if (ir->op == ir_txs) {
2565 glsl_type const *type = ir->sampler->type;
2566 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2567 type->sampler_array) {
2568 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2569 writemask(inst->dst, WRITEMASK_Z),
2570 src_reg(inst->dst), src_reg(6));
2571 }
2572 }
2573
2574 if (brw->gen == 6 && ir->op == ir_tg4) {
2575 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2576 }
2577
2578 swizzle_result(ir, src_reg(inst->dst), sampler);
2579 }
2580
2581 /**
2582 * Apply workarounds for Gen6 gather with UINT/SINT
2583 */
2584 void
2585 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2586 {
2587 if (!wa)
2588 return;
2589
2590 int width = (wa & WA_8BIT) ? 8 : 16;
2591 dst_reg dst_f = dst;
2592 dst_f.type = BRW_REGISTER_TYPE_F;
2593
2594 /* Convert from UNORM to UINT */
2595 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2596 emit(MOV(dst, src_reg(dst_f)));
2597
2598 if (wa & WA_SIGN) {
2599 /* Reinterpret the UINT value as a signed INT value by
2600 * shifting the sign bit into place, then shifting back
2601 * preserving sign.
2602 */
2603 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2604 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2605 }
2606 }
2607
2608 /**
2609 * Set up the gather channel based on the swizzle, for gather4.
2610 */
2611 uint32_t
2612 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2613 {
2614 ir_constant *chan = ir->lod_info.component->as_constant();
2615 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2616 switch (swiz) {
2617 case SWIZZLE_X: return 0;
2618 case SWIZZLE_Y:
2619 /* gather4 sampler is broken for green channel on RG32F --
2620 * we must ask for blue instead.
2621 */
2622 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2623 return 2;
2624 return 1;
2625 case SWIZZLE_Z: return 2;
2626 case SWIZZLE_W: return 3;
2627 default:
2628 unreachable("Not reached"); /* zero, one swizzles handled already */
2629 }
2630 }
2631
2632 void
2633 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2634 {
2635 int s = key->tex.swizzles[sampler];
2636
2637 this->result = src_reg(this, ir->type);
2638 dst_reg swizzled_result(this->result);
2639
2640 if (ir->op == ir_query_levels) {
2641 /* # levels is in .w */
2642 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2643 emit(MOV(swizzled_result, orig_val));
2644 return;
2645 }
2646
2647 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2648 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2649 emit(MOV(swizzled_result, orig_val));
2650 return;
2651 }
2652
2653
2654 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2655 int swizzle[4] = {0};
2656
2657 for (int i = 0; i < 4; i++) {
2658 switch (GET_SWZ(s, i)) {
2659 case SWIZZLE_ZERO:
2660 zero_mask |= (1 << i);
2661 break;
2662 case SWIZZLE_ONE:
2663 one_mask |= (1 << i);
2664 break;
2665 default:
2666 copy_mask |= (1 << i);
2667 swizzle[i] = GET_SWZ(s, i);
2668 break;
2669 }
2670 }
2671
2672 if (copy_mask) {
2673 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2674 swizzled_result.writemask = copy_mask;
2675 emit(MOV(swizzled_result, orig_val));
2676 }
2677
2678 if (zero_mask) {
2679 swizzled_result.writemask = zero_mask;
2680 emit(MOV(swizzled_result, src_reg(0.0f)));
2681 }
2682
2683 if (one_mask) {
2684 swizzled_result.writemask = one_mask;
2685 emit(MOV(swizzled_result, src_reg(1.0f)));
2686 }
2687 }
2688
2689 void
2690 vec4_visitor::visit(ir_return *)
2691 {
2692 unreachable("not reached");
2693 }
2694
2695 void
2696 vec4_visitor::visit(ir_discard *)
2697 {
2698 unreachable("not reached");
2699 }
2700
2701 void
2702 vec4_visitor::visit(ir_if *ir)
2703 {
2704 /* Don't point the annotation at the if statement, because then it plus
2705 * the then and else blocks get printed.
2706 */
2707 this->base_ir = ir->condition;
2708
2709 if (brw->gen == 6) {
2710 emit_if_gen6(ir);
2711 } else {
2712 uint32_t predicate;
2713 emit_bool_to_cond_code(ir->condition, &predicate);
2714 emit(IF(predicate));
2715 }
2716
2717 visit_instructions(&ir->then_instructions);
2718
2719 if (!ir->else_instructions.is_empty()) {
2720 this->base_ir = ir->condition;
2721 emit(BRW_OPCODE_ELSE);
2722
2723 visit_instructions(&ir->else_instructions);
2724 }
2725
2726 this->base_ir = ir->condition;
2727 emit(BRW_OPCODE_ENDIF);
2728 }
2729
2730 void
2731 vec4_visitor::visit(ir_emit_vertex *)
2732 {
2733 unreachable("not reached");
2734 }
2735
2736 void
2737 vec4_visitor::visit(ir_end_primitive *)
2738 {
2739 unreachable("not reached");
2740 }
2741
2742 void
2743 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2744 dst_reg dst, src_reg offset,
2745 src_reg src0, src_reg src1)
2746 {
2747 unsigned mlen = 0;
2748
2749 /* Set the atomic operation offset. */
2750 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2751 mlen++;
2752
2753 /* Set the atomic operation arguments. */
2754 if (src0.file != BAD_FILE) {
2755 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2756 mlen++;
2757 }
2758
2759 if (src1.file != BAD_FILE) {
2760 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2761 mlen++;
2762 }
2763
2764 /* Emit the instruction. Note that this maps to the normal SIMD8
2765 * untyped atomic message on Ivy Bridge, but that's OK because
2766 * unused channels will be masked out.
2767 */
2768 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2769 src_reg(atomic_op), src_reg(surf_index));
2770 inst->base_mrf = 0;
2771 inst->mlen = mlen;
2772 }
2773
2774 void
2775 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2776 src_reg offset)
2777 {
2778 /* Set the surface read offset. */
2779 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2780
2781 /* Emit the instruction. Note that this maps to the normal SIMD8
2782 * untyped surface read message, but that's OK because unused
2783 * channels will be masked out.
2784 */
2785 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2786 dst, src_reg(surf_index));
2787 inst->base_mrf = 0;
2788 inst->mlen = 1;
2789 }
2790
2791 void
2792 vec4_visitor::emit_ndc_computation()
2793 {
2794 /* Get the position */
2795 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2796
2797 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2798 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2799 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2800
2801 current_annotation = "NDC";
2802 dst_reg ndc_w = ndc;
2803 ndc_w.writemask = WRITEMASK_W;
2804 src_reg pos_w = pos;
2805 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2806 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2807
2808 dst_reg ndc_xyz = ndc;
2809 ndc_xyz.writemask = WRITEMASK_XYZ;
2810
2811 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2812 }
2813
2814 void
2815 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2816 {
2817 if (brw->gen < 6 &&
2818 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2819 key->userclip_active || brw->has_negative_rhw_bug)) {
2820 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2821 dst_reg header1_w = header1;
2822 header1_w.writemask = WRITEMASK_W;
2823
2824 emit(MOV(header1, 0u));
2825
2826 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2827 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2828
2829 current_annotation = "Point size";
2830 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2831 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2832 }
2833
2834 if (key->userclip_active) {
2835 current_annotation = "Clipping flags";
2836 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2837 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2838
2839 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2840 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2841 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2842
2843 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2844 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2845 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2846 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2847 }
2848
2849 /* i965 clipping workaround:
2850 * 1) Test for -ve rhw
2851 * 2) If set,
2852 * set ndc = (0,0,0,0)
2853 * set ucp[6] = 1
2854 *
2855 * Later, clipping will detect ucp[6] and ensure the primitive is
2856 * clipped against all fixed planes.
2857 */
2858 if (brw->has_negative_rhw_bug) {
2859 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2860 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2861 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2862 vec4_instruction *inst;
2863 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2864 inst->predicate = BRW_PREDICATE_NORMAL;
2865 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2866 inst->predicate = BRW_PREDICATE_NORMAL;
2867 }
2868
2869 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2870 } else if (brw->gen < 6) {
2871 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2872 } else {
2873 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2874 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2875 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2876 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2877 }
2878 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2879 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2880 src_reg(output_reg[VARYING_SLOT_LAYER])));
2881 }
2882 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2883 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2884 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2885 }
2886 }
2887 }
2888
2889 void
2890 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2891 {
2892 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2893 *
2894 * "If a linked set of shaders forming the vertex stage contains no
2895 * static write to gl_ClipVertex or gl_ClipDistance, but the
2896 * application has requested clipping against user clip planes through
2897 * the API, then the coordinate written to gl_Position is used for
2898 * comparison against the user clip planes."
2899 *
2900 * This function is only called if the shader didn't write to
2901 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2902 * if the user wrote to it; otherwise we use gl_Position.
2903 */
2904 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2905 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2906 clip_vertex = VARYING_SLOT_POS;
2907 }
2908
2909 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2910 ++i) {
2911 reg.writemask = 1 << i;
2912 emit(DP4(reg,
2913 src_reg(output_reg[clip_vertex]),
2914 src_reg(this->userplane[i + offset])));
2915 }
2916 }
2917
2918 void
2919 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2920 {
2921 assert (varying < VARYING_SLOT_MAX);
2922 reg.type = output_reg[varying].type;
2923 current_annotation = output_reg_annotation[varying];
2924 /* Copy the register, saturating if necessary */
2925 vec4_instruction *inst = emit(MOV(reg,
2926 src_reg(output_reg[varying])));
2927 if ((varying == VARYING_SLOT_COL0 ||
2928 varying == VARYING_SLOT_COL1 ||
2929 varying == VARYING_SLOT_BFC0 ||
2930 varying == VARYING_SLOT_BFC1) &&
2931 key->clamp_vertex_color) {
2932 inst->saturate = true;
2933 }
2934 }
2935
2936 void
2937 vec4_visitor::emit_urb_slot(int mrf, int varying)
2938 {
2939 struct brw_reg hw_reg = brw_message_reg(mrf);
2940 dst_reg reg = dst_reg(MRF, mrf);
2941 reg.type = BRW_REGISTER_TYPE_F;
2942
2943 switch (varying) {
2944 case VARYING_SLOT_PSIZ:
2945 /* PSIZ is always in slot 0, and is coupled with other flags. */
2946 current_annotation = "indices, point width, clip flags";
2947 emit_psiz_and_flags(hw_reg);
2948 break;
2949 case BRW_VARYING_SLOT_NDC:
2950 current_annotation = "NDC";
2951 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2952 break;
2953 case VARYING_SLOT_POS:
2954 current_annotation = "gl_Position";
2955 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2956 break;
2957 case VARYING_SLOT_EDGE:
2958 /* This is present when doing unfilled polygons. We're supposed to copy
2959 * the edge flag from the user-provided vertex array
2960 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2961 * of that attribute (starts as 1.0f). This is then used in clipping to
2962 * determine which edges should be drawn as wireframe.
2963 */
2964 current_annotation = "edge flag";
2965 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2966 glsl_type::float_type, WRITEMASK_XYZW))));
2967 break;
2968 case BRW_VARYING_SLOT_PAD:
2969 /* No need to write to this slot */
2970 break;
2971 default:
2972 emit_generic_urb_slot(reg, varying);
2973 break;
2974 }
2975 }
2976
2977 static int
2978 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2979 {
2980 if (brw->gen >= 6) {
2981 /* URB data written (does not include the message header reg) must
2982 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2983 * section 5.4.3.2.2: URB_INTERLEAVED.
2984 *
2985 * URB entries are allocated on a multiple of 1024 bits, so an
2986 * extra 128 bits written here to make the end align to 256 is
2987 * no problem.
2988 */
2989 if ((mlen % 2) != 1)
2990 mlen++;
2991 }
2992
2993 return mlen;
2994 }
2995
2996
2997 /**
2998 * Generates the VUE payload plus the necessary URB write instructions to
2999 * output it.
3000 *
3001 * The VUE layout is documented in Volume 2a.
3002 */
3003 void
3004 vec4_visitor::emit_vertex()
3005 {
3006 /* MRF 0 is reserved for the debugger, so start with message header
3007 * in MRF 1.
3008 */
3009 int base_mrf = 1;
3010 int mrf = base_mrf;
3011 /* In the process of generating our URB write message contents, we
3012 * may need to unspill a register or load from an array. Those
3013 * reads would use MRFs 14-15.
3014 */
3015 int max_usable_mrf = 13;
3016
3017 /* The following assertion verifies that max_usable_mrf causes an
3018 * even-numbered amount of URB write data, which will meet gen6's
3019 * requirements for length alignment.
3020 */
3021 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3022
3023 /* First mrf is the g0-based message header containing URB handles and
3024 * such.
3025 */
3026 emit_urb_write_header(mrf++);
3027
3028 if (brw->gen < 6) {
3029 emit_ndc_computation();
3030 }
3031
3032 /* Lower legacy ff and ClipVertex clipping to clip distances */
3033 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3034 current_annotation = "user clip distances";
3035
3036 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3037 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3038
3039 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3040 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3041 }
3042
3043 /* We may need to split this up into several URB writes, so do them in a
3044 * loop.
3045 */
3046 int slot = 0;
3047 bool complete = false;
3048 do {
3049 /* URB offset is in URB row increments, and each of our MRFs is half of
3050 * one of those, since we're doing interleaved writes.
3051 */
3052 int offset = slot / 2;
3053
3054 mrf = base_mrf + 1;
3055 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3056 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3057
3058 /* If this was max_usable_mrf, we can't fit anything more into this
3059 * URB WRITE.
3060 */
3061 if (mrf > max_usable_mrf) {
3062 slot++;
3063 break;
3064 }
3065 }
3066
3067 complete = slot >= prog_data->vue_map.num_slots;
3068 current_annotation = "URB write";
3069 vec4_instruction *inst = emit_urb_write_opcode(complete);
3070 inst->base_mrf = base_mrf;
3071 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3072 inst->offset += offset;
3073 } while(!complete);
3074 }
3075
3076
3077 src_reg
3078 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3079 src_reg *reladdr, int reg_offset)
3080 {
3081 /* Because we store the values to scratch interleaved like our
3082 * vertex data, we need to scale the vec4 index by 2.
3083 */
3084 int message_header_scale = 2;
3085
3086 /* Pre-gen6, the message header uses byte offsets instead of vec4
3087 * (16-byte) offset units.
3088 */
3089 if (brw->gen < 6)
3090 message_header_scale *= 16;
3091
3092 if (reladdr) {
3093 src_reg index = src_reg(this, glsl_type::int_type);
3094
3095 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3096 emit_before(inst, MUL(dst_reg(index),
3097 index, src_reg(message_header_scale)));
3098
3099 return index;
3100 } else {
3101 return src_reg(reg_offset * message_header_scale);
3102 }
3103 }
3104
3105 src_reg
3106 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3107 src_reg *reladdr, int reg_offset)
3108 {
3109 if (reladdr) {
3110 src_reg index = src_reg(this, glsl_type::int_type);
3111
3112 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3113
3114 /* Pre-gen6, the message header uses byte offsets instead of vec4
3115 * (16-byte) offset units.
3116 */
3117 if (brw->gen < 6) {
3118 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3119 }
3120
3121 return index;
3122 } else if (brw->gen >= 8) {
3123 /* Store the offset in a GRF so we can send-from-GRF. */
3124 src_reg offset = src_reg(this, glsl_type::int_type);
3125 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3126 return offset;
3127 } else {
3128 int message_header_scale = brw->gen < 6 ? 16 : 1;
3129 return src_reg(reg_offset * message_header_scale);
3130 }
3131 }
3132
3133 /**
3134 * Emits an instruction before @inst to load the value named by @orig_src
3135 * from scratch space at @base_offset to @temp.
3136 *
3137 * @base_offset is measured in 32-byte units (the size of a register).
3138 */
3139 void
3140 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3141 dst_reg temp, src_reg orig_src,
3142 int base_offset)
3143 {
3144 int reg_offset = base_offset + orig_src.reg_offset;
3145 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3146
3147 emit_before(inst, SCRATCH_READ(temp, index));
3148 }
3149
3150 /**
3151 * Emits an instruction after @inst to store the value to be written
3152 * to @orig_dst to scratch space at @base_offset, from @temp.
3153 *
3154 * @base_offset is measured in 32-byte units (the size of a register).
3155 */
3156 void
3157 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3158 {
3159 int reg_offset = base_offset + inst->dst.reg_offset;
3160 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3161
3162 /* Create a temporary register to store *inst's result in.
3163 *
3164 * We have to be careful in MOVing from our temporary result register in
3165 * the scratch write. If we swizzle from channels of the temporary that
3166 * weren't initialized, it will confuse live interval analysis, which will
3167 * make spilling fail to make progress.
3168 */
3169 src_reg temp = src_reg(this, glsl_type::vec4_type);
3170 temp.type = inst->dst.type;
3171 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3172 int swizzles[4];
3173 for (int i = 0; i < 4; i++)
3174 if (inst->dst.writemask & (1 << i))
3175 swizzles[i] = i;
3176 else
3177 swizzles[i] = first_writemask_chan;
3178 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3179 swizzles[2], swizzles[3]);
3180
3181 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3182 inst->dst.writemask));
3183 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3184 write->predicate = inst->predicate;
3185 write->ir = inst->ir;
3186 write->annotation = inst->annotation;
3187 inst->insert_after(write);
3188
3189 inst->dst.file = temp.file;
3190 inst->dst.reg = temp.reg;
3191 inst->dst.reg_offset = temp.reg_offset;
3192 inst->dst.reladdr = NULL;
3193 }
3194
3195 /**
3196 * We can't generally support array access in GRF space, because a
3197 * single instruction's destination can only span 2 contiguous
3198 * registers. So, we send all GRF arrays that get variable index
3199 * access to scratch space.
3200 */
3201 void
3202 vec4_visitor::move_grf_array_access_to_scratch()
3203 {
3204 int scratch_loc[this->virtual_grf_count];
3205
3206 for (int i = 0; i < this->virtual_grf_count; i++) {
3207 scratch_loc[i] = -1;
3208 }
3209
3210 /* First, calculate the set of virtual GRFs that need to be punted
3211 * to scratch due to having any array access on them, and where in
3212 * scratch.
3213 */
3214 foreach_in_list(vec4_instruction, inst, &instructions) {
3215 if (inst->dst.file == GRF && inst->dst.reladdr &&
3216 scratch_loc[inst->dst.reg] == -1) {
3217 scratch_loc[inst->dst.reg] = c->last_scratch;
3218 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3219 }
3220
3221 for (int i = 0 ; i < 3; i++) {
3222 src_reg *src = &inst->src[i];
3223
3224 if (src->file == GRF && src->reladdr &&
3225 scratch_loc[src->reg] == -1) {
3226 scratch_loc[src->reg] = c->last_scratch;
3227 c->last_scratch += this->virtual_grf_sizes[src->reg];
3228 }
3229 }
3230 }
3231
3232 /* Now, for anything that will be accessed through scratch, rewrite
3233 * it to load/store. Note that this is a _safe list walk, because
3234 * we may generate a new scratch_write instruction after the one
3235 * we're processing.
3236 */
3237 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3238 /* Set up the annotation tracking for new generated instructions. */
3239 base_ir = inst->ir;
3240 current_annotation = inst->annotation;
3241
3242 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3243 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3244 }
3245
3246 for (int i = 0 ; i < 3; i++) {
3247 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3248 continue;
3249
3250 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3251
3252 emit_scratch_read(inst, temp, inst->src[i],
3253 scratch_loc[inst->src[i].reg]);
3254
3255 inst->src[i].file = temp.file;
3256 inst->src[i].reg = temp.reg;
3257 inst->src[i].reg_offset = temp.reg_offset;
3258 inst->src[i].reladdr = NULL;
3259 }
3260 }
3261 }
3262
3263 /**
3264 * Emits an instruction before @inst to load the value named by @orig_src
3265 * from the pull constant buffer (surface) at @base_offset to @temp.
3266 */
3267 void
3268 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3269 dst_reg temp, src_reg orig_src,
3270 int base_offset)
3271 {
3272 int reg_offset = base_offset + orig_src.reg_offset;
3273 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3274 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3275 vec4_instruction *load;
3276
3277 if (brw->gen >= 7) {
3278 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3279 grf_offset.type = offset.type;
3280 emit_before(inst, MOV(grf_offset, offset));
3281
3282 load = new(mem_ctx) vec4_instruction(this,
3283 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3284 temp, index, src_reg(grf_offset));
3285 } else {
3286 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3287 temp, index, offset);
3288 load->base_mrf = 14;
3289 load->mlen = 1;
3290 }
3291 emit_before(inst, load);
3292 }
3293
3294 /**
3295 * Implements array access of uniforms by inserting a
3296 * PULL_CONSTANT_LOAD instruction.
3297 *
3298 * Unlike temporary GRF array access (where we don't support it due to
3299 * the difficulty of doing relative addressing on instruction
3300 * destinations), we could potentially do array access of uniforms
3301 * that were loaded in GRF space as push constants. In real-world
3302 * usage we've seen, though, the arrays being used are always larger
3303 * than we could load as push constants, so just always move all
3304 * uniform array access out to a pull constant buffer.
3305 */
3306 void
3307 vec4_visitor::move_uniform_array_access_to_pull_constants()
3308 {
3309 int pull_constant_loc[this->uniforms];
3310
3311 for (int i = 0; i < this->uniforms; i++) {
3312 pull_constant_loc[i] = -1;
3313 }
3314
3315 /* Walk through and find array access of uniforms. Put a copy of that
3316 * uniform in the pull constant buffer.
3317 *
3318 * Note that we don't move constant-indexed accesses to arrays. No
3319 * testing has been done of the performance impact of this choice.
3320 */
3321 foreach_in_list_safe(vec4_instruction, inst, &instructions) {
3322 for (int i = 0 ; i < 3; i++) {
3323 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3324 continue;
3325
3326 int uniform = inst->src[i].reg;
3327
3328 /* If this array isn't already present in the pull constant buffer,
3329 * add it.
3330 */
3331 if (pull_constant_loc[uniform] == -1) {
3332 const float **values = &stage_prog_data->param[uniform * 4];
3333
3334 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3335
3336 assert(uniform < uniform_array_size);
3337 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3338 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3339 = values[j];
3340 }
3341 }
3342
3343 /* Set up the annotation tracking for new generated instructions. */
3344 base_ir = inst->ir;
3345 current_annotation = inst->annotation;
3346
3347 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3348
3349 emit_pull_constant_load(inst, temp, inst->src[i],
3350 pull_constant_loc[uniform]);
3351
3352 inst->src[i].file = temp.file;
3353 inst->src[i].reg = temp.reg;
3354 inst->src[i].reg_offset = temp.reg_offset;
3355 inst->src[i].reladdr = NULL;
3356 }
3357 }
3358
3359 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3360 * no need to track them as larger-than-vec4 objects. This will be
3361 * relied on in cutting out unused uniform vectors from push
3362 * constants.
3363 */
3364 split_uniform_registers();
3365 }
3366
3367 void
3368 vec4_visitor::resolve_ud_negate(src_reg *reg)
3369 {
3370 if (reg->type != BRW_REGISTER_TYPE_UD ||
3371 !reg->negate)
3372 return;
3373
3374 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3375 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3376 *reg = temp;
3377 }
3378
3379 vec4_visitor::vec4_visitor(struct brw_context *brw,
3380 struct brw_vec4_compile *c,
3381 struct gl_program *prog,
3382 const struct brw_vec4_prog_key *key,
3383 struct brw_vec4_prog_data *prog_data,
3384 struct gl_shader_program *shader_prog,
3385 gl_shader_stage stage,
3386 void *mem_ctx,
3387 bool debug_flag,
3388 bool no_spills,
3389 shader_time_shader_type st_base,
3390 shader_time_shader_type st_written,
3391 shader_time_shader_type st_reset)
3392 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3393 c(c),
3394 key(key),
3395 prog_data(prog_data),
3396 sanity_param_count(0),
3397 fail_msg(NULL),
3398 first_non_payload_grf(0),
3399 need_all_constants_in_pull_buffer(false),
3400 debug_flag(debug_flag),
3401 no_spills(no_spills),
3402 st_base(st_base),
3403 st_written(st_written),
3404 st_reset(st_reset)
3405 {
3406 this->mem_ctx = mem_ctx;
3407 this->failed = false;
3408
3409 this->base_ir = NULL;
3410 this->current_annotation = NULL;
3411 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3412
3413 this->variable_ht = hash_table_ctor(0,
3414 hash_table_pointer_hash,
3415 hash_table_pointer_compare);
3416
3417 this->virtual_grf_start = NULL;
3418 this->virtual_grf_end = NULL;
3419 this->virtual_grf_sizes = NULL;
3420 this->virtual_grf_count = 0;
3421 this->virtual_grf_reg_map = NULL;
3422 this->virtual_grf_reg_count = 0;
3423 this->virtual_grf_array_size = 0;
3424 this->live_intervals_valid = false;
3425
3426 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3427
3428 this->uniforms = 0;
3429
3430 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3431 * at least one. See setup_uniforms() in brw_vec4.cpp.
3432 */
3433 this->uniform_array_size = 1;
3434 if (prog_data) {
3435 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3436 }
3437
3438 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3439 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3440 }
3441
3442 vec4_visitor::~vec4_visitor()
3443 {
3444 hash_table_dtor(this->variable_ht);
3445 }
3446
3447
3448 void
3449 vec4_visitor::fail(const char *format, ...)
3450 {
3451 va_list va;
3452 char *msg;
3453
3454 if (failed)
3455 return;
3456
3457 failed = true;
3458
3459 va_start(va, format);
3460 msg = ralloc_vasprintf(mem_ctx, format, va);
3461 va_end(va);
3462 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3463
3464 this->fail_msg = msg;
3465
3466 if (debug_flag) {
3467 fprintf(stderr, "%s", msg);
3468 }
3469 }
3470
3471 } /* namespace brw */