i965: Use typed foreach_in_list instead of foreach_list.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "program/sampler.h"
28 }
29
30 namespace brw {
31
32 vec4_instruction::vec4_instruction(vec4_visitor *v,
33 enum opcode opcode, const dst_reg &dst,
34 const src_reg &src0, const src_reg &src1,
35 const src_reg &src2)
36 {
37 this->opcode = opcode;
38 this->dst = dst;
39 this->src[0] = src0;
40 this->src[1] = src1;
41 this->src[2] = src2;
42 this->saturate = false;
43 this->force_writemask_all = false;
44 this->no_dd_clear = false;
45 this->no_dd_check = false;
46 this->writes_accumulator = false;
47 this->conditional_mod = BRW_CONDITIONAL_NONE;
48 this->sampler = 0;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
71 {
72 new_inst->ir = inst->ir;
73 new_inst->annotation = inst->annotation;
74
75 inst->insert_before(new_inst);
76
77 return inst;
78 }
79
80 vec4_instruction *
81 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
82 src_reg src0, src_reg src1, src_reg src2)
83 {
84 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
85 src0, src1, src2));
86 }
87
88
89 vec4_instruction *
90 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
91 {
92 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
93 }
94
95 vec4_instruction *
96 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
97 {
98 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
99 }
100
101 vec4_instruction *
102 vec4_visitor::emit(enum opcode opcode, dst_reg dst)
103 {
104 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
105 }
106
107 vec4_instruction *
108 vec4_visitor::emit(enum opcode opcode)
109 {
110 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
111 }
112
113 #define ALU1(op) \
114 vec4_instruction * \
115 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
116 { \
117 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
118 src0); \
119 }
120
121 #define ALU2(op) \
122 vec4_instruction * \
123 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
124 const src_reg &src1) \
125 { \
126 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
127 src0, src1); \
128 }
129
130 #define ALU2_ACC(op) \
131 vec4_instruction * \
132 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
133 const src_reg &src1) \
134 { \
135 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
136 BRW_OPCODE_##op, dst, src0, src1); \
137 inst->writes_accumulator = true; \
138 return inst; \
139 }
140
141 #define ALU3(op) \
142 vec4_instruction * \
143 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
144 const src_reg &src1, const src_reg &src2) \
145 { \
146 assert(brw->gen >= 6); \
147 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
148 src0, src1, src2); \
149 }
150
151 ALU1(NOT)
152 ALU1(MOV)
153 ALU1(FRC)
154 ALU1(RNDD)
155 ALU1(RNDE)
156 ALU1(RNDZ)
157 ALU1(F32TO16)
158 ALU1(F16TO32)
159 ALU2(ADD)
160 ALU2(MUL)
161 ALU2_ACC(MACH)
162 ALU2(AND)
163 ALU2(OR)
164 ALU2(XOR)
165 ALU2(DP3)
166 ALU2(DP4)
167 ALU2(DPH)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172 ALU1(BFREV)
173 ALU3(BFE)
174 ALU2(BFI1)
175 ALU3(BFI2)
176 ALU1(FBH)
177 ALU1(FBL)
178 ALU1(CBIT)
179 ALU3(MAD)
180 ALU2_ACC(ADDC)
181 ALU2_ACC(SUBB)
182 ALU2(MAC)
183
184 /** Gen4 predicated IF. */
185 vec4_instruction *
186 vec4_visitor::IF(uint32_t predicate)
187 {
188 vec4_instruction *inst;
189
190 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
191 inst->predicate = predicate;
192
193 return inst;
194 }
195
196 /** Gen6 IF with embedded comparison. */
197 vec4_instruction *
198 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
199 {
200 assert(brw->gen == 6);
201
202 vec4_instruction *inst;
203
204 resolve_ud_negate(&src0);
205 resolve_ud_negate(&src1);
206
207 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
208 src0, src1);
209 inst->conditional_mod = condition;
210
211 return inst;
212 }
213
214 /**
215 * CMP: Sets the low bit of the destination channels with the result
216 * of the comparison, while the upper bits are undefined, and updates
217 * the flag register with the packed 16 bits of the result.
218 */
219 vec4_instruction *
220 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
221 {
222 vec4_instruction *inst;
223
224 /* original gen4 does type conversion to the destination type
225 * before before comparison, producing garbage results for floating
226 * point comparisons.
227 */
228 if (brw->gen == 4) {
229 dst.type = src0.type;
230 if (dst.file == HW_REG)
231 dst.fixed_hw_reg.type = dst.type;
232 }
233
234 resolve_ud_negate(&src0);
235 resolve_ud_negate(&src1);
236
237 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
238 inst->conditional_mod = condition;
239
240 return inst;
241 }
242
243 vec4_instruction *
244 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
245 {
246 vec4_instruction *inst;
247
248 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
249 dst, index);
250 inst->base_mrf = 14;
251 inst->mlen = 2;
252
253 return inst;
254 }
255
256 vec4_instruction *
257 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
258 const src_reg &index)
259 {
260 vec4_instruction *inst;
261
262 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
263 dst, src, index);
264 inst->base_mrf = 13;
265 inst->mlen = 3;
266
267 return inst;
268 }
269
270 void
271 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
272 {
273 static enum opcode dot_opcodes[] = {
274 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
275 };
276
277 emit(dot_opcodes[elements - 2], dst, src0, src1);
278 }
279
280 src_reg
281 vec4_visitor::fix_3src_operand(src_reg src)
282 {
283 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
284 * able to use vertical stride of zero to replicate the vec4 uniform, like
285 *
286 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
287 *
288 * But you can't, since vertical stride is always four in three-source
289 * instructions. Instead, insert a MOV instruction to do the replication so
290 * that the three-source instruction can consume it.
291 */
292
293 /* The MOV is only needed if the source is a uniform or immediate. */
294 if (src.file != UNIFORM && src.file != IMM)
295 return src;
296
297 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
298 return src;
299
300 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
301 expanded.type = src.type;
302 emit(MOV(expanded, src));
303 return src_reg(expanded);
304 }
305
306 src_reg
307 vec4_visitor::fix_math_operand(src_reg src)
308 {
309 /* The gen6 math instruction ignores the source modifiers --
310 * swizzle, abs, negate, and at least some parts of the register
311 * region description.
312 *
313 * Rather than trying to enumerate all these cases, *always* expand the
314 * operand to a temp GRF for gen6.
315 *
316 * For gen7, keep the operand as-is, except if immediate, which gen7 still
317 * can't use.
318 */
319
320 if (brw->gen == 7 && src.file != IMM)
321 return src;
322
323 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
324 expanded.type = src.type;
325 emit(MOV(expanded, src));
326 return src_reg(expanded);
327 }
328
329 void
330 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
331 {
332 src = fix_math_operand(src);
333
334 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
335 /* The gen6 math instruction must be align1, so we can't do
336 * writemasks.
337 */
338 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
339
340 emit(opcode, temp_dst, src);
341
342 emit(MOV(dst, src_reg(temp_dst)));
343 } else {
344 emit(opcode, dst, src);
345 }
346 }
347
348 void
349 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
350 {
351 vec4_instruction *inst = emit(opcode, dst, src);
352 inst->base_mrf = 1;
353 inst->mlen = 1;
354 }
355
356 void
357 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
358 {
359 switch (opcode) {
360 case SHADER_OPCODE_RCP:
361 case SHADER_OPCODE_RSQ:
362 case SHADER_OPCODE_SQRT:
363 case SHADER_OPCODE_EXP2:
364 case SHADER_OPCODE_LOG2:
365 case SHADER_OPCODE_SIN:
366 case SHADER_OPCODE_COS:
367 break;
368 default:
369 assert(!"not reached: bad math opcode");
370 return;
371 }
372
373 if (brw->gen >= 8) {
374 emit(opcode, dst, src);
375 } else if (brw->gen >= 6) {
376 emit_math1_gen6(opcode, dst, src);
377 } else {
378 emit_math1_gen4(opcode, dst, src);
379 }
380 }
381
382 void
383 vec4_visitor::emit_math2_gen6(enum opcode opcode,
384 dst_reg dst, src_reg src0, src_reg src1)
385 {
386 src0 = fix_math_operand(src0);
387 src1 = fix_math_operand(src1);
388
389 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
390 /* The gen6 math instruction must be align1, so we can't do
391 * writemasks.
392 */
393 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
394 temp_dst.type = dst.type;
395
396 emit(opcode, temp_dst, src0, src1);
397
398 emit(MOV(dst, src_reg(temp_dst)));
399 } else {
400 emit(opcode, dst, src0, src1);
401 }
402 }
403
404 void
405 vec4_visitor::emit_math2_gen4(enum opcode opcode,
406 dst_reg dst, src_reg src0, src_reg src1)
407 {
408 vec4_instruction *inst = emit(opcode, dst, src0, src1);
409 inst->base_mrf = 1;
410 inst->mlen = 2;
411 }
412
413 void
414 vec4_visitor::emit_math(enum opcode opcode,
415 dst_reg dst, src_reg src0, src_reg src1)
416 {
417 switch (opcode) {
418 case SHADER_OPCODE_POW:
419 case SHADER_OPCODE_INT_QUOTIENT:
420 case SHADER_OPCODE_INT_REMAINDER:
421 break;
422 default:
423 assert(!"not reached: unsupported binary math opcode");
424 return;
425 }
426
427 if (brw->gen >= 8) {
428 emit(opcode, dst, src0, src1);
429 } else if (brw->gen >= 6) {
430 emit_math2_gen6(opcode, dst, src0, src1);
431 } else {
432 emit_math2_gen4(opcode, dst, src0, src1);
433 }
434 }
435
436 void
437 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
438 {
439 if (brw->gen < 7)
440 assert(!"ir_unop_pack_half_2x16 should be lowered");
441
442 assert(dst.type == BRW_REGISTER_TYPE_UD);
443 assert(src0.type == BRW_REGISTER_TYPE_F);
444
445 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
446 *
447 * Because this instruction does not have a 16-bit floating-point type,
448 * the destination data type must be Word (W).
449 *
450 * The destination must be DWord-aligned and specify a horizontal stride
451 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
452 * each destination channel and the upper word is not modified.
453 *
454 * The above restriction implies that the f32to16 instruction must use
455 * align1 mode, because only in align1 mode is it possible to specify
456 * horizontal stride. We choose here to defy the hardware docs and emit
457 * align16 instructions.
458 *
459 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
460 * instructions. I was partially successful in that the code passed all
461 * tests. However, the code was dubiously correct and fragile, and the
462 * tests were not harsh enough to probe that frailty. Not trusting the
463 * code, I chose instead to remain in align16 mode in defiance of the hw
464 * docs).
465 *
466 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
467 * simulator, emitting a f32to16 in align16 mode with UD as destination
468 * data type is safe. The behavior differs from that specified in the PRM
469 * in that the upper word of each destination channel is cleared to 0.
470 */
471
472 dst_reg tmp_dst(this, glsl_type::uvec2_type);
473 src_reg tmp_src(tmp_dst);
474
475 #if 0
476 /* Verify the undocumented behavior on which the following instructions
477 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
478 * then the result of the bit-or instruction below will be incorrect.
479 *
480 * You should inspect the disasm output in order to verify that the MOV is
481 * not optimized away.
482 */
483 emit(MOV(tmp_dst, src_reg(0x12345678u)));
484 #endif
485
486 /* Give tmp the form below, where "." means untouched.
487 *
488 * w z y x w z y x
489 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
490 *
491 * That the upper word of each write-channel be 0 is required for the
492 * following bit-shift and bit-or instructions to work. Note that this
493 * relies on the undocumented hardware behavior mentioned above.
494 */
495 tmp_dst.writemask = WRITEMASK_XY;
496 emit(F32TO16(tmp_dst, src0));
497
498 /* Give the write-channels of dst the form:
499 * 0xhhhh0000
500 */
501 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
502 emit(SHL(dst, tmp_src, src_reg(16u)));
503
504 /* Finally, give the write-channels of dst the form of packHalf2x16's
505 * output:
506 * 0xhhhhllll
507 */
508 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
509 emit(OR(dst, src_reg(dst), tmp_src));
510 }
511
512 void
513 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
514 {
515 if (brw->gen < 7)
516 assert(!"ir_unop_unpack_half_2x16 should be lowered");
517
518 assert(dst.type == BRW_REGISTER_TYPE_F);
519 assert(src0.type == BRW_REGISTER_TYPE_UD);
520
521 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
522 *
523 * Because this instruction does not have a 16-bit floating-point type,
524 * the source data type must be Word (W). The destination type must be
525 * F (Float).
526 *
527 * To use W as the source data type, we must adjust horizontal strides,
528 * which is only possible in align1 mode. All my [chadv] attempts at
529 * emitting align1 instructions for unpackHalf2x16 failed to pass the
530 * Piglit tests, so I gave up.
531 *
532 * I've verified that, on gen7 hardware and the simulator, it is safe to
533 * emit f16to32 in align16 mode with UD as source data type.
534 */
535
536 dst_reg tmp_dst(this, glsl_type::uvec2_type);
537 src_reg tmp_src(tmp_dst);
538
539 tmp_dst.writemask = WRITEMASK_X;
540 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
541
542 tmp_dst.writemask = WRITEMASK_Y;
543 emit(SHR(tmp_dst, src0, src_reg(16u)));
544
545 dst.writemask = WRITEMASK_XY;
546 emit(F16TO32(dst, tmp_src));
547 }
548
549 void
550 vec4_visitor::visit_instructions(const exec_list *list)
551 {
552 foreach_in_list(ir_instruction, ir, list) {
553 base_ir = ir;
554 ir->accept(this);
555 }
556 }
557
558
559 static int
560 type_size(const struct glsl_type *type)
561 {
562 unsigned int i;
563 int size;
564
565 switch (type->base_type) {
566 case GLSL_TYPE_UINT:
567 case GLSL_TYPE_INT:
568 case GLSL_TYPE_FLOAT:
569 case GLSL_TYPE_BOOL:
570 if (type->is_matrix()) {
571 return type->matrix_columns;
572 } else {
573 /* Regardless of size of vector, it gets a vec4. This is bad
574 * packing for things like floats, but otherwise arrays become a
575 * mess. Hopefully a later pass over the code can pack scalars
576 * down if appropriate.
577 */
578 return 1;
579 }
580 case GLSL_TYPE_ARRAY:
581 assert(type->length > 0);
582 return type_size(type->fields.array) * type->length;
583 case GLSL_TYPE_STRUCT:
584 size = 0;
585 for (i = 0; i < type->length; i++) {
586 size += type_size(type->fields.structure[i].type);
587 }
588 return size;
589 case GLSL_TYPE_SAMPLER:
590 /* Samplers take up one slot in UNIFORMS[], but they're baked in
591 * at link time.
592 */
593 return 1;
594 case GLSL_TYPE_ATOMIC_UINT:
595 return 0;
596 case GLSL_TYPE_IMAGE:
597 case GLSL_TYPE_VOID:
598 case GLSL_TYPE_ERROR:
599 case GLSL_TYPE_INTERFACE:
600 assert(0);
601 break;
602 }
603
604 return 0;
605 }
606
607 int
608 vec4_visitor::virtual_grf_alloc(int size)
609 {
610 if (virtual_grf_array_size <= virtual_grf_count) {
611 if (virtual_grf_array_size == 0)
612 virtual_grf_array_size = 16;
613 else
614 virtual_grf_array_size *= 2;
615 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
616 virtual_grf_array_size);
617 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
618 virtual_grf_array_size);
619 }
620 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
621 virtual_grf_reg_count += size;
622 virtual_grf_sizes[virtual_grf_count] = size;
623 return virtual_grf_count++;
624 }
625
626 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
627 {
628 init();
629
630 this->file = GRF;
631 this->reg = v->virtual_grf_alloc(type_size(type));
632
633 if (type->is_array() || type->is_record()) {
634 this->swizzle = BRW_SWIZZLE_NOOP;
635 } else {
636 this->swizzle = swizzle_for_size(type->vector_elements);
637 }
638
639 this->type = brw_type_for_base_type(type);
640 }
641
642 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
643 {
644 init();
645
646 this->file = GRF;
647 this->reg = v->virtual_grf_alloc(type_size(type));
648
649 if (type->is_array() || type->is_record()) {
650 this->writemask = WRITEMASK_XYZW;
651 } else {
652 this->writemask = (1 << type->vector_elements) - 1;
653 }
654
655 this->type = brw_type_for_base_type(type);
656 }
657
658 /* Our support for uniforms is piggy-backed on the struct
659 * gl_fragment_program, because that's where the values actually
660 * get stored, rather than in some global gl_shader_program uniform
661 * store.
662 */
663 void
664 vec4_visitor::setup_uniform_values(ir_variable *ir)
665 {
666 int namelen = strlen(ir->name);
667
668 /* The data for our (non-builtin) uniforms is stored in a series of
669 * gl_uniform_driver_storage structs for each subcomponent that
670 * glGetUniformLocation() could name. We know it's been set up in the same
671 * order we'd walk the type, so walk the list of storage and find anything
672 * with our name, or the prefix of a component that starts with our name.
673 */
674 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
675 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
676
677 if (strncmp(ir->name, storage->name, namelen) != 0 ||
678 (storage->name[namelen] != 0 &&
679 storage->name[namelen] != '.' &&
680 storage->name[namelen] != '[')) {
681 continue;
682 }
683
684 gl_constant_value *components = storage->storage;
685 unsigned vector_count = (MAX2(storage->array_elements, 1) *
686 storage->type->matrix_columns);
687
688 for (unsigned s = 0; s < vector_count; s++) {
689 assert(uniforms < uniform_array_size);
690 uniform_vector_size[uniforms] = storage->type->vector_elements;
691
692 int i;
693 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
694 stage_prog_data->param[uniforms * 4 + i] = &components->f;
695 components++;
696 }
697 for (; i < 4; i++) {
698 static float zero = 0;
699 stage_prog_data->param[uniforms * 4 + i] = &zero;
700 }
701
702 uniforms++;
703 }
704 }
705 }
706
707 void
708 vec4_visitor::setup_uniform_clipplane_values()
709 {
710 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
711
712 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
713 assert(this->uniforms < uniform_array_size);
714 this->uniform_vector_size[this->uniforms] = 4;
715 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
716 this->userplane[i].type = BRW_REGISTER_TYPE_F;
717 for (int j = 0; j < 4; ++j) {
718 stage_prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
719 }
720 ++this->uniforms;
721 }
722 }
723
724 /* Our support for builtin uniforms is even scarier than non-builtin.
725 * It sits on top of the PROG_STATE_VAR parameters that are
726 * automatically updated from GL context state.
727 */
728 void
729 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
730 {
731 const ir_state_slot *const slots = ir->state_slots;
732 assert(ir->state_slots != NULL);
733
734 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
735 /* This state reference has already been setup by ir_to_mesa,
736 * but we'll get the same index back here. We can reference
737 * ParameterValues directly, since unlike brw_fs.cpp, we never
738 * add new state references during compile.
739 */
740 int index = _mesa_add_state_reference(this->prog->Parameters,
741 (gl_state_index *)slots[i].tokens);
742 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
743
744 assert(this->uniforms < uniform_array_size);
745 this->uniform_vector_size[this->uniforms] = 0;
746 /* Add each of the unique swizzled channels of the element.
747 * This will end up matching the size of the glsl_type of this field.
748 */
749 int last_swiz = -1;
750 for (unsigned int j = 0; j < 4; j++) {
751 int swiz = GET_SWZ(slots[i].swizzle, j);
752 last_swiz = swiz;
753
754 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
755 assert(this->uniforms < uniform_array_size);
756 if (swiz <= last_swiz)
757 this->uniform_vector_size[this->uniforms]++;
758 }
759 this->uniforms++;
760 }
761 }
762
763 dst_reg *
764 vec4_visitor::variable_storage(ir_variable *var)
765 {
766 return (dst_reg *)hash_table_find(this->variable_ht, var);
767 }
768
769 void
770 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
771 {
772 ir_expression *expr = ir->as_expression();
773
774 *predicate = BRW_PREDICATE_NORMAL;
775
776 if (expr) {
777 src_reg op[2];
778 vec4_instruction *inst;
779
780 assert(expr->get_num_operands() <= 2);
781 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
782 expr->operands[i]->accept(this);
783 op[i] = this->result;
784
785 resolve_ud_negate(&op[i]);
786 }
787
788 switch (expr->operation) {
789 case ir_unop_logic_not:
790 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
791 inst->conditional_mod = BRW_CONDITIONAL_Z;
792 break;
793
794 case ir_binop_logic_xor:
795 inst = emit(XOR(dst_null_d(), op[0], op[1]));
796 inst->conditional_mod = BRW_CONDITIONAL_NZ;
797 break;
798
799 case ir_binop_logic_or:
800 inst = emit(OR(dst_null_d(), op[0], op[1]));
801 inst->conditional_mod = BRW_CONDITIONAL_NZ;
802 break;
803
804 case ir_binop_logic_and:
805 inst = emit(AND(dst_null_d(), op[0], op[1]));
806 inst->conditional_mod = BRW_CONDITIONAL_NZ;
807 break;
808
809 case ir_unop_f2b:
810 if (brw->gen >= 6) {
811 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
812 } else {
813 inst = emit(MOV(dst_null_f(), op[0]));
814 inst->conditional_mod = BRW_CONDITIONAL_NZ;
815 }
816 break;
817
818 case ir_unop_i2b:
819 if (brw->gen >= 6) {
820 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
821 } else {
822 inst = emit(MOV(dst_null_d(), op[0]));
823 inst->conditional_mod = BRW_CONDITIONAL_NZ;
824 }
825 break;
826
827 case ir_binop_all_equal:
828 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
829 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
830 break;
831
832 case ir_binop_any_nequal:
833 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
834 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
835 break;
836
837 case ir_unop_any:
838 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
839 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
840 break;
841
842 case ir_binop_greater:
843 case ir_binop_gequal:
844 case ir_binop_less:
845 case ir_binop_lequal:
846 case ir_binop_equal:
847 case ir_binop_nequal:
848 emit(CMP(dst_null_d(), op[0], op[1],
849 brw_conditional_for_comparison(expr->operation)));
850 break;
851
852 default:
853 assert(!"not reached");
854 break;
855 }
856 return;
857 }
858
859 ir->accept(this);
860
861 resolve_ud_negate(&this->result);
862
863 if (brw->gen >= 6) {
864 vec4_instruction *inst = emit(AND(dst_null_d(),
865 this->result, src_reg(1)));
866 inst->conditional_mod = BRW_CONDITIONAL_NZ;
867 } else {
868 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
869 inst->conditional_mod = BRW_CONDITIONAL_NZ;
870 }
871 }
872
873 /**
874 * Emit a gen6 IF statement with the comparison folded into the IF
875 * instruction.
876 */
877 void
878 vec4_visitor::emit_if_gen6(ir_if *ir)
879 {
880 ir_expression *expr = ir->condition->as_expression();
881
882 if (expr) {
883 src_reg op[2];
884 dst_reg temp;
885
886 assert(expr->get_num_operands() <= 2);
887 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
888 expr->operands[i]->accept(this);
889 op[i] = this->result;
890 }
891
892 switch (expr->operation) {
893 case ir_unop_logic_not:
894 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
895 return;
896
897 case ir_binop_logic_xor:
898 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
899 return;
900
901 case ir_binop_logic_or:
902 temp = dst_reg(this, glsl_type::bool_type);
903 emit(OR(temp, op[0], op[1]));
904 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
905 return;
906
907 case ir_binop_logic_and:
908 temp = dst_reg(this, glsl_type::bool_type);
909 emit(AND(temp, op[0], op[1]));
910 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
911 return;
912
913 case ir_unop_f2b:
914 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
915 return;
916
917 case ir_unop_i2b:
918 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
919 return;
920
921 case ir_binop_greater:
922 case ir_binop_gequal:
923 case ir_binop_less:
924 case ir_binop_lequal:
925 case ir_binop_equal:
926 case ir_binop_nequal:
927 emit(IF(op[0], op[1],
928 brw_conditional_for_comparison(expr->operation)));
929 return;
930
931 case ir_binop_all_equal:
932 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
933 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
934 return;
935
936 case ir_binop_any_nequal:
937 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
938 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
939 return;
940
941 case ir_unop_any:
942 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
943 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
944 return;
945
946 default:
947 assert(!"not reached");
948 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
949 return;
950 }
951 return;
952 }
953
954 ir->condition->accept(this);
955
956 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
957 }
958
959 void
960 vec4_visitor::visit(ir_variable *ir)
961 {
962 dst_reg *reg = NULL;
963
964 if (variable_storage(ir))
965 return;
966
967 switch (ir->data.mode) {
968 case ir_var_shader_in:
969 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
970 break;
971
972 case ir_var_shader_out:
973 reg = new(mem_ctx) dst_reg(this, ir->type);
974
975 for (int i = 0; i < type_size(ir->type); i++) {
976 output_reg[ir->data.location + i] = *reg;
977 output_reg[ir->data.location + i].reg_offset = i;
978 output_reg[ir->data.location + i].type =
979 brw_type_for_base_type(ir->type->get_scalar_type());
980 output_reg_annotation[ir->data.location + i] = ir->name;
981 }
982 break;
983
984 case ir_var_auto:
985 case ir_var_temporary:
986 reg = new(mem_ctx) dst_reg(this, ir->type);
987 break;
988
989 case ir_var_uniform:
990 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
991
992 /* Thanks to the lower_ubo_reference pass, we will see only
993 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
994 * variables, so no need for them to be in variable_ht.
995 *
996 * Atomic counters take no uniform storage, no need to do
997 * anything here.
998 */
999 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
1000 return;
1001
1002 /* Track how big the whole uniform variable is, in case we need to put a
1003 * copy of its data into pull constants for array access.
1004 */
1005 assert(this->uniforms < uniform_array_size);
1006 this->uniform_size[this->uniforms] = type_size(ir->type);
1007
1008 if (!strncmp(ir->name, "gl_", 3)) {
1009 setup_builtin_uniform_values(ir);
1010 } else {
1011 setup_uniform_values(ir);
1012 }
1013 break;
1014
1015 case ir_var_system_value:
1016 reg = make_reg_for_system_value(ir);
1017 break;
1018
1019 default:
1020 assert(!"not reached");
1021 }
1022
1023 reg->type = brw_type_for_base_type(ir->type);
1024 hash_table_insert(this->variable_ht, reg, ir);
1025 }
1026
1027 void
1028 vec4_visitor::visit(ir_loop *ir)
1029 {
1030 /* We don't want debugging output to print the whole body of the
1031 * loop as the annotation.
1032 */
1033 this->base_ir = NULL;
1034
1035 emit(BRW_OPCODE_DO);
1036
1037 visit_instructions(&ir->body_instructions);
1038
1039 emit(BRW_OPCODE_WHILE);
1040 }
1041
1042 void
1043 vec4_visitor::visit(ir_loop_jump *ir)
1044 {
1045 switch (ir->mode) {
1046 case ir_loop_jump::jump_break:
1047 emit(BRW_OPCODE_BREAK);
1048 break;
1049 case ir_loop_jump::jump_continue:
1050 emit(BRW_OPCODE_CONTINUE);
1051 break;
1052 }
1053 }
1054
1055
1056 void
1057 vec4_visitor::visit(ir_function_signature *ir)
1058 {
1059 assert(0);
1060 (void)ir;
1061 }
1062
1063 void
1064 vec4_visitor::visit(ir_function *ir)
1065 {
1066 /* Ignore function bodies other than main() -- we shouldn't see calls to
1067 * them since they should all be inlined.
1068 */
1069 if (strcmp(ir->name, "main") == 0) {
1070 const ir_function_signature *sig;
1071 exec_list empty;
1072
1073 sig = ir->matching_signature(NULL, &empty);
1074
1075 assert(sig);
1076
1077 visit_instructions(&sig->body);
1078 }
1079 }
1080
1081 bool
1082 vec4_visitor::try_emit_sat(ir_expression *ir)
1083 {
1084 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1085 if (!sat_src)
1086 return false;
1087
1088 sat_src->accept(this);
1089 src_reg src = this->result;
1090
1091 this->result = src_reg(this, ir->type);
1092 vec4_instruction *inst;
1093 inst = emit(MOV(dst_reg(this->result), src));
1094 inst->saturate = true;
1095
1096 return true;
1097 }
1098
1099 bool
1100 vec4_visitor::try_emit_mad(ir_expression *ir)
1101 {
1102 /* 3-src instructions were introduced in gen6. */
1103 if (brw->gen < 6)
1104 return false;
1105
1106 /* MAD can only handle floating-point data. */
1107 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1108 return false;
1109
1110 ir_rvalue *nonmul = ir->operands[1];
1111 ir_expression *mul = ir->operands[0]->as_expression();
1112
1113 if (!mul || mul->operation != ir_binop_mul) {
1114 nonmul = ir->operands[0];
1115 mul = ir->operands[1]->as_expression();
1116
1117 if (!mul || mul->operation != ir_binop_mul)
1118 return false;
1119 }
1120
1121 nonmul->accept(this);
1122 src_reg src0 = fix_3src_operand(this->result);
1123
1124 mul->operands[0]->accept(this);
1125 src_reg src1 = fix_3src_operand(this->result);
1126
1127 mul->operands[1]->accept(this);
1128 src_reg src2 = fix_3src_operand(this->result);
1129
1130 this->result = src_reg(this, ir->type);
1131 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1132
1133 return true;
1134 }
1135
1136 bool
1137 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1138 {
1139 ir_expression *const cmp = ir->operands[0]->as_expression();
1140
1141 if (cmp == NULL)
1142 return false;
1143
1144 switch (cmp->operation) {
1145 case ir_binop_less:
1146 case ir_binop_greater:
1147 case ir_binop_lequal:
1148 case ir_binop_gequal:
1149 case ir_binop_equal:
1150 case ir_binop_nequal:
1151 break;
1152
1153 default:
1154 return false;
1155 }
1156
1157 cmp->operands[0]->accept(this);
1158 const src_reg cmp_src0 = this->result;
1159
1160 cmp->operands[1]->accept(this);
1161 const src_reg cmp_src1 = this->result;
1162
1163 this->result = src_reg(this, ir->type);
1164
1165 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1166 brw_conditional_for_comparison(cmp->operation)));
1167
1168 /* If the comparison is false, this->result will just happen to be zero.
1169 */
1170 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1171 this->result, src_reg(1.0f));
1172 inst->predicate = BRW_PREDICATE_NORMAL;
1173 inst->predicate_inverse = true;
1174
1175 return true;
1176 }
1177
1178 void
1179 vec4_visitor::emit_bool_comparison(unsigned int op,
1180 dst_reg dst, src_reg src0, src_reg src1)
1181 {
1182 /* original gen4 does destination conversion before comparison. */
1183 if (brw->gen < 5)
1184 dst.type = src0.type;
1185
1186 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1187
1188 dst.type = BRW_REGISTER_TYPE_D;
1189 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1190 }
1191
1192 void
1193 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1194 src_reg src0, src_reg src1)
1195 {
1196 vec4_instruction *inst;
1197
1198 if (brw->gen >= 6) {
1199 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1200 inst->conditional_mod = conditionalmod;
1201 } else {
1202 emit(CMP(dst, src0, src1, conditionalmod));
1203
1204 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1205 inst->predicate = BRW_PREDICATE_NORMAL;
1206 }
1207 }
1208
1209 void
1210 vec4_visitor::emit_lrp(const dst_reg &dst,
1211 const src_reg &x, const src_reg &y, const src_reg &a)
1212 {
1213 if (brw->gen >= 6) {
1214 /* Note that the instruction's argument order is reversed from GLSL
1215 * and the IR.
1216 */
1217 emit(LRP(dst,
1218 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1219 } else {
1220 /* Earlier generations don't support three source operations, so we
1221 * need to emit x*(1-a) + y*a.
1222 */
1223 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1224 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1225 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1226 y_times_a.writemask = dst.writemask;
1227 one_minus_a.writemask = dst.writemask;
1228 x_times_one_minus_a.writemask = dst.writemask;
1229
1230 emit(MUL(y_times_a, y, a));
1231 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1232 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1233 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1234 }
1235 }
1236
1237 void
1238 vec4_visitor::visit(ir_expression *ir)
1239 {
1240 unsigned int operand;
1241 src_reg op[Elements(ir->operands)];
1242 src_reg result_src;
1243 dst_reg result_dst;
1244 vec4_instruction *inst;
1245
1246 if (try_emit_sat(ir))
1247 return;
1248
1249 if (ir->operation == ir_binop_add) {
1250 if (try_emit_mad(ir))
1251 return;
1252 }
1253
1254 if (ir->operation == ir_unop_b2f) {
1255 if (try_emit_b2f_of_compare(ir))
1256 return;
1257 }
1258
1259 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1260 this->result.file = BAD_FILE;
1261 ir->operands[operand]->accept(this);
1262 if (this->result.file == BAD_FILE) {
1263 fprintf(stderr, "Failed to get tree for expression operand:\n");
1264 ir->operands[operand]->fprint(stderr);
1265 exit(1);
1266 }
1267 op[operand] = this->result;
1268
1269 /* Matrix expression operands should have been broken down to vector
1270 * operations already.
1271 */
1272 assert(!ir->operands[operand]->type->is_matrix());
1273 }
1274
1275 int vector_elements = ir->operands[0]->type->vector_elements;
1276 if (ir->operands[1]) {
1277 vector_elements = MAX2(vector_elements,
1278 ir->operands[1]->type->vector_elements);
1279 }
1280
1281 this->result.file = BAD_FILE;
1282
1283 /* Storage for our result. Ideally for an assignment we'd be using
1284 * the actual storage for the result here, instead.
1285 */
1286 result_src = src_reg(this, ir->type);
1287 /* convenience for the emit functions below. */
1288 result_dst = dst_reg(result_src);
1289 /* If nothing special happens, this is the result. */
1290 this->result = result_src;
1291 /* Limit writes to the channels that will be used by result_src later.
1292 * This does limit this temp's use as a temporary for multi-instruction
1293 * sequences.
1294 */
1295 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1296
1297 switch (ir->operation) {
1298 case ir_unop_logic_not:
1299 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1300 * ones complement of the whole register, not just bit 0.
1301 */
1302 emit(XOR(result_dst, op[0], src_reg(1)));
1303 break;
1304 case ir_unop_neg:
1305 op[0].negate = !op[0].negate;
1306 emit(MOV(result_dst, op[0]));
1307 break;
1308 case ir_unop_abs:
1309 op[0].abs = true;
1310 op[0].negate = false;
1311 emit(MOV(result_dst, op[0]));
1312 break;
1313
1314 case ir_unop_sign:
1315 if (ir->type->is_float()) {
1316 /* AND(val, 0x80000000) gives the sign bit.
1317 *
1318 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1319 * zero.
1320 */
1321 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1322
1323 op[0].type = BRW_REGISTER_TYPE_UD;
1324 result_dst.type = BRW_REGISTER_TYPE_UD;
1325 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1326
1327 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1328 inst->predicate = BRW_PREDICATE_NORMAL;
1329
1330 this->result.type = BRW_REGISTER_TYPE_F;
1331 } else {
1332 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1333 * -> non-negative val generates 0x00000000.
1334 * Predicated OR sets 1 if val is positive.
1335 */
1336 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1337
1338 emit(ASR(result_dst, op[0], src_reg(31)));
1339
1340 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1341 inst->predicate = BRW_PREDICATE_NORMAL;
1342 }
1343 break;
1344
1345 case ir_unop_rcp:
1346 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1347 break;
1348
1349 case ir_unop_exp2:
1350 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1351 break;
1352 case ir_unop_log2:
1353 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1354 break;
1355 case ir_unop_exp:
1356 case ir_unop_log:
1357 assert(!"not reached: should be handled by ir_explog_to_explog2");
1358 break;
1359 case ir_unop_sin:
1360 case ir_unop_sin_reduced:
1361 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1362 break;
1363 case ir_unop_cos:
1364 case ir_unop_cos_reduced:
1365 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1366 break;
1367
1368 case ir_unop_dFdx:
1369 case ir_unop_dFdy:
1370 assert(!"derivatives not valid in vertex shader");
1371 break;
1372
1373 case ir_unop_bitfield_reverse:
1374 emit(BFREV(result_dst, op[0]));
1375 break;
1376 case ir_unop_bit_count:
1377 emit(CBIT(result_dst, op[0]));
1378 break;
1379 case ir_unop_find_msb: {
1380 src_reg temp = src_reg(this, glsl_type::uint_type);
1381
1382 inst = emit(FBH(dst_reg(temp), op[0]));
1383 inst->dst.writemask = WRITEMASK_XYZW;
1384
1385 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1386 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1387 * subtract the result from 31 to convert the MSB count into an LSB count.
1388 */
1389
1390 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1391 temp.swizzle = BRW_SWIZZLE_NOOP;
1392 emit(MOV(result_dst, temp));
1393
1394 src_reg src_tmp = src_reg(result_dst);
1395 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1396
1397 src_tmp.negate = true;
1398 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1399 inst->predicate = BRW_PREDICATE_NORMAL;
1400 break;
1401 }
1402 case ir_unop_find_lsb:
1403 emit(FBL(result_dst, op[0]));
1404 break;
1405
1406 case ir_unop_noise:
1407 assert(!"not reached: should be handled by lower_noise");
1408 break;
1409
1410 case ir_binop_add:
1411 emit(ADD(result_dst, op[0], op[1]));
1412 break;
1413 case ir_binop_sub:
1414 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1415 break;
1416
1417 case ir_binop_mul:
1418 if (brw->gen < 8 && ir->type->is_integer()) {
1419 /* For integer multiplication, the MUL uses the low 16 bits of one of
1420 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1421 * accumulates in the contribution of the upper 16 bits of that
1422 * operand. If we can determine that one of the args is in the low
1423 * 16 bits, though, we can just emit a single MUL.
1424 */
1425 if (ir->operands[0]->is_uint16_constant()) {
1426 if (brw->gen < 7)
1427 emit(MUL(result_dst, op[0], op[1]));
1428 else
1429 emit(MUL(result_dst, op[1], op[0]));
1430 } else if (ir->operands[1]->is_uint16_constant()) {
1431 if (brw->gen < 7)
1432 emit(MUL(result_dst, op[1], op[0]));
1433 else
1434 emit(MUL(result_dst, op[0], op[1]));
1435 } else {
1436 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1437
1438 emit(MUL(acc, op[0], op[1]));
1439 emit(MACH(dst_null_d(), op[0], op[1]));
1440 emit(MOV(result_dst, src_reg(acc)));
1441 }
1442 } else {
1443 emit(MUL(result_dst, op[0], op[1]));
1444 }
1445 break;
1446 case ir_binop_imul_high: {
1447 struct brw_reg acc = retype(brw_acc_reg(), result_dst.type);
1448
1449 emit(MUL(acc, op[0], op[1]));
1450 emit(MACH(result_dst, op[0], op[1]));
1451 break;
1452 }
1453 case ir_binop_div:
1454 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1455 assert(ir->type->is_integer());
1456 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1457 break;
1458 case ir_binop_carry: {
1459 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1460
1461 emit(ADDC(dst_null_ud(), op[0], op[1]));
1462 emit(MOV(result_dst, src_reg(acc)));
1463 break;
1464 }
1465 case ir_binop_borrow: {
1466 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
1467
1468 emit(SUBB(dst_null_ud(), op[0], op[1]));
1469 emit(MOV(result_dst, src_reg(acc)));
1470 break;
1471 }
1472 case ir_binop_mod:
1473 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1474 assert(ir->type->is_integer());
1475 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1476 break;
1477
1478 case ir_binop_less:
1479 case ir_binop_greater:
1480 case ir_binop_lequal:
1481 case ir_binop_gequal:
1482 case ir_binop_equal:
1483 case ir_binop_nequal: {
1484 emit(CMP(result_dst, op[0], op[1],
1485 brw_conditional_for_comparison(ir->operation)));
1486 emit(AND(result_dst, result_src, src_reg(0x1)));
1487 break;
1488 }
1489
1490 case ir_binop_all_equal:
1491 /* "==" operator producing a scalar boolean. */
1492 if (ir->operands[0]->type->is_vector() ||
1493 ir->operands[1]->type->is_vector()) {
1494 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1495 emit(MOV(result_dst, src_reg(0)));
1496 inst = emit(MOV(result_dst, src_reg(1)));
1497 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1498 } else {
1499 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1500 emit(AND(result_dst, result_src, src_reg(0x1)));
1501 }
1502 break;
1503 case ir_binop_any_nequal:
1504 /* "!=" operator producing a scalar boolean. */
1505 if (ir->operands[0]->type->is_vector() ||
1506 ir->operands[1]->type->is_vector()) {
1507 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1508
1509 emit(MOV(result_dst, src_reg(0)));
1510 inst = emit(MOV(result_dst, src_reg(1)));
1511 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1512 } else {
1513 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1514 emit(AND(result_dst, result_src, src_reg(0x1)));
1515 }
1516 break;
1517
1518 case ir_unop_any:
1519 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1520 emit(MOV(result_dst, src_reg(0)));
1521
1522 inst = emit(MOV(result_dst, src_reg(1)));
1523 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1524 break;
1525
1526 case ir_binop_logic_xor:
1527 emit(XOR(result_dst, op[0], op[1]));
1528 break;
1529
1530 case ir_binop_logic_or:
1531 emit(OR(result_dst, op[0], op[1]));
1532 break;
1533
1534 case ir_binop_logic_and:
1535 emit(AND(result_dst, op[0], op[1]));
1536 break;
1537
1538 case ir_binop_dot:
1539 assert(ir->operands[0]->type->is_vector());
1540 assert(ir->operands[0]->type == ir->operands[1]->type);
1541 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1542 break;
1543
1544 case ir_unop_sqrt:
1545 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1546 break;
1547 case ir_unop_rsq:
1548 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1549 break;
1550
1551 case ir_unop_bitcast_i2f:
1552 case ir_unop_bitcast_u2f:
1553 this->result = op[0];
1554 this->result.type = BRW_REGISTER_TYPE_F;
1555 break;
1556
1557 case ir_unop_bitcast_f2i:
1558 this->result = op[0];
1559 this->result.type = BRW_REGISTER_TYPE_D;
1560 break;
1561
1562 case ir_unop_bitcast_f2u:
1563 this->result = op[0];
1564 this->result.type = BRW_REGISTER_TYPE_UD;
1565 break;
1566
1567 case ir_unop_i2f:
1568 case ir_unop_i2u:
1569 case ir_unop_u2i:
1570 case ir_unop_u2f:
1571 case ir_unop_b2f:
1572 case ir_unop_b2i:
1573 case ir_unop_f2i:
1574 case ir_unop_f2u:
1575 emit(MOV(result_dst, op[0]));
1576 break;
1577 case ir_unop_f2b:
1578 case ir_unop_i2b: {
1579 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1580 emit(AND(result_dst, result_src, src_reg(1)));
1581 break;
1582 }
1583
1584 case ir_unop_trunc:
1585 emit(RNDZ(result_dst, op[0]));
1586 break;
1587 case ir_unop_ceil:
1588 op[0].negate = !op[0].negate;
1589 inst = emit(RNDD(result_dst, op[0]));
1590 this->result.negate = true;
1591 break;
1592 case ir_unop_floor:
1593 inst = emit(RNDD(result_dst, op[0]));
1594 break;
1595 case ir_unop_fract:
1596 inst = emit(FRC(result_dst, op[0]));
1597 break;
1598 case ir_unop_round_even:
1599 emit(RNDE(result_dst, op[0]));
1600 break;
1601
1602 case ir_binop_min:
1603 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1604 break;
1605 case ir_binop_max:
1606 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1607 break;
1608
1609 case ir_binop_pow:
1610 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1611 break;
1612
1613 case ir_unop_bit_not:
1614 inst = emit(NOT(result_dst, op[0]));
1615 break;
1616 case ir_binop_bit_and:
1617 inst = emit(AND(result_dst, op[0], op[1]));
1618 break;
1619 case ir_binop_bit_xor:
1620 inst = emit(XOR(result_dst, op[0], op[1]));
1621 break;
1622 case ir_binop_bit_or:
1623 inst = emit(OR(result_dst, op[0], op[1]));
1624 break;
1625
1626 case ir_binop_lshift:
1627 inst = emit(SHL(result_dst, op[0], op[1]));
1628 break;
1629
1630 case ir_binop_rshift:
1631 if (ir->type->base_type == GLSL_TYPE_INT)
1632 inst = emit(ASR(result_dst, op[0], op[1]));
1633 else
1634 inst = emit(SHR(result_dst, op[0], op[1]));
1635 break;
1636
1637 case ir_binop_bfm:
1638 emit(BFI1(result_dst, op[0], op[1]));
1639 break;
1640
1641 case ir_binop_ubo_load: {
1642 ir_constant *uniform_block = ir->operands[0]->as_constant();
1643 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1644 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1645 src_reg offset;
1646
1647 /* Now, load the vector from that offset. */
1648 assert(ir->type->is_vector() || ir->type->is_scalar());
1649
1650 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1651 packed_consts.type = result.type;
1652 src_reg surf_index =
1653 src_reg(prog_data->base.binding_table.ubo_start + uniform_block->value.u[0]);
1654 if (const_offset_ir) {
1655 if (brw->gen >= 8) {
1656 /* Store the offset in a GRF so we can send-from-GRF. */
1657 offset = src_reg(this, glsl_type::int_type);
1658 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1659 } else {
1660 /* Immediates are fine on older generations since they'll be moved
1661 * to a (potentially fake) MRF at the generator level.
1662 */
1663 offset = src_reg(const_offset / 16);
1664 }
1665 } else {
1666 offset = src_reg(this, glsl_type::uint_type);
1667 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1668 }
1669
1670 if (brw->gen >= 7) {
1671 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1672 grf_offset.type = offset.type;
1673
1674 emit(MOV(grf_offset, offset));
1675
1676 emit(new(mem_ctx) vec4_instruction(this,
1677 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1678 dst_reg(packed_consts),
1679 surf_index,
1680 src_reg(grf_offset)));
1681 } else {
1682 vec4_instruction *pull =
1683 emit(new(mem_ctx) vec4_instruction(this,
1684 VS_OPCODE_PULL_CONSTANT_LOAD,
1685 dst_reg(packed_consts),
1686 surf_index,
1687 offset));
1688 pull->base_mrf = 14;
1689 pull->mlen = 1;
1690 }
1691
1692 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1693 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1694 const_offset % 16 / 4,
1695 const_offset % 16 / 4,
1696 const_offset % 16 / 4);
1697
1698 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1699 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1700 emit(CMP(result_dst, packed_consts, src_reg(0u),
1701 BRW_CONDITIONAL_NZ));
1702 emit(AND(result_dst, result, src_reg(0x1)));
1703 } else {
1704 emit(MOV(result_dst, packed_consts));
1705 }
1706 break;
1707 }
1708
1709 case ir_binop_vector_extract:
1710 assert(!"should have been lowered by vec_index_to_cond_assign");
1711 break;
1712
1713 case ir_triop_fma:
1714 op[0] = fix_3src_operand(op[0]);
1715 op[1] = fix_3src_operand(op[1]);
1716 op[2] = fix_3src_operand(op[2]);
1717 /* Note that the instruction's argument order is reversed from GLSL
1718 * and the IR.
1719 */
1720 emit(MAD(result_dst, op[2], op[1], op[0]));
1721 break;
1722
1723 case ir_triop_lrp:
1724 emit_lrp(result_dst, op[0], op[1], op[2]);
1725 break;
1726
1727 case ir_triop_csel:
1728 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1729 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1730 inst->predicate = BRW_PREDICATE_NORMAL;
1731 break;
1732
1733 case ir_triop_bfi:
1734 op[0] = fix_3src_operand(op[0]);
1735 op[1] = fix_3src_operand(op[1]);
1736 op[2] = fix_3src_operand(op[2]);
1737 emit(BFI2(result_dst, op[0], op[1], op[2]));
1738 break;
1739
1740 case ir_triop_bitfield_extract:
1741 op[0] = fix_3src_operand(op[0]);
1742 op[1] = fix_3src_operand(op[1]);
1743 op[2] = fix_3src_operand(op[2]);
1744 /* Note that the instruction's argument order is reversed from GLSL
1745 * and the IR.
1746 */
1747 emit(BFE(result_dst, op[2], op[1], op[0]));
1748 break;
1749
1750 case ir_triop_vector_insert:
1751 assert(!"should have been lowered by lower_vector_insert");
1752 break;
1753
1754 case ir_quadop_bitfield_insert:
1755 assert(!"not reached: should be handled by "
1756 "bitfield_insert_to_bfm_bfi\n");
1757 break;
1758
1759 case ir_quadop_vector:
1760 assert(!"not reached: should be handled by lower_quadop_vector");
1761 break;
1762
1763 case ir_unop_pack_half_2x16:
1764 emit_pack_half_2x16(result_dst, op[0]);
1765 break;
1766 case ir_unop_unpack_half_2x16:
1767 emit_unpack_half_2x16(result_dst, op[0]);
1768 break;
1769 case ir_unop_pack_snorm_2x16:
1770 case ir_unop_pack_snorm_4x8:
1771 case ir_unop_pack_unorm_2x16:
1772 case ir_unop_pack_unorm_4x8:
1773 case ir_unop_unpack_snorm_2x16:
1774 case ir_unop_unpack_snorm_4x8:
1775 case ir_unop_unpack_unorm_2x16:
1776 case ir_unop_unpack_unorm_4x8:
1777 assert(!"not reached: should be handled by lower_packing_builtins");
1778 break;
1779 case ir_unop_unpack_half_2x16_split_x:
1780 case ir_unop_unpack_half_2x16_split_y:
1781 case ir_binop_pack_half_2x16_split:
1782 assert(!"not reached: should not occur in vertex shader");
1783 break;
1784 case ir_binop_ldexp:
1785 assert(!"not reached: should be handled by ldexp_to_arith()");
1786 break;
1787 }
1788 }
1789
1790
1791 void
1792 vec4_visitor::visit(ir_swizzle *ir)
1793 {
1794 src_reg src;
1795 int i = 0;
1796 int swizzle[4];
1797
1798 /* Note that this is only swizzles in expressions, not those on the left
1799 * hand side of an assignment, which do write masking. See ir_assignment
1800 * for that.
1801 */
1802
1803 ir->val->accept(this);
1804 src = this->result;
1805 assert(src.file != BAD_FILE);
1806
1807 for (i = 0; i < ir->type->vector_elements; i++) {
1808 switch (i) {
1809 case 0:
1810 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1811 break;
1812 case 1:
1813 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1814 break;
1815 case 2:
1816 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1817 break;
1818 case 3:
1819 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1820 break;
1821 }
1822 }
1823 for (; i < 4; i++) {
1824 /* Replicate the last channel out. */
1825 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1826 }
1827
1828 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1829
1830 this->result = src;
1831 }
1832
1833 void
1834 vec4_visitor::visit(ir_dereference_variable *ir)
1835 {
1836 const struct glsl_type *type = ir->type;
1837 dst_reg *reg = variable_storage(ir->var);
1838
1839 if (!reg) {
1840 fail("Failed to find variable storage for %s\n", ir->var->name);
1841 this->result = src_reg(brw_null_reg());
1842 return;
1843 }
1844
1845 this->result = src_reg(*reg);
1846
1847 /* System values get their swizzle from the dst_reg writemask */
1848 if (ir->var->data.mode == ir_var_system_value)
1849 return;
1850
1851 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1852 this->result.swizzle = swizzle_for_size(type->vector_elements);
1853 }
1854
1855
1856 int
1857 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1858 {
1859 /* Under normal circumstances array elements are stored consecutively, so
1860 * the stride is equal to the size of the array element.
1861 */
1862 return type_size(ir->type);
1863 }
1864
1865
1866 void
1867 vec4_visitor::visit(ir_dereference_array *ir)
1868 {
1869 ir_constant *constant_index;
1870 src_reg src;
1871 int array_stride = compute_array_stride(ir);
1872
1873 constant_index = ir->array_index->constant_expression_value();
1874
1875 ir->array->accept(this);
1876 src = this->result;
1877
1878 if (constant_index) {
1879 src.reg_offset += constant_index->value.i[0] * array_stride;
1880 } else {
1881 /* Variable index array dereference. It eats the "vec4" of the
1882 * base of the array and an index that offsets the Mesa register
1883 * index.
1884 */
1885 ir->array_index->accept(this);
1886
1887 src_reg index_reg;
1888
1889 if (array_stride == 1) {
1890 index_reg = this->result;
1891 } else {
1892 index_reg = src_reg(this, glsl_type::int_type);
1893
1894 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1895 }
1896
1897 if (src.reladdr) {
1898 src_reg temp = src_reg(this, glsl_type::int_type);
1899
1900 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1901
1902 index_reg = temp;
1903 }
1904
1905 src.reladdr = ralloc(mem_ctx, src_reg);
1906 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1907 }
1908
1909 /* If the type is smaller than a vec4, replicate the last channel out. */
1910 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1911 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1912 else
1913 src.swizzle = BRW_SWIZZLE_NOOP;
1914 src.type = brw_type_for_base_type(ir->type);
1915
1916 this->result = src;
1917 }
1918
1919 void
1920 vec4_visitor::visit(ir_dereference_record *ir)
1921 {
1922 unsigned int i;
1923 const glsl_type *struct_type = ir->record->type;
1924 int offset = 0;
1925
1926 ir->record->accept(this);
1927
1928 for (i = 0; i < struct_type->length; i++) {
1929 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1930 break;
1931 offset += type_size(struct_type->fields.structure[i].type);
1932 }
1933
1934 /* If the type is smaller than a vec4, replicate the last channel out. */
1935 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1936 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1937 else
1938 this->result.swizzle = BRW_SWIZZLE_NOOP;
1939 this->result.type = brw_type_for_base_type(ir->type);
1940
1941 this->result.reg_offset += offset;
1942 }
1943
1944 /**
1945 * We want to be careful in assignment setup to hit the actual storage
1946 * instead of potentially using a temporary like we might with the
1947 * ir_dereference handler.
1948 */
1949 static dst_reg
1950 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1951 {
1952 /* The LHS must be a dereference. If the LHS is a variable indexed array
1953 * access of a vector, it must be separated into a series conditional moves
1954 * before reaching this point (see ir_vec_index_to_cond_assign).
1955 */
1956 assert(ir->as_dereference());
1957 ir_dereference_array *deref_array = ir->as_dereference_array();
1958 if (deref_array) {
1959 assert(!deref_array->array->type->is_vector());
1960 }
1961
1962 /* Use the rvalue deref handler for the most part. We'll ignore
1963 * swizzles in it and write swizzles using writemask, though.
1964 */
1965 ir->accept(v);
1966 return dst_reg(v->result);
1967 }
1968
1969 void
1970 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1971 const struct glsl_type *type, uint32_t predicate)
1972 {
1973 if (type->base_type == GLSL_TYPE_STRUCT) {
1974 for (unsigned int i = 0; i < type->length; i++) {
1975 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1976 }
1977 return;
1978 }
1979
1980 if (type->is_array()) {
1981 for (unsigned int i = 0; i < type->length; i++) {
1982 emit_block_move(dst, src, type->fields.array, predicate);
1983 }
1984 return;
1985 }
1986
1987 if (type->is_matrix()) {
1988 const struct glsl_type *vec_type;
1989
1990 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1991 type->vector_elements, 1);
1992
1993 for (int i = 0; i < type->matrix_columns; i++) {
1994 emit_block_move(dst, src, vec_type, predicate);
1995 }
1996 return;
1997 }
1998
1999 assert(type->is_scalar() || type->is_vector());
2000
2001 dst->type = brw_type_for_base_type(type);
2002 src->type = dst->type;
2003
2004 dst->writemask = (1 << type->vector_elements) - 1;
2005
2006 src->swizzle = swizzle_for_size(type->vector_elements);
2007
2008 vec4_instruction *inst = emit(MOV(*dst, *src));
2009 inst->predicate = predicate;
2010
2011 dst->reg_offset++;
2012 src->reg_offset++;
2013 }
2014
2015
2016 /* If the RHS processing resulted in an instruction generating a
2017 * temporary value, and it would be easy to rewrite the instruction to
2018 * generate its result right into the LHS instead, do so. This ends
2019 * up reliably removing instructions where it can be tricky to do so
2020 * later without real UD chain information.
2021 */
2022 bool
2023 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2024 dst_reg dst,
2025 src_reg src,
2026 vec4_instruction *pre_rhs_inst,
2027 vec4_instruction *last_rhs_inst)
2028 {
2029 /* This could be supported, but it would take more smarts. */
2030 if (ir->condition)
2031 return false;
2032
2033 if (pre_rhs_inst == last_rhs_inst)
2034 return false; /* No instructions generated to work with. */
2035
2036 /* Make sure the last instruction generated our source reg. */
2037 if (src.file != GRF ||
2038 src.file != last_rhs_inst->dst.file ||
2039 src.reg != last_rhs_inst->dst.reg ||
2040 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2041 src.reladdr ||
2042 src.abs ||
2043 src.negate ||
2044 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2045 return false;
2046
2047 /* Check that that last instruction fully initialized the channels
2048 * we want to use, in the order we want to use them. We could
2049 * potentially reswizzle the operands of many instructions so that
2050 * we could handle out of order channels, but don't yet.
2051 */
2052
2053 for (unsigned i = 0; i < 4; i++) {
2054 if (dst.writemask & (1 << i)) {
2055 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2056 return false;
2057
2058 if (BRW_GET_SWZ(src.swizzle, i) != i)
2059 return false;
2060 }
2061 }
2062
2063 /* Success! Rewrite the instruction. */
2064 last_rhs_inst->dst.file = dst.file;
2065 last_rhs_inst->dst.reg = dst.reg;
2066 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2067 last_rhs_inst->dst.reladdr = dst.reladdr;
2068 last_rhs_inst->dst.writemask &= dst.writemask;
2069
2070 return true;
2071 }
2072
2073 void
2074 vec4_visitor::visit(ir_assignment *ir)
2075 {
2076 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2077 uint32_t predicate = BRW_PREDICATE_NONE;
2078
2079 if (!ir->lhs->type->is_scalar() &&
2080 !ir->lhs->type->is_vector()) {
2081 ir->rhs->accept(this);
2082 src_reg src = this->result;
2083
2084 if (ir->condition) {
2085 emit_bool_to_cond_code(ir->condition, &predicate);
2086 }
2087
2088 /* emit_block_move doesn't account for swizzles in the source register.
2089 * This should be ok, since the source register is a structure or an
2090 * array, and those can't be swizzled. But double-check to be sure.
2091 */
2092 assert(src.swizzle ==
2093 (ir->rhs->type->is_matrix()
2094 ? swizzle_for_size(ir->rhs->type->vector_elements)
2095 : BRW_SWIZZLE_NOOP));
2096
2097 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2098 return;
2099 }
2100
2101 /* Now we're down to just a scalar/vector with writemasks. */
2102 int i;
2103
2104 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2105 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2106
2107 ir->rhs->accept(this);
2108
2109 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2110
2111 src_reg src = this->result;
2112
2113 int swizzles[4];
2114 int first_enabled_chan = 0;
2115 int src_chan = 0;
2116
2117 assert(ir->lhs->type->is_vector() ||
2118 ir->lhs->type->is_scalar());
2119 dst.writemask = ir->write_mask;
2120
2121 for (int i = 0; i < 4; i++) {
2122 if (dst.writemask & (1 << i)) {
2123 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2124 break;
2125 }
2126 }
2127
2128 /* Swizzle a small RHS vector into the channels being written.
2129 *
2130 * glsl ir treats write_mask as dictating how many channels are
2131 * present on the RHS while in our instructions we need to make
2132 * those channels appear in the slots of the vec4 they're written to.
2133 */
2134 for (int i = 0; i < 4; i++) {
2135 if (dst.writemask & (1 << i))
2136 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2137 else
2138 swizzles[i] = first_enabled_chan;
2139 }
2140 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2141 swizzles[2], swizzles[3]);
2142
2143 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2144 return;
2145 }
2146
2147 if (ir->condition) {
2148 emit_bool_to_cond_code(ir->condition, &predicate);
2149 }
2150
2151 for (i = 0; i < type_size(ir->lhs->type); i++) {
2152 vec4_instruction *inst = emit(MOV(dst, src));
2153 inst->predicate = predicate;
2154
2155 dst.reg_offset++;
2156 src.reg_offset++;
2157 }
2158 }
2159
2160 void
2161 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2162 {
2163 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2164 foreach_in_list(ir_constant, field_value, &ir->components) {
2165 emit_constant_values(dst, field_value);
2166 }
2167 return;
2168 }
2169
2170 if (ir->type->is_array()) {
2171 for (unsigned int i = 0; i < ir->type->length; i++) {
2172 emit_constant_values(dst, ir->array_elements[i]);
2173 }
2174 return;
2175 }
2176
2177 if (ir->type->is_matrix()) {
2178 for (int i = 0; i < ir->type->matrix_columns; i++) {
2179 float *vec = &ir->value.f[i * ir->type->vector_elements];
2180
2181 for (int j = 0; j < ir->type->vector_elements; j++) {
2182 dst->writemask = 1 << j;
2183 dst->type = BRW_REGISTER_TYPE_F;
2184
2185 emit(MOV(*dst, src_reg(vec[j])));
2186 }
2187 dst->reg_offset++;
2188 }
2189 return;
2190 }
2191
2192 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2193
2194 for (int i = 0; i < ir->type->vector_elements; i++) {
2195 if (!(remaining_writemask & (1 << i)))
2196 continue;
2197
2198 dst->writemask = 1 << i;
2199 dst->type = brw_type_for_base_type(ir->type);
2200
2201 /* Find other components that match the one we're about to
2202 * write. Emits fewer instructions for things like vec4(0.5,
2203 * 1.5, 1.5, 1.5).
2204 */
2205 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2206 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2207 if (ir->value.b[i] == ir->value.b[j])
2208 dst->writemask |= (1 << j);
2209 } else {
2210 /* u, i, and f storage all line up, so no need for a
2211 * switch case for comparing each type.
2212 */
2213 if (ir->value.u[i] == ir->value.u[j])
2214 dst->writemask |= (1 << j);
2215 }
2216 }
2217
2218 switch (ir->type->base_type) {
2219 case GLSL_TYPE_FLOAT:
2220 emit(MOV(*dst, src_reg(ir->value.f[i])));
2221 break;
2222 case GLSL_TYPE_INT:
2223 emit(MOV(*dst, src_reg(ir->value.i[i])));
2224 break;
2225 case GLSL_TYPE_UINT:
2226 emit(MOV(*dst, src_reg(ir->value.u[i])));
2227 break;
2228 case GLSL_TYPE_BOOL:
2229 emit(MOV(*dst, src_reg(ir->value.b[i])));
2230 break;
2231 default:
2232 assert(!"Non-float/uint/int/bool constant");
2233 break;
2234 }
2235
2236 remaining_writemask &= ~dst->writemask;
2237 }
2238 dst->reg_offset++;
2239 }
2240
2241 void
2242 vec4_visitor::visit(ir_constant *ir)
2243 {
2244 dst_reg dst = dst_reg(this, ir->type);
2245 this->result = src_reg(dst);
2246
2247 emit_constant_values(&dst, ir);
2248 }
2249
2250 void
2251 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2252 {
2253 ir_dereference *deref = static_cast<ir_dereference *>(
2254 ir->actual_parameters.get_head());
2255 ir_variable *location = deref->variable_referenced();
2256 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2257 location->data.atomic.buffer_index);
2258
2259 /* Calculate the surface offset */
2260 src_reg offset(this, glsl_type::uint_type);
2261 ir_dereference_array *deref_array = deref->as_dereference_array();
2262 if (deref_array) {
2263 deref_array->array_index->accept(this);
2264
2265 src_reg tmp(this, glsl_type::uint_type);
2266 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2267 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2268 } else {
2269 offset = location->data.atomic.offset;
2270 }
2271
2272 /* Emit the appropriate machine instruction */
2273 const char *callee = ir->callee->function_name();
2274 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2275
2276 if (!strcmp("__intrinsic_atomic_read", callee)) {
2277 emit_untyped_surface_read(surf_index, dst, offset);
2278
2279 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2280 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2281 src_reg(), src_reg());
2282
2283 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2284 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2285 src_reg(), src_reg());
2286 }
2287 }
2288
2289 void
2290 vec4_visitor::visit(ir_call *ir)
2291 {
2292 const char *callee = ir->callee->function_name();
2293
2294 if (!strcmp("__intrinsic_atomic_read", callee) ||
2295 !strcmp("__intrinsic_atomic_increment", callee) ||
2296 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2297 visit_atomic_counter_intrinsic(ir);
2298 } else {
2299 assert(!"Unsupported intrinsic.");
2300 }
2301 }
2302
2303 src_reg
2304 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, int sampler)
2305 {
2306 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2307 inst->base_mrf = 2;
2308 inst->mlen = 1;
2309 inst->sampler = sampler;
2310 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2311 inst->dst.writemask = WRITEMASK_XYZW;
2312
2313 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2314 int param_base = inst->base_mrf;
2315 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2316 int zero_mask = 0xf & ~coord_mask;
2317
2318 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2319 coordinate));
2320
2321 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2322 src_reg(0)));
2323
2324 emit(inst);
2325 return src_reg(inst->dst);
2326 }
2327
2328 void
2329 vec4_visitor::visit(ir_texture *ir)
2330 {
2331 int sampler =
2332 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2333
2334 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2335 * emitting anything other than setting up the constant result.
2336 */
2337 if (ir->op == ir_tg4) {
2338 ir_constant *chan = ir->lod_info.component->as_constant();
2339 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2340 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2341 dst_reg result(this, ir->type);
2342 this->result = src_reg(result);
2343 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2344 return;
2345 }
2346 }
2347
2348 /* Should be lowered by do_lower_texture_projection */
2349 assert(!ir->projector);
2350
2351 /* Should be lowered */
2352 assert(!ir->offset || !ir->offset->type->is_array());
2353
2354 /* Generate code to compute all the subexpression trees. This has to be
2355 * done before loading any values into MRFs for the sampler message since
2356 * generating these values may involve SEND messages that need the MRFs.
2357 */
2358 src_reg coordinate;
2359 if (ir->coordinate) {
2360 ir->coordinate->accept(this);
2361 coordinate = this->result;
2362 }
2363
2364 src_reg shadow_comparitor;
2365 if (ir->shadow_comparitor) {
2366 ir->shadow_comparitor->accept(this);
2367 shadow_comparitor = this->result;
2368 }
2369
2370 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2371 src_reg offset_value;
2372 if (has_nonconstant_offset) {
2373 ir->offset->accept(this);
2374 offset_value = src_reg(this->result);
2375 }
2376
2377 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2378 src_reg lod, dPdx, dPdy, sample_index, mcs;
2379 switch (ir->op) {
2380 case ir_tex:
2381 lod = src_reg(0.0f);
2382 lod_type = glsl_type::float_type;
2383 break;
2384 case ir_txf:
2385 case ir_txl:
2386 case ir_txs:
2387 ir->lod_info.lod->accept(this);
2388 lod = this->result;
2389 lod_type = ir->lod_info.lod->type;
2390 break;
2391 case ir_query_levels:
2392 lod = src_reg(0);
2393 lod_type = glsl_type::int_type;
2394 break;
2395 case ir_txf_ms:
2396 ir->lod_info.sample_index->accept(this);
2397 sample_index = this->result;
2398 sample_index_type = ir->lod_info.sample_index->type;
2399
2400 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2401 mcs = emit_mcs_fetch(ir, coordinate, sampler);
2402 else
2403 mcs = src_reg(0u);
2404 break;
2405 case ir_txd:
2406 ir->lod_info.grad.dPdx->accept(this);
2407 dPdx = this->result;
2408
2409 ir->lod_info.grad.dPdy->accept(this);
2410 dPdy = this->result;
2411
2412 lod_type = ir->lod_info.grad.dPdx->type;
2413 break;
2414 case ir_txb:
2415 case ir_lod:
2416 case ir_tg4:
2417 break;
2418 }
2419
2420 vec4_instruction *inst = NULL;
2421 switch (ir->op) {
2422 case ir_tex:
2423 case ir_txl:
2424 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2425 break;
2426 case ir_txd:
2427 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2428 break;
2429 case ir_txf:
2430 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2431 break;
2432 case ir_txf_ms:
2433 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_CMS);
2434 break;
2435 case ir_txs:
2436 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2437 break;
2438 case ir_tg4:
2439 if (has_nonconstant_offset)
2440 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4_OFFSET);
2441 else
2442 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TG4);
2443 break;
2444 case ir_query_levels:
2445 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2446 break;
2447 case ir_txb:
2448 assert(!"TXB is not valid for vertex shaders.");
2449 break;
2450 case ir_lod:
2451 assert(!"LOD is not valid for vertex shaders.");
2452 break;
2453 default:
2454 assert(!"Unrecognized tex op");
2455 }
2456
2457 if (ir->offset != NULL && ir->op != ir_txf)
2458 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
2459
2460 /* Stuff the channel select bits in the top of the texture offset */
2461 if (ir->op == ir_tg4)
2462 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2463
2464 /* The message header is necessary for:
2465 * - Gen4 (always)
2466 * - Texel offsets
2467 * - Gather channel selection
2468 * - Sampler indices too large to fit in a 4-bit value.
2469 */
2470 inst->header_present =
2471 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2472 sampler >= 16;
2473 inst->base_mrf = 2;
2474 inst->mlen = inst->header_present + 1; /* always at least one */
2475 inst->sampler = sampler;
2476 inst->dst = dst_reg(this, ir->type);
2477 inst->dst.writemask = WRITEMASK_XYZW;
2478 inst->shadow_compare = ir->shadow_comparitor != NULL;
2479
2480 /* MRF for the first parameter */
2481 int param_base = inst->base_mrf + inst->header_present;
2482
2483 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2484 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2485 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2486 } else {
2487 /* Load the coordinate */
2488 /* FINISHME: gl_clamp_mask and saturate */
2489 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2490 int zero_mask = 0xf & ~coord_mask;
2491
2492 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2493 coordinate));
2494
2495 if (zero_mask != 0) {
2496 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2497 src_reg(0)));
2498 }
2499 /* Load the shadow comparitor */
2500 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2501 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2502 WRITEMASK_X),
2503 shadow_comparitor));
2504 inst->mlen++;
2505 }
2506
2507 /* Load the LOD info */
2508 if (ir->op == ir_tex || ir->op == ir_txl) {
2509 int mrf, writemask;
2510 if (brw->gen >= 5) {
2511 mrf = param_base + 1;
2512 if (ir->shadow_comparitor) {
2513 writemask = WRITEMASK_Y;
2514 /* mlen already incremented */
2515 } else {
2516 writemask = WRITEMASK_X;
2517 inst->mlen++;
2518 }
2519 } else /* brw->gen == 4 */ {
2520 mrf = param_base;
2521 writemask = WRITEMASK_W;
2522 }
2523 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2524 } else if (ir->op == ir_txf) {
2525 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2526 } else if (ir->op == ir_txf_ms) {
2527 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2528 sample_index));
2529 if (brw->gen >= 7)
2530 /* MCS data is in the first channel of `mcs`, but we need to get it into
2531 * the .y channel of the second vec4 of params, so replicate .x across
2532 * the whole vec4 and then mask off everything except .y
2533 */
2534 mcs.swizzle = BRW_SWIZZLE_XXXX;
2535 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2536 mcs));
2537 inst->mlen++;
2538 } else if (ir->op == ir_txd) {
2539 const glsl_type *type = lod_type;
2540
2541 if (brw->gen >= 5) {
2542 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2543 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2544 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2545 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2546 inst->mlen++;
2547
2548 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2549 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2550 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2551 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2552 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2553 inst->mlen++;
2554
2555 if (ir->shadow_comparitor) {
2556 emit(MOV(dst_reg(MRF, param_base + 2,
2557 ir->shadow_comparitor->type, WRITEMASK_Z),
2558 shadow_comparitor));
2559 }
2560 }
2561 } else /* brw->gen == 4 */ {
2562 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2563 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2564 inst->mlen += 2;
2565 }
2566 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2567 if (ir->shadow_comparitor) {
2568 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2569 shadow_comparitor));
2570 }
2571
2572 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2573 offset_value));
2574 inst->mlen++;
2575 }
2576 }
2577
2578 emit(inst);
2579
2580 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2581 * spec requires layers.
2582 */
2583 if (ir->op == ir_txs) {
2584 glsl_type const *type = ir->sampler->type;
2585 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2586 type->sampler_array) {
2587 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2588 writemask(inst->dst, WRITEMASK_Z),
2589 src_reg(inst->dst), src_reg(6));
2590 }
2591 }
2592
2593 if (brw->gen == 6 && ir->op == ir_tg4) {
2594 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2595 }
2596
2597 swizzle_result(ir, src_reg(inst->dst), sampler);
2598 }
2599
2600 /**
2601 * Apply workarounds for Gen6 gather with UINT/SINT
2602 */
2603 void
2604 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2605 {
2606 if (!wa)
2607 return;
2608
2609 int width = (wa & WA_8BIT) ? 8 : 16;
2610 dst_reg dst_f = dst;
2611 dst_f.type = BRW_REGISTER_TYPE_F;
2612
2613 /* Convert from UNORM to UINT */
2614 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2615 emit(MOV(dst, src_reg(dst_f)));
2616
2617 if (wa & WA_SIGN) {
2618 /* Reinterpret the UINT value as a signed INT value by
2619 * shifting the sign bit into place, then shifting back
2620 * preserving sign.
2621 */
2622 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2623 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2624 }
2625 }
2626
2627 /**
2628 * Set up the gather channel based on the swizzle, for gather4.
2629 */
2630 uint32_t
2631 vec4_visitor::gather_channel(ir_texture *ir, int sampler)
2632 {
2633 ir_constant *chan = ir->lod_info.component->as_constant();
2634 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2635 switch (swiz) {
2636 case SWIZZLE_X: return 0;
2637 case SWIZZLE_Y:
2638 /* gather4 sampler is broken for green channel on RG32F --
2639 * we must ask for blue instead.
2640 */
2641 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2642 return 2;
2643 return 1;
2644 case SWIZZLE_Z: return 2;
2645 case SWIZZLE_W: return 3;
2646 default:
2647 assert(!"Not reached"); /* zero, one swizzles handled already */
2648 return 0;
2649 }
2650 }
2651
2652 void
2653 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2654 {
2655 int s = key->tex.swizzles[sampler];
2656
2657 this->result = src_reg(this, ir->type);
2658 dst_reg swizzled_result(this->result);
2659
2660 if (ir->op == ir_query_levels) {
2661 /* # levels is in .w */
2662 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2663 emit(MOV(swizzled_result, orig_val));
2664 return;
2665 }
2666
2667 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2668 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2669 emit(MOV(swizzled_result, orig_val));
2670 return;
2671 }
2672
2673
2674 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2675 int swizzle[4] = {0};
2676
2677 for (int i = 0; i < 4; i++) {
2678 switch (GET_SWZ(s, i)) {
2679 case SWIZZLE_ZERO:
2680 zero_mask |= (1 << i);
2681 break;
2682 case SWIZZLE_ONE:
2683 one_mask |= (1 << i);
2684 break;
2685 default:
2686 copy_mask |= (1 << i);
2687 swizzle[i] = GET_SWZ(s, i);
2688 break;
2689 }
2690 }
2691
2692 if (copy_mask) {
2693 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2694 swizzled_result.writemask = copy_mask;
2695 emit(MOV(swizzled_result, orig_val));
2696 }
2697
2698 if (zero_mask) {
2699 swizzled_result.writemask = zero_mask;
2700 emit(MOV(swizzled_result, src_reg(0.0f)));
2701 }
2702
2703 if (one_mask) {
2704 swizzled_result.writemask = one_mask;
2705 emit(MOV(swizzled_result, src_reg(1.0f)));
2706 }
2707 }
2708
2709 void
2710 vec4_visitor::visit(ir_return *)
2711 {
2712 assert(!"not reached");
2713 }
2714
2715 void
2716 vec4_visitor::visit(ir_discard *)
2717 {
2718 assert(!"not reached");
2719 }
2720
2721 void
2722 vec4_visitor::visit(ir_if *ir)
2723 {
2724 /* Don't point the annotation at the if statement, because then it plus
2725 * the then and else blocks get printed.
2726 */
2727 this->base_ir = ir->condition;
2728
2729 if (brw->gen == 6) {
2730 emit_if_gen6(ir);
2731 } else {
2732 uint32_t predicate;
2733 emit_bool_to_cond_code(ir->condition, &predicate);
2734 emit(IF(predicate));
2735 }
2736
2737 visit_instructions(&ir->then_instructions);
2738
2739 if (!ir->else_instructions.is_empty()) {
2740 this->base_ir = ir->condition;
2741 emit(BRW_OPCODE_ELSE);
2742
2743 visit_instructions(&ir->else_instructions);
2744 }
2745
2746 this->base_ir = ir->condition;
2747 emit(BRW_OPCODE_ENDIF);
2748 }
2749
2750 void
2751 vec4_visitor::visit(ir_emit_vertex *)
2752 {
2753 assert(!"not reached");
2754 }
2755
2756 void
2757 vec4_visitor::visit(ir_end_primitive *)
2758 {
2759 assert(!"not reached");
2760 }
2761
2762 void
2763 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2764 dst_reg dst, src_reg offset,
2765 src_reg src0, src_reg src1)
2766 {
2767 unsigned mlen = 0;
2768
2769 /* Set the atomic operation offset. */
2770 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2771 mlen++;
2772
2773 /* Set the atomic operation arguments. */
2774 if (src0.file != BAD_FILE) {
2775 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2776 mlen++;
2777 }
2778
2779 if (src1.file != BAD_FILE) {
2780 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2781 mlen++;
2782 }
2783
2784 /* Emit the instruction. Note that this maps to the normal SIMD8
2785 * untyped atomic message on Ivy Bridge, but that's OK because
2786 * unused channels will be masked out.
2787 */
2788 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2789 src_reg(atomic_op), src_reg(surf_index));
2790 inst->base_mrf = 0;
2791 inst->mlen = mlen;
2792 }
2793
2794 void
2795 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2796 src_reg offset)
2797 {
2798 /* Set the surface read offset. */
2799 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2800
2801 /* Emit the instruction. Note that this maps to the normal SIMD8
2802 * untyped surface read message, but that's OK because unused
2803 * channels will be masked out.
2804 */
2805 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2806 dst, src_reg(surf_index));
2807 inst->base_mrf = 0;
2808 inst->mlen = 1;
2809 }
2810
2811 void
2812 vec4_visitor::emit_ndc_computation()
2813 {
2814 /* Get the position */
2815 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2816
2817 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2818 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2819 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2820
2821 current_annotation = "NDC";
2822 dst_reg ndc_w = ndc;
2823 ndc_w.writemask = WRITEMASK_W;
2824 src_reg pos_w = pos;
2825 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2826 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2827
2828 dst_reg ndc_xyz = ndc;
2829 ndc_xyz.writemask = WRITEMASK_XYZ;
2830
2831 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2832 }
2833
2834 void
2835 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2836 {
2837 if (brw->gen < 6 &&
2838 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2839 key->userclip_active || brw->has_negative_rhw_bug)) {
2840 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2841 dst_reg header1_w = header1;
2842 header1_w.writemask = WRITEMASK_W;
2843
2844 emit(MOV(header1, 0u));
2845
2846 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2847 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2848
2849 current_annotation = "Point size";
2850 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2851 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2852 }
2853
2854 if (key->userclip_active) {
2855 current_annotation = "Clipping flags";
2856 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2857 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2858
2859 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2860 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2861 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2862
2863 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2864 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2865 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2866 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2867 }
2868
2869 /* i965 clipping workaround:
2870 * 1) Test for -ve rhw
2871 * 2) If set,
2872 * set ndc = (0,0,0,0)
2873 * set ucp[6] = 1
2874 *
2875 * Later, clipping will detect ucp[6] and ensure the primitive is
2876 * clipped against all fixed planes.
2877 */
2878 if (brw->has_negative_rhw_bug) {
2879 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2880 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2881 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2882 vec4_instruction *inst;
2883 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2884 inst->predicate = BRW_PREDICATE_NORMAL;
2885 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2886 inst->predicate = BRW_PREDICATE_NORMAL;
2887 }
2888
2889 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2890 } else if (brw->gen < 6) {
2891 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2892 } else {
2893 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2894 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2895 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2896 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2897 }
2898 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2899 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2900 src_reg(output_reg[VARYING_SLOT_LAYER])));
2901 }
2902 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2903 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Z), BRW_REGISTER_TYPE_D),
2904 src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2905 }
2906 }
2907 }
2908
2909 void
2910 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2911 {
2912 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2913 *
2914 * "If a linked set of shaders forming the vertex stage contains no
2915 * static write to gl_ClipVertex or gl_ClipDistance, but the
2916 * application has requested clipping against user clip planes through
2917 * the API, then the coordinate written to gl_Position is used for
2918 * comparison against the user clip planes."
2919 *
2920 * This function is only called if the shader didn't write to
2921 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2922 * if the user wrote to it; otherwise we use gl_Position.
2923 */
2924 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2925 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2926 clip_vertex = VARYING_SLOT_POS;
2927 }
2928
2929 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2930 ++i) {
2931 reg.writemask = 1 << i;
2932 emit(DP4(reg,
2933 src_reg(output_reg[clip_vertex]),
2934 src_reg(this->userplane[i + offset])));
2935 }
2936 }
2937
2938 void
2939 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2940 {
2941 assert (varying < VARYING_SLOT_MAX);
2942 reg.type = output_reg[varying].type;
2943 current_annotation = output_reg_annotation[varying];
2944 /* Copy the register, saturating if necessary */
2945 vec4_instruction *inst = emit(MOV(reg,
2946 src_reg(output_reg[varying])));
2947 if ((varying == VARYING_SLOT_COL0 ||
2948 varying == VARYING_SLOT_COL1 ||
2949 varying == VARYING_SLOT_BFC0 ||
2950 varying == VARYING_SLOT_BFC1) &&
2951 key->clamp_vertex_color) {
2952 inst->saturate = true;
2953 }
2954 }
2955
2956 void
2957 vec4_visitor::emit_urb_slot(int mrf, int varying)
2958 {
2959 struct brw_reg hw_reg = brw_message_reg(mrf);
2960 dst_reg reg = dst_reg(MRF, mrf);
2961 reg.type = BRW_REGISTER_TYPE_F;
2962
2963 switch (varying) {
2964 case VARYING_SLOT_PSIZ:
2965 /* PSIZ is always in slot 0, and is coupled with other flags. */
2966 current_annotation = "indices, point width, clip flags";
2967 emit_psiz_and_flags(hw_reg);
2968 break;
2969 case BRW_VARYING_SLOT_NDC:
2970 current_annotation = "NDC";
2971 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2972 break;
2973 case VARYING_SLOT_POS:
2974 current_annotation = "gl_Position";
2975 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2976 break;
2977 case VARYING_SLOT_EDGE:
2978 /* This is present when doing unfilled polygons. We're supposed to copy
2979 * the edge flag from the user-provided vertex array
2980 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2981 * of that attribute (starts as 1.0f). This is then used in clipping to
2982 * determine which edges should be drawn as wireframe.
2983 */
2984 current_annotation = "edge flag";
2985 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2986 glsl_type::float_type, WRITEMASK_XYZW))));
2987 break;
2988 case BRW_VARYING_SLOT_PAD:
2989 /* No need to write to this slot */
2990 break;
2991 default:
2992 emit_generic_urb_slot(reg, varying);
2993 break;
2994 }
2995 }
2996
2997 static int
2998 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2999 {
3000 if (brw->gen >= 6) {
3001 /* URB data written (does not include the message header reg) must
3002 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3003 * section 5.4.3.2.2: URB_INTERLEAVED.
3004 *
3005 * URB entries are allocated on a multiple of 1024 bits, so an
3006 * extra 128 bits written here to make the end align to 256 is
3007 * no problem.
3008 */
3009 if ((mlen % 2) != 1)
3010 mlen++;
3011 }
3012
3013 return mlen;
3014 }
3015
3016
3017 /**
3018 * Generates the VUE payload plus the necessary URB write instructions to
3019 * output it.
3020 *
3021 * The VUE layout is documented in Volume 2a.
3022 */
3023 void
3024 vec4_visitor::emit_vertex()
3025 {
3026 /* MRF 0 is reserved for the debugger, so start with message header
3027 * in MRF 1.
3028 */
3029 int base_mrf = 1;
3030 int mrf = base_mrf;
3031 /* In the process of generating our URB write message contents, we
3032 * may need to unspill a register or load from an array. Those
3033 * reads would use MRFs 14-15.
3034 */
3035 int max_usable_mrf = 13;
3036
3037 /* The following assertion verifies that max_usable_mrf causes an
3038 * even-numbered amount of URB write data, which will meet gen6's
3039 * requirements for length alignment.
3040 */
3041 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3042
3043 /* First mrf is the g0-based message header containing URB handles and
3044 * such.
3045 */
3046 emit_urb_write_header(mrf++);
3047
3048 if (brw->gen < 6) {
3049 emit_ndc_computation();
3050 }
3051
3052 /* Lower legacy ff and ClipVertex clipping to clip distances */
3053 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3054 current_annotation = "user clip distances";
3055
3056 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3057 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3058
3059 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3060 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3061 }
3062
3063 /* We may need to split this up into several URB writes, so do them in a
3064 * loop.
3065 */
3066 int slot = 0;
3067 bool complete = false;
3068 do {
3069 /* URB offset is in URB row increments, and each of our MRFs is half of
3070 * one of those, since we're doing interleaved writes.
3071 */
3072 int offset = slot / 2;
3073
3074 mrf = base_mrf + 1;
3075 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3076 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
3077
3078 /* If this was max_usable_mrf, we can't fit anything more into this
3079 * URB WRITE.
3080 */
3081 if (mrf > max_usable_mrf) {
3082 slot++;
3083 break;
3084 }
3085 }
3086
3087 complete = slot >= prog_data->vue_map.num_slots;
3088 current_annotation = "URB write";
3089 vec4_instruction *inst = emit_urb_write_opcode(complete);
3090 inst->base_mrf = base_mrf;
3091 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3092 inst->offset += offset;
3093 } while(!complete);
3094 }
3095
3096
3097 src_reg
3098 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
3099 src_reg *reladdr, int reg_offset)
3100 {
3101 /* Because we store the values to scratch interleaved like our
3102 * vertex data, we need to scale the vec4 index by 2.
3103 */
3104 int message_header_scale = 2;
3105
3106 /* Pre-gen6, the message header uses byte offsets instead of vec4
3107 * (16-byte) offset units.
3108 */
3109 if (brw->gen < 6)
3110 message_header_scale *= 16;
3111
3112 if (reladdr) {
3113 src_reg index = src_reg(this, glsl_type::int_type);
3114
3115 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3116 emit_before(inst, MUL(dst_reg(index),
3117 index, src_reg(message_header_scale)));
3118
3119 return index;
3120 } else {
3121 return src_reg(reg_offset * message_header_scale);
3122 }
3123 }
3124
3125 src_reg
3126 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
3127 src_reg *reladdr, int reg_offset)
3128 {
3129 if (reladdr) {
3130 src_reg index = src_reg(this, glsl_type::int_type);
3131
3132 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
3133
3134 /* Pre-gen6, the message header uses byte offsets instead of vec4
3135 * (16-byte) offset units.
3136 */
3137 if (brw->gen < 6) {
3138 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
3139 }
3140
3141 return index;
3142 } else if (brw->gen >= 8) {
3143 /* Store the offset in a GRF so we can send-from-GRF. */
3144 src_reg offset = src_reg(this, glsl_type::int_type);
3145 emit_before(inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3146 return offset;
3147 } else {
3148 int message_header_scale = brw->gen < 6 ? 16 : 1;
3149 return src_reg(reg_offset * message_header_scale);
3150 }
3151 }
3152
3153 /**
3154 * Emits an instruction before @inst to load the value named by @orig_src
3155 * from scratch space at @base_offset to @temp.
3156 *
3157 * @base_offset is measured in 32-byte units (the size of a register).
3158 */
3159 void
3160 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
3161 dst_reg temp, src_reg orig_src,
3162 int base_offset)
3163 {
3164 int reg_offset = base_offset + orig_src.reg_offset;
3165 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
3166
3167 emit_before(inst, SCRATCH_READ(temp, index));
3168 }
3169
3170 /**
3171 * Emits an instruction after @inst to store the value to be written
3172 * to @orig_dst to scratch space at @base_offset, from @temp.
3173 *
3174 * @base_offset is measured in 32-byte units (the size of a register).
3175 */
3176 void
3177 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
3178 {
3179 int reg_offset = base_offset + inst->dst.reg_offset;
3180 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
3181
3182 /* Create a temporary register to store *inst's result in.
3183 *
3184 * We have to be careful in MOVing from our temporary result register in
3185 * the scratch write. If we swizzle from channels of the temporary that
3186 * weren't initialized, it will confuse live interval analysis, which will
3187 * make spilling fail to make progress.
3188 */
3189 src_reg temp = src_reg(this, glsl_type::vec4_type);
3190 temp.type = inst->dst.type;
3191 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3192 int swizzles[4];
3193 for (int i = 0; i < 4; i++)
3194 if (inst->dst.writemask & (1 << i))
3195 swizzles[i] = i;
3196 else
3197 swizzles[i] = first_writemask_chan;
3198 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3199 swizzles[2], swizzles[3]);
3200
3201 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3202 inst->dst.writemask));
3203 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3204 write->predicate = inst->predicate;
3205 write->ir = inst->ir;
3206 write->annotation = inst->annotation;
3207 inst->insert_after(write);
3208
3209 inst->dst.file = temp.file;
3210 inst->dst.reg = temp.reg;
3211 inst->dst.reg_offset = temp.reg_offset;
3212 inst->dst.reladdr = NULL;
3213 }
3214
3215 /**
3216 * We can't generally support array access in GRF space, because a
3217 * single instruction's destination can only span 2 contiguous
3218 * registers. So, we send all GRF arrays that get variable index
3219 * access to scratch space.
3220 */
3221 void
3222 vec4_visitor::move_grf_array_access_to_scratch()
3223 {
3224 int scratch_loc[this->virtual_grf_count];
3225
3226 for (int i = 0; i < this->virtual_grf_count; i++) {
3227 scratch_loc[i] = -1;
3228 }
3229
3230 /* First, calculate the set of virtual GRFs that need to be punted
3231 * to scratch due to having any array access on them, and where in
3232 * scratch.
3233 */
3234 foreach_in_list(vec4_instruction, inst, &instructions) {
3235 if (inst->dst.file == GRF && inst->dst.reladdr &&
3236 scratch_loc[inst->dst.reg] == -1) {
3237 scratch_loc[inst->dst.reg] = c->last_scratch;
3238 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3239 }
3240
3241 for (int i = 0 ; i < 3; i++) {
3242 src_reg *src = &inst->src[i];
3243
3244 if (src->file == GRF && src->reladdr &&
3245 scratch_loc[src->reg] == -1) {
3246 scratch_loc[src->reg] = c->last_scratch;
3247 c->last_scratch += this->virtual_grf_sizes[src->reg];
3248 }
3249 }
3250 }
3251
3252 /* Now, for anything that will be accessed through scratch, rewrite
3253 * it to load/store. Note that this is a _safe list walk, because
3254 * we may generate a new scratch_write instruction after the one
3255 * we're processing.
3256 */
3257 foreach_list_safe(node, &this->instructions) {
3258 vec4_instruction *inst = (vec4_instruction *)node;
3259
3260 /* Set up the annotation tracking for new generated instructions. */
3261 base_ir = inst->ir;
3262 current_annotation = inst->annotation;
3263
3264 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3265 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3266 }
3267
3268 for (int i = 0 ; i < 3; i++) {
3269 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3270 continue;
3271
3272 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3273
3274 emit_scratch_read(inst, temp, inst->src[i],
3275 scratch_loc[inst->src[i].reg]);
3276
3277 inst->src[i].file = temp.file;
3278 inst->src[i].reg = temp.reg;
3279 inst->src[i].reg_offset = temp.reg_offset;
3280 inst->src[i].reladdr = NULL;
3281 }
3282 }
3283 }
3284
3285 /**
3286 * Emits an instruction before @inst to load the value named by @orig_src
3287 * from the pull constant buffer (surface) at @base_offset to @temp.
3288 */
3289 void
3290 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3291 dst_reg temp, src_reg orig_src,
3292 int base_offset)
3293 {
3294 int reg_offset = base_offset + orig_src.reg_offset;
3295 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3296 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3297 vec4_instruction *load;
3298
3299 if (brw->gen >= 7) {
3300 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3301 grf_offset.type = offset.type;
3302 emit_before(inst, MOV(grf_offset, offset));
3303
3304 load = new(mem_ctx) vec4_instruction(this,
3305 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3306 temp, index, src_reg(grf_offset));
3307 } else {
3308 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3309 temp, index, offset);
3310 load->base_mrf = 14;
3311 load->mlen = 1;
3312 }
3313 emit_before(inst, load);
3314 }
3315
3316 /**
3317 * Implements array access of uniforms by inserting a
3318 * PULL_CONSTANT_LOAD instruction.
3319 *
3320 * Unlike temporary GRF array access (where we don't support it due to
3321 * the difficulty of doing relative addressing on instruction
3322 * destinations), we could potentially do array access of uniforms
3323 * that were loaded in GRF space as push constants. In real-world
3324 * usage we've seen, though, the arrays being used are always larger
3325 * than we could load as push constants, so just always move all
3326 * uniform array access out to a pull constant buffer.
3327 */
3328 void
3329 vec4_visitor::move_uniform_array_access_to_pull_constants()
3330 {
3331 int pull_constant_loc[this->uniforms];
3332
3333 for (int i = 0; i < this->uniforms; i++) {
3334 pull_constant_loc[i] = -1;
3335 }
3336
3337 /* Walk through and find array access of uniforms. Put a copy of that
3338 * uniform in the pull constant buffer.
3339 *
3340 * Note that we don't move constant-indexed accesses to arrays. No
3341 * testing has been done of the performance impact of this choice.
3342 */
3343 foreach_list_safe(node, &this->instructions) {
3344 vec4_instruction *inst = (vec4_instruction *)node;
3345
3346 for (int i = 0 ; i < 3; i++) {
3347 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3348 continue;
3349
3350 int uniform = inst->src[i].reg;
3351
3352 /* If this array isn't already present in the pull constant buffer,
3353 * add it.
3354 */
3355 if (pull_constant_loc[uniform] == -1) {
3356 const float **values = &stage_prog_data->param[uniform * 4];
3357
3358 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3359
3360 assert(uniform < uniform_array_size);
3361 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3362 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3363 = values[j];
3364 }
3365 }
3366
3367 /* Set up the annotation tracking for new generated instructions. */
3368 base_ir = inst->ir;
3369 current_annotation = inst->annotation;
3370
3371 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3372
3373 emit_pull_constant_load(inst, temp, inst->src[i],
3374 pull_constant_loc[uniform]);
3375
3376 inst->src[i].file = temp.file;
3377 inst->src[i].reg = temp.reg;
3378 inst->src[i].reg_offset = temp.reg_offset;
3379 inst->src[i].reladdr = NULL;
3380 }
3381 }
3382
3383 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3384 * no need to track them as larger-than-vec4 objects. This will be
3385 * relied on in cutting out unused uniform vectors from push
3386 * constants.
3387 */
3388 split_uniform_registers();
3389 }
3390
3391 void
3392 vec4_visitor::resolve_ud_negate(src_reg *reg)
3393 {
3394 if (reg->type != BRW_REGISTER_TYPE_UD ||
3395 !reg->negate)
3396 return;
3397
3398 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3399 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3400 *reg = temp;
3401 }
3402
3403 vec4_visitor::vec4_visitor(struct brw_context *brw,
3404 struct brw_vec4_compile *c,
3405 struct gl_program *prog,
3406 const struct brw_vec4_prog_key *key,
3407 struct brw_vec4_prog_data *prog_data,
3408 struct gl_shader_program *shader_prog,
3409 gl_shader_stage stage,
3410 void *mem_ctx,
3411 bool debug_flag,
3412 bool no_spills,
3413 shader_time_shader_type st_base,
3414 shader_time_shader_type st_written,
3415 shader_time_shader_type st_reset)
3416 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3417 c(c),
3418 key(key),
3419 prog_data(prog_data),
3420 sanity_param_count(0),
3421 fail_msg(NULL),
3422 first_non_payload_grf(0),
3423 need_all_constants_in_pull_buffer(false),
3424 debug_flag(debug_flag),
3425 no_spills(no_spills),
3426 st_base(st_base),
3427 st_written(st_written),
3428 st_reset(st_reset)
3429 {
3430 this->mem_ctx = mem_ctx;
3431 this->failed = false;
3432
3433 this->base_ir = NULL;
3434 this->current_annotation = NULL;
3435 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3436
3437 this->variable_ht = hash_table_ctor(0,
3438 hash_table_pointer_hash,
3439 hash_table_pointer_compare);
3440
3441 this->virtual_grf_start = NULL;
3442 this->virtual_grf_end = NULL;
3443 this->virtual_grf_sizes = NULL;
3444 this->virtual_grf_count = 0;
3445 this->virtual_grf_reg_map = NULL;
3446 this->virtual_grf_reg_count = 0;
3447 this->virtual_grf_array_size = 0;
3448 this->live_intervals_valid = false;
3449
3450 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3451
3452 this->uniforms = 0;
3453
3454 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3455 * at least one. See setup_uniforms() in brw_vec4.cpp.
3456 */
3457 this->uniform_array_size = 1;
3458 if (prog_data) {
3459 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3460 }
3461
3462 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3463 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3464 }
3465
3466 vec4_visitor::~vec4_visitor()
3467 {
3468 hash_table_dtor(this->variable_ht);
3469 }
3470
3471
3472 void
3473 vec4_visitor::fail(const char *format, ...)
3474 {
3475 va_list va;
3476 char *msg;
3477
3478 if (failed)
3479 return;
3480
3481 failed = true;
3482
3483 va_start(va, format);
3484 msg = ralloc_vasprintf(mem_ctx, format, va);
3485 va_end(va);
3486 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3487
3488 this->fail_msg = msg;
3489
3490 if (debug_flag) {
3491 fprintf(stderr, "%s", msg);
3492 }
3493 }
3494
3495 } /* namespace brw */