i965: Add cases for ir_binop_vector_extract that assert.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 #define ALU3(op) \
111 vec4_instruction * \
112 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1, src_reg src2)\
113 { \
114 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
115 src0, src1, src2); \
116 }
117
118 ALU1(NOT)
119 ALU1(MOV)
120 ALU1(FRC)
121 ALU1(RNDD)
122 ALU1(RNDE)
123 ALU1(RNDZ)
124 ALU1(F32TO16)
125 ALU1(F16TO32)
126 ALU2(ADD)
127 ALU2(MUL)
128 ALU2(MACH)
129 ALU2(AND)
130 ALU2(OR)
131 ALU2(XOR)
132 ALU2(DP3)
133 ALU2(DP4)
134 ALU2(DPH)
135 ALU2(SHL)
136 ALU2(SHR)
137 ALU2(ASR)
138 ALU3(LRP)
139 ALU1(BFREV)
140 ALU3(BFE)
141 ALU2(BFI1)
142 ALU3(BFI2)
143 ALU1(FBH)
144 ALU1(FBL)
145 ALU1(CBIT)
146
147 /** Gen4 predicated IF. */
148 vec4_instruction *
149 vec4_visitor::IF(uint32_t predicate)
150 {
151 vec4_instruction *inst;
152
153 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
154 inst->predicate = predicate;
155
156 return inst;
157 }
158
159 /** Gen6+ IF with embedded comparison. */
160 vec4_instruction *
161 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
162 {
163 assert(intel->gen >= 6);
164
165 vec4_instruction *inst;
166
167 resolve_ud_negate(&src0);
168 resolve_ud_negate(&src1);
169
170 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
171 src0, src1);
172 inst->conditional_mod = condition;
173
174 return inst;
175 }
176
177 /**
178 * CMP: Sets the low bit of the destination channels with the result
179 * of the comparison, while the upper bits are undefined, and updates
180 * the flag register with the packed 16 bits of the result.
181 */
182 vec4_instruction *
183 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
184 {
185 vec4_instruction *inst;
186
187 /* original gen4 does type conversion to the destination type
188 * before before comparison, producing garbage results for floating
189 * point comparisons.
190 */
191 if (intel->gen == 4) {
192 dst.type = src0.type;
193 if (dst.file == HW_REG)
194 dst.fixed_hw_reg.type = dst.type;
195 }
196
197 resolve_ud_negate(&src0);
198 resolve_ud_negate(&src1);
199
200 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
201 inst->conditional_mod = condition;
202
203 return inst;
204 }
205
206 vec4_instruction *
207 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
208 {
209 vec4_instruction *inst;
210
211 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
212 dst, index);
213 inst->base_mrf = 14;
214 inst->mlen = 2;
215
216 return inst;
217 }
218
219 vec4_instruction *
220 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
221 {
222 vec4_instruction *inst;
223
224 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
225 dst, src, index);
226 inst->base_mrf = 13;
227 inst->mlen = 3;
228
229 return inst;
230 }
231
232 void
233 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
234 {
235 static enum opcode dot_opcodes[] = {
236 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
237 };
238
239 emit(dot_opcodes[elements - 2], dst, src0, src1);
240 }
241
242 src_reg
243 vec4_visitor::fix_3src_operand(src_reg src)
244 {
245 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
246 * able to use vertical stride of zero to replicate the vec4 uniform, like
247 *
248 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
249 *
250 * But you can't, since vertical stride is always four in three-source
251 * instructions. Instead, insert a MOV instruction to do the replication so
252 * that the three-source instruction can consume it.
253 */
254
255 /* The MOV is only needed if the source is a uniform or immediate. */
256 if (src.file != UNIFORM && src.file != IMM)
257 return src;
258
259 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
260 expanded.type = src.type;
261 emit(MOV(expanded, src));
262 return src_reg(expanded);
263 }
264
265 src_reg
266 vec4_visitor::fix_math_operand(src_reg src)
267 {
268 /* The gen6 math instruction ignores the source modifiers --
269 * swizzle, abs, negate, and at least some parts of the register
270 * region description.
271 *
272 * Rather than trying to enumerate all these cases, *always* expand the
273 * operand to a temp GRF for gen6.
274 *
275 * For gen7, keep the operand as-is, except if immediate, which gen7 still
276 * can't use.
277 */
278
279 if (intel->gen == 7 && src.file != IMM)
280 return src;
281
282 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
283 expanded.type = src.type;
284 emit(MOV(expanded, src));
285 return src_reg(expanded);
286 }
287
288 void
289 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
290 {
291 src = fix_math_operand(src);
292
293 if (dst.writemask != WRITEMASK_XYZW) {
294 /* The gen6 math instruction must be align1, so we can't do
295 * writemasks.
296 */
297 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
298
299 emit(opcode, temp_dst, src);
300
301 emit(MOV(dst, src_reg(temp_dst)));
302 } else {
303 emit(opcode, dst, src);
304 }
305 }
306
307 void
308 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
309 {
310 vec4_instruction *inst = emit(opcode, dst, src);
311 inst->base_mrf = 1;
312 inst->mlen = 1;
313 }
314
315 void
316 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
317 {
318 switch (opcode) {
319 case SHADER_OPCODE_RCP:
320 case SHADER_OPCODE_RSQ:
321 case SHADER_OPCODE_SQRT:
322 case SHADER_OPCODE_EXP2:
323 case SHADER_OPCODE_LOG2:
324 case SHADER_OPCODE_SIN:
325 case SHADER_OPCODE_COS:
326 break;
327 default:
328 assert(!"not reached: bad math opcode");
329 return;
330 }
331
332 if (intel->gen >= 6) {
333 return emit_math1_gen6(opcode, dst, src);
334 } else {
335 return emit_math1_gen4(opcode, dst, src);
336 }
337 }
338
339 void
340 vec4_visitor::emit_math2_gen6(enum opcode opcode,
341 dst_reg dst, src_reg src0, src_reg src1)
342 {
343 src0 = fix_math_operand(src0);
344 src1 = fix_math_operand(src1);
345
346 if (dst.writemask != WRITEMASK_XYZW) {
347 /* The gen6 math instruction must be align1, so we can't do
348 * writemasks.
349 */
350 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
351 temp_dst.type = dst.type;
352
353 emit(opcode, temp_dst, src0, src1);
354
355 emit(MOV(dst, src_reg(temp_dst)));
356 } else {
357 emit(opcode, dst, src0, src1);
358 }
359 }
360
361 void
362 vec4_visitor::emit_math2_gen4(enum opcode opcode,
363 dst_reg dst, src_reg src0, src_reg src1)
364 {
365 vec4_instruction *inst = emit(opcode, dst, src0, src1);
366 inst->base_mrf = 1;
367 inst->mlen = 2;
368 }
369
370 void
371 vec4_visitor::emit_math(enum opcode opcode,
372 dst_reg dst, src_reg src0, src_reg src1)
373 {
374 switch (opcode) {
375 case SHADER_OPCODE_POW:
376 case SHADER_OPCODE_INT_QUOTIENT:
377 case SHADER_OPCODE_INT_REMAINDER:
378 break;
379 default:
380 assert(!"not reached: unsupported binary math opcode");
381 return;
382 }
383
384 if (intel->gen >= 6) {
385 return emit_math2_gen6(opcode, dst, src0, src1);
386 } else {
387 return emit_math2_gen4(opcode, dst, src0, src1);
388 }
389 }
390
391 void
392 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
393 {
394 if (intel->gen < 7)
395 assert(!"ir_unop_pack_half_2x16 should be lowered");
396
397 assert(dst.type == BRW_REGISTER_TYPE_UD);
398 assert(src0.type == BRW_REGISTER_TYPE_F);
399
400 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
401 *
402 * Because this instruction does not have a 16-bit floating-point type,
403 * the destination data type must be Word (W).
404 *
405 * The destination must be DWord-aligned and specify a horizontal stride
406 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
407 * each destination channel and the upper word is not modified.
408 *
409 * The above restriction implies that the f32to16 instruction must use
410 * align1 mode, because only in align1 mode is it possible to specify
411 * horizontal stride. We choose here to defy the hardware docs and emit
412 * align16 instructions.
413 *
414 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
415 * instructions. I was partially successful in that the code passed all
416 * tests. However, the code was dubiously correct and fragile, and the
417 * tests were not harsh enough to probe that frailty. Not trusting the
418 * code, I chose instead to remain in align16 mode in defiance of the hw
419 * docs).
420 *
421 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
422 * simulator, emitting a f32to16 in align16 mode with UD as destination
423 * data type is safe. The behavior differs from that specified in the PRM
424 * in that the upper word of each destination channel is cleared to 0.
425 */
426
427 dst_reg tmp_dst(this, glsl_type::uvec2_type);
428 src_reg tmp_src(tmp_dst);
429
430 #if 0
431 /* Verify the undocumented behavior on which the following instructions
432 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
433 * then the result of the bit-or instruction below will be incorrect.
434 *
435 * You should inspect the disasm output in order to verify that the MOV is
436 * not optimized away.
437 */
438 emit(MOV(tmp_dst, src_reg(0x12345678u)));
439 #endif
440
441 /* Give tmp the form below, where "." means untouched.
442 *
443 * w z y x w z y x
444 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
445 *
446 * That the upper word of each write-channel be 0 is required for the
447 * following bit-shift and bit-or instructions to work. Note that this
448 * relies on the undocumented hardware behavior mentioned above.
449 */
450 tmp_dst.writemask = WRITEMASK_XY;
451 emit(F32TO16(tmp_dst, src0));
452
453 /* Give the write-channels of dst the form:
454 * 0xhhhh0000
455 */
456 tmp_src.swizzle = SWIZZLE_Y;
457 emit(SHL(dst, tmp_src, src_reg(16u)));
458
459 /* Finally, give the write-channels of dst the form of packHalf2x16's
460 * output:
461 * 0xhhhhllll
462 */
463 tmp_src.swizzle = SWIZZLE_X;
464 emit(OR(dst, src_reg(dst), tmp_src));
465 }
466
467 void
468 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
469 {
470 if (intel->gen < 7)
471 assert(!"ir_unop_unpack_half_2x16 should be lowered");
472
473 assert(dst.type == BRW_REGISTER_TYPE_F);
474 assert(src0.type == BRW_REGISTER_TYPE_UD);
475
476 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
477 *
478 * Because this instruction does not have a 16-bit floating-point type,
479 * the source data type must be Word (W). The destination type must be
480 * F (Float).
481 *
482 * To use W as the source data type, we must adjust horizontal strides,
483 * which is only possible in align1 mode. All my [chadv] attempts at
484 * emitting align1 instructions for unpackHalf2x16 failed to pass the
485 * Piglit tests, so I gave up.
486 *
487 * I've verified that, on gen7 hardware and the simulator, it is safe to
488 * emit f16to32 in align16 mode with UD as source data type.
489 */
490
491 dst_reg tmp_dst(this, glsl_type::uvec2_type);
492 src_reg tmp_src(tmp_dst);
493
494 tmp_dst.writemask = WRITEMASK_X;
495 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
496
497 tmp_dst.writemask = WRITEMASK_Y;
498 emit(SHR(tmp_dst, src0, src_reg(16u)));
499
500 dst.writemask = WRITEMASK_XY;
501 emit(F16TO32(dst, tmp_src));
502 }
503
504 void
505 vec4_visitor::visit_instructions(const exec_list *list)
506 {
507 foreach_list(node, list) {
508 ir_instruction *ir = (ir_instruction *)node;
509
510 base_ir = ir;
511 ir->accept(this);
512 }
513 }
514
515
516 static int
517 type_size(const struct glsl_type *type)
518 {
519 unsigned int i;
520 int size;
521
522 switch (type->base_type) {
523 case GLSL_TYPE_UINT:
524 case GLSL_TYPE_INT:
525 case GLSL_TYPE_FLOAT:
526 case GLSL_TYPE_BOOL:
527 if (type->is_matrix()) {
528 return type->matrix_columns;
529 } else {
530 /* Regardless of size of vector, it gets a vec4. This is bad
531 * packing for things like floats, but otherwise arrays become a
532 * mess. Hopefully a later pass over the code can pack scalars
533 * down if appropriate.
534 */
535 return 1;
536 }
537 case GLSL_TYPE_ARRAY:
538 assert(type->length > 0);
539 return type_size(type->fields.array) * type->length;
540 case GLSL_TYPE_STRUCT:
541 size = 0;
542 for (i = 0; i < type->length; i++) {
543 size += type_size(type->fields.structure[i].type);
544 }
545 return size;
546 case GLSL_TYPE_SAMPLER:
547 /* Samplers take up one slot in UNIFORMS[], but they're baked in
548 * at link time.
549 */
550 return 1;
551 case GLSL_TYPE_VOID:
552 case GLSL_TYPE_ERROR:
553 case GLSL_TYPE_INTERFACE:
554 assert(0);
555 break;
556 }
557
558 return 0;
559 }
560
561 int
562 vec4_visitor::virtual_grf_alloc(int size)
563 {
564 if (virtual_grf_array_size <= virtual_grf_count) {
565 if (virtual_grf_array_size == 0)
566 virtual_grf_array_size = 16;
567 else
568 virtual_grf_array_size *= 2;
569 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
570 virtual_grf_array_size);
571 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
572 virtual_grf_array_size);
573 }
574 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
575 virtual_grf_reg_count += size;
576 virtual_grf_sizes[virtual_grf_count] = size;
577 return virtual_grf_count++;
578 }
579
580 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
581 {
582 init();
583
584 this->file = GRF;
585 this->reg = v->virtual_grf_alloc(type_size(type));
586
587 if (type->is_array() || type->is_record()) {
588 this->swizzle = BRW_SWIZZLE_NOOP;
589 } else {
590 this->swizzle = swizzle_for_size(type->vector_elements);
591 }
592
593 this->type = brw_type_for_base_type(type);
594 }
595
596 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
597 {
598 init();
599
600 this->file = GRF;
601 this->reg = v->virtual_grf_alloc(type_size(type));
602
603 if (type->is_array() || type->is_record()) {
604 this->writemask = WRITEMASK_XYZW;
605 } else {
606 this->writemask = (1 << type->vector_elements) - 1;
607 }
608
609 this->type = brw_type_for_base_type(type);
610 }
611
612 /* Our support for uniforms is piggy-backed on the struct
613 * gl_fragment_program, because that's where the values actually
614 * get stored, rather than in some global gl_shader_program uniform
615 * store.
616 */
617 void
618 vec4_visitor::setup_uniform_values(ir_variable *ir)
619 {
620 int namelen = strlen(ir->name);
621
622 /* The data for our (non-builtin) uniforms is stored in a series of
623 * gl_uniform_driver_storage structs for each subcomponent that
624 * glGetUniformLocation() could name. We know it's been set up in the same
625 * order we'd walk the type, so walk the list of storage and find anything
626 * with our name, or the prefix of a component that starts with our name.
627 */
628 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
629 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
630
631 if (strncmp(ir->name, storage->name, namelen) != 0 ||
632 (storage->name[namelen] != 0 &&
633 storage->name[namelen] != '.' &&
634 storage->name[namelen] != '[')) {
635 continue;
636 }
637
638 gl_constant_value *components = storage->storage;
639 unsigned vector_count = (MAX2(storage->array_elements, 1) *
640 storage->type->matrix_columns);
641
642 for (unsigned s = 0; s < vector_count; s++) {
643 uniform_vector_size[uniforms] = storage->type->vector_elements;
644
645 int i;
646 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
647 prog_data->param[uniforms * 4 + i] = &components->f;
648 components++;
649 }
650 for (; i < 4; i++) {
651 static float zero = 0;
652 prog_data->param[uniforms * 4 + i] = &zero;
653 }
654
655 uniforms++;
656 }
657 }
658 }
659
660 void
661 vec4_visitor::setup_uniform_clipplane_values()
662 {
663 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
664
665 if (intel->gen < 6) {
666 /* Pre-Gen6, we compact clip planes. For example, if the user
667 * enables just clip planes 0, 1, and 3, we will enable clip planes
668 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
669 * plane 2. This simplifies the implementation of the Gen6 clip
670 * thread.
671 */
672 int compacted_clipplane_index = 0;
673 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
674 if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
675 continue;
676
677 this->uniform_vector_size[this->uniforms] = 4;
678 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
679 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
680 for (int j = 0; j < 4; ++j) {
681 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
682 }
683 ++compacted_clipplane_index;
684 ++this->uniforms;
685 }
686 } else {
687 /* In Gen6 and later, we don't compact clip planes, because this
688 * simplifies the implementation of gl_ClipDistance.
689 */
690 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
691 this->uniform_vector_size[this->uniforms] = 4;
692 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
693 this->userplane[i].type = BRW_REGISTER_TYPE_F;
694 for (int j = 0; j < 4; ++j) {
695 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
696 }
697 ++this->uniforms;
698 }
699 }
700 }
701
702 /* Our support for builtin uniforms is even scarier than non-builtin.
703 * It sits on top of the PROG_STATE_VAR parameters that are
704 * automatically updated from GL context state.
705 */
706 void
707 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
708 {
709 const ir_state_slot *const slots = ir->state_slots;
710 assert(ir->state_slots != NULL);
711
712 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
713 /* This state reference has already been setup by ir_to_mesa,
714 * but we'll get the same index back here. We can reference
715 * ParameterValues directly, since unlike brw_fs.cpp, we never
716 * add new state references during compile.
717 */
718 int index = _mesa_add_state_reference(this->prog->Parameters,
719 (gl_state_index *)slots[i].tokens);
720 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
721
722 this->uniform_vector_size[this->uniforms] = 0;
723 /* Add each of the unique swizzled channels of the element.
724 * This will end up matching the size of the glsl_type of this field.
725 */
726 int last_swiz = -1;
727 for (unsigned int j = 0; j < 4; j++) {
728 int swiz = GET_SWZ(slots[i].swizzle, j);
729 last_swiz = swiz;
730
731 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
732 if (swiz <= last_swiz)
733 this->uniform_vector_size[this->uniforms]++;
734 }
735 this->uniforms++;
736 }
737 }
738
739 dst_reg *
740 vec4_visitor::variable_storage(ir_variable *var)
741 {
742 return (dst_reg *)hash_table_find(this->variable_ht, var);
743 }
744
745 void
746 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
747 {
748 ir_expression *expr = ir->as_expression();
749
750 *predicate = BRW_PREDICATE_NORMAL;
751
752 if (expr) {
753 src_reg op[2];
754 vec4_instruction *inst;
755
756 assert(expr->get_num_operands() <= 2);
757 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
758 expr->operands[i]->accept(this);
759 op[i] = this->result;
760
761 resolve_ud_negate(&op[i]);
762 }
763
764 switch (expr->operation) {
765 case ir_unop_logic_not:
766 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
767 inst->conditional_mod = BRW_CONDITIONAL_Z;
768 break;
769
770 case ir_binop_logic_xor:
771 inst = emit(XOR(dst_null_d(), op[0], op[1]));
772 inst->conditional_mod = BRW_CONDITIONAL_NZ;
773 break;
774
775 case ir_binop_logic_or:
776 inst = emit(OR(dst_null_d(), op[0], op[1]));
777 inst->conditional_mod = BRW_CONDITIONAL_NZ;
778 break;
779
780 case ir_binop_logic_and:
781 inst = emit(AND(dst_null_d(), op[0], op[1]));
782 inst->conditional_mod = BRW_CONDITIONAL_NZ;
783 break;
784
785 case ir_unop_f2b:
786 if (intel->gen >= 6) {
787 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
788 } else {
789 inst = emit(MOV(dst_null_f(), op[0]));
790 inst->conditional_mod = BRW_CONDITIONAL_NZ;
791 }
792 break;
793
794 case ir_unop_i2b:
795 if (intel->gen >= 6) {
796 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
797 } else {
798 inst = emit(MOV(dst_null_d(), op[0]));
799 inst->conditional_mod = BRW_CONDITIONAL_NZ;
800 }
801 break;
802
803 case ir_binop_all_equal:
804 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
805 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
806 break;
807
808 case ir_binop_any_nequal:
809 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
810 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
811 break;
812
813 case ir_unop_any:
814 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
815 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
816 break;
817
818 case ir_binop_greater:
819 case ir_binop_gequal:
820 case ir_binop_less:
821 case ir_binop_lequal:
822 case ir_binop_equal:
823 case ir_binop_nequal:
824 emit(CMP(dst_null_d(), op[0], op[1],
825 brw_conditional_for_comparison(expr->operation)));
826 break;
827
828 default:
829 assert(!"not reached");
830 break;
831 }
832 return;
833 }
834
835 ir->accept(this);
836
837 resolve_ud_negate(&this->result);
838
839 if (intel->gen >= 6) {
840 vec4_instruction *inst = emit(AND(dst_null_d(),
841 this->result, src_reg(1)));
842 inst->conditional_mod = BRW_CONDITIONAL_NZ;
843 } else {
844 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
845 inst->conditional_mod = BRW_CONDITIONAL_NZ;
846 }
847 }
848
849 /**
850 * Emit a gen6 IF statement with the comparison folded into the IF
851 * instruction.
852 */
853 void
854 vec4_visitor::emit_if_gen6(ir_if *ir)
855 {
856 ir_expression *expr = ir->condition->as_expression();
857
858 if (expr) {
859 src_reg op[2];
860 dst_reg temp;
861
862 assert(expr->get_num_operands() <= 2);
863 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
864 expr->operands[i]->accept(this);
865 op[i] = this->result;
866 }
867
868 switch (expr->operation) {
869 case ir_unop_logic_not:
870 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
871 return;
872
873 case ir_binop_logic_xor:
874 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
875 return;
876
877 case ir_binop_logic_or:
878 temp = dst_reg(this, glsl_type::bool_type);
879 emit(OR(temp, op[0], op[1]));
880 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
881 return;
882
883 case ir_binop_logic_and:
884 temp = dst_reg(this, glsl_type::bool_type);
885 emit(AND(temp, op[0], op[1]));
886 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
887 return;
888
889 case ir_unop_f2b:
890 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
891 return;
892
893 case ir_unop_i2b:
894 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
895 return;
896
897 case ir_binop_greater:
898 case ir_binop_gequal:
899 case ir_binop_less:
900 case ir_binop_lequal:
901 case ir_binop_equal:
902 case ir_binop_nequal:
903 emit(IF(op[0], op[1],
904 brw_conditional_for_comparison(expr->operation)));
905 return;
906
907 case ir_binop_all_equal:
908 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
909 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
910 return;
911
912 case ir_binop_any_nequal:
913 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
914 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
915 return;
916
917 case ir_unop_any:
918 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
919 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
920 return;
921
922 default:
923 assert(!"not reached");
924 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
925 return;
926 }
927 return;
928 }
929
930 ir->condition->accept(this);
931
932 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
933 }
934
935 static dst_reg
936 with_writemask(dst_reg const & r, int mask)
937 {
938 dst_reg result = r;
939 result.writemask = mask;
940 return result;
941 }
942
943 void
944 vec4_vs_visitor::emit_prolog()
945 {
946 dst_reg sign_recovery_shift;
947 dst_reg normalize_factor;
948 dst_reg es3_normalize_factor;
949
950 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
951 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
952 uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
953 dst_reg reg(ATTR, i);
954 dst_reg reg_d = reg;
955 reg_d.type = BRW_REGISTER_TYPE_D;
956 dst_reg reg_ud = reg;
957 reg_ud.type = BRW_REGISTER_TYPE_UD;
958
959 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
960 * come in as floating point conversions of the integer values.
961 */
962 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
963 dst_reg dst = reg;
964 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
965 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
966 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
967 }
968
969 /* Do sign recovery for 2101010 formats if required. */
970 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
971 if (sign_recovery_shift.file == BAD_FILE) {
972 /* shift constant: <22,22,22,30> */
973 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
974 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
975 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
976 }
977
978 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
979 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
980 }
981
982 /* Apply BGRA swizzle if required. */
983 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
984 src_reg temp = src_reg(reg);
985 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
986 emit(MOV(reg, temp));
987 }
988
989 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
990 /* ES 3.0 has different rules for converting signed normalized
991 * fixed-point numbers than desktop GL.
992 */
993 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
994 /* According to equation 2.2 of the ES 3.0 specification,
995 * signed normalization conversion is done by:
996 *
997 * f = c / (2^(b-1)-1)
998 */
999 if (es3_normalize_factor.file == BAD_FILE) {
1000 /* mul constant: 1 / (2^(b-1) - 1) */
1001 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
1002 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
1003 src_reg(1.0f / ((1<<9) - 1))));
1004 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
1005 src_reg(1.0f / ((1<<1) - 1))));
1006 }
1007
1008 dst_reg dst = reg;
1009 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1010 emit(MOV(dst, src_reg(reg_d)));
1011 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
1012 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
1013 } else {
1014 /* The following equations are from the OpenGL 3.2 specification:
1015 *
1016 * 2.1 unsigned normalization
1017 * f = c/(2^n-1)
1018 *
1019 * 2.2 signed normalization
1020 * f = (2c+1)/(2^n-1)
1021 *
1022 * Both of these share a common divisor, which is represented by
1023 * "normalize_factor" in the code below.
1024 */
1025 if (normalize_factor.file == BAD_FILE) {
1026 /* 1 / (2^b - 1) for b=<10,10,10,2> */
1027 normalize_factor = dst_reg(this, glsl_type::vec4_type);
1028 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1029 src_reg(1.0f / ((1<<10) - 1))));
1030 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1031 src_reg(1.0f / ((1<<2) - 1))));
1032 }
1033
1034 dst_reg dst = reg;
1035 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1036 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1037
1038 /* For signed normalization, we want the numerator to be 2c+1. */
1039 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1040 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1041 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1042 }
1043
1044 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1045 }
1046 }
1047
1048 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1049 dst_reg dst = reg;
1050 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1051 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1052 }
1053 }
1054 }
1055 }
1056
1057
1058 dst_reg *
1059 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1060 {
1061 /* VertexID is stored by the VF as the last vertex element, but
1062 * we don't represent it with a flag in inputs_read, so we call
1063 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1064 */
1065 dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1066 vs_prog_data->uses_vertexid = true;
1067
1068 switch (ir->location) {
1069 case SYSTEM_VALUE_VERTEX_ID:
1070 reg->writemask = WRITEMASK_X;
1071 break;
1072 case SYSTEM_VALUE_INSTANCE_ID:
1073 reg->writemask = WRITEMASK_Y;
1074 break;
1075 default:
1076 assert(!"not reached");
1077 break;
1078 }
1079
1080 return reg;
1081 }
1082
1083
1084 void
1085 vec4_visitor::visit(ir_variable *ir)
1086 {
1087 dst_reg *reg = NULL;
1088
1089 if (variable_storage(ir))
1090 return;
1091
1092 switch (ir->mode) {
1093 case ir_var_shader_in:
1094 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1095 break;
1096
1097 case ir_var_shader_out:
1098 reg = new(mem_ctx) dst_reg(this, ir->type);
1099
1100 for (int i = 0; i < type_size(ir->type); i++) {
1101 output_reg[ir->location + i] = *reg;
1102 output_reg[ir->location + i].reg_offset = i;
1103 output_reg[ir->location + i].type =
1104 brw_type_for_base_type(ir->type->get_scalar_type());
1105 output_reg_annotation[ir->location + i] = ir->name;
1106 }
1107 break;
1108
1109 case ir_var_auto:
1110 case ir_var_temporary:
1111 reg = new(mem_ctx) dst_reg(this, ir->type);
1112 break;
1113
1114 case ir_var_uniform:
1115 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1116
1117 /* Thanks to the lower_ubo_reference pass, we will see only
1118 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1119 * variables, so no need for them to be in variable_ht.
1120 */
1121 if (ir->is_in_uniform_block())
1122 return;
1123
1124 /* Track how big the whole uniform variable is, in case we need to put a
1125 * copy of its data into pull constants for array access.
1126 */
1127 this->uniform_size[this->uniforms] = type_size(ir->type);
1128
1129 if (!strncmp(ir->name, "gl_", 3)) {
1130 setup_builtin_uniform_values(ir);
1131 } else {
1132 setup_uniform_values(ir);
1133 }
1134 break;
1135
1136 case ir_var_system_value:
1137 reg = make_reg_for_system_value(ir);
1138 break;
1139
1140 default:
1141 assert(!"not reached");
1142 }
1143
1144 reg->type = brw_type_for_base_type(ir->type);
1145 hash_table_insert(this->variable_ht, reg, ir);
1146 }
1147
1148 void
1149 vec4_visitor::visit(ir_loop *ir)
1150 {
1151 dst_reg counter;
1152
1153 /* We don't want debugging output to print the whole body of the
1154 * loop as the annotation.
1155 */
1156 this->base_ir = NULL;
1157
1158 if (ir->counter != NULL) {
1159 this->base_ir = ir->counter;
1160 ir->counter->accept(this);
1161 counter = *(variable_storage(ir->counter));
1162
1163 if (ir->from != NULL) {
1164 this->base_ir = ir->from;
1165 ir->from->accept(this);
1166
1167 emit(MOV(counter, this->result));
1168 }
1169 }
1170
1171 emit(BRW_OPCODE_DO);
1172
1173 if (ir->to) {
1174 this->base_ir = ir->to;
1175 ir->to->accept(this);
1176
1177 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1178 brw_conditional_for_comparison(ir->cmp)));
1179
1180 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1181 inst->predicate = BRW_PREDICATE_NORMAL;
1182 }
1183
1184 visit_instructions(&ir->body_instructions);
1185
1186
1187 if (ir->increment) {
1188 this->base_ir = ir->increment;
1189 ir->increment->accept(this);
1190 emit(ADD(counter, src_reg(counter), this->result));
1191 }
1192
1193 emit(BRW_OPCODE_WHILE);
1194 }
1195
1196 void
1197 vec4_visitor::visit(ir_loop_jump *ir)
1198 {
1199 switch (ir->mode) {
1200 case ir_loop_jump::jump_break:
1201 emit(BRW_OPCODE_BREAK);
1202 break;
1203 case ir_loop_jump::jump_continue:
1204 emit(BRW_OPCODE_CONTINUE);
1205 break;
1206 }
1207 }
1208
1209
1210 void
1211 vec4_visitor::visit(ir_function_signature *ir)
1212 {
1213 assert(0);
1214 (void)ir;
1215 }
1216
1217 void
1218 vec4_visitor::visit(ir_function *ir)
1219 {
1220 /* Ignore function bodies other than main() -- we shouldn't see calls to
1221 * them since they should all be inlined.
1222 */
1223 if (strcmp(ir->name, "main") == 0) {
1224 const ir_function_signature *sig;
1225 exec_list empty;
1226
1227 sig = ir->matching_signature(&empty);
1228
1229 assert(sig);
1230
1231 visit_instructions(&sig->body);
1232 }
1233 }
1234
1235 bool
1236 vec4_visitor::try_emit_sat(ir_expression *ir)
1237 {
1238 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1239 if (!sat_src)
1240 return false;
1241
1242 sat_src->accept(this);
1243 src_reg src = this->result;
1244
1245 this->result = src_reg(this, ir->type);
1246 vec4_instruction *inst;
1247 inst = emit(MOV(dst_reg(this->result), src));
1248 inst->saturate = true;
1249
1250 return true;
1251 }
1252
1253 void
1254 vec4_visitor::emit_bool_comparison(unsigned int op,
1255 dst_reg dst, src_reg src0, src_reg src1)
1256 {
1257 /* original gen4 does destination conversion before comparison. */
1258 if (intel->gen < 5)
1259 dst.type = src0.type;
1260
1261 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1262
1263 dst.type = BRW_REGISTER_TYPE_D;
1264 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1265 }
1266
1267 void
1268 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1269 src_reg src0, src_reg src1)
1270 {
1271 vec4_instruction *inst;
1272
1273 if (intel->gen >= 6) {
1274 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1275 inst->conditional_mod = conditionalmod;
1276 } else {
1277 emit(CMP(dst, src0, src1, conditionalmod));
1278
1279 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1280 inst->predicate = BRW_PREDICATE_NORMAL;
1281 }
1282 }
1283
1284 void
1285 vec4_visitor::visit(ir_expression *ir)
1286 {
1287 unsigned int operand;
1288 src_reg op[Elements(ir->operands)];
1289 src_reg result_src;
1290 dst_reg result_dst;
1291 vec4_instruction *inst;
1292
1293 if (try_emit_sat(ir))
1294 return;
1295
1296 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1297 this->result.file = BAD_FILE;
1298 ir->operands[operand]->accept(this);
1299 if (this->result.file == BAD_FILE) {
1300 printf("Failed to get tree for expression operand:\n");
1301 ir->operands[operand]->print();
1302 exit(1);
1303 }
1304 op[operand] = this->result;
1305
1306 /* Matrix expression operands should have been broken down to vector
1307 * operations already.
1308 */
1309 assert(!ir->operands[operand]->type->is_matrix());
1310 }
1311
1312 int vector_elements = ir->operands[0]->type->vector_elements;
1313 if (ir->operands[1]) {
1314 vector_elements = MAX2(vector_elements,
1315 ir->operands[1]->type->vector_elements);
1316 }
1317
1318 this->result.file = BAD_FILE;
1319
1320 /* Storage for our result. Ideally for an assignment we'd be using
1321 * the actual storage for the result here, instead.
1322 */
1323 result_src = src_reg(this, ir->type);
1324 /* convenience for the emit functions below. */
1325 result_dst = dst_reg(result_src);
1326 /* If nothing special happens, this is the result. */
1327 this->result = result_src;
1328 /* Limit writes to the channels that will be used by result_src later.
1329 * This does limit this temp's use as a temporary for multi-instruction
1330 * sequences.
1331 */
1332 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1333
1334 switch (ir->operation) {
1335 case ir_unop_logic_not:
1336 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1337 * ones complement of the whole register, not just bit 0.
1338 */
1339 emit(XOR(result_dst, op[0], src_reg(1)));
1340 break;
1341 case ir_unop_neg:
1342 op[0].negate = !op[0].negate;
1343 this->result = op[0];
1344 break;
1345 case ir_unop_abs:
1346 op[0].abs = true;
1347 op[0].negate = false;
1348 this->result = op[0];
1349 break;
1350
1351 case ir_unop_sign:
1352 emit(MOV(result_dst, src_reg(0.0f)));
1353
1354 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1355 inst = emit(MOV(result_dst, src_reg(1.0f)));
1356 inst->predicate = BRW_PREDICATE_NORMAL;
1357
1358 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1359 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1360 inst->predicate = BRW_PREDICATE_NORMAL;
1361
1362 break;
1363
1364 case ir_unop_rcp:
1365 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1366 break;
1367
1368 case ir_unop_exp2:
1369 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1370 break;
1371 case ir_unop_log2:
1372 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1373 break;
1374 case ir_unop_exp:
1375 case ir_unop_log:
1376 assert(!"not reached: should be handled by ir_explog_to_explog2");
1377 break;
1378 case ir_unop_sin:
1379 case ir_unop_sin_reduced:
1380 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1381 break;
1382 case ir_unop_cos:
1383 case ir_unop_cos_reduced:
1384 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1385 break;
1386
1387 case ir_unop_dFdx:
1388 case ir_unop_dFdy:
1389 assert(!"derivatives not valid in vertex shader");
1390 break;
1391
1392 case ir_unop_bitfield_reverse:
1393 emit(BFREV(result_dst, op[0]));
1394 break;
1395 case ir_unop_bit_count:
1396 emit(CBIT(result_dst, op[0]));
1397 break;
1398 case ir_unop_find_msb: {
1399 src_reg temp = src_reg(this, glsl_type::uint_type);
1400
1401 inst = emit(FBH(dst_reg(temp), op[0]));
1402 inst->dst.writemask = WRITEMASK_XYZW;
1403
1404 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1405 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1406 * subtract the result from 31 to convert the MSB count into an LSB count.
1407 */
1408
1409 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1410 temp.swizzle = BRW_SWIZZLE_NOOP;
1411 emit(MOV(result_dst, temp));
1412
1413 src_reg src_tmp = src_reg(result_dst);
1414 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1415
1416 src_tmp.negate = true;
1417 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1418 inst->predicate = BRW_PREDICATE_NORMAL;
1419 break;
1420 }
1421 case ir_unop_find_lsb:
1422 emit(FBL(result_dst, op[0]));
1423 break;
1424
1425 case ir_unop_noise:
1426 assert(!"not reached: should be handled by lower_noise");
1427 break;
1428
1429 case ir_binop_add:
1430 emit(ADD(result_dst, op[0], op[1]));
1431 break;
1432 case ir_binop_sub:
1433 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1434 break;
1435
1436 case ir_binop_mul:
1437 if (ir->type->is_integer()) {
1438 /* For integer multiplication, the MUL uses the low 16 bits
1439 * of one of the operands (src0 on gen6, src1 on gen7). The
1440 * MACH accumulates in the contribution of the upper 16 bits
1441 * of that operand.
1442 *
1443 * FINISHME: Emit just the MUL if we know an operand is small
1444 * enough.
1445 */
1446 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1447
1448 emit(MUL(acc, op[0], op[1]));
1449 emit(MACH(dst_null_d(), op[0], op[1]));
1450 emit(MOV(result_dst, src_reg(acc)));
1451 } else {
1452 emit(MUL(result_dst, op[0], op[1]));
1453 }
1454 break;
1455 case ir_binop_div:
1456 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1457 assert(ir->type->is_integer());
1458 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1459 break;
1460 case ir_binop_mod:
1461 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1462 assert(ir->type->is_integer());
1463 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1464 break;
1465
1466 case ir_binop_less:
1467 case ir_binop_greater:
1468 case ir_binop_lequal:
1469 case ir_binop_gequal:
1470 case ir_binop_equal:
1471 case ir_binop_nequal: {
1472 emit(CMP(result_dst, op[0], op[1],
1473 brw_conditional_for_comparison(ir->operation)));
1474 emit(AND(result_dst, result_src, src_reg(0x1)));
1475 break;
1476 }
1477
1478 case ir_binop_all_equal:
1479 /* "==" operator producing a scalar boolean. */
1480 if (ir->operands[0]->type->is_vector() ||
1481 ir->operands[1]->type->is_vector()) {
1482 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1483 emit(MOV(result_dst, src_reg(0)));
1484 inst = emit(MOV(result_dst, src_reg(1)));
1485 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1486 } else {
1487 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1488 emit(AND(result_dst, result_src, src_reg(0x1)));
1489 }
1490 break;
1491 case ir_binop_any_nequal:
1492 /* "!=" operator producing a scalar boolean. */
1493 if (ir->operands[0]->type->is_vector() ||
1494 ir->operands[1]->type->is_vector()) {
1495 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1496
1497 emit(MOV(result_dst, src_reg(0)));
1498 inst = emit(MOV(result_dst, src_reg(1)));
1499 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1500 } else {
1501 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1502 emit(AND(result_dst, result_src, src_reg(0x1)));
1503 }
1504 break;
1505
1506 case ir_unop_any:
1507 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1508 emit(MOV(result_dst, src_reg(0)));
1509
1510 inst = emit(MOV(result_dst, src_reg(1)));
1511 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1512 break;
1513
1514 case ir_binop_logic_xor:
1515 emit(XOR(result_dst, op[0], op[1]));
1516 break;
1517
1518 case ir_binop_logic_or:
1519 emit(OR(result_dst, op[0], op[1]));
1520 break;
1521
1522 case ir_binop_logic_and:
1523 emit(AND(result_dst, op[0], op[1]));
1524 break;
1525
1526 case ir_binop_dot:
1527 assert(ir->operands[0]->type->is_vector());
1528 assert(ir->operands[0]->type == ir->operands[1]->type);
1529 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1530 break;
1531
1532 case ir_unop_sqrt:
1533 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1534 break;
1535 case ir_unop_rsq:
1536 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1537 break;
1538
1539 case ir_unop_bitcast_i2f:
1540 case ir_unop_bitcast_u2f:
1541 this->result = op[0];
1542 this->result.type = BRW_REGISTER_TYPE_F;
1543 break;
1544
1545 case ir_unop_bitcast_f2i:
1546 this->result = op[0];
1547 this->result.type = BRW_REGISTER_TYPE_D;
1548 break;
1549
1550 case ir_unop_bitcast_f2u:
1551 this->result = op[0];
1552 this->result.type = BRW_REGISTER_TYPE_UD;
1553 break;
1554
1555 case ir_unop_i2f:
1556 case ir_unop_i2u:
1557 case ir_unop_u2i:
1558 case ir_unop_u2f:
1559 case ir_unop_b2f:
1560 case ir_unop_b2i:
1561 case ir_unop_f2i:
1562 case ir_unop_f2u:
1563 emit(MOV(result_dst, op[0]));
1564 break;
1565 case ir_unop_f2b:
1566 case ir_unop_i2b: {
1567 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1568 emit(AND(result_dst, result_src, src_reg(1)));
1569 break;
1570 }
1571
1572 case ir_unop_trunc:
1573 emit(RNDZ(result_dst, op[0]));
1574 break;
1575 case ir_unop_ceil:
1576 op[0].negate = !op[0].negate;
1577 inst = emit(RNDD(result_dst, op[0]));
1578 this->result.negate = true;
1579 break;
1580 case ir_unop_floor:
1581 inst = emit(RNDD(result_dst, op[0]));
1582 break;
1583 case ir_unop_fract:
1584 inst = emit(FRC(result_dst, op[0]));
1585 break;
1586 case ir_unop_round_even:
1587 emit(RNDE(result_dst, op[0]));
1588 break;
1589
1590 case ir_binop_min:
1591 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1592 break;
1593 case ir_binop_max:
1594 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1595 break;
1596
1597 case ir_binop_pow:
1598 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1599 break;
1600
1601 case ir_unop_bit_not:
1602 inst = emit(NOT(result_dst, op[0]));
1603 break;
1604 case ir_binop_bit_and:
1605 inst = emit(AND(result_dst, op[0], op[1]));
1606 break;
1607 case ir_binop_bit_xor:
1608 inst = emit(XOR(result_dst, op[0], op[1]));
1609 break;
1610 case ir_binop_bit_or:
1611 inst = emit(OR(result_dst, op[0], op[1]));
1612 break;
1613
1614 case ir_binop_lshift:
1615 inst = emit(SHL(result_dst, op[0], op[1]));
1616 break;
1617
1618 case ir_binop_rshift:
1619 if (ir->type->base_type == GLSL_TYPE_INT)
1620 inst = emit(ASR(result_dst, op[0], op[1]));
1621 else
1622 inst = emit(SHR(result_dst, op[0], op[1]));
1623 break;
1624
1625 case ir_binop_bfm:
1626 emit(BFI1(result_dst, op[0], op[1]));
1627 break;
1628
1629 case ir_binop_ubo_load: {
1630 ir_constant *uniform_block = ir->operands[0]->as_constant();
1631 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1632 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1633 src_reg offset = op[1];
1634
1635 /* Now, load the vector from that offset. */
1636 assert(ir->type->is_vector() || ir->type->is_scalar());
1637
1638 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1639 packed_consts.type = result.type;
1640 src_reg surf_index =
1641 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1642 if (const_offset_ir) {
1643 offset = src_reg(const_offset / 16);
1644 } else {
1645 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1646 }
1647
1648 vec4_instruction *pull =
1649 emit(new(mem_ctx) vec4_instruction(this,
1650 VS_OPCODE_PULL_CONSTANT_LOAD,
1651 dst_reg(packed_consts),
1652 surf_index,
1653 offset));
1654 pull->base_mrf = 14;
1655 pull->mlen = 1;
1656
1657 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1658 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1659 const_offset % 16 / 4,
1660 const_offset % 16 / 4,
1661 const_offset % 16 / 4);
1662
1663 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1664 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1665 emit(CMP(result_dst, packed_consts, src_reg(0u),
1666 BRW_CONDITIONAL_NZ));
1667 emit(AND(result_dst, result, src_reg(0x1)));
1668 } else {
1669 emit(MOV(result_dst, packed_consts));
1670 }
1671 break;
1672 }
1673
1674 case ir_binop_vector_extract:
1675 assert(!"should have been lowered by vec_index_to_cond_assign");
1676 break;
1677
1678 case ir_triop_lrp:
1679 op[0] = fix_3src_operand(op[0]);
1680 op[1] = fix_3src_operand(op[1]);
1681 op[2] = fix_3src_operand(op[2]);
1682 /* Note that the instruction's argument order is reversed from GLSL
1683 * and the IR.
1684 */
1685 emit(LRP(result_dst, op[2], op[1], op[0]));
1686 break;
1687
1688 case ir_triop_bfi:
1689 op[0] = fix_3src_operand(op[0]);
1690 op[1] = fix_3src_operand(op[1]);
1691 op[2] = fix_3src_operand(op[2]);
1692 emit(BFI2(result_dst, op[0], op[1], op[2]));
1693 break;
1694
1695 case ir_triop_bitfield_extract:
1696 op[0] = fix_3src_operand(op[0]);
1697 op[1] = fix_3src_operand(op[1]);
1698 op[2] = fix_3src_operand(op[2]);
1699 /* Note that the instruction's argument order is reversed from GLSL
1700 * and the IR.
1701 */
1702 emit(BFE(result_dst, op[2], op[1], op[0]));
1703 break;
1704
1705 case ir_quadop_bitfield_insert:
1706 assert(!"not reached: should be handled by "
1707 "bitfield_insert_to_bfm_bfi\n");
1708 break;
1709
1710 case ir_quadop_vector:
1711 assert(!"not reached: should be handled by lower_quadop_vector");
1712 break;
1713
1714 case ir_unop_pack_half_2x16:
1715 emit_pack_half_2x16(result_dst, op[0]);
1716 break;
1717 case ir_unop_unpack_half_2x16:
1718 emit_unpack_half_2x16(result_dst, op[0]);
1719 break;
1720 case ir_unop_pack_snorm_2x16:
1721 case ir_unop_pack_snorm_4x8:
1722 case ir_unop_pack_unorm_2x16:
1723 case ir_unop_pack_unorm_4x8:
1724 case ir_unop_unpack_snorm_2x16:
1725 case ir_unop_unpack_snorm_4x8:
1726 case ir_unop_unpack_unorm_2x16:
1727 case ir_unop_unpack_unorm_4x8:
1728 assert(!"not reached: should be handled by lower_packing_builtins");
1729 break;
1730 case ir_unop_unpack_half_2x16_split_x:
1731 case ir_unop_unpack_half_2x16_split_y:
1732 case ir_binop_pack_half_2x16_split:
1733 assert(!"not reached: should not occur in vertex shader");
1734 break;
1735 }
1736 }
1737
1738
1739 void
1740 vec4_visitor::visit(ir_swizzle *ir)
1741 {
1742 src_reg src;
1743 int i = 0;
1744 int swizzle[4];
1745
1746 /* Note that this is only swizzles in expressions, not those on the left
1747 * hand side of an assignment, which do write masking. See ir_assignment
1748 * for that.
1749 */
1750
1751 ir->val->accept(this);
1752 src = this->result;
1753 assert(src.file != BAD_FILE);
1754
1755 for (i = 0; i < ir->type->vector_elements; i++) {
1756 switch (i) {
1757 case 0:
1758 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1759 break;
1760 case 1:
1761 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1762 break;
1763 case 2:
1764 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1765 break;
1766 case 3:
1767 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1768 break;
1769 }
1770 }
1771 for (; i < 4; i++) {
1772 /* Replicate the last channel out. */
1773 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1774 }
1775
1776 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1777
1778 this->result = src;
1779 }
1780
1781 void
1782 vec4_visitor::visit(ir_dereference_variable *ir)
1783 {
1784 const struct glsl_type *type = ir->type;
1785 dst_reg *reg = variable_storage(ir->var);
1786
1787 if (!reg) {
1788 fail("Failed to find variable storage for %s\n", ir->var->name);
1789 this->result = src_reg(brw_null_reg());
1790 return;
1791 }
1792
1793 this->result = src_reg(*reg);
1794
1795 /* System values get their swizzle from the dst_reg writemask */
1796 if (ir->var->mode == ir_var_system_value)
1797 return;
1798
1799 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1800 this->result.swizzle = swizzle_for_size(type->vector_elements);
1801 }
1802
1803
1804 int
1805 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1806 {
1807 /* Under normal circumstances array elements are stored consecutively, so
1808 * the stride is equal to the size of the array element.
1809 */
1810 return type_size(ir->type);
1811 }
1812
1813
1814 void
1815 vec4_visitor::visit(ir_dereference_array *ir)
1816 {
1817 ir_constant *constant_index;
1818 src_reg src;
1819 int array_stride = compute_array_stride(ir);
1820
1821 constant_index = ir->array_index->constant_expression_value();
1822
1823 ir->array->accept(this);
1824 src = this->result;
1825
1826 if (constant_index) {
1827 src.reg_offset += constant_index->value.i[0] * array_stride;
1828 } else {
1829 /* Variable index array dereference. It eats the "vec4" of the
1830 * base of the array and an index that offsets the Mesa register
1831 * index.
1832 */
1833 ir->array_index->accept(this);
1834
1835 src_reg index_reg;
1836
1837 if (array_stride == 1) {
1838 index_reg = this->result;
1839 } else {
1840 index_reg = src_reg(this, glsl_type::int_type);
1841
1842 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1843 }
1844
1845 if (src.reladdr) {
1846 src_reg temp = src_reg(this, glsl_type::int_type);
1847
1848 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1849
1850 index_reg = temp;
1851 }
1852
1853 src.reladdr = ralloc(mem_ctx, src_reg);
1854 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1855 }
1856
1857 /* If the type is smaller than a vec4, replicate the last channel out. */
1858 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1859 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1860 else
1861 src.swizzle = BRW_SWIZZLE_NOOP;
1862 src.type = brw_type_for_base_type(ir->type);
1863
1864 this->result = src;
1865 }
1866
1867 void
1868 vec4_visitor::visit(ir_dereference_record *ir)
1869 {
1870 unsigned int i;
1871 const glsl_type *struct_type = ir->record->type;
1872 int offset = 0;
1873
1874 ir->record->accept(this);
1875
1876 for (i = 0; i < struct_type->length; i++) {
1877 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1878 break;
1879 offset += type_size(struct_type->fields.structure[i].type);
1880 }
1881
1882 /* If the type is smaller than a vec4, replicate the last channel out. */
1883 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1884 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1885 else
1886 this->result.swizzle = BRW_SWIZZLE_NOOP;
1887 this->result.type = brw_type_for_base_type(ir->type);
1888
1889 this->result.reg_offset += offset;
1890 }
1891
1892 /**
1893 * We want to be careful in assignment setup to hit the actual storage
1894 * instead of potentially using a temporary like we might with the
1895 * ir_dereference handler.
1896 */
1897 static dst_reg
1898 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1899 {
1900 /* The LHS must be a dereference. If the LHS is a variable indexed array
1901 * access of a vector, it must be separated into a series conditional moves
1902 * before reaching this point (see ir_vec_index_to_cond_assign).
1903 */
1904 assert(ir->as_dereference());
1905 ir_dereference_array *deref_array = ir->as_dereference_array();
1906 if (deref_array) {
1907 assert(!deref_array->array->type->is_vector());
1908 }
1909
1910 /* Use the rvalue deref handler for the most part. We'll ignore
1911 * swizzles in it and write swizzles using writemask, though.
1912 */
1913 ir->accept(v);
1914 return dst_reg(v->result);
1915 }
1916
1917 void
1918 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1919 const struct glsl_type *type, uint32_t predicate)
1920 {
1921 if (type->base_type == GLSL_TYPE_STRUCT) {
1922 for (unsigned int i = 0; i < type->length; i++) {
1923 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1924 }
1925 return;
1926 }
1927
1928 if (type->is_array()) {
1929 for (unsigned int i = 0; i < type->length; i++) {
1930 emit_block_move(dst, src, type->fields.array, predicate);
1931 }
1932 return;
1933 }
1934
1935 if (type->is_matrix()) {
1936 const struct glsl_type *vec_type;
1937
1938 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1939 type->vector_elements, 1);
1940
1941 for (int i = 0; i < type->matrix_columns; i++) {
1942 emit_block_move(dst, src, vec_type, predicate);
1943 }
1944 return;
1945 }
1946
1947 assert(type->is_scalar() || type->is_vector());
1948
1949 dst->type = brw_type_for_base_type(type);
1950 src->type = dst->type;
1951
1952 dst->writemask = (1 << type->vector_elements) - 1;
1953
1954 src->swizzle = swizzle_for_size(type->vector_elements);
1955
1956 vec4_instruction *inst = emit(MOV(*dst, *src));
1957 inst->predicate = predicate;
1958
1959 dst->reg_offset++;
1960 src->reg_offset++;
1961 }
1962
1963
1964 /* If the RHS processing resulted in an instruction generating a
1965 * temporary value, and it would be easy to rewrite the instruction to
1966 * generate its result right into the LHS instead, do so. This ends
1967 * up reliably removing instructions where it can be tricky to do so
1968 * later without real UD chain information.
1969 */
1970 bool
1971 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1972 dst_reg dst,
1973 src_reg src,
1974 vec4_instruction *pre_rhs_inst,
1975 vec4_instruction *last_rhs_inst)
1976 {
1977 /* This could be supported, but it would take more smarts. */
1978 if (ir->condition)
1979 return false;
1980
1981 if (pre_rhs_inst == last_rhs_inst)
1982 return false; /* No instructions generated to work with. */
1983
1984 /* Make sure the last instruction generated our source reg. */
1985 if (src.file != GRF ||
1986 src.file != last_rhs_inst->dst.file ||
1987 src.reg != last_rhs_inst->dst.reg ||
1988 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1989 src.reladdr ||
1990 src.abs ||
1991 src.negate ||
1992 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1993 return false;
1994
1995 /* Check that that last instruction fully initialized the channels
1996 * we want to use, in the order we want to use them. We could
1997 * potentially reswizzle the operands of many instructions so that
1998 * we could handle out of order channels, but don't yet.
1999 */
2000
2001 for (unsigned i = 0; i < 4; i++) {
2002 if (dst.writemask & (1 << i)) {
2003 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2004 return false;
2005
2006 if (BRW_GET_SWZ(src.swizzle, i) != i)
2007 return false;
2008 }
2009 }
2010
2011 /* Success! Rewrite the instruction. */
2012 last_rhs_inst->dst.file = dst.file;
2013 last_rhs_inst->dst.reg = dst.reg;
2014 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2015 last_rhs_inst->dst.reladdr = dst.reladdr;
2016 last_rhs_inst->dst.writemask &= dst.writemask;
2017
2018 return true;
2019 }
2020
2021 void
2022 vec4_visitor::visit(ir_assignment *ir)
2023 {
2024 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2025 uint32_t predicate = BRW_PREDICATE_NONE;
2026
2027 if (!ir->lhs->type->is_scalar() &&
2028 !ir->lhs->type->is_vector()) {
2029 ir->rhs->accept(this);
2030 src_reg src = this->result;
2031
2032 if (ir->condition) {
2033 emit_bool_to_cond_code(ir->condition, &predicate);
2034 }
2035
2036 /* emit_block_move doesn't account for swizzles in the source register.
2037 * This should be ok, since the source register is a structure or an
2038 * array, and those can't be swizzled. But double-check to be sure.
2039 */
2040 assert(src.swizzle ==
2041 (ir->rhs->type->is_matrix()
2042 ? swizzle_for_size(ir->rhs->type->vector_elements)
2043 : BRW_SWIZZLE_NOOP));
2044
2045 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2046 return;
2047 }
2048
2049 /* Now we're down to just a scalar/vector with writemasks. */
2050 int i;
2051
2052 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2053 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2054
2055 ir->rhs->accept(this);
2056
2057 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2058
2059 src_reg src = this->result;
2060
2061 int swizzles[4];
2062 int first_enabled_chan = 0;
2063 int src_chan = 0;
2064
2065 assert(ir->lhs->type->is_vector() ||
2066 ir->lhs->type->is_scalar());
2067 dst.writemask = ir->write_mask;
2068
2069 for (int i = 0; i < 4; i++) {
2070 if (dst.writemask & (1 << i)) {
2071 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2072 break;
2073 }
2074 }
2075
2076 /* Swizzle a small RHS vector into the channels being written.
2077 *
2078 * glsl ir treats write_mask as dictating how many channels are
2079 * present on the RHS while in our instructions we need to make
2080 * those channels appear in the slots of the vec4 they're written to.
2081 */
2082 for (int i = 0; i < 4; i++) {
2083 if (dst.writemask & (1 << i))
2084 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2085 else
2086 swizzles[i] = first_enabled_chan;
2087 }
2088 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2089 swizzles[2], swizzles[3]);
2090
2091 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2092 return;
2093 }
2094
2095 if (ir->condition) {
2096 emit_bool_to_cond_code(ir->condition, &predicate);
2097 }
2098
2099 for (i = 0; i < type_size(ir->lhs->type); i++) {
2100 vec4_instruction *inst = emit(MOV(dst, src));
2101 inst->predicate = predicate;
2102
2103 dst.reg_offset++;
2104 src.reg_offset++;
2105 }
2106 }
2107
2108 void
2109 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2110 {
2111 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2112 foreach_list(node, &ir->components) {
2113 ir_constant *field_value = (ir_constant *)node;
2114
2115 emit_constant_values(dst, field_value);
2116 }
2117 return;
2118 }
2119
2120 if (ir->type->is_array()) {
2121 for (unsigned int i = 0; i < ir->type->length; i++) {
2122 emit_constant_values(dst, ir->array_elements[i]);
2123 }
2124 return;
2125 }
2126
2127 if (ir->type->is_matrix()) {
2128 for (int i = 0; i < ir->type->matrix_columns; i++) {
2129 float *vec = &ir->value.f[i * ir->type->vector_elements];
2130
2131 for (int j = 0; j < ir->type->vector_elements; j++) {
2132 dst->writemask = 1 << j;
2133 dst->type = BRW_REGISTER_TYPE_F;
2134
2135 emit(MOV(*dst, src_reg(vec[j])));
2136 }
2137 dst->reg_offset++;
2138 }
2139 return;
2140 }
2141
2142 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2143
2144 for (int i = 0; i < ir->type->vector_elements; i++) {
2145 if (!(remaining_writemask & (1 << i)))
2146 continue;
2147
2148 dst->writemask = 1 << i;
2149 dst->type = brw_type_for_base_type(ir->type);
2150
2151 /* Find other components that match the one we're about to
2152 * write. Emits fewer instructions for things like vec4(0.5,
2153 * 1.5, 1.5, 1.5).
2154 */
2155 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2156 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2157 if (ir->value.b[i] == ir->value.b[j])
2158 dst->writemask |= (1 << j);
2159 } else {
2160 /* u, i, and f storage all line up, so no need for a
2161 * switch case for comparing each type.
2162 */
2163 if (ir->value.u[i] == ir->value.u[j])
2164 dst->writemask |= (1 << j);
2165 }
2166 }
2167
2168 switch (ir->type->base_type) {
2169 case GLSL_TYPE_FLOAT:
2170 emit(MOV(*dst, src_reg(ir->value.f[i])));
2171 break;
2172 case GLSL_TYPE_INT:
2173 emit(MOV(*dst, src_reg(ir->value.i[i])));
2174 break;
2175 case GLSL_TYPE_UINT:
2176 emit(MOV(*dst, src_reg(ir->value.u[i])));
2177 break;
2178 case GLSL_TYPE_BOOL:
2179 emit(MOV(*dst, src_reg(ir->value.b[i])));
2180 break;
2181 default:
2182 assert(!"Non-float/uint/int/bool constant");
2183 break;
2184 }
2185
2186 remaining_writemask &= ~dst->writemask;
2187 }
2188 dst->reg_offset++;
2189 }
2190
2191 void
2192 vec4_visitor::visit(ir_constant *ir)
2193 {
2194 dst_reg dst = dst_reg(this, ir->type);
2195 this->result = src_reg(dst);
2196
2197 emit_constant_values(&dst, ir);
2198 }
2199
2200 void
2201 vec4_visitor::visit(ir_call *ir)
2202 {
2203 assert(!"not reached");
2204 }
2205
2206 void
2207 vec4_visitor::visit(ir_texture *ir)
2208 {
2209 int sampler =
2210 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2211
2212 /* Should be lowered by do_lower_texture_projection */
2213 assert(!ir->projector);
2214
2215 /* Generate code to compute all the subexpression trees. This has to be
2216 * done before loading any values into MRFs for the sampler message since
2217 * generating these values may involve SEND messages that need the MRFs.
2218 */
2219 src_reg coordinate;
2220 if (ir->coordinate) {
2221 ir->coordinate->accept(this);
2222 coordinate = this->result;
2223 }
2224
2225 src_reg shadow_comparitor;
2226 if (ir->shadow_comparitor) {
2227 ir->shadow_comparitor->accept(this);
2228 shadow_comparitor = this->result;
2229 }
2230
2231 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2232 src_reg lod, dPdx, dPdy, sample_index;
2233 switch (ir->op) {
2234 case ir_tex:
2235 lod = src_reg(0.0f);
2236 lod_type = glsl_type::float_type;
2237 break;
2238 case ir_txf:
2239 case ir_txl:
2240 case ir_txs:
2241 ir->lod_info.lod->accept(this);
2242 lod = this->result;
2243 lod_type = ir->lod_info.lod->type;
2244 break;
2245 case ir_txf_ms:
2246 ir->lod_info.sample_index->accept(this);
2247 sample_index = this->result;
2248 sample_index_type = ir->lod_info.sample_index->type;
2249 break;
2250 case ir_txd:
2251 ir->lod_info.grad.dPdx->accept(this);
2252 dPdx = this->result;
2253
2254 ir->lod_info.grad.dPdy->accept(this);
2255 dPdy = this->result;
2256
2257 lod_type = ir->lod_info.grad.dPdx->type;
2258 break;
2259 case ir_txb:
2260 case ir_lod:
2261 break;
2262 }
2263
2264 vec4_instruction *inst = NULL;
2265 switch (ir->op) {
2266 case ir_tex:
2267 case ir_txl:
2268 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2269 break;
2270 case ir_txd:
2271 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2272 break;
2273 case ir_txf:
2274 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2275 break;
2276 case ir_txf_ms:
2277 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2278 break;
2279 case ir_txs:
2280 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2281 break;
2282 case ir_txb:
2283 assert(!"TXB is not valid for vertex shaders.");
2284 break;
2285 case ir_lod:
2286 assert(!"LOD is not valid for vertex shaders.");
2287 break;
2288 }
2289
2290 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2291
2292 /* Texel offsets go in the message header; Gen4 also requires headers. */
2293 inst->header_present = use_texture_offset || intel->gen < 5;
2294 inst->base_mrf = 2;
2295 inst->mlen = inst->header_present + 1; /* always at least one */
2296 inst->sampler = sampler;
2297 inst->dst = dst_reg(this, ir->type);
2298 inst->dst.writemask = WRITEMASK_XYZW;
2299 inst->shadow_compare = ir->shadow_comparitor != NULL;
2300
2301 if (use_texture_offset)
2302 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2303
2304 /* MRF for the first parameter */
2305 int param_base = inst->base_mrf + inst->header_present;
2306
2307 if (ir->op == ir_txs) {
2308 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2309 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2310 } else {
2311 int i, coord_mask = 0, zero_mask = 0;
2312 /* Load the coordinate */
2313 /* FINISHME: gl_clamp_mask and saturate */
2314 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2315 coord_mask |= (1 << i);
2316 for (; i < 4; i++)
2317 zero_mask |= (1 << i);
2318
2319 if (ir->offset && ir->op == ir_txf) {
2320 /* It appears that the ld instruction used for txf does its
2321 * address bounds check before adding in the offset. To work
2322 * around this, just add the integer offset to the integer
2323 * texel coordinate, and don't put the offset in the header.
2324 */
2325 ir_constant *offset = ir->offset->as_constant();
2326 assert(offset);
2327
2328 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2329 src_reg src = coordinate;
2330 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2331 BRW_GET_SWZ(src.swizzle, j),
2332 BRW_GET_SWZ(src.swizzle, j),
2333 BRW_GET_SWZ(src.swizzle, j));
2334 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2335 src, offset->value.i[j]));
2336 }
2337 } else {
2338 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2339 coordinate));
2340 }
2341 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2342 src_reg(0)));
2343 /* Load the shadow comparitor */
2344 if (ir->shadow_comparitor && ir->op != ir_txd) {
2345 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2346 WRITEMASK_X),
2347 shadow_comparitor));
2348 inst->mlen++;
2349 }
2350
2351 /* Load the LOD info */
2352 if (ir->op == ir_tex || ir->op == ir_txl) {
2353 int mrf, writemask;
2354 if (intel->gen >= 5) {
2355 mrf = param_base + 1;
2356 if (ir->shadow_comparitor) {
2357 writemask = WRITEMASK_Y;
2358 /* mlen already incremented */
2359 } else {
2360 writemask = WRITEMASK_X;
2361 inst->mlen++;
2362 }
2363 } else /* intel->gen == 4 */ {
2364 mrf = param_base;
2365 writemask = WRITEMASK_Z;
2366 }
2367 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2368 } else if (ir->op == ir_txf) {
2369 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2370 } else if (ir->op == ir_txf_ms) {
2371 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2372 sample_index));
2373 inst->mlen++;
2374
2375 /* on Gen7, there is an additional MCS parameter here after SI,
2376 * but we don't bother to emit it since it's always zero. If
2377 * we start supporting texturing from CMS surfaces, this will have
2378 * to change
2379 */
2380 } else if (ir->op == ir_txd) {
2381 const glsl_type *type = lod_type;
2382
2383 if (intel->gen >= 5) {
2384 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2385 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2386 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2387 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2388 inst->mlen++;
2389
2390 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2391 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2392 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2393 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2394 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2395 inst->mlen++;
2396
2397 if (ir->shadow_comparitor) {
2398 emit(MOV(dst_reg(MRF, param_base + 2,
2399 ir->shadow_comparitor->type, WRITEMASK_Z),
2400 shadow_comparitor));
2401 }
2402 }
2403 } else /* intel->gen == 4 */ {
2404 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2405 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2406 inst->mlen += 2;
2407 }
2408 }
2409 }
2410
2411 emit(inst);
2412
2413 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2414 * spec requires layers.
2415 */
2416 if (ir->op == ir_txs) {
2417 glsl_type const *type = ir->sampler->type;
2418 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2419 type->sampler_array) {
2420 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2421 with_writemask(inst->dst, WRITEMASK_Z),
2422 src_reg(inst->dst), src_reg(6));
2423 }
2424 }
2425
2426 swizzle_result(ir, src_reg(inst->dst), sampler);
2427 }
2428
2429 void
2430 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2431 {
2432 int s = key->tex.swizzles[sampler];
2433
2434 this->result = src_reg(this, ir->type);
2435 dst_reg swizzled_result(this->result);
2436
2437 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2438 || s == SWIZZLE_NOOP) {
2439 emit(MOV(swizzled_result, orig_val));
2440 return;
2441 }
2442
2443 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2444 int swizzle[4];
2445
2446 for (int i = 0; i < 4; i++) {
2447 switch (GET_SWZ(s, i)) {
2448 case SWIZZLE_ZERO:
2449 zero_mask |= (1 << i);
2450 break;
2451 case SWIZZLE_ONE:
2452 one_mask |= (1 << i);
2453 break;
2454 default:
2455 copy_mask |= (1 << i);
2456 swizzle[i] = GET_SWZ(s, i);
2457 break;
2458 }
2459 }
2460
2461 if (copy_mask) {
2462 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2463 swizzled_result.writemask = copy_mask;
2464 emit(MOV(swizzled_result, orig_val));
2465 }
2466
2467 if (zero_mask) {
2468 swizzled_result.writemask = zero_mask;
2469 emit(MOV(swizzled_result, src_reg(0.0f)));
2470 }
2471
2472 if (one_mask) {
2473 swizzled_result.writemask = one_mask;
2474 emit(MOV(swizzled_result, src_reg(1.0f)));
2475 }
2476 }
2477
2478 void
2479 vec4_visitor::visit(ir_return *ir)
2480 {
2481 assert(!"not reached");
2482 }
2483
2484 void
2485 vec4_visitor::visit(ir_discard *ir)
2486 {
2487 assert(!"not reached");
2488 }
2489
2490 void
2491 vec4_visitor::visit(ir_if *ir)
2492 {
2493 /* Don't point the annotation at the if statement, because then it plus
2494 * the then and else blocks get printed.
2495 */
2496 this->base_ir = ir->condition;
2497
2498 if (intel->gen == 6) {
2499 emit_if_gen6(ir);
2500 } else {
2501 uint32_t predicate;
2502 emit_bool_to_cond_code(ir->condition, &predicate);
2503 emit(IF(predicate));
2504 }
2505
2506 visit_instructions(&ir->then_instructions);
2507
2508 if (!ir->else_instructions.is_empty()) {
2509 this->base_ir = ir->condition;
2510 emit(BRW_OPCODE_ELSE);
2511
2512 visit_instructions(&ir->else_instructions);
2513 }
2514
2515 this->base_ir = ir->condition;
2516 emit(BRW_OPCODE_ENDIF);
2517 }
2518
2519 void
2520 vec4_visitor::emit_ndc_computation()
2521 {
2522 /* Get the position */
2523 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2524
2525 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2526 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2527 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2528
2529 current_annotation = "NDC";
2530 dst_reg ndc_w = ndc;
2531 ndc_w.writemask = WRITEMASK_W;
2532 src_reg pos_w = pos;
2533 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2534 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2535
2536 dst_reg ndc_xyz = ndc;
2537 ndc_xyz.writemask = WRITEMASK_XYZ;
2538
2539 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2540 }
2541
2542 void
2543 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2544 {
2545 if (intel->gen < 6 &&
2546 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2547 key->userclip_active || brw->has_negative_rhw_bug)) {
2548 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2549 dst_reg header1_w = header1;
2550 header1_w.writemask = WRITEMASK_W;
2551 GLuint i;
2552
2553 emit(MOV(header1, 0u));
2554
2555 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2556 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2557
2558 current_annotation = "Point size";
2559 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2560 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2561 }
2562
2563 current_annotation = "Clipping flags";
2564 for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2565 vec4_instruction *inst;
2566
2567 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2568 src_reg(this->userplane[i])));
2569 inst->conditional_mod = BRW_CONDITIONAL_L;
2570
2571 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2572 inst->predicate = BRW_PREDICATE_NORMAL;
2573 }
2574
2575 /* i965 clipping workaround:
2576 * 1) Test for -ve rhw
2577 * 2) If set,
2578 * set ndc = (0,0,0,0)
2579 * set ucp[6] = 1
2580 *
2581 * Later, clipping will detect ucp[6] and ensure the primitive is
2582 * clipped against all fixed planes.
2583 */
2584 if (brw->has_negative_rhw_bug) {
2585 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2586 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2587 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2588 vec4_instruction *inst;
2589 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2590 inst->predicate = BRW_PREDICATE_NORMAL;
2591 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2592 inst->predicate = BRW_PREDICATE_NORMAL;
2593 }
2594
2595 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2596 } else if (intel->gen < 6) {
2597 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2598 } else {
2599 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2600 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2601 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2602 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2603 }
2604 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2605 emit(MOV(retype(brw_writemask(reg, WRITEMASK_Y), BRW_REGISTER_TYPE_D),
2606 src_reg(output_reg[VARYING_SLOT_LAYER])));
2607 }
2608 }
2609 }
2610
2611 void
2612 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2613 {
2614 if (intel->gen < 6) {
2615 /* Clip distance slots are set aside in gen5, but they are not used. It
2616 * is not clear whether we actually need to set aside space for them,
2617 * but the performance cost is negligible.
2618 */
2619 return;
2620 }
2621
2622 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2623 *
2624 * "If a linked set of shaders forming the vertex stage contains no
2625 * static write to gl_ClipVertex or gl_ClipDistance, but the
2626 * application has requested clipping against user clip planes through
2627 * the API, then the coordinate written to gl_Position is used for
2628 * comparison against the user clip planes."
2629 *
2630 * This function is only called if the shader didn't write to
2631 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2632 * if the user wrote to it; otherwise we use gl_Position.
2633 */
2634 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2635 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2636 clip_vertex = VARYING_SLOT_POS;
2637 }
2638
2639 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2640 ++i) {
2641 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2642 src_reg(output_reg[clip_vertex]),
2643 src_reg(this->userplane[i + offset])));
2644 }
2645 }
2646
2647 void
2648 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2649 {
2650 assert (varying < VARYING_SLOT_MAX);
2651 reg.type = output_reg[varying].type;
2652 current_annotation = output_reg_annotation[varying];
2653 /* Copy the register, saturating if necessary */
2654 vec4_instruction *inst = emit(MOV(reg,
2655 src_reg(output_reg[varying])));
2656 if ((varying == VARYING_SLOT_COL0 ||
2657 varying == VARYING_SLOT_COL1 ||
2658 varying == VARYING_SLOT_BFC0 ||
2659 varying == VARYING_SLOT_BFC1) &&
2660 key->clamp_vertex_color) {
2661 inst->saturate = true;
2662 }
2663 }
2664
2665 void
2666 vec4_visitor::emit_urb_slot(int mrf, int varying)
2667 {
2668 struct brw_reg hw_reg = brw_message_reg(mrf);
2669 dst_reg reg = dst_reg(MRF, mrf);
2670 reg.type = BRW_REGISTER_TYPE_F;
2671
2672 switch (varying) {
2673 case VARYING_SLOT_PSIZ:
2674 /* PSIZ is always in slot 0, and is coupled with other flags. */
2675 current_annotation = "indices, point width, clip flags";
2676 emit_psiz_and_flags(hw_reg);
2677 break;
2678 case BRW_VARYING_SLOT_NDC:
2679 current_annotation = "NDC";
2680 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2681 break;
2682 case BRW_VARYING_SLOT_POS_DUPLICATE:
2683 case VARYING_SLOT_POS:
2684 current_annotation = "gl_Position";
2685 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2686 break;
2687 case VARYING_SLOT_CLIP_DIST0:
2688 case VARYING_SLOT_CLIP_DIST1:
2689 if (this->key->uses_clip_distance) {
2690 emit_generic_urb_slot(reg, varying);
2691 } else {
2692 current_annotation = "user clip distances";
2693 emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2694 }
2695 break;
2696 case VARYING_SLOT_EDGE:
2697 /* This is present when doing unfilled polygons. We're supposed to copy
2698 * the edge flag from the user-provided vertex array
2699 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2700 * of that attribute (starts as 1.0f). This is then used in clipping to
2701 * determine which edges should be drawn as wireframe.
2702 */
2703 current_annotation = "edge flag";
2704 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2705 glsl_type::float_type, WRITEMASK_XYZW))));
2706 break;
2707 case BRW_VARYING_SLOT_PAD:
2708 /* No need to write to this slot */
2709 break;
2710 default:
2711 emit_generic_urb_slot(reg, varying);
2712 break;
2713 }
2714 }
2715
2716 static int
2717 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2718 {
2719 struct intel_context *intel = &brw->intel;
2720
2721 if (intel->gen >= 6) {
2722 /* URB data written (does not include the message header reg) must
2723 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2724 * section 5.4.3.2.2: URB_INTERLEAVED.
2725 *
2726 * URB entries are allocated on a multiple of 1024 bits, so an
2727 * extra 128 bits written here to make the end align to 256 is
2728 * no problem.
2729 */
2730 if ((mlen % 2) != 1)
2731 mlen++;
2732 }
2733
2734 return mlen;
2735 }
2736
2737 void
2738 vec4_vs_visitor::emit_urb_write_header(int mrf)
2739 {
2740 /* No need to do anything for VS; an implied write to this MRF will be
2741 * performed by VS_OPCODE_URB_WRITE.
2742 */
2743 (void) mrf;
2744 }
2745
2746 vec4_instruction *
2747 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2748 {
2749 /* For VS, the URB writes end the thread. */
2750 if (complete) {
2751 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2752 emit_shader_time_end();
2753 }
2754
2755 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2756 inst->eot = complete;
2757
2758 return inst;
2759 }
2760
2761 /**
2762 * Generates the VUE payload plus the necessary URB write instructions to
2763 * output it.
2764 *
2765 * The VUE layout is documented in Volume 2a.
2766 */
2767 void
2768 vec4_visitor::emit_vertex()
2769 {
2770 /* MRF 0 is reserved for the debugger, so start with message header
2771 * in MRF 1.
2772 */
2773 int base_mrf = 1;
2774 int mrf = base_mrf;
2775 /* In the process of generating our URB write message contents, we
2776 * may need to unspill a register or load from an array. Those
2777 * reads would use MRFs 14-15.
2778 */
2779 int max_usable_mrf = 13;
2780
2781 /* The following assertion verifies that max_usable_mrf causes an
2782 * even-numbered amount of URB write data, which will meet gen6's
2783 * requirements for length alignment.
2784 */
2785 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2786
2787 /* First mrf is the g0-based message header containing URB handles and
2788 * such.
2789 */
2790 emit_urb_write_header(mrf++);
2791
2792 if (intel->gen < 6) {
2793 emit_ndc_computation();
2794 }
2795
2796 /* Set up the VUE data for the first URB write */
2797 int slot;
2798 for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2799 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2800
2801 /* If this was max_usable_mrf, we can't fit anything more into this URB
2802 * WRITE.
2803 */
2804 if (mrf > max_usable_mrf) {
2805 slot++;
2806 break;
2807 }
2808 }
2809
2810 bool complete = slot >= prog_data->vue_map.num_slots;
2811 current_annotation = "URB write";
2812 vec4_instruction *inst = emit_urb_write_opcode(complete);
2813 inst->base_mrf = base_mrf;
2814 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2815
2816 /* Optional second URB write */
2817 if (!complete) {
2818 mrf = base_mrf + 1;
2819
2820 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2821 assert(mrf < max_usable_mrf);
2822
2823 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2824 }
2825
2826 current_annotation = "URB write";
2827 inst = emit_urb_write_opcode(true /* complete */);
2828 inst->base_mrf = base_mrf;
2829 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2830 /* URB destination offset. In the previous write, we got MRFs
2831 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2832 * URB row increments, and each of our MRFs is half of one of
2833 * those, since we're doing interleaved writes.
2834 */
2835 inst->offset = (max_usable_mrf - base_mrf) / 2;
2836 }
2837 }
2838
2839 void
2840 vec4_vs_visitor::emit_thread_end()
2841 {
2842 /* For VS, we always end the thread by emitting a single vertex.
2843 * emit_urb_write_opcode() will take care of setting the eot flag on the
2844 * SEND instruction.
2845 */
2846 emit_vertex();
2847 }
2848
2849 src_reg
2850 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2851 src_reg *reladdr, int reg_offset)
2852 {
2853 /* Because we store the values to scratch interleaved like our
2854 * vertex data, we need to scale the vec4 index by 2.
2855 */
2856 int message_header_scale = 2;
2857
2858 /* Pre-gen6, the message header uses byte offsets instead of vec4
2859 * (16-byte) offset units.
2860 */
2861 if (intel->gen < 6)
2862 message_header_scale *= 16;
2863
2864 if (reladdr) {
2865 src_reg index = src_reg(this, glsl_type::int_type);
2866
2867 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2868 emit_before(inst, MUL(dst_reg(index),
2869 index, src_reg(message_header_scale)));
2870
2871 return index;
2872 } else {
2873 return src_reg(reg_offset * message_header_scale);
2874 }
2875 }
2876
2877 src_reg
2878 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2879 src_reg *reladdr, int reg_offset)
2880 {
2881 if (reladdr) {
2882 src_reg index = src_reg(this, glsl_type::int_type);
2883
2884 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2885
2886 /* Pre-gen6, the message header uses byte offsets instead of vec4
2887 * (16-byte) offset units.
2888 */
2889 if (intel->gen < 6) {
2890 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2891 }
2892
2893 return index;
2894 } else {
2895 int message_header_scale = intel->gen < 6 ? 16 : 1;
2896 return src_reg(reg_offset * message_header_scale);
2897 }
2898 }
2899
2900 /**
2901 * Emits an instruction before @inst to load the value named by @orig_src
2902 * from scratch space at @base_offset to @temp.
2903 *
2904 * @base_offset is measured in 32-byte units (the size of a register).
2905 */
2906 void
2907 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2908 dst_reg temp, src_reg orig_src,
2909 int base_offset)
2910 {
2911 int reg_offset = base_offset + orig_src.reg_offset;
2912 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2913
2914 emit_before(inst, SCRATCH_READ(temp, index));
2915 }
2916
2917 /**
2918 * Emits an instruction after @inst to store the value to be written
2919 * to @orig_dst to scratch space at @base_offset, from @temp.
2920 *
2921 * @base_offset is measured in 32-byte units (the size of a register).
2922 */
2923 void
2924 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2925 {
2926 int reg_offset = base_offset + inst->dst.reg_offset;
2927 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2928
2929 /* Create a temporary register to store *inst's result in.
2930 *
2931 * We have to be careful in MOVing from our temporary result register in
2932 * the scratch write. If we swizzle from channels of the temporary that
2933 * weren't initialized, it will confuse live interval analysis, which will
2934 * make spilling fail to make progress.
2935 */
2936 src_reg temp = src_reg(this, glsl_type::vec4_type);
2937 temp.type = inst->dst.type;
2938 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2939 int swizzles[4];
2940 for (int i = 0; i < 4; i++)
2941 if (inst->dst.writemask & (1 << i))
2942 swizzles[i] = i;
2943 else
2944 swizzles[i] = first_writemask_chan;
2945 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2946 swizzles[2], swizzles[3]);
2947
2948 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2949 inst->dst.writemask));
2950 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2951 write->predicate = inst->predicate;
2952 write->ir = inst->ir;
2953 write->annotation = inst->annotation;
2954 inst->insert_after(write);
2955
2956 inst->dst.file = temp.file;
2957 inst->dst.reg = temp.reg;
2958 inst->dst.reg_offset = temp.reg_offset;
2959 inst->dst.reladdr = NULL;
2960 }
2961
2962 /**
2963 * We can't generally support array access in GRF space, because a
2964 * single instruction's destination can only span 2 contiguous
2965 * registers. So, we send all GRF arrays that get variable index
2966 * access to scratch space.
2967 */
2968 void
2969 vec4_visitor::move_grf_array_access_to_scratch()
2970 {
2971 int scratch_loc[this->virtual_grf_count];
2972
2973 for (int i = 0; i < this->virtual_grf_count; i++) {
2974 scratch_loc[i] = -1;
2975 }
2976
2977 /* First, calculate the set of virtual GRFs that need to be punted
2978 * to scratch due to having any array access on them, and where in
2979 * scratch.
2980 */
2981 foreach_list(node, &this->instructions) {
2982 vec4_instruction *inst = (vec4_instruction *)node;
2983
2984 if (inst->dst.file == GRF && inst->dst.reladdr &&
2985 scratch_loc[inst->dst.reg] == -1) {
2986 scratch_loc[inst->dst.reg] = c->last_scratch;
2987 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2988 }
2989
2990 for (int i = 0 ; i < 3; i++) {
2991 src_reg *src = &inst->src[i];
2992
2993 if (src->file == GRF && src->reladdr &&
2994 scratch_loc[src->reg] == -1) {
2995 scratch_loc[src->reg] = c->last_scratch;
2996 c->last_scratch += this->virtual_grf_sizes[src->reg];
2997 }
2998 }
2999 }
3000
3001 /* Now, for anything that will be accessed through scratch, rewrite
3002 * it to load/store. Note that this is a _safe list walk, because
3003 * we may generate a new scratch_write instruction after the one
3004 * we're processing.
3005 */
3006 foreach_list_safe(node, &this->instructions) {
3007 vec4_instruction *inst = (vec4_instruction *)node;
3008
3009 /* Set up the annotation tracking for new generated instructions. */
3010 base_ir = inst->ir;
3011 current_annotation = inst->annotation;
3012
3013 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3014 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
3015 }
3016
3017 for (int i = 0 ; i < 3; i++) {
3018 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3019 continue;
3020
3021 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3022
3023 emit_scratch_read(inst, temp, inst->src[i],
3024 scratch_loc[inst->src[i].reg]);
3025
3026 inst->src[i].file = temp.file;
3027 inst->src[i].reg = temp.reg;
3028 inst->src[i].reg_offset = temp.reg_offset;
3029 inst->src[i].reladdr = NULL;
3030 }
3031 }
3032 }
3033
3034 /**
3035 * Emits an instruction before @inst to load the value named by @orig_src
3036 * from the pull constant buffer (surface) at @base_offset to @temp.
3037 */
3038 void
3039 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
3040 dst_reg temp, src_reg orig_src,
3041 int base_offset)
3042 {
3043 int reg_offset = base_offset + orig_src.reg_offset;
3044 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
3045 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
3046 vec4_instruction *load;
3047
3048 if (intel->gen >= 7) {
3049 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3050 grf_offset.type = offset.type;
3051 emit_before(inst, MOV(grf_offset, offset));
3052
3053 load = new(mem_ctx) vec4_instruction(this,
3054 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3055 temp, index, src_reg(grf_offset));
3056 } else {
3057 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3058 temp, index, offset);
3059 load->base_mrf = 14;
3060 load->mlen = 1;
3061 }
3062 emit_before(inst, load);
3063 }
3064
3065 /**
3066 * Implements array access of uniforms by inserting a
3067 * PULL_CONSTANT_LOAD instruction.
3068 *
3069 * Unlike temporary GRF array access (where we don't support it due to
3070 * the difficulty of doing relative addressing on instruction
3071 * destinations), we could potentially do array access of uniforms
3072 * that were loaded in GRF space as push constants. In real-world
3073 * usage we've seen, though, the arrays being used are always larger
3074 * than we could load as push constants, so just always move all
3075 * uniform array access out to a pull constant buffer.
3076 */
3077 void
3078 vec4_visitor::move_uniform_array_access_to_pull_constants()
3079 {
3080 int pull_constant_loc[this->uniforms];
3081
3082 for (int i = 0; i < this->uniforms; i++) {
3083 pull_constant_loc[i] = -1;
3084 }
3085
3086 /* Walk through and find array access of uniforms. Put a copy of that
3087 * uniform in the pull constant buffer.
3088 *
3089 * Note that we don't move constant-indexed accesses to arrays. No
3090 * testing has been done of the performance impact of this choice.
3091 */
3092 foreach_list_safe(node, &this->instructions) {
3093 vec4_instruction *inst = (vec4_instruction *)node;
3094
3095 for (int i = 0 ; i < 3; i++) {
3096 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3097 continue;
3098
3099 int uniform = inst->src[i].reg;
3100
3101 /* If this array isn't already present in the pull constant buffer,
3102 * add it.
3103 */
3104 if (pull_constant_loc[uniform] == -1) {
3105 const float **values = &prog_data->param[uniform * 4];
3106
3107 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3108
3109 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3110 prog_data->pull_param[prog_data->nr_pull_params++]
3111 = values[j];
3112 }
3113 }
3114
3115 /* Set up the annotation tracking for new generated instructions. */
3116 base_ir = inst->ir;
3117 current_annotation = inst->annotation;
3118
3119 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3120
3121 emit_pull_constant_load(inst, temp, inst->src[i],
3122 pull_constant_loc[uniform]);
3123
3124 inst->src[i].file = temp.file;
3125 inst->src[i].reg = temp.reg;
3126 inst->src[i].reg_offset = temp.reg_offset;
3127 inst->src[i].reladdr = NULL;
3128 }
3129 }
3130
3131 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3132 * no need to track them as larger-than-vec4 objects. This will be
3133 * relied on in cutting out unused uniform vectors from push
3134 * constants.
3135 */
3136 split_uniform_registers();
3137 }
3138
3139 void
3140 vec4_visitor::resolve_ud_negate(src_reg *reg)
3141 {
3142 if (reg->type != BRW_REGISTER_TYPE_UD ||
3143 !reg->negate)
3144 return;
3145
3146 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3147 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3148 *reg = temp;
3149 }
3150
3151 vec4_visitor::vec4_visitor(struct brw_context *brw,
3152 struct brw_vec4_compile *c,
3153 struct gl_program *prog,
3154 const struct brw_vec4_prog_key *key,
3155 struct brw_vec4_prog_data *prog_data,
3156 struct gl_shader_program *shader_prog,
3157 struct brw_shader *shader,
3158 void *mem_ctx,
3159 bool debug_flag)
3160 : debug_flag(debug_flag)
3161 {
3162 this->brw = brw;
3163 this->intel = &brw->intel;
3164 this->ctx = &intel->ctx;
3165 this->shader_prog = shader_prog;
3166 this->shader = shader;
3167
3168 this->mem_ctx = mem_ctx;
3169 this->failed = false;
3170
3171 this->base_ir = NULL;
3172 this->current_annotation = NULL;
3173 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3174
3175 this->c = c;
3176 this->prog = prog;
3177 this->key = key;
3178 this->prog_data = prog_data;
3179
3180 this->variable_ht = hash_table_ctor(0,
3181 hash_table_pointer_hash,
3182 hash_table_pointer_compare);
3183
3184 this->virtual_grf_start = NULL;
3185 this->virtual_grf_end = NULL;
3186 this->virtual_grf_sizes = NULL;
3187 this->virtual_grf_count = 0;
3188 this->virtual_grf_reg_map = NULL;
3189 this->virtual_grf_reg_count = 0;
3190 this->virtual_grf_array_size = 0;
3191 this->live_intervals_valid = false;
3192
3193 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3194
3195 this->uniforms = 0;
3196 }
3197
3198 vec4_visitor::~vec4_visitor()
3199 {
3200 hash_table_dtor(this->variable_ht);
3201 }
3202
3203
3204 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3205 struct brw_vs_compile *vs_compile,
3206 struct brw_vs_prog_data *vs_prog_data,
3207 struct gl_shader_program *prog,
3208 struct brw_shader *shader,
3209 void *mem_ctx)
3210 : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3211 &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3212 mem_ctx, INTEL_DEBUG & DEBUG_VS),
3213 vs_compile(vs_compile),
3214 vs_prog_data(vs_prog_data)
3215 {
3216 }
3217
3218
3219 void
3220 vec4_visitor::fail(const char *format, ...)
3221 {
3222 va_list va;
3223 char *msg;
3224
3225 if (failed)
3226 return;
3227
3228 failed = true;
3229
3230 va_start(va, format);
3231 msg = ralloc_vasprintf(mem_ctx, format, va);
3232 va_end(va);
3233 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3234
3235 this->fail_msg = msg;
3236
3237 if (debug_flag) {
3238 fprintf(stderr, "%s", msg);
3239 }
3240 }
3241
3242 } /* namespace brw */