i965/vs: Add a function to fix-up uniform arguments for 3-src insts.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU1(F32TO16)
117 ALU1(F16TO32)
118 ALU2(ADD)
119 ALU2(MUL)
120 ALU2(MACH)
121 ALU2(AND)
122 ALU2(OR)
123 ALU2(XOR)
124 ALU2(DP3)
125 ALU2(DP4)
126 ALU2(DPH)
127 ALU2(SHL)
128 ALU2(SHR)
129 ALU2(ASR)
130
131 /** Gen4 predicated IF. */
132 vec4_instruction *
133 vec4_visitor::IF(uint32_t predicate)
134 {
135 vec4_instruction *inst;
136
137 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
138 inst->predicate = predicate;
139
140 return inst;
141 }
142
143 /** Gen6+ IF with embedded comparison. */
144 vec4_instruction *
145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
146 {
147 assert(intel->gen >= 6);
148
149 vec4_instruction *inst;
150
151 resolve_ud_negate(&src0);
152 resolve_ud_negate(&src1);
153
154 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
155 src0, src1);
156 inst->conditional_mod = condition;
157
158 return inst;
159 }
160
161 /**
162 * CMP: Sets the low bit of the destination channels with the result
163 * of the comparison, while the upper bits are undefined, and updates
164 * the flag register with the packed 16 bits of the result.
165 */
166 vec4_instruction *
167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
168 {
169 vec4_instruction *inst;
170
171 /* original gen4 does type conversion to the destination type
172 * before before comparison, producing garbage results for floating
173 * point comparisons.
174 */
175 if (intel->gen == 4) {
176 dst.type = src0.type;
177 if (dst.file == HW_REG)
178 dst.fixed_hw_reg.type = dst.type;
179 }
180
181 resolve_ud_negate(&src0);
182 resolve_ud_negate(&src1);
183
184 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
185 inst->conditional_mod = condition;
186
187 return inst;
188 }
189
190 vec4_instruction *
191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
196 dst, index);
197 inst->base_mrf = 14;
198 inst->mlen = 2;
199
200 return inst;
201 }
202
203 vec4_instruction *
204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
205 {
206 vec4_instruction *inst;
207
208 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
209 dst, src, index);
210 inst->base_mrf = 13;
211 inst->mlen = 3;
212
213 return inst;
214 }
215
216 void
217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
218 {
219 static enum opcode dot_opcodes[] = {
220 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
221 };
222
223 emit(dot_opcodes[elements - 2], dst, src0, src1);
224 }
225
226 src_reg
227 vec4_visitor::fix_3src_operand(src_reg src)
228 {
229 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
230 * able to use vertical stride of zero to replicate the vec4 uniform, like
231 *
232 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
233 *
234 * But you can't, since vertical stride is always four in three-source
235 * instructions. Instead, insert a MOV instruction to do the replication so
236 * that the three-source instruction can consume it.
237 */
238
239 /* The MOV is only needed if the source is a uniform or immediate. */
240 if (src.file != UNIFORM && src.file != IMM)
241 return src;
242
243 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
244 expanded.type = src.type;
245 emit(MOV(expanded, src));
246 return src_reg(expanded);
247 }
248
249 src_reg
250 vec4_visitor::fix_math_operand(src_reg src)
251 {
252 /* The gen6 math instruction ignores the source modifiers --
253 * swizzle, abs, negate, and at least some parts of the register
254 * region description.
255 *
256 * Rather than trying to enumerate all these cases, *always* expand the
257 * operand to a temp GRF for gen6.
258 *
259 * For gen7, keep the operand as-is, except if immediate, which gen7 still
260 * can't use.
261 */
262
263 if (intel->gen == 7 && src.file != IMM)
264 return src;
265
266 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
267 expanded.type = src.type;
268 emit(MOV(expanded, src));
269 return src_reg(expanded);
270 }
271
272 void
273 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
274 {
275 src = fix_math_operand(src);
276
277 if (dst.writemask != WRITEMASK_XYZW) {
278 /* The gen6 math instruction must be align1, so we can't do
279 * writemasks.
280 */
281 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
282
283 emit(opcode, temp_dst, src);
284
285 emit(MOV(dst, src_reg(temp_dst)));
286 } else {
287 emit(opcode, dst, src);
288 }
289 }
290
291 void
292 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
293 {
294 vec4_instruction *inst = emit(opcode, dst, src);
295 inst->base_mrf = 1;
296 inst->mlen = 1;
297 }
298
299 void
300 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
301 {
302 switch (opcode) {
303 case SHADER_OPCODE_RCP:
304 case SHADER_OPCODE_RSQ:
305 case SHADER_OPCODE_SQRT:
306 case SHADER_OPCODE_EXP2:
307 case SHADER_OPCODE_LOG2:
308 case SHADER_OPCODE_SIN:
309 case SHADER_OPCODE_COS:
310 break;
311 default:
312 assert(!"not reached: bad math opcode");
313 return;
314 }
315
316 if (intel->gen >= 6) {
317 return emit_math1_gen6(opcode, dst, src);
318 } else {
319 return emit_math1_gen4(opcode, dst, src);
320 }
321 }
322
323 void
324 vec4_visitor::emit_math2_gen6(enum opcode opcode,
325 dst_reg dst, src_reg src0, src_reg src1)
326 {
327 src0 = fix_math_operand(src0);
328 src1 = fix_math_operand(src1);
329
330 if (dst.writemask != WRITEMASK_XYZW) {
331 /* The gen6 math instruction must be align1, so we can't do
332 * writemasks.
333 */
334 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
335 temp_dst.type = dst.type;
336
337 emit(opcode, temp_dst, src0, src1);
338
339 emit(MOV(dst, src_reg(temp_dst)));
340 } else {
341 emit(opcode, dst, src0, src1);
342 }
343 }
344
345 void
346 vec4_visitor::emit_math2_gen4(enum opcode opcode,
347 dst_reg dst, src_reg src0, src_reg src1)
348 {
349 vec4_instruction *inst = emit(opcode, dst, src0, src1);
350 inst->base_mrf = 1;
351 inst->mlen = 2;
352 }
353
354 void
355 vec4_visitor::emit_math(enum opcode opcode,
356 dst_reg dst, src_reg src0, src_reg src1)
357 {
358 switch (opcode) {
359 case SHADER_OPCODE_POW:
360 case SHADER_OPCODE_INT_QUOTIENT:
361 case SHADER_OPCODE_INT_REMAINDER:
362 break;
363 default:
364 assert(!"not reached: unsupported binary math opcode");
365 return;
366 }
367
368 if (intel->gen >= 6) {
369 return emit_math2_gen6(opcode, dst, src0, src1);
370 } else {
371 return emit_math2_gen4(opcode, dst, src0, src1);
372 }
373 }
374
375 void
376 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
377 {
378 if (intel->gen < 7)
379 assert(!"ir_unop_pack_half_2x16 should be lowered");
380
381 assert(dst.type == BRW_REGISTER_TYPE_UD);
382 assert(src0.type == BRW_REGISTER_TYPE_F);
383
384 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
385 *
386 * Because this instruction does not have a 16-bit floating-point type,
387 * the destination data type must be Word (W).
388 *
389 * The destination must be DWord-aligned and specify a horizontal stride
390 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
391 * each destination channel and the upper word is not modified.
392 *
393 * The above restriction implies that the f32to16 instruction must use
394 * align1 mode, because only in align1 mode is it possible to specify
395 * horizontal stride. We choose here to defy the hardware docs and emit
396 * align16 instructions.
397 *
398 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
399 * instructions. I was partially successful in that the code passed all
400 * tests. However, the code was dubiously correct and fragile, and the
401 * tests were not harsh enough to probe that frailty. Not trusting the
402 * code, I chose instead to remain in align16 mode in defiance of the hw
403 * docs).
404 *
405 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
406 * simulator, emitting a f32to16 in align16 mode with UD as destination
407 * data type is safe. The behavior differs from that specified in the PRM
408 * in that the upper word of each destination channel is cleared to 0.
409 */
410
411 dst_reg tmp_dst(this, glsl_type::uvec2_type);
412 src_reg tmp_src(tmp_dst);
413
414 #if 0
415 /* Verify the undocumented behavior on which the following instructions
416 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
417 * then the result of the bit-or instruction below will be incorrect.
418 *
419 * You should inspect the disasm output in order to verify that the MOV is
420 * not optimized away.
421 */
422 emit(MOV(tmp_dst, src_reg(0x12345678u)));
423 #endif
424
425 /* Give tmp the form below, where "." means untouched.
426 *
427 * w z y x w z y x
428 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
429 *
430 * That the upper word of each write-channel be 0 is required for the
431 * following bit-shift and bit-or instructions to work. Note that this
432 * relies on the undocumented hardware behavior mentioned above.
433 */
434 tmp_dst.writemask = WRITEMASK_XY;
435 emit(F32TO16(tmp_dst, src0));
436
437 /* Give the write-channels of dst the form:
438 * 0xhhhh0000
439 */
440 tmp_src.swizzle = SWIZZLE_Y;
441 emit(SHL(dst, tmp_src, src_reg(16u)));
442
443 /* Finally, give the write-channels of dst the form of packHalf2x16's
444 * output:
445 * 0xhhhhllll
446 */
447 tmp_src.swizzle = SWIZZLE_X;
448 emit(OR(dst, src_reg(dst), tmp_src));
449 }
450
451 void
452 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
453 {
454 if (intel->gen < 7)
455 assert(!"ir_unop_unpack_half_2x16 should be lowered");
456
457 assert(dst.type == BRW_REGISTER_TYPE_F);
458 assert(src0.type == BRW_REGISTER_TYPE_UD);
459
460 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
461 *
462 * Because this instruction does not have a 16-bit floating-point type,
463 * the source data type must be Word (W). The destination type must be
464 * F (Float).
465 *
466 * To use W as the source data type, we must adjust horizontal strides,
467 * which is only possible in align1 mode. All my [chadv] attempts at
468 * emitting align1 instructions for unpackHalf2x16 failed to pass the
469 * Piglit tests, so I gave up.
470 *
471 * I've verified that, on gen7 hardware and the simulator, it is safe to
472 * emit f16to32 in align16 mode with UD as source data type.
473 */
474
475 dst_reg tmp_dst(this, glsl_type::uvec2_type);
476 src_reg tmp_src(tmp_dst);
477
478 tmp_dst.writemask = WRITEMASK_X;
479 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
480
481 tmp_dst.writemask = WRITEMASK_Y;
482 emit(SHR(tmp_dst, src0, src_reg(16u)));
483
484 dst.writemask = WRITEMASK_XY;
485 emit(F16TO32(dst, tmp_src));
486 }
487
488 void
489 vec4_visitor::visit_instructions(const exec_list *list)
490 {
491 foreach_list(node, list) {
492 ir_instruction *ir = (ir_instruction *)node;
493
494 base_ir = ir;
495 ir->accept(this);
496 }
497 }
498
499
500 static int
501 type_size(const struct glsl_type *type)
502 {
503 unsigned int i;
504 int size;
505
506 switch (type->base_type) {
507 case GLSL_TYPE_UINT:
508 case GLSL_TYPE_INT:
509 case GLSL_TYPE_FLOAT:
510 case GLSL_TYPE_BOOL:
511 if (type->is_matrix()) {
512 return type->matrix_columns;
513 } else {
514 /* Regardless of size of vector, it gets a vec4. This is bad
515 * packing for things like floats, but otherwise arrays become a
516 * mess. Hopefully a later pass over the code can pack scalars
517 * down if appropriate.
518 */
519 return 1;
520 }
521 case GLSL_TYPE_ARRAY:
522 assert(type->length > 0);
523 return type_size(type->fields.array) * type->length;
524 case GLSL_TYPE_STRUCT:
525 size = 0;
526 for (i = 0; i < type->length; i++) {
527 size += type_size(type->fields.structure[i].type);
528 }
529 return size;
530 case GLSL_TYPE_SAMPLER:
531 /* Samplers take up one slot in UNIFORMS[], but they're baked in
532 * at link time.
533 */
534 return 1;
535 case GLSL_TYPE_VOID:
536 case GLSL_TYPE_ERROR:
537 case GLSL_TYPE_INTERFACE:
538 assert(0);
539 break;
540 }
541
542 return 0;
543 }
544
545 int
546 vec4_visitor::virtual_grf_alloc(int size)
547 {
548 if (virtual_grf_array_size <= virtual_grf_count) {
549 if (virtual_grf_array_size == 0)
550 virtual_grf_array_size = 16;
551 else
552 virtual_grf_array_size *= 2;
553 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
554 virtual_grf_array_size);
555 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
556 virtual_grf_array_size);
557 }
558 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
559 virtual_grf_reg_count += size;
560 virtual_grf_sizes[virtual_grf_count] = size;
561 return virtual_grf_count++;
562 }
563
564 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
565 {
566 init();
567
568 this->file = GRF;
569 this->reg = v->virtual_grf_alloc(type_size(type));
570
571 if (type->is_array() || type->is_record()) {
572 this->swizzle = BRW_SWIZZLE_NOOP;
573 } else {
574 this->swizzle = swizzle_for_size(type->vector_elements);
575 }
576
577 this->type = brw_type_for_base_type(type);
578 }
579
580 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
581 {
582 init();
583
584 this->file = GRF;
585 this->reg = v->virtual_grf_alloc(type_size(type));
586
587 if (type->is_array() || type->is_record()) {
588 this->writemask = WRITEMASK_XYZW;
589 } else {
590 this->writemask = (1 << type->vector_elements) - 1;
591 }
592
593 this->type = brw_type_for_base_type(type);
594 }
595
596 /* Our support for uniforms is piggy-backed on the struct
597 * gl_fragment_program, because that's where the values actually
598 * get stored, rather than in some global gl_shader_program uniform
599 * store.
600 */
601 void
602 vec4_visitor::setup_uniform_values(ir_variable *ir)
603 {
604 int namelen = strlen(ir->name);
605
606 /* The data for our (non-builtin) uniforms is stored in a series of
607 * gl_uniform_driver_storage structs for each subcomponent that
608 * glGetUniformLocation() could name. We know it's been set up in the same
609 * order we'd walk the type, so walk the list of storage and find anything
610 * with our name, or the prefix of a component that starts with our name.
611 */
612 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
613 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
614
615 if (strncmp(ir->name, storage->name, namelen) != 0 ||
616 (storage->name[namelen] != 0 &&
617 storage->name[namelen] != '.' &&
618 storage->name[namelen] != '[')) {
619 continue;
620 }
621
622 gl_constant_value *components = storage->storage;
623 unsigned vector_count = (MAX2(storage->array_elements, 1) *
624 storage->type->matrix_columns);
625
626 for (unsigned s = 0; s < vector_count; s++) {
627 uniform_vector_size[uniforms] = storage->type->vector_elements;
628
629 int i;
630 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
631 prog_data->param[uniforms * 4 + i] = &components->f;
632 components++;
633 }
634 for (; i < 4; i++) {
635 static float zero = 0;
636 prog_data->param[uniforms * 4 + i] = &zero;
637 }
638
639 uniforms++;
640 }
641 }
642 }
643
644 void
645 vec4_visitor::setup_uniform_clipplane_values()
646 {
647 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
648
649 if (intel->gen < 6) {
650 /* Pre-Gen6, we compact clip planes. For example, if the user
651 * enables just clip planes 0, 1, and 3, we will enable clip planes
652 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
653 * plane 2. This simplifies the implementation of the Gen6 clip
654 * thread.
655 */
656 int compacted_clipplane_index = 0;
657 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
658 if (!(key->userclip_planes_enabled_gen_4_5 & (1 << i)))
659 continue;
660
661 this->uniform_vector_size[this->uniforms] = 4;
662 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
663 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
664 for (int j = 0; j < 4; ++j) {
665 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
666 }
667 ++compacted_clipplane_index;
668 ++this->uniforms;
669 }
670 } else {
671 /* In Gen6 and later, we don't compact clip planes, because this
672 * simplifies the implementation of gl_ClipDistance.
673 */
674 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
675 this->uniform_vector_size[this->uniforms] = 4;
676 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
677 this->userplane[i].type = BRW_REGISTER_TYPE_F;
678 for (int j = 0; j < 4; ++j) {
679 prog_data->param[this->uniforms * 4 + j] = &clip_planes[i][j];
680 }
681 ++this->uniforms;
682 }
683 }
684 }
685
686 /* Our support for builtin uniforms is even scarier than non-builtin.
687 * It sits on top of the PROG_STATE_VAR parameters that are
688 * automatically updated from GL context state.
689 */
690 void
691 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
692 {
693 const ir_state_slot *const slots = ir->state_slots;
694 assert(ir->state_slots != NULL);
695
696 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
697 /* This state reference has already been setup by ir_to_mesa,
698 * but we'll get the same index back here. We can reference
699 * ParameterValues directly, since unlike brw_fs.cpp, we never
700 * add new state references during compile.
701 */
702 int index = _mesa_add_state_reference(this->prog->Parameters,
703 (gl_state_index *)slots[i].tokens);
704 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
705
706 this->uniform_vector_size[this->uniforms] = 0;
707 /* Add each of the unique swizzled channels of the element.
708 * This will end up matching the size of the glsl_type of this field.
709 */
710 int last_swiz = -1;
711 for (unsigned int j = 0; j < 4; j++) {
712 int swiz = GET_SWZ(slots[i].swizzle, j);
713 last_swiz = swiz;
714
715 prog_data->param[this->uniforms * 4 + j] = &values[swiz];
716 if (swiz <= last_swiz)
717 this->uniform_vector_size[this->uniforms]++;
718 }
719 this->uniforms++;
720 }
721 }
722
723 dst_reg *
724 vec4_visitor::variable_storage(ir_variable *var)
725 {
726 return (dst_reg *)hash_table_find(this->variable_ht, var);
727 }
728
729 void
730 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
731 {
732 ir_expression *expr = ir->as_expression();
733
734 *predicate = BRW_PREDICATE_NORMAL;
735
736 if (expr) {
737 src_reg op[2];
738 vec4_instruction *inst;
739
740 assert(expr->get_num_operands() <= 2);
741 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
742 expr->operands[i]->accept(this);
743 op[i] = this->result;
744
745 resolve_ud_negate(&op[i]);
746 }
747
748 switch (expr->operation) {
749 case ir_unop_logic_not:
750 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
751 inst->conditional_mod = BRW_CONDITIONAL_Z;
752 break;
753
754 case ir_binop_logic_xor:
755 inst = emit(XOR(dst_null_d(), op[0], op[1]));
756 inst->conditional_mod = BRW_CONDITIONAL_NZ;
757 break;
758
759 case ir_binop_logic_or:
760 inst = emit(OR(dst_null_d(), op[0], op[1]));
761 inst->conditional_mod = BRW_CONDITIONAL_NZ;
762 break;
763
764 case ir_binop_logic_and:
765 inst = emit(AND(dst_null_d(), op[0], op[1]));
766 inst->conditional_mod = BRW_CONDITIONAL_NZ;
767 break;
768
769 case ir_unop_f2b:
770 if (intel->gen >= 6) {
771 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
772 } else {
773 inst = emit(MOV(dst_null_f(), op[0]));
774 inst->conditional_mod = BRW_CONDITIONAL_NZ;
775 }
776 break;
777
778 case ir_unop_i2b:
779 if (intel->gen >= 6) {
780 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
781 } else {
782 inst = emit(MOV(dst_null_d(), op[0]));
783 inst->conditional_mod = BRW_CONDITIONAL_NZ;
784 }
785 break;
786
787 case ir_binop_all_equal:
788 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
789 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
790 break;
791
792 case ir_binop_any_nequal:
793 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
794 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
795 break;
796
797 case ir_unop_any:
798 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
799 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
800 break;
801
802 case ir_binop_greater:
803 case ir_binop_gequal:
804 case ir_binop_less:
805 case ir_binop_lequal:
806 case ir_binop_equal:
807 case ir_binop_nequal:
808 emit(CMP(dst_null_d(), op[0], op[1],
809 brw_conditional_for_comparison(expr->operation)));
810 break;
811
812 default:
813 assert(!"not reached");
814 break;
815 }
816 return;
817 }
818
819 ir->accept(this);
820
821 resolve_ud_negate(&this->result);
822
823 if (intel->gen >= 6) {
824 vec4_instruction *inst = emit(AND(dst_null_d(),
825 this->result, src_reg(1)));
826 inst->conditional_mod = BRW_CONDITIONAL_NZ;
827 } else {
828 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
829 inst->conditional_mod = BRW_CONDITIONAL_NZ;
830 }
831 }
832
833 /**
834 * Emit a gen6 IF statement with the comparison folded into the IF
835 * instruction.
836 */
837 void
838 vec4_visitor::emit_if_gen6(ir_if *ir)
839 {
840 ir_expression *expr = ir->condition->as_expression();
841
842 if (expr) {
843 src_reg op[2];
844 dst_reg temp;
845
846 assert(expr->get_num_operands() <= 2);
847 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
848 expr->operands[i]->accept(this);
849 op[i] = this->result;
850 }
851
852 switch (expr->operation) {
853 case ir_unop_logic_not:
854 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
855 return;
856
857 case ir_binop_logic_xor:
858 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
859 return;
860
861 case ir_binop_logic_or:
862 temp = dst_reg(this, glsl_type::bool_type);
863 emit(OR(temp, op[0], op[1]));
864 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
865 return;
866
867 case ir_binop_logic_and:
868 temp = dst_reg(this, glsl_type::bool_type);
869 emit(AND(temp, op[0], op[1]));
870 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
871 return;
872
873 case ir_unop_f2b:
874 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
875 return;
876
877 case ir_unop_i2b:
878 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
879 return;
880
881 case ir_binop_greater:
882 case ir_binop_gequal:
883 case ir_binop_less:
884 case ir_binop_lequal:
885 case ir_binop_equal:
886 case ir_binop_nequal:
887 emit(IF(op[0], op[1],
888 brw_conditional_for_comparison(expr->operation)));
889 return;
890
891 case ir_binop_all_equal:
892 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
893 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
894 return;
895
896 case ir_binop_any_nequal:
897 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
898 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
899 return;
900
901 case ir_unop_any:
902 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
903 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
904 return;
905
906 default:
907 assert(!"not reached");
908 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
909 return;
910 }
911 return;
912 }
913
914 ir->condition->accept(this);
915
916 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
917 }
918
919 static dst_reg
920 with_writemask(dst_reg const & r, int mask)
921 {
922 dst_reg result = r;
923 result.writemask = mask;
924 return result;
925 }
926
927 void
928 vec4_vs_visitor::emit_prolog()
929 {
930 dst_reg sign_recovery_shift;
931 dst_reg normalize_factor;
932 dst_reg es3_normalize_factor;
933
934 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
935 if (vs_prog_data->inputs_read & BITFIELD64_BIT(i)) {
936 uint8_t wa_flags = vs_compile->key.gl_attrib_wa_flags[i];
937 dst_reg reg(ATTR, i);
938 dst_reg reg_d = reg;
939 reg_d.type = BRW_REGISTER_TYPE_D;
940 dst_reg reg_ud = reg;
941 reg_ud.type = BRW_REGISTER_TYPE_UD;
942
943 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
944 * come in as floating point conversions of the integer values.
945 */
946 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
947 dst_reg dst = reg;
948 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
949 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
950 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
951 }
952
953 /* Do sign recovery for 2101010 formats if required. */
954 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
955 if (sign_recovery_shift.file == BAD_FILE) {
956 /* shift constant: <22,22,22,30> */
957 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
958 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
959 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
960 }
961
962 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
963 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
964 }
965
966 /* Apply BGRA swizzle if required. */
967 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
968 src_reg temp = src_reg(reg);
969 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
970 emit(MOV(reg, temp));
971 }
972
973 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
974 /* ES 3.0 has different rules for converting signed normalized
975 * fixed-point numbers than desktop GL.
976 */
977 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
978 /* According to equation 2.2 of the ES 3.0 specification,
979 * signed normalization conversion is done by:
980 *
981 * f = c / (2^(b-1)-1)
982 */
983 if (es3_normalize_factor.file == BAD_FILE) {
984 /* mul constant: 1 / (2^(b-1) - 1) */
985 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
986 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
987 src_reg(1.0f / ((1<<9) - 1))));
988 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
989 src_reg(1.0f / ((1<<1) - 1))));
990 }
991
992 dst_reg dst = reg;
993 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
994 emit(MOV(dst, src_reg(reg_d)));
995 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
996 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
997 } else {
998 /* The following equations are from the OpenGL 3.2 specification:
999 *
1000 * 2.1 unsigned normalization
1001 * f = c/(2^n-1)
1002 *
1003 * 2.2 signed normalization
1004 * f = (2c+1)/(2^n-1)
1005 *
1006 * Both of these share a common divisor, which is represented by
1007 * "normalize_factor" in the code below.
1008 */
1009 if (normalize_factor.file == BAD_FILE) {
1010 /* 1 / (2^b - 1) for b=<10,10,10,2> */
1011 normalize_factor = dst_reg(this, glsl_type::vec4_type);
1012 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
1013 src_reg(1.0f / ((1<<10) - 1))));
1014 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
1015 src_reg(1.0f / ((1<<2) - 1))));
1016 }
1017
1018 dst_reg dst = reg;
1019 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1020 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1021
1022 /* For signed normalization, we want the numerator to be 2c+1. */
1023 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1024 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1025 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1026 }
1027
1028 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1029 }
1030 }
1031
1032 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1033 dst_reg dst = reg;
1034 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1035 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1036 }
1037 }
1038 }
1039 }
1040
1041
1042 dst_reg *
1043 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1044 {
1045 /* VertexID is stored by the VF as the last vertex element, but
1046 * we don't represent it with a flag in inputs_read, so we call
1047 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1048 */
1049 dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1050 vs_prog_data->uses_vertexid = true;
1051
1052 switch (ir->location) {
1053 case SYSTEM_VALUE_VERTEX_ID:
1054 reg->writemask = WRITEMASK_X;
1055 break;
1056 case SYSTEM_VALUE_INSTANCE_ID:
1057 reg->writemask = WRITEMASK_Y;
1058 break;
1059 default:
1060 assert(!"not reached");
1061 break;
1062 }
1063
1064 return reg;
1065 }
1066
1067
1068 void
1069 vec4_visitor::visit(ir_variable *ir)
1070 {
1071 dst_reg *reg = NULL;
1072
1073 if (variable_storage(ir))
1074 return;
1075
1076 switch (ir->mode) {
1077 case ir_var_shader_in:
1078 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1079 break;
1080
1081 case ir_var_shader_out:
1082 reg = new(mem_ctx) dst_reg(this, ir->type);
1083
1084 for (int i = 0; i < type_size(ir->type); i++) {
1085 output_reg[ir->location + i] = *reg;
1086 output_reg[ir->location + i].reg_offset = i;
1087 output_reg[ir->location + i].type =
1088 brw_type_for_base_type(ir->type->get_scalar_type());
1089 output_reg_annotation[ir->location + i] = ir->name;
1090 }
1091 break;
1092
1093 case ir_var_auto:
1094 case ir_var_temporary:
1095 reg = new(mem_ctx) dst_reg(this, ir->type);
1096 break;
1097
1098 case ir_var_uniform:
1099 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1100
1101 /* Thanks to the lower_ubo_reference pass, we will see only
1102 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1103 * variables, so no need for them to be in variable_ht.
1104 */
1105 if (ir->is_in_uniform_block())
1106 return;
1107
1108 /* Track how big the whole uniform variable is, in case we need to put a
1109 * copy of its data into pull constants for array access.
1110 */
1111 this->uniform_size[this->uniforms] = type_size(ir->type);
1112
1113 if (!strncmp(ir->name, "gl_", 3)) {
1114 setup_builtin_uniform_values(ir);
1115 } else {
1116 setup_uniform_values(ir);
1117 }
1118 break;
1119
1120 case ir_var_system_value:
1121 reg = make_reg_for_system_value(ir);
1122 break;
1123
1124 default:
1125 assert(!"not reached");
1126 }
1127
1128 reg->type = brw_type_for_base_type(ir->type);
1129 hash_table_insert(this->variable_ht, reg, ir);
1130 }
1131
1132 void
1133 vec4_visitor::visit(ir_loop *ir)
1134 {
1135 dst_reg counter;
1136
1137 /* We don't want debugging output to print the whole body of the
1138 * loop as the annotation.
1139 */
1140 this->base_ir = NULL;
1141
1142 if (ir->counter != NULL) {
1143 this->base_ir = ir->counter;
1144 ir->counter->accept(this);
1145 counter = *(variable_storage(ir->counter));
1146
1147 if (ir->from != NULL) {
1148 this->base_ir = ir->from;
1149 ir->from->accept(this);
1150
1151 emit(MOV(counter, this->result));
1152 }
1153 }
1154
1155 emit(BRW_OPCODE_DO);
1156
1157 if (ir->to) {
1158 this->base_ir = ir->to;
1159 ir->to->accept(this);
1160
1161 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1162 brw_conditional_for_comparison(ir->cmp)));
1163
1164 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1165 inst->predicate = BRW_PREDICATE_NORMAL;
1166 }
1167
1168 visit_instructions(&ir->body_instructions);
1169
1170
1171 if (ir->increment) {
1172 this->base_ir = ir->increment;
1173 ir->increment->accept(this);
1174 emit(ADD(counter, src_reg(counter), this->result));
1175 }
1176
1177 emit(BRW_OPCODE_WHILE);
1178 }
1179
1180 void
1181 vec4_visitor::visit(ir_loop_jump *ir)
1182 {
1183 switch (ir->mode) {
1184 case ir_loop_jump::jump_break:
1185 emit(BRW_OPCODE_BREAK);
1186 break;
1187 case ir_loop_jump::jump_continue:
1188 emit(BRW_OPCODE_CONTINUE);
1189 break;
1190 }
1191 }
1192
1193
1194 void
1195 vec4_visitor::visit(ir_function_signature *ir)
1196 {
1197 assert(0);
1198 (void)ir;
1199 }
1200
1201 void
1202 vec4_visitor::visit(ir_function *ir)
1203 {
1204 /* Ignore function bodies other than main() -- we shouldn't see calls to
1205 * them since they should all be inlined.
1206 */
1207 if (strcmp(ir->name, "main") == 0) {
1208 const ir_function_signature *sig;
1209 exec_list empty;
1210
1211 sig = ir->matching_signature(&empty);
1212
1213 assert(sig);
1214
1215 visit_instructions(&sig->body);
1216 }
1217 }
1218
1219 bool
1220 vec4_visitor::try_emit_sat(ir_expression *ir)
1221 {
1222 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1223 if (!sat_src)
1224 return false;
1225
1226 sat_src->accept(this);
1227 src_reg src = this->result;
1228
1229 this->result = src_reg(this, ir->type);
1230 vec4_instruction *inst;
1231 inst = emit(MOV(dst_reg(this->result), src));
1232 inst->saturate = true;
1233
1234 return true;
1235 }
1236
1237 void
1238 vec4_visitor::emit_bool_comparison(unsigned int op,
1239 dst_reg dst, src_reg src0, src_reg src1)
1240 {
1241 /* original gen4 does destination conversion before comparison. */
1242 if (intel->gen < 5)
1243 dst.type = src0.type;
1244
1245 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1246
1247 dst.type = BRW_REGISTER_TYPE_D;
1248 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1249 }
1250
1251 void
1252 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1253 src_reg src0, src_reg src1)
1254 {
1255 vec4_instruction *inst;
1256
1257 if (intel->gen >= 6) {
1258 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1259 inst->conditional_mod = conditionalmod;
1260 } else {
1261 emit(CMP(dst, src0, src1, conditionalmod));
1262
1263 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1264 inst->predicate = BRW_PREDICATE_NORMAL;
1265 }
1266 }
1267
1268 void
1269 vec4_visitor::visit(ir_expression *ir)
1270 {
1271 unsigned int operand;
1272 src_reg op[Elements(ir->operands)];
1273 src_reg result_src;
1274 dst_reg result_dst;
1275 vec4_instruction *inst;
1276
1277 if (try_emit_sat(ir))
1278 return;
1279
1280 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1281 this->result.file = BAD_FILE;
1282 ir->operands[operand]->accept(this);
1283 if (this->result.file == BAD_FILE) {
1284 printf("Failed to get tree for expression operand:\n");
1285 ir->operands[operand]->print();
1286 exit(1);
1287 }
1288 op[operand] = this->result;
1289
1290 /* Matrix expression operands should have been broken down to vector
1291 * operations already.
1292 */
1293 assert(!ir->operands[operand]->type->is_matrix());
1294 }
1295
1296 int vector_elements = ir->operands[0]->type->vector_elements;
1297 if (ir->operands[1]) {
1298 vector_elements = MAX2(vector_elements,
1299 ir->operands[1]->type->vector_elements);
1300 }
1301
1302 this->result.file = BAD_FILE;
1303
1304 /* Storage for our result. Ideally for an assignment we'd be using
1305 * the actual storage for the result here, instead.
1306 */
1307 result_src = src_reg(this, ir->type);
1308 /* convenience for the emit functions below. */
1309 result_dst = dst_reg(result_src);
1310 /* If nothing special happens, this is the result. */
1311 this->result = result_src;
1312 /* Limit writes to the channels that will be used by result_src later.
1313 * This does limit this temp's use as a temporary for multi-instruction
1314 * sequences.
1315 */
1316 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1317
1318 switch (ir->operation) {
1319 case ir_unop_logic_not:
1320 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1321 * ones complement of the whole register, not just bit 0.
1322 */
1323 emit(XOR(result_dst, op[0], src_reg(1)));
1324 break;
1325 case ir_unop_neg:
1326 op[0].negate = !op[0].negate;
1327 this->result = op[0];
1328 break;
1329 case ir_unop_abs:
1330 op[0].abs = true;
1331 op[0].negate = false;
1332 this->result = op[0];
1333 break;
1334
1335 case ir_unop_sign:
1336 emit(MOV(result_dst, src_reg(0.0f)));
1337
1338 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1339 inst = emit(MOV(result_dst, src_reg(1.0f)));
1340 inst->predicate = BRW_PREDICATE_NORMAL;
1341
1342 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1343 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1344 inst->predicate = BRW_PREDICATE_NORMAL;
1345
1346 break;
1347
1348 case ir_unop_rcp:
1349 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1350 break;
1351
1352 case ir_unop_exp2:
1353 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1354 break;
1355 case ir_unop_log2:
1356 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1357 break;
1358 case ir_unop_exp:
1359 case ir_unop_log:
1360 assert(!"not reached: should be handled by ir_explog_to_explog2");
1361 break;
1362 case ir_unop_sin:
1363 case ir_unop_sin_reduced:
1364 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1365 break;
1366 case ir_unop_cos:
1367 case ir_unop_cos_reduced:
1368 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1369 break;
1370
1371 case ir_unop_dFdx:
1372 case ir_unop_dFdy:
1373 assert(!"derivatives not valid in vertex shader");
1374 break;
1375
1376 case ir_unop_noise:
1377 assert(!"not reached: should be handled by lower_noise");
1378 break;
1379
1380 case ir_binop_add:
1381 emit(ADD(result_dst, op[0], op[1]));
1382 break;
1383 case ir_binop_sub:
1384 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1385 break;
1386
1387 case ir_binop_mul:
1388 if (ir->type->is_integer()) {
1389 /* For integer multiplication, the MUL uses the low 16 bits
1390 * of one of the operands (src0 on gen6, src1 on gen7). The
1391 * MACH accumulates in the contribution of the upper 16 bits
1392 * of that operand.
1393 *
1394 * FINISHME: Emit just the MUL if we know an operand is small
1395 * enough.
1396 */
1397 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1398
1399 emit(MUL(acc, op[0], op[1]));
1400 emit(MACH(dst_null_d(), op[0], op[1]));
1401 emit(MOV(result_dst, src_reg(acc)));
1402 } else {
1403 emit(MUL(result_dst, op[0], op[1]));
1404 }
1405 break;
1406 case ir_binop_div:
1407 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1408 assert(ir->type->is_integer());
1409 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1410 break;
1411 case ir_binop_mod:
1412 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1413 assert(ir->type->is_integer());
1414 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1415 break;
1416
1417 case ir_binop_less:
1418 case ir_binop_greater:
1419 case ir_binop_lequal:
1420 case ir_binop_gequal:
1421 case ir_binop_equal:
1422 case ir_binop_nequal: {
1423 emit(CMP(result_dst, op[0], op[1],
1424 brw_conditional_for_comparison(ir->operation)));
1425 emit(AND(result_dst, result_src, src_reg(0x1)));
1426 break;
1427 }
1428
1429 case ir_binop_all_equal:
1430 /* "==" operator producing a scalar boolean. */
1431 if (ir->operands[0]->type->is_vector() ||
1432 ir->operands[1]->type->is_vector()) {
1433 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1434 emit(MOV(result_dst, src_reg(0)));
1435 inst = emit(MOV(result_dst, src_reg(1)));
1436 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1437 } else {
1438 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1439 emit(AND(result_dst, result_src, src_reg(0x1)));
1440 }
1441 break;
1442 case ir_binop_any_nequal:
1443 /* "!=" operator producing a scalar boolean. */
1444 if (ir->operands[0]->type->is_vector() ||
1445 ir->operands[1]->type->is_vector()) {
1446 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1447
1448 emit(MOV(result_dst, src_reg(0)));
1449 inst = emit(MOV(result_dst, src_reg(1)));
1450 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1451 } else {
1452 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1453 emit(AND(result_dst, result_src, src_reg(0x1)));
1454 }
1455 break;
1456
1457 case ir_unop_any:
1458 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1459 emit(MOV(result_dst, src_reg(0)));
1460
1461 inst = emit(MOV(result_dst, src_reg(1)));
1462 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1463 break;
1464
1465 case ir_binop_logic_xor:
1466 emit(XOR(result_dst, op[0], op[1]));
1467 break;
1468
1469 case ir_binop_logic_or:
1470 emit(OR(result_dst, op[0], op[1]));
1471 break;
1472
1473 case ir_binop_logic_and:
1474 emit(AND(result_dst, op[0], op[1]));
1475 break;
1476
1477 case ir_binop_dot:
1478 assert(ir->operands[0]->type->is_vector());
1479 assert(ir->operands[0]->type == ir->operands[1]->type);
1480 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1481 break;
1482
1483 case ir_unop_sqrt:
1484 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1485 break;
1486 case ir_unop_rsq:
1487 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1488 break;
1489
1490 case ir_unop_bitcast_i2f:
1491 case ir_unop_bitcast_u2f:
1492 this->result = op[0];
1493 this->result.type = BRW_REGISTER_TYPE_F;
1494 break;
1495
1496 case ir_unop_bitcast_f2i:
1497 this->result = op[0];
1498 this->result.type = BRW_REGISTER_TYPE_D;
1499 break;
1500
1501 case ir_unop_bitcast_f2u:
1502 this->result = op[0];
1503 this->result.type = BRW_REGISTER_TYPE_UD;
1504 break;
1505
1506 case ir_unop_i2f:
1507 case ir_unop_i2u:
1508 case ir_unop_u2i:
1509 case ir_unop_u2f:
1510 case ir_unop_b2f:
1511 case ir_unop_b2i:
1512 case ir_unop_f2i:
1513 case ir_unop_f2u:
1514 emit(MOV(result_dst, op[0]));
1515 break;
1516 case ir_unop_f2b:
1517 case ir_unop_i2b: {
1518 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1519 emit(AND(result_dst, result_src, src_reg(1)));
1520 break;
1521 }
1522
1523 case ir_unop_trunc:
1524 emit(RNDZ(result_dst, op[0]));
1525 break;
1526 case ir_unop_ceil:
1527 op[0].negate = !op[0].negate;
1528 inst = emit(RNDD(result_dst, op[0]));
1529 this->result.negate = true;
1530 break;
1531 case ir_unop_floor:
1532 inst = emit(RNDD(result_dst, op[0]));
1533 break;
1534 case ir_unop_fract:
1535 inst = emit(FRC(result_dst, op[0]));
1536 break;
1537 case ir_unop_round_even:
1538 emit(RNDE(result_dst, op[0]));
1539 break;
1540
1541 case ir_binop_min:
1542 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1543 break;
1544 case ir_binop_max:
1545 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1546 break;
1547
1548 case ir_binop_pow:
1549 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1550 break;
1551
1552 case ir_unop_bit_not:
1553 inst = emit(NOT(result_dst, op[0]));
1554 break;
1555 case ir_binop_bit_and:
1556 inst = emit(AND(result_dst, op[0], op[1]));
1557 break;
1558 case ir_binop_bit_xor:
1559 inst = emit(XOR(result_dst, op[0], op[1]));
1560 break;
1561 case ir_binop_bit_or:
1562 inst = emit(OR(result_dst, op[0], op[1]));
1563 break;
1564
1565 case ir_binop_lshift:
1566 inst = emit(SHL(result_dst, op[0], op[1]));
1567 break;
1568
1569 case ir_binop_rshift:
1570 if (ir->type->base_type == GLSL_TYPE_INT)
1571 inst = emit(ASR(result_dst, op[0], op[1]));
1572 else
1573 inst = emit(SHR(result_dst, op[0], op[1]));
1574 break;
1575
1576 case ir_binop_ubo_load: {
1577 ir_constant *uniform_block = ir->operands[0]->as_constant();
1578 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1579 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1580 src_reg offset = op[1];
1581
1582 /* Now, load the vector from that offset. */
1583 assert(ir->type->is_vector() || ir->type->is_scalar());
1584
1585 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1586 packed_consts.type = result.type;
1587 src_reg surf_index =
1588 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1589 if (const_offset_ir) {
1590 offset = src_reg(const_offset / 16);
1591 } else {
1592 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1593 }
1594
1595 vec4_instruction *pull =
1596 emit(new(mem_ctx) vec4_instruction(this,
1597 VS_OPCODE_PULL_CONSTANT_LOAD,
1598 dst_reg(packed_consts),
1599 surf_index,
1600 offset));
1601 pull->base_mrf = 14;
1602 pull->mlen = 1;
1603
1604 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1605 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1606 const_offset % 16 / 4,
1607 const_offset % 16 / 4,
1608 const_offset % 16 / 4);
1609
1610 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1611 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1612 emit(CMP(result_dst, packed_consts, src_reg(0u),
1613 BRW_CONDITIONAL_NZ));
1614 emit(AND(result_dst, result, src_reg(0x1)));
1615 } else {
1616 emit(MOV(result_dst, packed_consts));
1617 }
1618 break;
1619 }
1620
1621 case ir_triop_lrp:
1622 assert(!"not reached: should be handled by lrp_to_arith");
1623 break;
1624
1625 case ir_quadop_vector:
1626 assert(!"not reached: should be handled by lower_quadop_vector");
1627 break;
1628
1629 case ir_unop_pack_half_2x16:
1630 emit_pack_half_2x16(result_dst, op[0]);
1631 break;
1632 case ir_unop_unpack_half_2x16:
1633 emit_unpack_half_2x16(result_dst, op[0]);
1634 break;
1635 case ir_unop_pack_snorm_2x16:
1636 case ir_unop_pack_snorm_4x8:
1637 case ir_unop_pack_unorm_2x16:
1638 case ir_unop_pack_unorm_4x8:
1639 case ir_unop_unpack_snorm_2x16:
1640 case ir_unop_unpack_snorm_4x8:
1641 case ir_unop_unpack_unorm_2x16:
1642 case ir_unop_unpack_unorm_4x8:
1643 assert(!"not reached: should be handled by lower_packing_builtins");
1644 break;
1645 case ir_unop_unpack_half_2x16_split_x:
1646 case ir_unop_unpack_half_2x16_split_y:
1647 case ir_binop_pack_half_2x16_split:
1648 assert(!"not reached: should not occur in vertex shader");
1649 break;
1650 }
1651 }
1652
1653
1654 void
1655 vec4_visitor::visit(ir_swizzle *ir)
1656 {
1657 src_reg src;
1658 int i = 0;
1659 int swizzle[4];
1660
1661 /* Note that this is only swizzles in expressions, not those on the left
1662 * hand side of an assignment, which do write masking. See ir_assignment
1663 * for that.
1664 */
1665
1666 ir->val->accept(this);
1667 src = this->result;
1668 assert(src.file != BAD_FILE);
1669
1670 for (i = 0; i < ir->type->vector_elements; i++) {
1671 switch (i) {
1672 case 0:
1673 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1674 break;
1675 case 1:
1676 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1677 break;
1678 case 2:
1679 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1680 break;
1681 case 3:
1682 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1683 break;
1684 }
1685 }
1686 for (; i < 4; i++) {
1687 /* Replicate the last channel out. */
1688 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1689 }
1690
1691 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1692
1693 this->result = src;
1694 }
1695
1696 void
1697 vec4_visitor::visit(ir_dereference_variable *ir)
1698 {
1699 const struct glsl_type *type = ir->type;
1700 dst_reg *reg = variable_storage(ir->var);
1701
1702 if (!reg) {
1703 fail("Failed to find variable storage for %s\n", ir->var->name);
1704 this->result = src_reg(brw_null_reg());
1705 return;
1706 }
1707
1708 this->result = src_reg(*reg);
1709
1710 /* System values get their swizzle from the dst_reg writemask */
1711 if (ir->var->mode == ir_var_system_value)
1712 return;
1713
1714 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1715 this->result.swizzle = swizzle_for_size(type->vector_elements);
1716 }
1717
1718
1719 int
1720 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1721 {
1722 /* Under normal circumstances array elements are stored consecutively, so
1723 * the stride is equal to the size of the array element.
1724 */
1725 return type_size(ir->type);
1726 }
1727
1728
1729 void
1730 vec4_visitor::visit(ir_dereference_array *ir)
1731 {
1732 ir_constant *constant_index;
1733 src_reg src;
1734 int array_stride = compute_array_stride(ir);
1735
1736 constant_index = ir->array_index->constant_expression_value();
1737
1738 ir->array->accept(this);
1739 src = this->result;
1740
1741 if (constant_index) {
1742 src.reg_offset += constant_index->value.i[0] * array_stride;
1743 } else {
1744 /* Variable index array dereference. It eats the "vec4" of the
1745 * base of the array and an index that offsets the Mesa register
1746 * index.
1747 */
1748 ir->array_index->accept(this);
1749
1750 src_reg index_reg;
1751
1752 if (array_stride == 1) {
1753 index_reg = this->result;
1754 } else {
1755 index_reg = src_reg(this, glsl_type::int_type);
1756
1757 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1758 }
1759
1760 if (src.reladdr) {
1761 src_reg temp = src_reg(this, glsl_type::int_type);
1762
1763 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1764
1765 index_reg = temp;
1766 }
1767
1768 src.reladdr = ralloc(mem_ctx, src_reg);
1769 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1770 }
1771
1772 /* If the type is smaller than a vec4, replicate the last channel out. */
1773 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1774 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1775 else
1776 src.swizzle = BRW_SWIZZLE_NOOP;
1777 src.type = brw_type_for_base_type(ir->type);
1778
1779 this->result = src;
1780 }
1781
1782 void
1783 vec4_visitor::visit(ir_dereference_record *ir)
1784 {
1785 unsigned int i;
1786 const glsl_type *struct_type = ir->record->type;
1787 int offset = 0;
1788
1789 ir->record->accept(this);
1790
1791 for (i = 0; i < struct_type->length; i++) {
1792 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1793 break;
1794 offset += type_size(struct_type->fields.structure[i].type);
1795 }
1796
1797 /* If the type is smaller than a vec4, replicate the last channel out. */
1798 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1799 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1800 else
1801 this->result.swizzle = BRW_SWIZZLE_NOOP;
1802 this->result.type = brw_type_for_base_type(ir->type);
1803
1804 this->result.reg_offset += offset;
1805 }
1806
1807 /**
1808 * We want to be careful in assignment setup to hit the actual storage
1809 * instead of potentially using a temporary like we might with the
1810 * ir_dereference handler.
1811 */
1812 static dst_reg
1813 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1814 {
1815 /* The LHS must be a dereference. If the LHS is a variable indexed array
1816 * access of a vector, it must be separated into a series conditional moves
1817 * before reaching this point (see ir_vec_index_to_cond_assign).
1818 */
1819 assert(ir->as_dereference());
1820 ir_dereference_array *deref_array = ir->as_dereference_array();
1821 if (deref_array) {
1822 assert(!deref_array->array->type->is_vector());
1823 }
1824
1825 /* Use the rvalue deref handler for the most part. We'll ignore
1826 * swizzles in it and write swizzles using writemask, though.
1827 */
1828 ir->accept(v);
1829 return dst_reg(v->result);
1830 }
1831
1832 void
1833 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1834 const struct glsl_type *type, uint32_t predicate)
1835 {
1836 if (type->base_type == GLSL_TYPE_STRUCT) {
1837 for (unsigned int i = 0; i < type->length; i++) {
1838 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1839 }
1840 return;
1841 }
1842
1843 if (type->is_array()) {
1844 for (unsigned int i = 0; i < type->length; i++) {
1845 emit_block_move(dst, src, type->fields.array, predicate);
1846 }
1847 return;
1848 }
1849
1850 if (type->is_matrix()) {
1851 const struct glsl_type *vec_type;
1852
1853 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1854 type->vector_elements, 1);
1855
1856 for (int i = 0; i < type->matrix_columns; i++) {
1857 emit_block_move(dst, src, vec_type, predicate);
1858 }
1859 return;
1860 }
1861
1862 assert(type->is_scalar() || type->is_vector());
1863
1864 dst->type = brw_type_for_base_type(type);
1865 src->type = dst->type;
1866
1867 dst->writemask = (1 << type->vector_elements) - 1;
1868
1869 src->swizzle = swizzle_for_size(type->vector_elements);
1870
1871 vec4_instruction *inst = emit(MOV(*dst, *src));
1872 inst->predicate = predicate;
1873
1874 dst->reg_offset++;
1875 src->reg_offset++;
1876 }
1877
1878
1879 /* If the RHS processing resulted in an instruction generating a
1880 * temporary value, and it would be easy to rewrite the instruction to
1881 * generate its result right into the LHS instead, do so. This ends
1882 * up reliably removing instructions where it can be tricky to do so
1883 * later without real UD chain information.
1884 */
1885 bool
1886 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1887 dst_reg dst,
1888 src_reg src,
1889 vec4_instruction *pre_rhs_inst,
1890 vec4_instruction *last_rhs_inst)
1891 {
1892 /* This could be supported, but it would take more smarts. */
1893 if (ir->condition)
1894 return false;
1895
1896 if (pre_rhs_inst == last_rhs_inst)
1897 return false; /* No instructions generated to work with. */
1898
1899 /* Make sure the last instruction generated our source reg. */
1900 if (src.file != GRF ||
1901 src.file != last_rhs_inst->dst.file ||
1902 src.reg != last_rhs_inst->dst.reg ||
1903 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1904 src.reladdr ||
1905 src.abs ||
1906 src.negate ||
1907 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1908 return false;
1909
1910 /* Check that that last instruction fully initialized the channels
1911 * we want to use, in the order we want to use them. We could
1912 * potentially reswizzle the operands of many instructions so that
1913 * we could handle out of order channels, but don't yet.
1914 */
1915
1916 for (unsigned i = 0; i < 4; i++) {
1917 if (dst.writemask & (1 << i)) {
1918 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1919 return false;
1920
1921 if (BRW_GET_SWZ(src.swizzle, i) != i)
1922 return false;
1923 }
1924 }
1925
1926 /* Success! Rewrite the instruction. */
1927 last_rhs_inst->dst.file = dst.file;
1928 last_rhs_inst->dst.reg = dst.reg;
1929 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1930 last_rhs_inst->dst.reladdr = dst.reladdr;
1931 last_rhs_inst->dst.writemask &= dst.writemask;
1932
1933 return true;
1934 }
1935
1936 void
1937 vec4_visitor::visit(ir_assignment *ir)
1938 {
1939 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1940 uint32_t predicate = BRW_PREDICATE_NONE;
1941
1942 if (!ir->lhs->type->is_scalar() &&
1943 !ir->lhs->type->is_vector()) {
1944 ir->rhs->accept(this);
1945 src_reg src = this->result;
1946
1947 if (ir->condition) {
1948 emit_bool_to_cond_code(ir->condition, &predicate);
1949 }
1950
1951 /* emit_block_move doesn't account for swizzles in the source register.
1952 * This should be ok, since the source register is a structure or an
1953 * array, and those can't be swizzled. But double-check to be sure.
1954 */
1955 assert(src.swizzle ==
1956 (ir->rhs->type->is_matrix()
1957 ? swizzle_for_size(ir->rhs->type->vector_elements)
1958 : BRW_SWIZZLE_NOOP));
1959
1960 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1961 return;
1962 }
1963
1964 /* Now we're down to just a scalar/vector with writemasks. */
1965 int i;
1966
1967 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1968 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1969
1970 ir->rhs->accept(this);
1971
1972 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1973
1974 src_reg src = this->result;
1975
1976 int swizzles[4];
1977 int first_enabled_chan = 0;
1978 int src_chan = 0;
1979
1980 assert(ir->lhs->type->is_vector() ||
1981 ir->lhs->type->is_scalar());
1982 dst.writemask = ir->write_mask;
1983
1984 for (int i = 0; i < 4; i++) {
1985 if (dst.writemask & (1 << i)) {
1986 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1987 break;
1988 }
1989 }
1990
1991 /* Swizzle a small RHS vector into the channels being written.
1992 *
1993 * glsl ir treats write_mask as dictating how many channels are
1994 * present on the RHS while in our instructions we need to make
1995 * those channels appear in the slots of the vec4 they're written to.
1996 */
1997 for (int i = 0; i < 4; i++) {
1998 if (dst.writemask & (1 << i))
1999 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2000 else
2001 swizzles[i] = first_enabled_chan;
2002 }
2003 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2004 swizzles[2], swizzles[3]);
2005
2006 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2007 return;
2008 }
2009
2010 if (ir->condition) {
2011 emit_bool_to_cond_code(ir->condition, &predicate);
2012 }
2013
2014 for (i = 0; i < type_size(ir->lhs->type); i++) {
2015 vec4_instruction *inst = emit(MOV(dst, src));
2016 inst->predicate = predicate;
2017
2018 dst.reg_offset++;
2019 src.reg_offset++;
2020 }
2021 }
2022
2023 void
2024 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2025 {
2026 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2027 foreach_list(node, &ir->components) {
2028 ir_constant *field_value = (ir_constant *)node;
2029
2030 emit_constant_values(dst, field_value);
2031 }
2032 return;
2033 }
2034
2035 if (ir->type->is_array()) {
2036 for (unsigned int i = 0; i < ir->type->length; i++) {
2037 emit_constant_values(dst, ir->array_elements[i]);
2038 }
2039 return;
2040 }
2041
2042 if (ir->type->is_matrix()) {
2043 for (int i = 0; i < ir->type->matrix_columns; i++) {
2044 float *vec = &ir->value.f[i * ir->type->vector_elements];
2045
2046 for (int j = 0; j < ir->type->vector_elements; j++) {
2047 dst->writemask = 1 << j;
2048 dst->type = BRW_REGISTER_TYPE_F;
2049
2050 emit(MOV(*dst, src_reg(vec[j])));
2051 }
2052 dst->reg_offset++;
2053 }
2054 return;
2055 }
2056
2057 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2058
2059 for (int i = 0; i < ir->type->vector_elements; i++) {
2060 if (!(remaining_writemask & (1 << i)))
2061 continue;
2062
2063 dst->writemask = 1 << i;
2064 dst->type = brw_type_for_base_type(ir->type);
2065
2066 /* Find other components that match the one we're about to
2067 * write. Emits fewer instructions for things like vec4(0.5,
2068 * 1.5, 1.5, 1.5).
2069 */
2070 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2071 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2072 if (ir->value.b[i] == ir->value.b[j])
2073 dst->writemask |= (1 << j);
2074 } else {
2075 /* u, i, and f storage all line up, so no need for a
2076 * switch case for comparing each type.
2077 */
2078 if (ir->value.u[i] == ir->value.u[j])
2079 dst->writemask |= (1 << j);
2080 }
2081 }
2082
2083 switch (ir->type->base_type) {
2084 case GLSL_TYPE_FLOAT:
2085 emit(MOV(*dst, src_reg(ir->value.f[i])));
2086 break;
2087 case GLSL_TYPE_INT:
2088 emit(MOV(*dst, src_reg(ir->value.i[i])));
2089 break;
2090 case GLSL_TYPE_UINT:
2091 emit(MOV(*dst, src_reg(ir->value.u[i])));
2092 break;
2093 case GLSL_TYPE_BOOL:
2094 emit(MOV(*dst, src_reg(ir->value.b[i])));
2095 break;
2096 default:
2097 assert(!"Non-float/uint/int/bool constant");
2098 break;
2099 }
2100
2101 remaining_writemask &= ~dst->writemask;
2102 }
2103 dst->reg_offset++;
2104 }
2105
2106 void
2107 vec4_visitor::visit(ir_constant *ir)
2108 {
2109 dst_reg dst = dst_reg(this, ir->type);
2110 this->result = src_reg(dst);
2111
2112 emit_constant_values(&dst, ir);
2113 }
2114
2115 void
2116 vec4_visitor::visit(ir_call *ir)
2117 {
2118 assert(!"not reached");
2119 }
2120
2121 void
2122 vec4_visitor::visit(ir_texture *ir)
2123 {
2124 int sampler =
2125 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2126
2127 /* Should be lowered by do_lower_texture_projection */
2128 assert(!ir->projector);
2129
2130 /* Generate code to compute all the subexpression trees. This has to be
2131 * done before loading any values into MRFs for the sampler message since
2132 * generating these values may involve SEND messages that need the MRFs.
2133 */
2134 src_reg coordinate;
2135 if (ir->coordinate) {
2136 ir->coordinate->accept(this);
2137 coordinate = this->result;
2138 }
2139
2140 src_reg shadow_comparitor;
2141 if (ir->shadow_comparitor) {
2142 ir->shadow_comparitor->accept(this);
2143 shadow_comparitor = this->result;
2144 }
2145
2146 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2147 src_reg lod, dPdx, dPdy, sample_index;
2148 switch (ir->op) {
2149 case ir_tex:
2150 lod = src_reg(0.0f);
2151 lod_type = glsl_type::float_type;
2152 break;
2153 case ir_txf:
2154 case ir_txl:
2155 case ir_txs:
2156 ir->lod_info.lod->accept(this);
2157 lod = this->result;
2158 lod_type = ir->lod_info.lod->type;
2159 break;
2160 case ir_txf_ms:
2161 ir->lod_info.sample_index->accept(this);
2162 sample_index = this->result;
2163 sample_index_type = ir->lod_info.sample_index->type;
2164 break;
2165 case ir_txd:
2166 ir->lod_info.grad.dPdx->accept(this);
2167 dPdx = this->result;
2168
2169 ir->lod_info.grad.dPdy->accept(this);
2170 dPdy = this->result;
2171
2172 lod_type = ir->lod_info.grad.dPdx->type;
2173 break;
2174 case ir_txb:
2175 case ir_lod:
2176 break;
2177 }
2178
2179 vec4_instruction *inst = NULL;
2180 switch (ir->op) {
2181 case ir_tex:
2182 case ir_txl:
2183 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2184 break;
2185 case ir_txd:
2186 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2187 break;
2188 case ir_txf:
2189 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2190 break;
2191 case ir_txf_ms:
2192 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2193 break;
2194 case ir_txs:
2195 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2196 break;
2197 case ir_txb:
2198 assert(!"TXB is not valid for vertex shaders.");
2199 break;
2200 case ir_lod:
2201 assert(!"LOD is not valid for vertex shaders.");
2202 break;
2203 }
2204
2205 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2206
2207 /* Texel offsets go in the message header; Gen4 also requires headers. */
2208 inst->header_present = use_texture_offset || intel->gen < 5;
2209 inst->base_mrf = 2;
2210 inst->mlen = inst->header_present + 1; /* always at least one */
2211 inst->sampler = sampler;
2212 inst->dst = dst_reg(this, ir->type);
2213 inst->dst.writemask = WRITEMASK_XYZW;
2214 inst->shadow_compare = ir->shadow_comparitor != NULL;
2215
2216 if (use_texture_offset)
2217 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2218
2219 /* MRF for the first parameter */
2220 int param_base = inst->base_mrf + inst->header_present;
2221
2222 if (ir->op == ir_txs) {
2223 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2224 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2225 } else {
2226 int i, coord_mask = 0, zero_mask = 0;
2227 /* Load the coordinate */
2228 /* FINISHME: gl_clamp_mask and saturate */
2229 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2230 coord_mask |= (1 << i);
2231 for (; i < 4; i++)
2232 zero_mask |= (1 << i);
2233
2234 if (ir->offset && ir->op == ir_txf) {
2235 /* It appears that the ld instruction used for txf does its
2236 * address bounds check before adding in the offset. To work
2237 * around this, just add the integer offset to the integer
2238 * texel coordinate, and don't put the offset in the header.
2239 */
2240 ir_constant *offset = ir->offset->as_constant();
2241 assert(offset);
2242
2243 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2244 src_reg src = coordinate;
2245 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2246 BRW_GET_SWZ(src.swizzle, j),
2247 BRW_GET_SWZ(src.swizzle, j),
2248 BRW_GET_SWZ(src.swizzle, j));
2249 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2250 src, offset->value.i[j]));
2251 }
2252 } else {
2253 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2254 coordinate));
2255 }
2256 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2257 src_reg(0)));
2258 /* Load the shadow comparitor */
2259 if (ir->shadow_comparitor) {
2260 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2261 WRITEMASK_X),
2262 shadow_comparitor));
2263 inst->mlen++;
2264 }
2265
2266 /* Load the LOD info */
2267 if (ir->op == ir_tex || ir->op == ir_txl) {
2268 int mrf, writemask;
2269 if (intel->gen >= 5) {
2270 mrf = param_base + 1;
2271 if (ir->shadow_comparitor) {
2272 writemask = WRITEMASK_Y;
2273 /* mlen already incremented */
2274 } else {
2275 writemask = WRITEMASK_X;
2276 inst->mlen++;
2277 }
2278 } else /* intel->gen == 4 */ {
2279 mrf = param_base;
2280 writemask = WRITEMASK_Z;
2281 }
2282 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2283 } else if (ir->op == ir_txf) {
2284 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2285 } else if (ir->op == ir_txf_ms) {
2286 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2287 sample_index));
2288 inst->mlen++;
2289
2290 /* on Gen7, there is an additional MCS parameter here after SI,
2291 * but we don't bother to emit it since it's always zero. If
2292 * we start supporting texturing from CMS surfaces, this will have
2293 * to change
2294 */
2295 } else if (ir->op == ir_txd) {
2296 const glsl_type *type = lod_type;
2297
2298 if (intel->gen >= 5) {
2299 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2300 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2301 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2302 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2303 inst->mlen++;
2304
2305 if (ir->type->vector_elements == 3) {
2306 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2307 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2308 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2309 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2310 inst->mlen++;
2311 }
2312 } else /* intel->gen == 4 */ {
2313 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2314 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2315 inst->mlen += 2;
2316 }
2317 }
2318 }
2319
2320 emit(inst);
2321
2322 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2323 * spec requires layers.
2324 */
2325 if (ir->op == ir_txs) {
2326 glsl_type const *type = ir->sampler->type;
2327 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2328 type->sampler_array) {
2329 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2330 with_writemask(inst->dst, WRITEMASK_Z),
2331 src_reg(inst->dst), src_reg(6));
2332 }
2333 }
2334
2335 swizzle_result(ir, src_reg(inst->dst), sampler);
2336 }
2337
2338 void
2339 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2340 {
2341 int s = key->tex.swizzles[sampler];
2342
2343 this->result = src_reg(this, ir->type);
2344 dst_reg swizzled_result(this->result);
2345
2346 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2347 || s == SWIZZLE_NOOP) {
2348 emit(MOV(swizzled_result, orig_val));
2349 return;
2350 }
2351
2352 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2353 int swizzle[4];
2354
2355 for (int i = 0; i < 4; i++) {
2356 switch (GET_SWZ(s, i)) {
2357 case SWIZZLE_ZERO:
2358 zero_mask |= (1 << i);
2359 break;
2360 case SWIZZLE_ONE:
2361 one_mask |= (1 << i);
2362 break;
2363 default:
2364 copy_mask |= (1 << i);
2365 swizzle[i] = GET_SWZ(s, i);
2366 break;
2367 }
2368 }
2369
2370 if (copy_mask) {
2371 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2372 swizzled_result.writemask = copy_mask;
2373 emit(MOV(swizzled_result, orig_val));
2374 }
2375
2376 if (zero_mask) {
2377 swizzled_result.writemask = zero_mask;
2378 emit(MOV(swizzled_result, src_reg(0.0f)));
2379 }
2380
2381 if (one_mask) {
2382 swizzled_result.writemask = one_mask;
2383 emit(MOV(swizzled_result, src_reg(1.0f)));
2384 }
2385 }
2386
2387 void
2388 vec4_visitor::visit(ir_return *ir)
2389 {
2390 assert(!"not reached");
2391 }
2392
2393 void
2394 vec4_visitor::visit(ir_discard *ir)
2395 {
2396 assert(!"not reached");
2397 }
2398
2399 void
2400 vec4_visitor::visit(ir_if *ir)
2401 {
2402 /* Don't point the annotation at the if statement, because then it plus
2403 * the then and else blocks get printed.
2404 */
2405 this->base_ir = ir->condition;
2406
2407 if (intel->gen == 6) {
2408 emit_if_gen6(ir);
2409 } else {
2410 uint32_t predicate;
2411 emit_bool_to_cond_code(ir->condition, &predicate);
2412 emit(IF(predicate));
2413 }
2414
2415 visit_instructions(&ir->then_instructions);
2416
2417 if (!ir->else_instructions.is_empty()) {
2418 this->base_ir = ir->condition;
2419 emit(BRW_OPCODE_ELSE);
2420
2421 visit_instructions(&ir->else_instructions);
2422 }
2423
2424 this->base_ir = ir->condition;
2425 emit(BRW_OPCODE_ENDIF);
2426 }
2427
2428 void
2429 vec4_visitor::emit_ndc_computation()
2430 {
2431 /* Get the position */
2432 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2433
2434 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2435 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2436 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2437
2438 current_annotation = "NDC";
2439 dst_reg ndc_w = ndc;
2440 ndc_w.writemask = WRITEMASK_W;
2441 src_reg pos_w = pos;
2442 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2443 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2444
2445 dst_reg ndc_xyz = ndc;
2446 ndc_xyz.writemask = WRITEMASK_XYZ;
2447
2448 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2449 }
2450
2451 void
2452 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2453 {
2454 if (intel->gen < 6 &&
2455 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2456 key->userclip_active || brw->has_negative_rhw_bug)) {
2457 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2458 dst_reg header1_w = header1;
2459 header1_w.writemask = WRITEMASK_W;
2460 GLuint i;
2461
2462 emit(MOV(header1, 0u));
2463
2464 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2465 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2466
2467 current_annotation = "Point size";
2468 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2469 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2470 }
2471
2472 current_annotation = "Clipping flags";
2473 for (i = 0; i < key->nr_userclip_plane_consts; i++) {
2474 vec4_instruction *inst;
2475
2476 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2477 src_reg(this->userplane[i])));
2478 inst->conditional_mod = BRW_CONDITIONAL_L;
2479
2480 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2481 inst->predicate = BRW_PREDICATE_NORMAL;
2482 }
2483
2484 /* i965 clipping workaround:
2485 * 1) Test for -ve rhw
2486 * 2) If set,
2487 * set ndc = (0,0,0,0)
2488 * set ucp[6] = 1
2489 *
2490 * Later, clipping will detect ucp[6] and ensure the primitive is
2491 * clipped against all fixed planes.
2492 */
2493 if (brw->has_negative_rhw_bug) {
2494 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2495 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2496 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2497 vec4_instruction *inst;
2498 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2499 inst->predicate = BRW_PREDICATE_NORMAL;
2500 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2501 inst->predicate = BRW_PREDICATE_NORMAL;
2502 }
2503
2504 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2505 } else if (intel->gen < 6) {
2506 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2507 } else {
2508 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2509 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2510 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2511 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2512 }
2513 }
2514 }
2515
2516 void
2517 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2518 {
2519 if (intel->gen < 6) {
2520 /* Clip distance slots are set aside in gen5, but they are not used. It
2521 * is not clear whether we actually need to set aside space for them,
2522 * but the performance cost is negligible.
2523 */
2524 return;
2525 }
2526
2527 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2528 *
2529 * "If a linked set of shaders forming the vertex stage contains no
2530 * static write to gl_ClipVertex or gl_ClipDistance, but the
2531 * application has requested clipping against user clip planes through
2532 * the API, then the coordinate written to gl_Position is used for
2533 * comparison against the user clip planes."
2534 *
2535 * This function is only called if the shader didn't write to
2536 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2537 * if the user wrote to it; otherwise we use gl_Position.
2538 */
2539 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2540 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2541 clip_vertex = VARYING_SLOT_POS;
2542 }
2543
2544 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2545 ++i) {
2546 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2547 src_reg(output_reg[clip_vertex]),
2548 src_reg(this->userplane[i + offset])));
2549 }
2550 }
2551
2552 void
2553 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2554 {
2555 assert (varying < VARYING_SLOT_MAX);
2556 reg.type = output_reg[varying].type;
2557 current_annotation = output_reg_annotation[varying];
2558 /* Copy the register, saturating if necessary */
2559 vec4_instruction *inst = emit(MOV(reg,
2560 src_reg(output_reg[varying])));
2561 if ((varying == VARYING_SLOT_COL0 ||
2562 varying == VARYING_SLOT_COL1 ||
2563 varying == VARYING_SLOT_BFC0 ||
2564 varying == VARYING_SLOT_BFC1) &&
2565 key->clamp_vertex_color) {
2566 inst->saturate = true;
2567 }
2568 }
2569
2570 void
2571 vec4_visitor::emit_urb_slot(int mrf, int varying)
2572 {
2573 struct brw_reg hw_reg = brw_message_reg(mrf);
2574 dst_reg reg = dst_reg(MRF, mrf);
2575 reg.type = BRW_REGISTER_TYPE_F;
2576
2577 switch (varying) {
2578 case VARYING_SLOT_PSIZ:
2579 /* PSIZ is always in slot 0, and is coupled with other flags. */
2580 current_annotation = "indices, point width, clip flags";
2581 emit_psiz_and_flags(hw_reg);
2582 break;
2583 case BRW_VARYING_SLOT_NDC:
2584 current_annotation = "NDC";
2585 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2586 break;
2587 case BRW_VARYING_SLOT_POS_DUPLICATE:
2588 case VARYING_SLOT_POS:
2589 current_annotation = "gl_Position";
2590 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2591 break;
2592 case VARYING_SLOT_CLIP_DIST0:
2593 case VARYING_SLOT_CLIP_DIST1:
2594 if (this->key->uses_clip_distance) {
2595 emit_generic_urb_slot(reg, varying);
2596 } else {
2597 current_annotation = "user clip distances";
2598 emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2599 }
2600 break;
2601 case VARYING_SLOT_EDGE:
2602 /* This is present when doing unfilled polygons. We're supposed to copy
2603 * the edge flag from the user-provided vertex array
2604 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2605 * of that attribute (starts as 1.0f). This is then used in clipping to
2606 * determine which edges should be drawn as wireframe.
2607 */
2608 current_annotation = "edge flag";
2609 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2610 glsl_type::float_type, WRITEMASK_XYZW))));
2611 break;
2612 case BRW_VARYING_SLOT_PAD:
2613 /* No need to write to this slot */
2614 break;
2615 default:
2616 emit_generic_urb_slot(reg, varying);
2617 break;
2618 }
2619 }
2620
2621 static int
2622 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2623 {
2624 struct intel_context *intel = &brw->intel;
2625
2626 if (intel->gen >= 6) {
2627 /* URB data written (does not include the message header reg) must
2628 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2629 * section 5.4.3.2.2: URB_INTERLEAVED.
2630 *
2631 * URB entries are allocated on a multiple of 1024 bits, so an
2632 * extra 128 bits written here to make the end align to 256 is
2633 * no problem.
2634 */
2635 if ((mlen % 2) != 1)
2636 mlen++;
2637 }
2638
2639 return mlen;
2640 }
2641
2642 void
2643 vec4_vs_visitor::emit_urb_write_header(int mrf)
2644 {
2645 /* No need to do anything for VS; an implied write to this MRF will be
2646 * performed by VS_OPCODE_URB_WRITE.
2647 */
2648 (void) mrf;
2649 }
2650
2651 vec4_instruction *
2652 vec4_vs_visitor::emit_urb_write_opcode(bool complete)
2653 {
2654 /* For VS, the URB writes end the thread. */
2655 if (complete) {
2656 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2657 emit_shader_time_end();
2658 }
2659
2660 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2661 inst->eot = complete;
2662
2663 return inst;
2664 }
2665
2666 /**
2667 * Generates the VUE payload plus the necessary URB write instructions to
2668 * output it.
2669 *
2670 * The VUE layout is documented in Volume 2a.
2671 */
2672 void
2673 vec4_visitor::emit_vertex()
2674 {
2675 /* MRF 0 is reserved for the debugger, so start with message header
2676 * in MRF 1.
2677 */
2678 int base_mrf = 1;
2679 int mrf = base_mrf;
2680 /* In the process of generating our URB write message contents, we
2681 * may need to unspill a register or load from an array. Those
2682 * reads would use MRFs 14-15.
2683 */
2684 int max_usable_mrf = 13;
2685
2686 /* The following assertion verifies that max_usable_mrf causes an
2687 * even-numbered amount of URB write data, which will meet gen6's
2688 * requirements for length alignment.
2689 */
2690 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2691
2692 /* First mrf is the g0-based message header containing URB handles and
2693 * such.
2694 */
2695 emit_urb_write_header(mrf++);
2696
2697 if (intel->gen < 6) {
2698 emit_ndc_computation();
2699 }
2700
2701 /* Set up the VUE data for the first URB write */
2702 int slot;
2703 for (slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
2704 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2705
2706 /* If this was max_usable_mrf, we can't fit anything more into this URB
2707 * WRITE.
2708 */
2709 if (mrf > max_usable_mrf) {
2710 slot++;
2711 break;
2712 }
2713 }
2714
2715 bool complete = slot >= prog_data->vue_map.num_slots;
2716 current_annotation = "URB write";
2717 vec4_instruction *inst = emit_urb_write_opcode(complete);
2718 inst->base_mrf = base_mrf;
2719 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2720
2721 /* Optional second URB write */
2722 if (!complete) {
2723 mrf = base_mrf + 1;
2724
2725 for (; slot < prog_data->vue_map.num_slots; ++slot) {
2726 assert(mrf < max_usable_mrf);
2727
2728 emit_urb_slot(mrf++, prog_data->vue_map.slot_to_varying[slot]);
2729 }
2730
2731 current_annotation = "URB write";
2732 inst = emit_urb_write_opcode(true /* complete */);
2733 inst->base_mrf = base_mrf;
2734 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2735 /* URB destination offset. In the previous write, we got MRFs
2736 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2737 * URB row increments, and each of our MRFs is half of one of
2738 * those, since we're doing interleaved writes.
2739 */
2740 inst->offset = (max_usable_mrf - base_mrf) / 2;
2741 }
2742 }
2743
2744 void
2745 vec4_vs_visitor::emit_thread_end()
2746 {
2747 /* For VS, we always end the thread by emitting a single vertex.
2748 * emit_urb_write_opcode() will take care of setting the eot flag on the
2749 * SEND instruction.
2750 */
2751 emit_vertex();
2752 }
2753
2754 src_reg
2755 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2756 src_reg *reladdr, int reg_offset)
2757 {
2758 /* Because we store the values to scratch interleaved like our
2759 * vertex data, we need to scale the vec4 index by 2.
2760 */
2761 int message_header_scale = 2;
2762
2763 /* Pre-gen6, the message header uses byte offsets instead of vec4
2764 * (16-byte) offset units.
2765 */
2766 if (intel->gen < 6)
2767 message_header_scale *= 16;
2768
2769 if (reladdr) {
2770 src_reg index = src_reg(this, glsl_type::int_type);
2771
2772 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2773 emit_before(inst, MUL(dst_reg(index),
2774 index, src_reg(message_header_scale)));
2775
2776 return index;
2777 } else {
2778 return src_reg(reg_offset * message_header_scale);
2779 }
2780 }
2781
2782 src_reg
2783 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2784 src_reg *reladdr, int reg_offset)
2785 {
2786 if (reladdr) {
2787 src_reg index = src_reg(this, glsl_type::int_type);
2788
2789 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2790
2791 /* Pre-gen6, the message header uses byte offsets instead of vec4
2792 * (16-byte) offset units.
2793 */
2794 if (intel->gen < 6) {
2795 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2796 }
2797
2798 return index;
2799 } else {
2800 int message_header_scale = intel->gen < 6 ? 16 : 1;
2801 return src_reg(reg_offset * message_header_scale);
2802 }
2803 }
2804
2805 /**
2806 * Emits an instruction before @inst to load the value named by @orig_src
2807 * from scratch space at @base_offset to @temp.
2808 *
2809 * @base_offset is measured in 32-byte units (the size of a register).
2810 */
2811 void
2812 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2813 dst_reg temp, src_reg orig_src,
2814 int base_offset)
2815 {
2816 int reg_offset = base_offset + orig_src.reg_offset;
2817 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2818
2819 emit_before(inst, SCRATCH_READ(temp, index));
2820 }
2821
2822 /**
2823 * Emits an instruction after @inst to store the value to be written
2824 * to @orig_dst to scratch space at @base_offset, from @temp.
2825 *
2826 * @base_offset is measured in 32-byte units (the size of a register).
2827 */
2828 void
2829 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2830 {
2831 int reg_offset = base_offset + inst->dst.reg_offset;
2832 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2833
2834 /* Create a temporary register to store *inst's result in.
2835 *
2836 * We have to be careful in MOVing from our temporary result register in
2837 * the scratch write. If we swizzle from channels of the temporary that
2838 * weren't initialized, it will confuse live interval analysis, which will
2839 * make spilling fail to make progress.
2840 */
2841 src_reg temp = src_reg(this, glsl_type::vec4_type);
2842 temp.type = inst->dst.type;
2843 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2844 int swizzles[4];
2845 for (int i = 0; i < 4; i++)
2846 if (inst->dst.writemask & (1 << i))
2847 swizzles[i] = i;
2848 else
2849 swizzles[i] = first_writemask_chan;
2850 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2851 swizzles[2], swizzles[3]);
2852
2853 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2854 inst->dst.writemask));
2855 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2856 write->predicate = inst->predicate;
2857 write->ir = inst->ir;
2858 write->annotation = inst->annotation;
2859 inst->insert_after(write);
2860
2861 inst->dst.file = temp.file;
2862 inst->dst.reg = temp.reg;
2863 inst->dst.reg_offset = temp.reg_offset;
2864 inst->dst.reladdr = NULL;
2865 }
2866
2867 /**
2868 * We can't generally support array access in GRF space, because a
2869 * single instruction's destination can only span 2 contiguous
2870 * registers. So, we send all GRF arrays that get variable index
2871 * access to scratch space.
2872 */
2873 void
2874 vec4_visitor::move_grf_array_access_to_scratch()
2875 {
2876 int scratch_loc[this->virtual_grf_count];
2877
2878 for (int i = 0; i < this->virtual_grf_count; i++) {
2879 scratch_loc[i] = -1;
2880 }
2881
2882 /* First, calculate the set of virtual GRFs that need to be punted
2883 * to scratch due to having any array access on them, and where in
2884 * scratch.
2885 */
2886 foreach_list(node, &this->instructions) {
2887 vec4_instruction *inst = (vec4_instruction *)node;
2888
2889 if (inst->dst.file == GRF && inst->dst.reladdr &&
2890 scratch_loc[inst->dst.reg] == -1) {
2891 scratch_loc[inst->dst.reg] = c->last_scratch;
2892 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2893 }
2894
2895 for (int i = 0 ; i < 3; i++) {
2896 src_reg *src = &inst->src[i];
2897
2898 if (src->file == GRF && src->reladdr &&
2899 scratch_loc[src->reg] == -1) {
2900 scratch_loc[src->reg] = c->last_scratch;
2901 c->last_scratch += this->virtual_grf_sizes[src->reg];
2902 }
2903 }
2904 }
2905
2906 /* Now, for anything that will be accessed through scratch, rewrite
2907 * it to load/store. Note that this is a _safe list walk, because
2908 * we may generate a new scratch_write instruction after the one
2909 * we're processing.
2910 */
2911 foreach_list_safe(node, &this->instructions) {
2912 vec4_instruction *inst = (vec4_instruction *)node;
2913
2914 /* Set up the annotation tracking for new generated instructions. */
2915 base_ir = inst->ir;
2916 current_annotation = inst->annotation;
2917
2918 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2919 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2920 }
2921
2922 for (int i = 0 ; i < 3; i++) {
2923 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2924 continue;
2925
2926 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2927
2928 emit_scratch_read(inst, temp, inst->src[i],
2929 scratch_loc[inst->src[i].reg]);
2930
2931 inst->src[i].file = temp.file;
2932 inst->src[i].reg = temp.reg;
2933 inst->src[i].reg_offset = temp.reg_offset;
2934 inst->src[i].reladdr = NULL;
2935 }
2936 }
2937 }
2938
2939 /**
2940 * Emits an instruction before @inst to load the value named by @orig_src
2941 * from the pull constant buffer (surface) at @base_offset to @temp.
2942 */
2943 void
2944 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2945 dst_reg temp, src_reg orig_src,
2946 int base_offset)
2947 {
2948 int reg_offset = base_offset + orig_src.reg_offset;
2949 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2950 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2951 vec4_instruction *load;
2952
2953 if (intel->gen >= 7) {
2954 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2955 grf_offset.type = offset.type;
2956 emit_before(inst, MOV(grf_offset, offset));
2957
2958 load = new(mem_ctx) vec4_instruction(this,
2959 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2960 temp, index, src_reg(grf_offset));
2961 } else {
2962 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2963 temp, index, offset);
2964 load->base_mrf = 14;
2965 load->mlen = 1;
2966 }
2967 emit_before(inst, load);
2968 }
2969
2970 /**
2971 * Implements array access of uniforms by inserting a
2972 * PULL_CONSTANT_LOAD instruction.
2973 *
2974 * Unlike temporary GRF array access (where we don't support it due to
2975 * the difficulty of doing relative addressing on instruction
2976 * destinations), we could potentially do array access of uniforms
2977 * that were loaded in GRF space as push constants. In real-world
2978 * usage we've seen, though, the arrays being used are always larger
2979 * than we could load as push constants, so just always move all
2980 * uniform array access out to a pull constant buffer.
2981 */
2982 void
2983 vec4_visitor::move_uniform_array_access_to_pull_constants()
2984 {
2985 int pull_constant_loc[this->uniforms];
2986
2987 for (int i = 0; i < this->uniforms; i++) {
2988 pull_constant_loc[i] = -1;
2989 }
2990
2991 /* Walk through and find array access of uniforms. Put a copy of that
2992 * uniform in the pull constant buffer.
2993 *
2994 * Note that we don't move constant-indexed accesses to arrays. No
2995 * testing has been done of the performance impact of this choice.
2996 */
2997 foreach_list_safe(node, &this->instructions) {
2998 vec4_instruction *inst = (vec4_instruction *)node;
2999
3000 for (int i = 0 ; i < 3; i++) {
3001 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3002 continue;
3003
3004 int uniform = inst->src[i].reg;
3005
3006 /* If this array isn't already present in the pull constant buffer,
3007 * add it.
3008 */
3009 if (pull_constant_loc[uniform] == -1) {
3010 const float **values = &prog_data->param[uniform * 4];
3011
3012 pull_constant_loc[uniform] = prog_data->nr_pull_params / 4;
3013
3014 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3015 prog_data->pull_param[prog_data->nr_pull_params++]
3016 = values[j];
3017 }
3018 }
3019
3020 /* Set up the annotation tracking for new generated instructions. */
3021 base_ir = inst->ir;
3022 current_annotation = inst->annotation;
3023
3024 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3025
3026 emit_pull_constant_load(inst, temp, inst->src[i],
3027 pull_constant_loc[uniform]);
3028
3029 inst->src[i].file = temp.file;
3030 inst->src[i].reg = temp.reg;
3031 inst->src[i].reg_offset = temp.reg_offset;
3032 inst->src[i].reladdr = NULL;
3033 }
3034 }
3035
3036 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3037 * no need to track them as larger-than-vec4 objects. This will be
3038 * relied on in cutting out unused uniform vectors from push
3039 * constants.
3040 */
3041 split_uniform_registers();
3042 }
3043
3044 void
3045 vec4_visitor::resolve_ud_negate(src_reg *reg)
3046 {
3047 if (reg->type != BRW_REGISTER_TYPE_UD ||
3048 !reg->negate)
3049 return;
3050
3051 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3052 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3053 *reg = temp;
3054 }
3055
3056 vec4_visitor::vec4_visitor(struct brw_context *brw,
3057 struct brw_vec4_compile *c,
3058 struct gl_program *prog,
3059 const struct brw_vec4_prog_key *key,
3060 struct brw_vec4_prog_data *prog_data,
3061 struct gl_shader_program *shader_prog,
3062 struct brw_shader *shader,
3063 void *mem_ctx,
3064 bool debug_flag)
3065 : debug_flag(debug_flag)
3066 {
3067 this->brw = brw;
3068 this->intel = &brw->intel;
3069 this->ctx = &intel->ctx;
3070 this->shader_prog = shader_prog;
3071 this->shader = shader;
3072
3073 this->mem_ctx = mem_ctx;
3074 this->failed = false;
3075
3076 this->base_ir = NULL;
3077 this->current_annotation = NULL;
3078 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3079
3080 this->c = c;
3081 this->prog = prog;
3082 this->key = key;
3083 this->prog_data = prog_data;
3084
3085 this->variable_ht = hash_table_ctor(0,
3086 hash_table_pointer_hash,
3087 hash_table_pointer_compare);
3088
3089 this->virtual_grf_def = NULL;
3090 this->virtual_grf_use = NULL;
3091 this->virtual_grf_sizes = NULL;
3092 this->virtual_grf_count = 0;
3093 this->virtual_grf_reg_map = NULL;
3094 this->virtual_grf_reg_count = 0;
3095 this->virtual_grf_array_size = 0;
3096 this->live_intervals_valid = false;
3097
3098 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3099
3100 this->uniforms = 0;
3101 }
3102
3103 vec4_visitor::~vec4_visitor()
3104 {
3105 hash_table_dtor(this->variable_ht);
3106 }
3107
3108
3109 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3110 struct brw_vs_compile *vs_compile,
3111 struct brw_vs_prog_data *vs_prog_data,
3112 struct gl_shader_program *prog,
3113 struct brw_shader *shader,
3114 void *mem_ctx)
3115 : vec4_visitor(brw, &vs_compile->base, &vs_compile->vp->program.Base,
3116 &vs_compile->key.base, &vs_prog_data->base, prog, shader,
3117 mem_ctx, INTEL_DEBUG & DEBUG_VS),
3118 vs_compile(vs_compile),
3119 vs_prog_data(vs_prog_data)
3120 {
3121 }
3122
3123
3124 void
3125 vec4_visitor::fail(const char *format, ...)
3126 {
3127 va_list va;
3128 char *msg;
3129
3130 if (failed)
3131 return;
3132
3133 failed = true;
3134
3135 va_start(va, format);
3136 msg = ralloc_vasprintf(mem_ctx, format, va);
3137 va_end(va);
3138 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3139
3140 this->fail_msg = msg;
3141
3142 if (debug_flag) {
3143 fprintf(stderr, "%s", msg);
3144 }
3145 }
3146
3147 } /* namespace brw */