i965/vs: Add virtual function make_reg_for_system_value().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_uniform.h"
26 extern "C" {
27 #include "main/context.h"
28 #include "main/macros.h"
29 #include "program/prog_parameter.h"
30 #include "program/sampler.h"
31 }
32
33 namespace brw {
34
35 vec4_instruction::vec4_instruction(vec4_visitor *v,
36 enum opcode opcode, dst_reg dst,
37 src_reg src0, src_reg src1, src_reg src2)
38 {
39 this->opcode = opcode;
40 this->dst = dst;
41 this->src[0] = src0;
42 this->src[1] = src1;
43 this->src[2] = src2;
44 this->ir = v->base_ir;
45 this->annotation = v->current_annotation;
46 }
47
48 vec4_instruction *
49 vec4_visitor::emit(vec4_instruction *inst)
50 {
51 this->instructions.push_tail(inst);
52
53 return inst;
54 }
55
56 vec4_instruction *
57 vec4_visitor::emit_before(vec4_instruction *inst, vec4_instruction *new_inst)
58 {
59 new_inst->ir = inst->ir;
60 new_inst->annotation = inst->annotation;
61
62 inst->insert_before(new_inst);
63
64 return inst;
65 }
66
67 vec4_instruction *
68 vec4_visitor::emit(enum opcode opcode, dst_reg dst,
69 src_reg src0, src_reg src1, src_reg src2)
70 {
71 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
72 src0, src1, src2));
73 }
74
75
76 vec4_instruction *
77 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0, src_reg src1)
78 {
79 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
80 }
81
82 vec4_instruction *
83 vec4_visitor::emit(enum opcode opcode, dst_reg dst, src_reg src0)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
86 }
87
88 vec4_instruction *
89 vec4_visitor::emit(enum opcode opcode)
90 {
91 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
92 }
93
94 #define ALU1(op) \
95 vec4_instruction * \
96 vec4_visitor::op(dst_reg dst, src_reg src0) \
97 { \
98 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
99 src0); \
100 }
101
102 #define ALU2(op) \
103 vec4_instruction * \
104 vec4_visitor::op(dst_reg dst, src_reg src0, src_reg src1) \
105 { \
106 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
107 src0, src1); \
108 }
109
110 ALU1(NOT)
111 ALU1(MOV)
112 ALU1(FRC)
113 ALU1(RNDD)
114 ALU1(RNDE)
115 ALU1(RNDZ)
116 ALU1(F32TO16)
117 ALU1(F16TO32)
118 ALU2(ADD)
119 ALU2(MUL)
120 ALU2(MACH)
121 ALU2(AND)
122 ALU2(OR)
123 ALU2(XOR)
124 ALU2(DP3)
125 ALU2(DP4)
126 ALU2(DPH)
127 ALU2(SHL)
128 ALU2(SHR)
129 ALU2(ASR)
130
131 /** Gen4 predicated IF. */
132 vec4_instruction *
133 vec4_visitor::IF(uint32_t predicate)
134 {
135 vec4_instruction *inst;
136
137 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
138 inst->predicate = predicate;
139
140 return inst;
141 }
142
143 /** Gen6+ IF with embedded comparison. */
144 vec4_instruction *
145 vec4_visitor::IF(src_reg src0, src_reg src1, uint32_t condition)
146 {
147 assert(intel->gen >= 6);
148
149 vec4_instruction *inst;
150
151 resolve_ud_negate(&src0);
152 resolve_ud_negate(&src1);
153
154 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
155 src0, src1);
156 inst->conditional_mod = condition;
157
158 return inst;
159 }
160
161 /**
162 * CMP: Sets the low bit of the destination channels with the result
163 * of the comparison, while the upper bits are undefined, and updates
164 * the flag register with the packed 16 bits of the result.
165 */
166 vec4_instruction *
167 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1, uint32_t condition)
168 {
169 vec4_instruction *inst;
170
171 /* original gen4 does type conversion to the destination type
172 * before before comparison, producing garbage results for floating
173 * point comparisons.
174 */
175 if (intel->gen == 4) {
176 dst.type = src0.type;
177 if (dst.file == HW_REG)
178 dst.fixed_hw_reg.type = dst.type;
179 }
180
181 resolve_ud_negate(&src0);
182 resolve_ud_negate(&src1);
183
184 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
185 inst->conditional_mod = condition;
186
187 return inst;
188 }
189
190 vec4_instruction *
191 vec4_visitor::SCRATCH_READ(dst_reg dst, src_reg index)
192 {
193 vec4_instruction *inst;
194
195 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_READ,
196 dst, index);
197 inst->base_mrf = 14;
198 inst->mlen = 2;
199
200 return inst;
201 }
202
203 vec4_instruction *
204 vec4_visitor::SCRATCH_WRITE(dst_reg dst, src_reg src, src_reg index)
205 {
206 vec4_instruction *inst;
207
208 inst = new(mem_ctx) vec4_instruction(this, VS_OPCODE_SCRATCH_WRITE,
209 dst, src, index);
210 inst->base_mrf = 13;
211 inst->mlen = 3;
212
213 return inst;
214 }
215
216 void
217 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
218 {
219 static enum opcode dot_opcodes[] = {
220 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
221 };
222
223 emit(dot_opcodes[elements - 2], dst, src0, src1);
224 }
225
226 src_reg
227 vec4_visitor::fix_math_operand(src_reg src)
228 {
229 /* The gen6 math instruction ignores the source modifiers --
230 * swizzle, abs, negate, and at least some parts of the register
231 * region description.
232 *
233 * Rather than trying to enumerate all these cases, *always* expand the
234 * operand to a temp GRF for gen6.
235 *
236 * For gen7, keep the operand as-is, except if immediate, which gen7 still
237 * can't use.
238 */
239
240 if (intel->gen == 7 && src.file != IMM)
241 return src;
242
243 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
244 expanded.type = src.type;
245 emit(MOV(expanded, src));
246 return src_reg(expanded);
247 }
248
249 void
250 vec4_visitor::emit_math1_gen6(enum opcode opcode, dst_reg dst, src_reg src)
251 {
252 src = fix_math_operand(src);
253
254 if (dst.writemask != WRITEMASK_XYZW) {
255 /* The gen6 math instruction must be align1, so we can't do
256 * writemasks.
257 */
258 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
259
260 emit(opcode, temp_dst, src);
261
262 emit(MOV(dst, src_reg(temp_dst)));
263 } else {
264 emit(opcode, dst, src);
265 }
266 }
267
268 void
269 vec4_visitor::emit_math1_gen4(enum opcode opcode, dst_reg dst, src_reg src)
270 {
271 vec4_instruction *inst = emit(opcode, dst, src);
272 inst->base_mrf = 1;
273 inst->mlen = 1;
274 }
275
276 void
277 vec4_visitor::emit_math(opcode opcode, dst_reg dst, src_reg src)
278 {
279 switch (opcode) {
280 case SHADER_OPCODE_RCP:
281 case SHADER_OPCODE_RSQ:
282 case SHADER_OPCODE_SQRT:
283 case SHADER_OPCODE_EXP2:
284 case SHADER_OPCODE_LOG2:
285 case SHADER_OPCODE_SIN:
286 case SHADER_OPCODE_COS:
287 break;
288 default:
289 assert(!"not reached: bad math opcode");
290 return;
291 }
292
293 if (intel->gen >= 6) {
294 return emit_math1_gen6(opcode, dst, src);
295 } else {
296 return emit_math1_gen4(opcode, dst, src);
297 }
298 }
299
300 void
301 vec4_visitor::emit_math2_gen6(enum opcode opcode,
302 dst_reg dst, src_reg src0, src_reg src1)
303 {
304 src0 = fix_math_operand(src0);
305 src1 = fix_math_operand(src1);
306
307 if (dst.writemask != WRITEMASK_XYZW) {
308 /* The gen6 math instruction must be align1, so we can't do
309 * writemasks.
310 */
311 dst_reg temp_dst = dst_reg(this, glsl_type::vec4_type);
312 temp_dst.type = dst.type;
313
314 emit(opcode, temp_dst, src0, src1);
315
316 emit(MOV(dst, src_reg(temp_dst)));
317 } else {
318 emit(opcode, dst, src0, src1);
319 }
320 }
321
322 void
323 vec4_visitor::emit_math2_gen4(enum opcode opcode,
324 dst_reg dst, src_reg src0, src_reg src1)
325 {
326 vec4_instruction *inst = emit(opcode, dst, src0, src1);
327 inst->base_mrf = 1;
328 inst->mlen = 2;
329 }
330
331 void
332 vec4_visitor::emit_math(enum opcode opcode,
333 dst_reg dst, src_reg src0, src_reg src1)
334 {
335 switch (opcode) {
336 case SHADER_OPCODE_POW:
337 case SHADER_OPCODE_INT_QUOTIENT:
338 case SHADER_OPCODE_INT_REMAINDER:
339 break;
340 default:
341 assert(!"not reached: unsupported binary math opcode");
342 return;
343 }
344
345 if (intel->gen >= 6) {
346 return emit_math2_gen6(opcode, dst, src0, src1);
347 } else {
348 return emit_math2_gen4(opcode, dst, src0, src1);
349 }
350 }
351
352 void
353 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
354 {
355 if (intel->gen < 7)
356 assert(!"ir_unop_pack_half_2x16 should be lowered");
357
358 assert(dst.type == BRW_REGISTER_TYPE_UD);
359 assert(src0.type == BRW_REGISTER_TYPE_F);
360
361 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
362 *
363 * Because this instruction does not have a 16-bit floating-point type,
364 * the destination data type must be Word (W).
365 *
366 * The destination must be DWord-aligned and specify a horizontal stride
367 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
368 * each destination channel and the upper word is not modified.
369 *
370 * The above restriction implies that the f32to16 instruction must use
371 * align1 mode, because only in align1 mode is it possible to specify
372 * horizontal stride. We choose here to defy the hardware docs and emit
373 * align16 instructions.
374 *
375 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
376 * instructions. I was partially successful in that the code passed all
377 * tests. However, the code was dubiously correct and fragile, and the
378 * tests were not harsh enough to probe that frailty. Not trusting the
379 * code, I chose instead to remain in align16 mode in defiance of the hw
380 * docs).
381 *
382 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
383 * simulator, emitting a f32to16 in align16 mode with UD as destination
384 * data type is safe. The behavior differs from that specified in the PRM
385 * in that the upper word of each destination channel is cleared to 0.
386 */
387
388 dst_reg tmp_dst(this, glsl_type::uvec2_type);
389 src_reg tmp_src(tmp_dst);
390
391 #if 0
392 /* Verify the undocumented behavior on which the following instructions
393 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
394 * then the result of the bit-or instruction below will be incorrect.
395 *
396 * You should inspect the disasm output in order to verify that the MOV is
397 * not optimized away.
398 */
399 emit(MOV(tmp_dst, src_reg(0x12345678u)));
400 #endif
401
402 /* Give tmp the form below, where "." means untouched.
403 *
404 * w z y x w z y x
405 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
406 *
407 * That the upper word of each write-channel be 0 is required for the
408 * following bit-shift and bit-or instructions to work. Note that this
409 * relies on the undocumented hardware behavior mentioned above.
410 */
411 tmp_dst.writemask = WRITEMASK_XY;
412 emit(F32TO16(tmp_dst, src0));
413
414 /* Give the write-channels of dst the form:
415 * 0xhhhh0000
416 */
417 tmp_src.swizzle = SWIZZLE_Y;
418 emit(SHL(dst, tmp_src, src_reg(16u)));
419
420 /* Finally, give the write-channels of dst the form of packHalf2x16's
421 * output:
422 * 0xhhhhllll
423 */
424 tmp_src.swizzle = SWIZZLE_X;
425 emit(OR(dst, src_reg(dst), tmp_src));
426 }
427
428 void
429 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
430 {
431 if (intel->gen < 7)
432 assert(!"ir_unop_unpack_half_2x16 should be lowered");
433
434 assert(dst.type == BRW_REGISTER_TYPE_F);
435 assert(src0.type == BRW_REGISTER_TYPE_UD);
436
437 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
438 *
439 * Because this instruction does not have a 16-bit floating-point type,
440 * the source data type must be Word (W). The destination type must be
441 * F (Float).
442 *
443 * To use W as the source data type, we must adjust horizontal strides,
444 * which is only possible in align1 mode. All my [chadv] attempts at
445 * emitting align1 instructions for unpackHalf2x16 failed to pass the
446 * Piglit tests, so I gave up.
447 *
448 * I've verified that, on gen7 hardware and the simulator, it is safe to
449 * emit f16to32 in align16 mode with UD as source data type.
450 */
451
452 dst_reg tmp_dst(this, glsl_type::uvec2_type);
453 src_reg tmp_src(tmp_dst);
454
455 tmp_dst.writemask = WRITEMASK_X;
456 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
457
458 tmp_dst.writemask = WRITEMASK_Y;
459 emit(SHR(tmp_dst, src0, src_reg(16u)));
460
461 dst.writemask = WRITEMASK_XY;
462 emit(F16TO32(dst, tmp_src));
463 }
464
465 void
466 vec4_visitor::visit_instructions(const exec_list *list)
467 {
468 foreach_list(node, list) {
469 ir_instruction *ir = (ir_instruction *)node;
470
471 base_ir = ir;
472 ir->accept(this);
473 }
474 }
475
476
477 static int
478 type_size(const struct glsl_type *type)
479 {
480 unsigned int i;
481 int size;
482
483 switch (type->base_type) {
484 case GLSL_TYPE_UINT:
485 case GLSL_TYPE_INT:
486 case GLSL_TYPE_FLOAT:
487 case GLSL_TYPE_BOOL:
488 if (type->is_matrix()) {
489 return type->matrix_columns;
490 } else {
491 /* Regardless of size of vector, it gets a vec4. This is bad
492 * packing for things like floats, but otherwise arrays become a
493 * mess. Hopefully a later pass over the code can pack scalars
494 * down if appropriate.
495 */
496 return 1;
497 }
498 case GLSL_TYPE_ARRAY:
499 assert(type->length > 0);
500 return type_size(type->fields.array) * type->length;
501 case GLSL_TYPE_STRUCT:
502 size = 0;
503 for (i = 0; i < type->length; i++) {
504 size += type_size(type->fields.structure[i].type);
505 }
506 return size;
507 case GLSL_TYPE_SAMPLER:
508 /* Samplers take up one slot in UNIFORMS[], but they're baked in
509 * at link time.
510 */
511 return 1;
512 case GLSL_TYPE_VOID:
513 case GLSL_TYPE_ERROR:
514 case GLSL_TYPE_INTERFACE:
515 assert(0);
516 break;
517 }
518
519 return 0;
520 }
521
522 int
523 vec4_visitor::virtual_grf_alloc(int size)
524 {
525 if (virtual_grf_array_size <= virtual_grf_count) {
526 if (virtual_grf_array_size == 0)
527 virtual_grf_array_size = 16;
528 else
529 virtual_grf_array_size *= 2;
530 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
531 virtual_grf_array_size);
532 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
533 virtual_grf_array_size);
534 }
535 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
536 virtual_grf_reg_count += size;
537 virtual_grf_sizes[virtual_grf_count] = size;
538 return virtual_grf_count++;
539 }
540
541 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
542 {
543 init();
544
545 this->file = GRF;
546 this->reg = v->virtual_grf_alloc(type_size(type));
547
548 if (type->is_array() || type->is_record()) {
549 this->swizzle = BRW_SWIZZLE_NOOP;
550 } else {
551 this->swizzle = swizzle_for_size(type->vector_elements);
552 }
553
554 this->type = brw_type_for_base_type(type);
555 }
556
557 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
558 {
559 init();
560
561 this->file = GRF;
562 this->reg = v->virtual_grf_alloc(type_size(type));
563
564 if (type->is_array() || type->is_record()) {
565 this->writemask = WRITEMASK_XYZW;
566 } else {
567 this->writemask = (1 << type->vector_elements) - 1;
568 }
569
570 this->type = brw_type_for_base_type(type);
571 }
572
573 /* Our support for uniforms is piggy-backed on the struct
574 * gl_fragment_program, because that's where the values actually
575 * get stored, rather than in some global gl_shader_program uniform
576 * store.
577 */
578 void
579 vec4_visitor::setup_uniform_values(ir_variable *ir)
580 {
581 int namelen = strlen(ir->name);
582
583 /* The data for our (non-builtin) uniforms is stored in a series of
584 * gl_uniform_driver_storage structs for each subcomponent that
585 * glGetUniformLocation() could name. We know it's been set up in the same
586 * order we'd walk the type, so walk the list of storage and find anything
587 * with our name, or the prefix of a component that starts with our name.
588 */
589 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
590 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
591
592 if (strncmp(ir->name, storage->name, namelen) != 0 ||
593 (storage->name[namelen] != 0 &&
594 storage->name[namelen] != '.' &&
595 storage->name[namelen] != '[')) {
596 continue;
597 }
598
599 gl_constant_value *components = storage->storage;
600 unsigned vector_count = (MAX2(storage->array_elements, 1) *
601 storage->type->matrix_columns);
602
603 for (unsigned s = 0; s < vector_count; s++) {
604 uniform_vector_size[uniforms] = storage->type->vector_elements;
605
606 int i;
607 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
608 prog_data->base.param[uniforms * 4 + i] = &components->f;
609 components++;
610 }
611 for (; i < 4; i++) {
612 static float zero = 0;
613 prog_data->base.param[uniforms * 4 + i] = &zero;
614 }
615
616 uniforms++;
617 }
618 }
619 }
620
621 void
622 vec4_visitor::setup_uniform_clipplane_values()
623 {
624 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
625
626 if (intel->gen < 6) {
627 /* Pre-Gen6, we compact clip planes. For example, if the user
628 * enables just clip planes 0, 1, and 3, we will enable clip planes
629 * 0, 1, and 2 in the hardware, and we'll move clip plane 3 to clip
630 * plane 2. This simplifies the implementation of the Gen6 clip
631 * thread.
632 */
633 int compacted_clipplane_index = 0;
634 for (int i = 0; i < MAX_CLIP_PLANES; ++i) {
635 if (!(c->key.base.userclip_planes_enabled_gen_4_5 & (1 << i)))
636 continue;
637
638 this->uniform_vector_size[this->uniforms] = 4;
639 this->userplane[compacted_clipplane_index] = dst_reg(UNIFORM, this->uniforms);
640 this->userplane[compacted_clipplane_index].type = BRW_REGISTER_TYPE_F;
641 for (int j = 0; j < 4; ++j) {
642 prog_data->base.param[this->uniforms * 4 + j] = &clip_planes[i][j];
643 }
644 ++compacted_clipplane_index;
645 ++this->uniforms;
646 }
647 } else {
648 /* In Gen6 and later, we don't compact clip planes, because this
649 * simplifies the implementation of gl_ClipDistance.
650 */
651 for (int i = 0; i < c->key.base.nr_userclip_plane_consts; ++i) {
652 this->uniform_vector_size[this->uniforms] = 4;
653 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
654 this->userplane[i].type = BRW_REGISTER_TYPE_F;
655 for (int j = 0; j < 4; ++j) {
656 prog_data->base.param[this->uniforms * 4 + j] = &clip_planes[i][j];
657 }
658 ++this->uniforms;
659 }
660 }
661 }
662
663 /* Our support for builtin uniforms is even scarier than non-builtin.
664 * It sits on top of the PROG_STATE_VAR parameters that are
665 * automatically updated from GL context state.
666 */
667 void
668 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
669 {
670 const ir_state_slot *const slots = ir->state_slots;
671 assert(ir->state_slots != NULL);
672
673 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
674 /* This state reference has already been setup by ir_to_mesa,
675 * but we'll get the same index back here. We can reference
676 * ParameterValues directly, since unlike brw_fs.cpp, we never
677 * add new state references during compile.
678 */
679 int index = _mesa_add_state_reference(this->prog->Parameters,
680 (gl_state_index *)slots[i].tokens);
681 float *values = &this->prog->Parameters->ParameterValues[index][0].f;
682
683 this->uniform_vector_size[this->uniforms] = 0;
684 /* Add each of the unique swizzled channels of the element.
685 * This will end up matching the size of the glsl_type of this field.
686 */
687 int last_swiz = -1;
688 for (unsigned int j = 0; j < 4; j++) {
689 int swiz = GET_SWZ(slots[i].swizzle, j);
690 last_swiz = swiz;
691
692 prog_data->base.param[this->uniforms * 4 + j] = &values[swiz];
693 if (swiz <= last_swiz)
694 this->uniform_vector_size[this->uniforms]++;
695 }
696 this->uniforms++;
697 }
698 }
699
700 dst_reg *
701 vec4_visitor::variable_storage(ir_variable *var)
702 {
703 return (dst_reg *)hash_table_find(this->variable_ht, var);
704 }
705
706 void
707 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir, uint32_t *predicate)
708 {
709 ir_expression *expr = ir->as_expression();
710
711 *predicate = BRW_PREDICATE_NORMAL;
712
713 if (expr) {
714 src_reg op[2];
715 vec4_instruction *inst;
716
717 assert(expr->get_num_operands() <= 2);
718 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
719 expr->operands[i]->accept(this);
720 op[i] = this->result;
721
722 resolve_ud_negate(&op[i]);
723 }
724
725 switch (expr->operation) {
726 case ir_unop_logic_not:
727 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
728 inst->conditional_mod = BRW_CONDITIONAL_Z;
729 break;
730
731 case ir_binop_logic_xor:
732 inst = emit(XOR(dst_null_d(), op[0], op[1]));
733 inst->conditional_mod = BRW_CONDITIONAL_NZ;
734 break;
735
736 case ir_binop_logic_or:
737 inst = emit(OR(dst_null_d(), op[0], op[1]));
738 inst->conditional_mod = BRW_CONDITIONAL_NZ;
739 break;
740
741 case ir_binop_logic_and:
742 inst = emit(AND(dst_null_d(), op[0], op[1]));
743 inst->conditional_mod = BRW_CONDITIONAL_NZ;
744 break;
745
746 case ir_unop_f2b:
747 if (intel->gen >= 6) {
748 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
749 } else {
750 inst = emit(MOV(dst_null_f(), op[0]));
751 inst->conditional_mod = BRW_CONDITIONAL_NZ;
752 }
753 break;
754
755 case ir_unop_i2b:
756 if (intel->gen >= 6) {
757 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
758 } else {
759 inst = emit(MOV(dst_null_d(), op[0]));
760 inst->conditional_mod = BRW_CONDITIONAL_NZ;
761 }
762 break;
763
764 case ir_binop_all_equal:
765 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
766 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
767 break;
768
769 case ir_binop_any_nequal:
770 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
771 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
772 break;
773
774 case ir_unop_any:
775 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
776 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
777 break;
778
779 case ir_binop_greater:
780 case ir_binop_gequal:
781 case ir_binop_less:
782 case ir_binop_lequal:
783 case ir_binop_equal:
784 case ir_binop_nequal:
785 emit(CMP(dst_null_d(), op[0], op[1],
786 brw_conditional_for_comparison(expr->operation)));
787 break;
788
789 default:
790 assert(!"not reached");
791 break;
792 }
793 return;
794 }
795
796 ir->accept(this);
797
798 resolve_ud_negate(&this->result);
799
800 if (intel->gen >= 6) {
801 vec4_instruction *inst = emit(AND(dst_null_d(),
802 this->result, src_reg(1)));
803 inst->conditional_mod = BRW_CONDITIONAL_NZ;
804 } else {
805 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
806 inst->conditional_mod = BRW_CONDITIONAL_NZ;
807 }
808 }
809
810 /**
811 * Emit a gen6 IF statement with the comparison folded into the IF
812 * instruction.
813 */
814 void
815 vec4_visitor::emit_if_gen6(ir_if *ir)
816 {
817 ir_expression *expr = ir->condition->as_expression();
818
819 if (expr) {
820 src_reg op[2];
821 dst_reg temp;
822
823 assert(expr->get_num_operands() <= 2);
824 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
825 expr->operands[i]->accept(this);
826 op[i] = this->result;
827 }
828
829 switch (expr->operation) {
830 case ir_unop_logic_not:
831 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
832 return;
833
834 case ir_binop_logic_xor:
835 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
836 return;
837
838 case ir_binop_logic_or:
839 temp = dst_reg(this, glsl_type::bool_type);
840 emit(OR(temp, op[0], op[1]));
841 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
842 return;
843
844 case ir_binop_logic_and:
845 temp = dst_reg(this, glsl_type::bool_type);
846 emit(AND(temp, op[0], op[1]));
847 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
848 return;
849
850 case ir_unop_f2b:
851 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
852 return;
853
854 case ir_unop_i2b:
855 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
856 return;
857
858 case ir_binop_greater:
859 case ir_binop_gequal:
860 case ir_binop_less:
861 case ir_binop_lequal:
862 case ir_binop_equal:
863 case ir_binop_nequal:
864 emit(IF(op[0], op[1],
865 brw_conditional_for_comparison(expr->operation)));
866 return;
867
868 case ir_binop_all_equal:
869 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
870 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
871 return;
872
873 case ir_binop_any_nequal:
874 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
875 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
876 return;
877
878 case ir_unop_any:
879 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
880 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
881 return;
882
883 default:
884 assert(!"not reached");
885 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
886 return;
887 }
888 return;
889 }
890
891 ir->condition->accept(this);
892
893 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
894 }
895
896 static dst_reg
897 with_writemask(dst_reg const & r, int mask)
898 {
899 dst_reg result = r;
900 result.writemask = mask;
901 return result;
902 }
903
904 void
905 vec4_vs_visitor::emit_prolog()
906 {
907 dst_reg sign_recovery_shift;
908 dst_reg normalize_factor;
909 dst_reg es3_normalize_factor;
910
911 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
912 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
913 uint8_t wa_flags = c->key.gl_attrib_wa_flags[i];
914 dst_reg reg(ATTR, i);
915 dst_reg reg_d = reg;
916 reg_d.type = BRW_REGISTER_TYPE_D;
917 dst_reg reg_ud = reg;
918 reg_ud.type = BRW_REGISTER_TYPE_UD;
919
920 /* Do GL_FIXED rescaling for GLES2.0. Our GL_FIXED attributes
921 * come in as floating point conversions of the integer values.
922 */
923 if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
924 dst_reg dst = reg;
925 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
926 dst.writemask = (1 << (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK)) - 1;
927 emit(MUL(dst, src_reg(dst), src_reg(1.0f / 65536.0f)));
928 }
929
930 /* Do sign recovery for 2101010 formats if required. */
931 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
932 if (sign_recovery_shift.file == BAD_FILE) {
933 /* shift constant: <22,22,22,30> */
934 sign_recovery_shift = dst_reg(this, glsl_type::uvec4_type);
935 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_XYZ), src_reg(22u)));
936 emit(MOV(with_writemask(sign_recovery_shift, WRITEMASK_W), src_reg(30u)));
937 }
938
939 emit(SHL(reg_ud, src_reg(reg_ud), src_reg(sign_recovery_shift)));
940 emit(ASR(reg_d, src_reg(reg_d), src_reg(sign_recovery_shift)));
941 }
942
943 /* Apply BGRA swizzle if required. */
944 if (wa_flags & BRW_ATTRIB_WA_BGRA) {
945 src_reg temp = src_reg(reg);
946 temp.swizzle = BRW_SWIZZLE4(2,1,0,3);
947 emit(MOV(reg, temp));
948 }
949
950 if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
951 /* ES 3.0 has different rules for converting signed normalized
952 * fixed-point numbers than desktop GL.
953 */
954 if (_mesa_is_gles3(ctx) && (wa_flags & BRW_ATTRIB_WA_SIGN)) {
955 /* According to equation 2.2 of the ES 3.0 specification,
956 * signed normalization conversion is done by:
957 *
958 * f = c / (2^(b-1)-1)
959 */
960 if (es3_normalize_factor.file == BAD_FILE) {
961 /* mul constant: 1 / (2^(b-1) - 1) */
962 es3_normalize_factor = dst_reg(this, glsl_type::vec4_type);
963 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_XYZ),
964 src_reg(1.0f / ((1<<9) - 1))));
965 emit(MOV(with_writemask(es3_normalize_factor, WRITEMASK_W),
966 src_reg(1.0f / ((1<<1) - 1))));
967 }
968
969 dst_reg dst = reg;
970 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
971 emit(MOV(dst, src_reg(reg_d)));
972 emit(MUL(dst, src_reg(dst), src_reg(es3_normalize_factor)));
973 emit_minmax(BRW_CONDITIONAL_G, dst, src_reg(dst), src_reg(-1.0f));
974 } else {
975 /* The following equations are from the OpenGL 3.2 specification:
976 *
977 * 2.1 unsigned normalization
978 * f = c/(2^n-1)
979 *
980 * 2.2 signed normalization
981 * f = (2c+1)/(2^n-1)
982 *
983 * Both of these share a common divisor, which is represented by
984 * "normalize_factor" in the code below.
985 */
986 if (normalize_factor.file == BAD_FILE) {
987 /* 1 / (2^b - 1) for b=<10,10,10,2> */
988 normalize_factor = dst_reg(this, glsl_type::vec4_type);
989 emit(MOV(with_writemask(normalize_factor, WRITEMASK_XYZ),
990 src_reg(1.0f / ((1<<10) - 1))));
991 emit(MOV(with_writemask(normalize_factor, WRITEMASK_W),
992 src_reg(1.0f / ((1<<2) - 1))));
993 }
994
995 dst_reg dst = reg;
996 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
997 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
998
999 /* For signed normalization, we want the numerator to be 2c+1. */
1000 if (wa_flags & BRW_ATTRIB_WA_SIGN) {
1001 emit(MUL(dst, src_reg(dst), src_reg(2.0f)));
1002 emit(ADD(dst, src_reg(dst), src_reg(1.0f)));
1003 }
1004
1005 emit(MUL(dst, src_reg(dst), src_reg(normalize_factor)));
1006 }
1007 }
1008
1009 if (wa_flags & BRW_ATTRIB_WA_SCALE) {
1010 dst_reg dst = reg;
1011 dst.type = brw_type_for_base_type(glsl_type::vec4_type);
1012 emit(MOV(dst, src_reg((wa_flags & BRW_ATTRIB_WA_SIGN) ? reg_d : reg_ud)));
1013 }
1014 }
1015 }
1016 }
1017
1018
1019 dst_reg *
1020 vec4_vs_visitor::make_reg_for_system_value(ir_variable *ir)
1021 {
1022 /* VertexID is stored by the VF as the last vertex element, but
1023 * we don't represent it with a flag in inputs_read, so we call
1024 * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
1025 */
1026 dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
1027 prog_data->uses_vertexid = true;
1028
1029 switch (ir->location) {
1030 case SYSTEM_VALUE_VERTEX_ID:
1031 reg->writemask = WRITEMASK_X;
1032 break;
1033 case SYSTEM_VALUE_INSTANCE_ID:
1034 reg->writemask = WRITEMASK_Y;
1035 break;
1036 default:
1037 assert(!"not reached");
1038 break;
1039 }
1040
1041 return reg;
1042 }
1043
1044
1045 void
1046 vec4_visitor::visit(ir_variable *ir)
1047 {
1048 dst_reg *reg = NULL;
1049
1050 if (variable_storage(ir))
1051 return;
1052
1053 switch (ir->mode) {
1054 case ir_var_shader_in:
1055 reg = new(mem_ctx) dst_reg(ATTR, ir->location);
1056 break;
1057
1058 case ir_var_shader_out:
1059 reg = new(mem_ctx) dst_reg(this, ir->type);
1060
1061 for (int i = 0; i < type_size(ir->type); i++) {
1062 output_reg[ir->location + i] = *reg;
1063 output_reg[ir->location + i].reg_offset = i;
1064 output_reg[ir->location + i].type =
1065 brw_type_for_base_type(ir->type->get_scalar_type());
1066 output_reg_annotation[ir->location + i] = ir->name;
1067 }
1068 break;
1069
1070 case ir_var_auto:
1071 case ir_var_temporary:
1072 reg = new(mem_ctx) dst_reg(this, ir->type);
1073 break;
1074
1075 case ir_var_uniform:
1076 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
1077
1078 /* Thanks to the lower_ubo_reference pass, we will see only
1079 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
1080 * variables, so no need for them to be in variable_ht.
1081 */
1082 if (ir->is_in_uniform_block())
1083 return;
1084
1085 /* Track how big the whole uniform variable is, in case we need to put a
1086 * copy of its data into pull constants for array access.
1087 */
1088 this->uniform_size[this->uniforms] = type_size(ir->type);
1089
1090 if (!strncmp(ir->name, "gl_", 3)) {
1091 setup_builtin_uniform_values(ir);
1092 } else {
1093 setup_uniform_values(ir);
1094 }
1095 break;
1096
1097 case ir_var_system_value:
1098 reg = make_reg_for_system_value(ir);
1099 break;
1100
1101 default:
1102 assert(!"not reached");
1103 }
1104
1105 reg->type = brw_type_for_base_type(ir->type);
1106 hash_table_insert(this->variable_ht, reg, ir);
1107 }
1108
1109 void
1110 vec4_visitor::visit(ir_loop *ir)
1111 {
1112 dst_reg counter;
1113
1114 /* We don't want debugging output to print the whole body of the
1115 * loop as the annotation.
1116 */
1117 this->base_ir = NULL;
1118
1119 if (ir->counter != NULL) {
1120 this->base_ir = ir->counter;
1121 ir->counter->accept(this);
1122 counter = *(variable_storage(ir->counter));
1123
1124 if (ir->from != NULL) {
1125 this->base_ir = ir->from;
1126 ir->from->accept(this);
1127
1128 emit(MOV(counter, this->result));
1129 }
1130 }
1131
1132 emit(BRW_OPCODE_DO);
1133
1134 if (ir->to) {
1135 this->base_ir = ir->to;
1136 ir->to->accept(this);
1137
1138 emit(CMP(dst_null_d(), src_reg(counter), this->result,
1139 brw_conditional_for_comparison(ir->cmp)));
1140
1141 vec4_instruction *inst = emit(BRW_OPCODE_BREAK);
1142 inst->predicate = BRW_PREDICATE_NORMAL;
1143 }
1144
1145 visit_instructions(&ir->body_instructions);
1146
1147
1148 if (ir->increment) {
1149 this->base_ir = ir->increment;
1150 ir->increment->accept(this);
1151 emit(ADD(counter, src_reg(counter), this->result));
1152 }
1153
1154 emit(BRW_OPCODE_WHILE);
1155 }
1156
1157 void
1158 vec4_visitor::visit(ir_loop_jump *ir)
1159 {
1160 switch (ir->mode) {
1161 case ir_loop_jump::jump_break:
1162 emit(BRW_OPCODE_BREAK);
1163 break;
1164 case ir_loop_jump::jump_continue:
1165 emit(BRW_OPCODE_CONTINUE);
1166 break;
1167 }
1168 }
1169
1170
1171 void
1172 vec4_visitor::visit(ir_function_signature *ir)
1173 {
1174 assert(0);
1175 (void)ir;
1176 }
1177
1178 void
1179 vec4_visitor::visit(ir_function *ir)
1180 {
1181 /* Ignore function bodies other than main() -- we shouldn't see calls to
1182 * them since they should all be inlined.
1183 */
1184 if (strcmp(ir->name, "main") == 0) {
1185 const ir_function_signature *sig;
1186 exec_list empty;
1187
1188 sig = ir->matching_signature(&empty);
1189
1190 assert(sig);
1191
1192 visit_instructions(&sig->body);
1193 }
1194 }
1195
1196 bool
1197 vec4_visitor::try_emit_sat(ir_expression *ir)
1198 {
1199 ir_rvalue *sat_src = ir->as_rvalue_to_saturate();
1200 if (!sat_src)
1201 return false;
1202
1203 sat_src->accept(this);
1204 src_reg src = this->result;
1205
1206 this->result = src_reg(this, ir->type);
1207 vec4_instruction *inst;
1208 inst = emit(MOV(dst_reg(this->result), src));
1209 inst->saturate = true;
1210
1211 return true;
1212 }
1213
1214 void
1215 vec4_visitor::emit_bool_comparison(unsigned int op,
1216 dst_reg dst, src_reg src0, src_reg src1)
1217 {
1218 /* original gen4 does destination conversion before comparison. */
1219 if (intel->gen < 5)
1220 dst.type = src0.type;
1221
1222 emit(CMP(dst, src0, src1, brw_conditional_for_comparison(op)));
1223
1224 dst.type = BRW_REGISTER_TYPE_D;
1225 emit(AND(dst, src_reg(dst), src_reg(0x1)));
1226 }
1227
1228 void
1229 vec4_visitor::emit_minmax(uint32_t conditionalmod, dst_reg dst,
1230 src_reg src0, src_reg src1)
1231 {
1232 vec4_instruction *inst;
1233
1234 if (intel->gen >= 6) {
1235 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1236 inst->conditional_mod = conditionalmod;
1237 } else {
1238 emit(CMP(dst, src0, src1, conditionalmod));
1239
1240 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1241 inst->predicate = BRW_PREDICATE_NORMAL;
1242 }
1243 }
1244
1245 void
1246 vec4_visitor::visit(ir_expression *ir)
1247 {
1248 unsigned int operand;
1249 src_reg op[Elements(ir->operands)];
1250 src_reg result_src;
1251 dst_reg result_dst;
1252 vec4_instruction *inst;
1253
1254 if (try_emit_sat(ir))
1255 return;
1256
1257 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1258 this->result.file = BAD_FILE;
1259 ir->operands[operand]->accept(this);
1260 if (this->result.file == BAD_FILE) {
1261 printf("Failed to get tree for expression operand:\n");
1262 ir->operands[operand]->print();
1263 exit(1);
1264 }
1265 op[operand] = this->result;
1266
1267 /* Matrix expression operands should have been broken down to vector
1268 * operations already.
1269 */
1270 assert(!ir->operands[operand]->type->is_matrix());
1271 }
1272
1273 int vector_elements = ir->operands[0]->type->vector_elements;
1274 if (ir->operands[1]) {
1275 vector_elements = MAX2(vector_elements,
1276 ir->operands[1]->type->vector_elements);
1277 }
1278
1279 this->result.file = BAD_FILE;
1280
1281 /* Storage for our result. Ideally for an assignment we'd be using
1282 * the actual storage for the result here, instead.
1283 */
1284 result_src = src_reg(this, ir->type);
1285 /* convenience for the emit functions below. */
1286 result_dst = dst_reg(result_src);
1287 /* If nothing special happens, this is the result. */
1288 this->result = result_src;
1289 /* Limit writes to the channels that will be used by result_src later.
1290 * This does limit this temp's use as a temporary for multi-instruction
1291 * sequences.
1292 */
1293 result_dst.writemask = (1 << ir->type->vector_elements) - 1;
1294
1295 switch (ir->operation) {
1296 case ir_unop_logic_not:
1297 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
1298 * ones complement of the whole register, not just bit 0.
1299 */
1300 emit(XOR(result_dst, op[0], src_reg(1)));
1301 break;
1302 case ir_unop_neg:
1303 op[0].negate = !op[0].negate;
1304 this->result = op[0];
1305 break;
1306 case ir_unop_abs:
1307 op[0].abs = true;
1308 op[0].negate = false;
1309 this->result = op[0];
1310 break;
1311
1312 case ir_unop_sign:
1313 emit(MOV(result_dst, src_reg(0.0f)));
1314
1315 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_G));
1316 inst = emit(MOV(result_dst, src_reg(1.0f)));
1317 inst->predicate = BRW_PREDICATE_NORMAL;
1318
1319 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_L));
1320 inst = emit(MOV(result_dst, src_reg(-1.0f)));
1321 inst->predicate = BRW_PREDICATE_NORMAL;
1322
1323 break;
1324
1325 case ir_unop_rcp:
1326 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1327 break;
1328
1329 case ir_unop_exp2:
1330 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1331 break;
1332 case ir_unop_log2:
1333 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1334 break;
1335 case ir_unop_exp:
1336 case ir_unop_log:
1337 assert(!"not reached: should be handled by ir_explog_to_explog2");
1338 break;
1339 case ir_unop_sin:
1340 case ir_unop_sin_reduced:
1341 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1342 break;
1343 case ir_unop_cos:
1344 case ir_unop_cos_reduced:
1345 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1346 break;
1347
1348 case ir_unop_dFdx:
1349 case ir_unop_dFdy:
1350 assert(!"derivatives not valid in vertex shader");
1351 break;
1352
1353 case ir_unop_noise:
1354 assert(!"not reached: should be handled by lower_noise");
1355 break;
1356
1357 case ir_binop_add:
1358 emit(ADD(result_dst, op[0], op[1]));
1359 break;
1360 case ir_binop_sub:
1361 assert(!"not reached: should be handled by ir_sub_to_add_neg");
1362 break;
1363
1364 case ir_binop_mul:
1365 if (ir->type->is_integer()) {
1366 /* For integer multiplication, the MUL uses the low 16 bits
1367 * of one of the operands (src0 on gen6, src1 on gen7). The
1368 * MACH accumulates in the contribution of the upper 16 bits
1369 * of that operand.
1370 *
1371 * FINISHME: Emit just the MUL if we know an operand is small
1372 * enough.
1373 */
1374 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_D);
1375
1376 emit(MUL(acc, op[0], op[1]));
1377 emit(MACH(dst_null_d(), op[0], op[1]));
1378 emit(MOV(result_dst, src_reg(acc)));
1379 } else {
1380 emit(MUL(result_dst, op[0], op[1]));
1381 }
1382 break;
1383 case ir_binop_div:
1384 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1385 assert(ir->type->is_integer());
1386 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1387 break;
1388 case ir_binop_mod:
1389 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1390 assert(ir->type->is_integer());
1391 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1392 break;
1393
1394 case ir_binop_less:
1395 case ir_binop_greater:
1396 case ir_binop_lequal:
1397 case ir_binop_gequal:
1398 case ir_binop_equal:
1399 case ir_binop_nequal: {
1400 emit(CMP(result_dst, op[0], op[1],
1401 brw_conditional_for_comparison(ir->operation)));
1402 emit(AND(result_dst, result_src, src_reg(0x1)));
1403 break;
1404 }
1405
1406 case ir_binop_all_equal:
1407 /* "==" operator producing a scalar boolean. */
1408 if (ir->operands[0]->type->is_vector() ||
1409 ir->operands[1]->type->is_vector()) {
1410 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1411 emit(MOV(result_dst, src_reg(0)));
1412 inst = emit(MOV(result_dst, src_reg(1)));
1413 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1414 } else {
1415 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1416 emit(AND(result_dst, result_src, src_reg(0x1)));
1417 }
1418 break;
1419 case ir_binop_any_nequal:
1420 /* "!=" operator producing a scalar boolean. */
1421 if (ir->operands[0]->type->is_vector() ||
1422 ir->operands[1]->type->is_vector()) {
1423 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1424
1425 emit(MOV(result_dst, src_reg(0)));
1426 inst = emit(MOV(result_dst, src_reg(1)));
1427 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1428 } else {
1429 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1430 emit(AND(result_dst, result_src, src_reg(0x1)));
1431 }
1432 break;
1433
1434 case ir_unop_any:
1435 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1436 emit(MOV(result_dst, src_reg(0)));
1437
1438 inst = emit(MOV(result_dst, src_reg(1)));
1439 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1440 break;
1441
1442 case ir_binop_logic_xor:
1443 emit(XOR(result_dst, op[0], op[1]));
1444 break;
1445
1446 case ir_binop_logic_or:
1447 emit(OR(result_dst, op[0], op[1]));
1448 break;
1449
1450 case ir_binop_logic_and:
1451 emit(AND(result_dst, op[0], op[1]));
1452 break;
1453
1454 case ir_binop_dot:
1455 assert(ir->operands[0]->type->is_vector());
1456 assert(ir->operands[0]->type == ir->operands[1]->type);
1457 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1458 break;
1459
1460 case ir_unop_sqrt:
1461 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1462 break;
1463 case ir_unop_rsq:
1464 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1465 break;
1466
1467 case ir_unop_bitcast_i2f:
1468 case ir_unop_bitcast_u2f:
1469 this->result = op[0];
1470 this->result.type = BRW_REGISTER_TYPE_F;
1471 break;
1472
1473 case ir_unop_bitcast_f2i:
1474 this->result = op[0];
1475 this->result.type = BRW_REGISTER_TYPE_D;
1476 break;
1477
1478 case ir_unop_bitcast_f2u:
1479 this->result = op[0];
1480 this->result.type = BRW_REGISTER_TYPE_UD;
1481 break;
1482
1483 case ir_unop_i2f:
1484 case ir_unop_i2u:
1485 case ir_unop_u2i:
1486 case ir_unop_u2f:
1487 case ir_unop_b2f:
1488 case ir_unop_b2i:
1489 case ir_unop_f2i:
1490 case ir_unop_f2u:
1491 emit(MOV(result_dst, op[0]));
1492 break;
1493 case ir_unop_f2b:
1494 case ir_unop_i2b: {
1495 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1496 emit(AND(result_dst, result_src, src_reg(1)));
1497 break;
1498 }
1499
1500 case ir_unop_trunc:
1501 emit(RNDZ(result_dst, op[0]));
1502 break;
1503 case ir_unop_ceil:
1504 op[0].negate = !op[0].negate;
1505 inst = emit(RNDD(result_dst, op[0]));
1506 this->result.negate = true;
1507 break;
1508 case ir_unop_floor:
1509 inst = emit(RNDD(result_dst, op[0]));
1510 break;
1511 case ir_unop_fract:
1512 inst = emit(FRC(result_dst, op[0]));
1513 break;
1514 case ir_unop_round_even:
1515 emit(RNDE(result_dst, op[0]));
1516 break;
1517
1518 case ir_binop_min:
1519 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1520 break;
1521 case ir_binop_max:
1522 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1523 break;
1524
1525 case ir_binop_pow:
1526 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1527 break;
1528
1529 case ir_unop_bit_not:
1530 inst = emit(NOT(result_dst, op[0]));
1531 break;
1532 case ir_binop_bit_and:
1533 inst = emit(AND(result_dst, op[0], op[1]));
1534 break;
1535 case ir_binop_bit_xor:
1536 inst = emit(XOR(result_dst, op[0], op[1]));
1537 break;
1538 case ir_binop_bit_or:
1539 inst = emit(OR(result_dst, op[0], op[1]));
1540 break;
1541
1542 case ir_binop_lshift:
1543 inst = emit(SHL(result_dst, op[0], op[1]));
1544 break;
1545
1546 case ir_binop_rshift:
1547 if (ir->type->base_type == GLSL_TYPE_INT)
1548 inst = emit(ASR(result_dst, op[0], op[1]));
1549 else
1550 inst = emit(SHR(result_dst, op[0], op[1]));
1551 break;
1552
1553 case ir_binop_ubo_load: {
1554 ir_constant *uniform_block = ir->operands[0]->as_constant();
1555 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1556 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1557 src_reg offset = op[1];
1558
1559 /* Now, load the vector from that offset. */
1560 assert(ir->type->is_vector() || ir->type->is_scalar());
1561
1562 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1563 packed_consts.type = result.type;
1564 src_reg surf_index =
1565 src_reg(SURF_INDEX_VS_UBO(uniform_block->value.u[0]));
1566 if (const_offset_ir) {
1567 offset = src_reg(const_offset / 16);
1568 } else {
1569 emit(SHR(dst_reg(offset), offset, src_reg(4)));
1570 }
1571
1572 vec4_instruction *pull =
1573 emit(new(mem_ctx) vec4_instruction(this,
1574 VS_OPCODE_PULL_CONSTANT_LOAD,
1575 dst_reg(packed_consts),
1576 surf_index,
1577 offset));
1578 pull->base_mrf = 14;
1579 pull->mlen = 1;
1580
1581 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1582 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1583 const_offset % 16 / 4,
1584 const_offset % 16 / 4,
1585 const_offset % 16 / 4);
1586
1587 /* UBO bools are any nonzero int. We store bools as either 0 or 1. */
1588 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1589 emit(CMP(result_dst, packed_consts, src_reg(0u),
1590 BRW_CONDITIONAL_NZ));
1591 emit(AND(result_dst, result, src_reg(0x1)));
1592 } else {
1593 emit(MOV(result_dst, packed_consts));
1594 }
1595 break;
1596 }
1597
1598 case ir_triop_lrp:
1599 assert(!"not reached: should be handled by lrp_to_arith");
1600 break;
1601
1602 case ir_quadop_vector:
1603 assert(!"not reached: should be handled by lower_quadop_vector");
1604 break;
1605
1606 case ir_unop_pack_half_2x16:
1607 emit_pack_half_2x16(result_dst, op[0]);
1608 break;
1609 case ir_unop_unpack_half_2x16:
1610 emit_unpack_half_2x16(result_dst, op[0]);
1611 break;
1612 case ir_unop_pack_snorm_2x16:
1613 case ir_unop_pack_snorm_4x8:
1614 case ir_unop_pack_unorm_2x16:
1615 case ir_unop_pack_unorm_4x8:
1616 case ir_unop_unpack_snorm_2x16:
1617 case ir_unop_unpack_snorm_4x8:
1618 case ir_unop_unpack_unorm_2x16:
1619 case ir_unop_unpack_unorm_4x8:
1620 assert(!"not reached: should be handled by lower_packing_builtins");
1621 break;
1622 case ir_unop_unpack_half_2x16_split_x:
1623 case ir_unop_unpack_half_2x16_split_y:
1624 case ir_binop_pack_half_2x16_split:
1625 assert(!"not reached: should not occur in vertex shader");
1626 break;
1627 }
1628 }
1629
1630
1631 void
1632 vec4_visitor::visit(ir_swizzle *ir)
1633 {
1634 src_reg src;
1635 int i = 0;
1636 int swizzle[4];
1637
1638 /* Note that this is only swizzles in expressions, not those on the left
1639 * hand side of an assignment, which do write masking. See ir_assignment
1640 * for that.
1641 */
1642
1643 ir->val->accept(this);
1644 src = this->result;
1645 assert(src.file != BAD_FILE);
1646
1647 for (i = 0; i < ir->type->vector_elements; i++) {
1648 switch (i) {
1649 case 0:
1650 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1651 break;
1652 case 1:
1653 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1654 break;
1655 case 2:
1656 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1657 break;
1658 case 3:
1659 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1660 break;
1661 }
1662 }
1663 for (; i < 4; i++) {
1664 /* Replicate the last channel out. */
1665 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1666 }
1667
1668 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1669
1670 this->result = src;
1671 }
1672
1673 void
1674 vec4_visitor::visit(ir_dereference_variable *ir)
1675 {
1676 const struct glsl_type *type = ir->type;
1677 dst_reg *reg = variable_storage(ir->var);
1678
1679 if (!reg) {
1680 fail("Failed to find variable storage for %s\n", ir->var->name);
1681 this->result = src_reg(brw_null_reg());
1682 return;
1683 }
1684
1685 this->result = src_reg(*reg);
1686
1687 /* System values get their swizzle from the dst_reg writemask */
1688 if (ir->var->mode == ir_var_system_value)
1689 return;
1690
1691 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1692 this->result.swizzle = swizzle_for_size(type->vector_elements);
1693 }
1694
1695 void
1696 vec4_visitor::visit(ir_dereference_array *ir)
1697 {
1698 ir_constant *constant_index;
1699 src_reg src;
1700 int element_size = type_size(ir->type);
1701
1702 constant_index = ir->array_index->constant_expression_value();
1703
1704 ir->array->accept(this);
1705 src = this->result;
1706
1707 if (constant_index) {
1708 src.reg_offset += constant_index->value.i[0] * element_size;
1709 } else {
1710 /* Variable index array dereference. It eats the "vec4" of the
1711 * base of the array and an index that offsets the Mesa register
1712 * index.
1713 */
1714 ir->array_index->accept(this);
1715
1716 src_reg index_reg;
1717
1718 if (element_size == 1) {
1719 index_reg = this->result;
1720 } else {
1721 index_reg = src_reg(this, glsl_type::int_type);
1722
1723 emit(MUL(dst_reg(index_reg), this->result, src_reg(element_size)));
1724 }
1725
1726 if (src.reladdr) {
1727 src_reg temp = src_reg(this, glsl_type::int_type);
1728
1729 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1730
1731 index_reg = temp;
1732 }
1733
1734 src.reladdr = ralloc(mem_ctx, src_reg);
1735 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1736 }
1737
1738 /* If the type is smaller than a vec4, replicate the last channel out. */
1739 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1740 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1741 else
1742 src.swizzle = BRW_SWIZZLE_NOOP;
1743 src.type = brw_type_for_base_type(ir->type);
1744
1745 this->result = src;
1746 }
1747
1748 void
1749 vec4_visitor::visit(ir_dereference_record *ir)
1750 {
1751 unsigned int i;
1752 const glsl_type *struct_type = ir->record->type;
1753 int offset = 0;
1754
1755 ir->record->accept(this);
1756
1757 for (i = 0; i < struct_type->length; i++) {
1758 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1759 break;
1760 offset += type_size(struct_type->fields.structure[i].type);
1761 }
1762
1763 /* If the type is smaller than a vec4, replicate the last channel out. */
1764 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1765 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1766 else
1767 this->result.swizzle = BRW_SWIZZLE_NOOP;
1768 this->result.type = brw_type_for_base_type(ir->type);
1769
1770 this->result.reg_offset += offset;
1771 }
1772
1773 /**
1774 * We want to be careful in assignment setup to hit the actual storage
1775 * instead of potentially using a temporary like we might with the
1776 * ir_dereference handler.
1777 */
1778 static dst_reg
1779 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1780 {
1781 /* The LHS must be a dereference. If the LHS is a variable indexed array
1782 * access of a vector, it must be separated into a series conditional moves
1783 * before reaching this point (see ir_vec_index_to_cond_assign).
1784 */
1785 assert(ir->as_dereference());
1786 ir_dereference_array *deref_array = ir->as_dereference_array();
1787 if (deref_array) {
1788 assert(!deref_array->array->type->is_vector());
1789 }
1790
1791 /* Use the rvalue deref handler for the most part. We'll ignore
1792 * swizzles in it and write swizzles using writemask, though.
1793 */
1794 ir->accept(v);
1795 return dst_reg(v->result);
1796 }
1797
1798 void
1799 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1800 const struct glsl_type *type, uint32_t predicate)
1801 {
1802 if (type->base_type == GLSL_TYPE_STRUCT) {
1803 for (unsigned int i = 0; i < type->length; i++) {
1804 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1805 }
1806 return;
1807 }
1808
1809 if (type->is_array()) {
1810 for (unsigned int i = 0; i < type->length; i++) {
1811 emit_block_move(dst, src, type->fields.array, predicate);
1812 }
1813 return;
1814 }
1815
1816 if (type->is_matrix()) {
1817 const struct glsl_type *vec_type;
1818
1819 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1820 type->vector_elements, 1);
1821
1822 for (int i = 0; i < type->matrix_columns; i++) {
1823 emit_block_move(dst, src, vec_type, predicate);
1824 }
1825 return;
1826 }
1827
1828 assert(type->is_scalar() || type->is_vector());
1829
1830 dst->type = brw_type_for_base_type(type);
1831 src->type = dst->type;
1832
1833 dst->writemask = (1 << type->vector_elements) - 1;
1834
1835 src->swizzle = swizzle_for_size(type->vector_elements);
1836
1837 vec4_instruction *inst = emit(MOV(*dst, *src));
1838 inst->predicate = predicate;
1839
1840 dst->reg_offset++;
1841 src->reg_offset++;
1842 }
1843
1844
1845 /* If the RHS processing resulted in an instruction generating a
1846 * temporary value, and it would be easy to rewrite the instruction to
1847 * generate its result right into the LHS instead, do so. This ends
1848 * up reliably removing instructions where it can be tricky to do so
1849 * later without real UD chain information.
1850 */
1851 bool
1852 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
1853 dst_reg dst,
1854 src_reg src,
1855 vec4_instruction *pre_rhs_inst,
1856 vec4_instruction *last_rhs_inst)
1857 {
1858 /* This could be supported, but it would take more smarts. */
1859 if (ir->condition)
1860 return false;
1861
1862 if (pre_rhs_inst == last_rhs_inst)
1863 return false; /* No instructions generated to work with. */
1864
1865 /* Make sure the last instruction generated our source reg. */
1866 if (src.file != GRF ||
1867 src.file != last_rhs_inst->dst.file ||
1868 src.reg != last_rhs_inst->dst.reg ||
1869 src.reg_offset != last_rhs_inst->dst.reg_offset ||
1870 src.reladdr ||
1871 src.abs ||
1872 src.negate ||
1873 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
1874 return false;
1875
1876 /* Check that that last instruction fully initialized the channels
1877 * we want to use, in the order we want to use them. We could
1878 * potentially reswizzle the operands of many instructions so that
1879 * we could handle out of order channels, but don't yet.
1880 */
1881
1882 for (unsigned i = 0; i < 4; i++) {
1883 if (dst.writemask & (1 << i)) {
1884 if (!(last_rhs_inst->dst.writemask & (1 << i)))
1885 return false;
1886
1887 if (BRW_GET_SWZ(src.swizzle, i) != i)
1888 return false;
1889 }
1890 }
1891
1892 /* Success! Rewrite the instruction. */
1893 last_rhs_inst->dst.file = dst.file;
1894 last_rhs_inst->dst.reg = dst.reg;
1895 last_rhs_inst->dst.reg_offset = dst.reg_offset;
1896 last_rhs_inst->dst.reladdr = dst.reladdr;
1897 last_rhs_inst->dst.writemask &= dst.writemask;
1898
1899 return true;
1900 }
1901
1902 void
1903 vec4_visitor::visit(ir_assignment *ir)
1904 {
1905 dst_reg dst = get_assignment_lhs(ir->lhs, this);
1906 uint32_t predicate = BRW_PREDICATE_NONE;
1907
1908 if (!ir->lhs->type->is_scalar() &&
1909 !ir->lhs->type->is_vector()) {
1910 ir->rhs->accept(this);
1911 src_reg src = this->result;
1912
1913 if (ir->condition) {
1914 emit_bool_to_cond_code(ir->condition, &predicate);
1915 }
1916
1917 /* emit_block_move doesn't account for swizzles in the source register.
1918 * This should be ok, since the source register is a structure or an
1919 * array, and those can't be swizzled. But double-check to be sure.
1920 */
1921 assert(src.swizzle ==
1922 (ir->rhs->type->is_matrix()
1923 ? swizzle_for_size(ir->rhs->type->vector_elements)
1924 : BRW_SWIZZLE_NOOP));
1925
1926 emit_block_move(&dst, &src, ir->rhs->type, predicate);
1927 return;
1928 }
1929
1930 /* Now we're down to just a scalar/vector with writemasks. */
1931 int i;
1932
1933 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
1934 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1935
1936 ir->rhs->accept(this);
1937
1938 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
1939
1940 src_reg src = this->result;
1941
1942 int swizzles[4];
1943 int first_enabled_chan = 0;
1944 int src_chan = 0;
1945
1946 assert(ir->lhs->type->is_vector() ||
1947 ir->lhs->type->is_scalar());
1948 dst.writemask = ir->write_mask;
1949
1950 for (int i = 0; i < 4; i++) {
1951 if (dst.writemask & (1 << i)) {
1952 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
1953 break;
1954 }
1955 }
1956
1957 /* Swizzle a small RHS vector into the channels being written.
1958 *
1959 * glsl ir treats write_mask as dictating how many channels are
1960 * present on the RHS while in our instructions we need to make
1961 * those channels appear in the slots of the vec4 they're written to.
1962 */
1963 for (int i = 0; i < 4; i++) {
1964 if (dst.writemask & (1 << i))
1965 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
1966 else
1967 swizzles[i] = first_enabled_chan;
1968 }
1969 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
1970 swizzles[2], swizzles[3]);
1971
1972 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
1973 return;
1974 }
1975
1976 if (ir->condition) {
1977 emit_bool_to_cond_code(ir->condition, &predicate);
1978 }
1979
1980 for (i = 0; i < type_size(ir->lhs->type); i++) {
1981 vec4_instruction *inst = emit(MOV(dst, src));
1982 inst->predicate = predicate;
1983
1984 dst.reg_offset++;
1985 src.reg_offset++;
1986 }
1987 }
1988
1989 void
1990 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
1991 {
1992 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
1993 foreach_list(node, &ir->components) {
1994 ir_constant *field_value = (ir_constant *)node;
1995
1996 emit_constant_values(dst, field_value);
1997 }
1998 return;
1999 }
2000
2001 if (ir->type->is_array()) {
2002 for (unsigned int i = 0; i < ir->type->length; i++) {
2003 emit_constant_values(dst, ir->array_elements[i]);
2004 }
2005 return;
2006 }
2007
2008 if (ir->type->is_matrix()) {
2009 for (int i = 0; i < ir->type->matrix_columns; i++) {
2010 float *vec = &ir->value.f[i * ir->type->vector_elements];
2011
2012 for (int j = 0; j < ir->type->vector_elements; j++) {
2013 dst->writemask = 1 << j;
2014 dst->type = BRW_REGISTER_TYPE_F;
2015
2016 emit(MOV(*dst, src_reg(vec[j])));
2017 }
2018 dst->reg_offset++;
2019 }
2020 return;
2021 }
2022
2023 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2024
2025 for (int i = 0; i < ir->type->vector_elements; i++) {
2026 if (!(remaining_writemask & (1 << i)))
2027 continue;
2028
2029 dst->writemask = 1 << i;
2030 dst->type = brw_type_for_base_type(ir->type);
2031
2032 /* Find other components that match the one we're about to
2033 * write. Emits fewer instructions for things like vec4(0.5,
2034 * 1.5, 1.5, 1.5).
2035 */
2036 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2037 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2038 if (ir->value.b[i] == ir->value.b[j])
2039 dst->writemask |= (1 << j);
2040 } else {
2041 /* u, i, and f storage all line up, so no need for a
2042 * switch case for comparing each type.
2043 */
2044 if (ir->value.u[i] == ir->value.u[j])
2045 dst->writemask |= (1 << j);
2046 }
2047 }
2048
2049 switch (ir->type->base_type) {
2050 case GLSL_TYPE_FLOAT:
2051 emit(MOV(*dst, src_reg(ir->value.f[i])));
2052 break;
2053 case GLSL_TYPE_INT:
2054 emit(MOV(*dst, src_reg(ir->value.i[i])));
2055 break;
2056 case GLSL_TYPE_UINT:
2057 emit(MOV(*dst, src_reg(ir->value.u[i])));
2058 break;
2059 case GLSL_TYPE_BOOL:
2060 emit(MOV(*dst, src_reg(ir->value.b[i])));
2061 break;
2062 default:
2063 assert(!"Non-float/uint/int/bool constant");
2064 break;
2065 }
2066
2067 remaining_writemask &= ~dst->writemask;
2068 }
2069 dst->reg_offset++;
2070 }
2071
2072 void
2073 vec4_visitor::visit(ir_constant *ir)
2074 {
2075 dst_reg dst = dst_reg(this, ir->type);
2076 this->result = src_reg(dst);
2077
2078 emit_constant_values(&dst, ir);
2079 }
2080
2081 void
2082 vec4_visitor::visit(ir_call *ir)
2083 {
2084 assert(!"not reached");
2085 }
2086
2087 void
2088 vec4_visitor::visit(ir_texture *ir)
2089 {
2090 int sampler =
2091 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2092
2093 /* Should be lowered by do_lower_texture_projection */
2094 assert(!ir->projector);
2095
2096 /* Generate code to compute all the subexpression trees. This has to be
2097 * done before loading any values into MRFs for the sampler message since
2098 * generating these values may involve SEND messages that need the MRFs.
2099 */
2100 src_reg coordinate;
2101 if (ir->coordinate) {
2102 ir->coordinate->accept(this);
2103 coordinate = this->result;
2104 }
2105
2106 src_reg shadow_comparitor;
2107 if (ir->shadow_comparitor) {
2108 ir->shadow_comparitor->accept(this);
2109 shadow_comparitor = this->result;
2110 }
2111
2112 const glsl_type *lod_type, *sample_index_type;
2113 src_reg lod, dPdx, dPdy, sample_index;
2114 switch (ir->op) {
2115 case ir_tex:
2116 lod = src_reg(0.0f);
2117 lod_type = glsl_type::float_type;
2118 break;
2119 case ir_txf:
2120 case ir_txl:
2121 case ir_txs:
2122 ir->lod_info.lod->accept(this);
2123 lod = this->result;
2124 lod_type = ir->lod_info.lod->type;
2125 break;
2126 case ir_txf_ms:
2127 ir->lod_info.sample_index->accept(this);
2128 sample_index = this->result;
2129 sample_index_type = ir->lod_info.sample_index->type;
2130 break;
2131 case ir_txd:
2132 ir->lod_info.grad.dPdx->accept(this);
2133 dPdx = this->result;
2134
2135 ir->lod_info.grad.dPdy->accept(this);
2136 dPdy = this->result;
2137
2138 lod_type = ir->lod_info.grad.dPdx->type;
2139 break;
2140 case ir_txb:
2141 case ir_lod:
2142 break;
2143 }
2144
2145 vec4_instruction *inst = NULL;
2146 switch (ir->op) {
2147 case ir_tex:
2148 case ir_txl:
2149 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXL);
2150 break;
2151 case ir_txd:
2152 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXD);
2153 break;
2154 case ir_txf:
2155 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF);
2156 break;
2157 case ir_txf_ms:
2158 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MS);
2159 break;
2160 case ir_txs:
2161 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXS);
2162 break;
2163 case ir_txb:
2164 assert(!"TXB is not valid for vertex shaders.");
2165 break;
2166 case ir_lod:
2167 assert(!"LOD is not valid for vertex shaders.");
2168 break;
2169 }
2170
2171 bool use_texture_offset = ir->offset != NULL && ir->op != ir_txf;
2172
2173 /* Texel offsets go in the message header; Gen4 also requires headers. */
2174 inst->header_present = use_texture_offset || intel->gen < 5;
2175 inst->base_mrf = 2;
2176 inst->mlen = inst->header_present + 1; /* always at least one */
2177 inst->sampler = sampler;
2178 inst->dst = dst_reg(this, ir->type);
2179 inst->dst.writemask = WRITEMASK_XYZW;
2180 inst->shadow_compare = ir->shadow_comparitor != NULL;
2181
2182 if (use_texture_offset)
2183 inst->texture_offset = brw_texture_offset(ir->offset->as_constant());
2184
2185 /* MRF for the first parameter */
2186 int param_base = inst->base_mrf + inst->header_present;
2187
2188 if (ir->op == ir_txs) {
2189 int writemask = intel->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2190 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2191 } else {
2192 int i, coord_mask = 0, zero_mask = 0;
2193 /* Load the coordinate */
2194 /* FINISHME: gl_clamp_mask and saturate */
2195 for (i = 0; i < ir->coordinate->type->vector_elements; i++)
2196 coord_mask |= (1 << i);
2197 for (; i < 4; i++)
2198 zero_mask |= (1 << i);
2199
2200 if (ir->offset && ir->op == ir_txf) {
2201 /* It appears that the ld instruction used for txf does its
2202 * address bounds check before adding in the offset. To work
2203 * around this, just add the integer offset to the integer
2204 * texel coordinate, and don't put the offset in the header.
2205 */
2206 ir_constant *offset = ir->offset->as_constant();
2207 assert(offset);
2208
2209 for (int j = 0; j < ir->coordinate->type->vector_elements; j++) {
2210 src_reg src = coordinate;
2211 src.swizzle = BRW_SWIZZLE4(BRW_GET_SWZ(src.swizzle, j),
2212 BRW_GET_SWZ(src.swizzle, j),
2213 BRW_GET_SWZ(src.swizzle, j),
2214 BRW_GET_SWZ(src.swizzle, j));
2215 emit(ADD(dst_reg(MRF, param_base, ir->coordinate->type, 1 << j),
2216 src, offset->value.i[j]));
2217 }
2218 } else {
2219 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2220 coordinate));
2221 }
2222 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2223 src_reg(0)));
2224 /* Load the shadow comparitor */
2225 if (ir->shadow_comparitor) {
2226 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2227 WRITEMASK_X),
2228 shadow_comparitor));
2229 inst->mlen++;
2230 }
2231
2232 /* Load the LOD info */
2233 if (ir->op == ir_tex || ir->op == ir_txl) {
2234 int mrf, writemask;
2235 if (intel->gen >= 5) {
2236 mrf = param_base + 1;
2237 if (ir->shadow_comparitor) {
2238 writemask = WRITEMASK_Y;
2239 /* mlen already incremented */
2240 } else {
2241 writemask = WRITEMASK_X;
2242 inst->mlen++;
2243 }
2244 } else /* intel->gen == 4 */ {
2245 mrf = param_base;
2246 writemask = WRITEMASK_Z;
2247 }
2248 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2249 } else if (ir->op == ir_txf) {
2250 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2251 } else if (ir->op == ir_txf_ms) {
2252 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2253 sample_index));
2254 inst->mlen++;
2255
2256 /* on Gen7, there is an additional MCS parameter here after SI,
2257 * but we don't bother to emit it since it's always zero. If
2258 * we start supporting texturing from CMS surfaces, this will have
2259 * to change
2260 */
2261 } else if (ir->op == ir_txd) {
2262 const glsl_type *type = lod_type;
2263
2264 if (intel->gen >= 5) {
2265 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2266 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2267 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2268 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2269 inst->mlen++;
2270
2271 if (ir->type->vector_elements == 3) {
2272 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2273 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2274 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2275 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2276 inst->mlen++;
2277 }
2278 } else /* intel->gen == 4 */ {
2279 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2280 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2281 inst->mlen += 2;
2282 }
2283 }
2284 }
2285
2286 emit(inst);
2287
2288 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2289 * spec requires layers.
2290 */
2291 if (ir->op == ir_txs) {
2292 glsl_type const *type = ir->sampler->type;
2293 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2294 type->sampler_array) {
2295 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2296 with_writemask(inst->dst, WRITEMASK_Z),
2297 src_reg(inst->dst), src_reg(6));
2298 }
2299 }
2300
2301 swizzle_result(ir, src_reg(inst->dst), sampler);
2302 }
2303
2304 void
2305 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, int sampler)
2306 {
2307 int s = c->key.base.tex.swizzles[sampler];
2308
2309 this->result = src_reg(this, ir->type);
2310 dst_reg swizzled_result(this->result);
2311
2312 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2313 || s == SWIZZLE_NOOP) {
2314 emit(MOV(swizzled_result, orig_val));
2315 return;
2316 }
2317
2318 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2319 int swizzle[4];
2320
2321 for (int i = 0; i < 4; i++) {
2322 switch (GET_SWZ(s, i)) {
2323 case SWIZZLE_ZERO:
2324 zero_mask |= (1 << i);
2325 break;
2326 case SWIZZLE_ONE:
2327 one_mask |= (1 << i);
2328 break;
2329 default:
2330 copy_mask |= (1 << i);
2331 swizzle[i] = GET_SWZ(s, i);
2332 break;
2333 }
2334 }
2335
2336 if (copy_mask) {
2337 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2338 swizzled_result.writemask = copy_mask;
2339 emit(MOV(swizzled_result, orig_val));
2340 }
2341
2342 if (zero_mask) {
2343 swizzled_result.writemask = zero_mask;
2344 emit(MOV(swizzled_result, src_reg(0.0f)));
2345 }
2346
2347 if (one_mask) {
2348 swizzled_result.writemask = one_mask;
2349 emit(MOV(swizzled_result, src_reg(1.0f)));
2350 }
2351 }
2352
2353 void
2354 vec4_visitor::visit(ir_return *ir)
2355 {
2356 assert(!"not reached");
2357 }
2358
2359 void
2360 vec4_visitor::visit(ir_discard *ir)
2361 {
2362 assert(!"not reached");
2363 }
2364
2365 void
2366 vec4_visitor::visit(ir_if *ir)
2367 {
2368 /* Don't point the annotation at the if statement, because then it plus
2369 * the then and else blocks get printed.
2370 */
2371 this->base_ir = ir->condition;
2372
2373 if (intel->gen == 6) {
2374 emit_if_gen6(ir);
2375 } else {
2376 uint32_t predicate;
2377 emit_bool_to_cond_code(ir->condition, &predicate);
2378 emit(IF(predicate));
2379 }
2380
2381 visit_instructions(&ir->then_instructions);
2382
2383 if (!ir->else_instructions.is_empty()) {
2384 this->base_ir = ir->condition;
2385 emit(BRW_OPCODE_ELSE);
2386
2387 visit_instructions(&ir->else_instructions);
2388 }
2389
2390 this->base_ir = ir->condition;
2391 emit(BRW_OPCODE_ENDIF);
2392 }
2393
2394 void
2395 vec4_visitor::emit_ndc_computation()
2396 {
2397 /* Get the position */
2398 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2399
2400 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2401 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2402 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2403
2404 current_annotation = "NDC";
2405 dst_reg ndc_w = ndc;
2406 ndc_w.writemask = WRITEMASK_W;
2407 src_reg pos_w = pos;
2408 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2409 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2410
2411 dst_reg ndc_xyz = ndc;
2412 ndc_xyz.writemask = WRITEMASK_XYZ;
2413
2414 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2415 }
2416
2417 void
2418 vec4_visitor::emit_psiz_and_flags(struct brw_reg reg)
2419 {
2420 if (intel->gen < 6 &&
2421 ((prog_data->base.vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2422 c->key.base.userclip_active || brw->has_negative_rhw_bug)) {
2423 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2424 dst_reg header1_w = header1;
2425 header1_w.writemask = WRITEMASK_W;
2426 GLuint i;
2427
2428 emit(MOV(header1, 0u));
2429
2430 if (prog_data->base.vue_map.slots_valid & VARYING_BIT_PSIZ) {
2431 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2432
2433 current_annotation = "Point size";
2434 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2435 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2436 }
2437
2438 current_annotation = "Clipping flags";
2439 for (i = 0; i < c->key.base.nr_userclip_plane_consts; i++) {
2440 vec4_instruction *inst;
2441
2442 inst = emit(DP4(dst_null_f(), src_reg(output_reg[VARYING_SLOT_POS]),
2443 src_reg(this->userplane[i])));
2444 inst->conditional_mod = BRW_CONDITIONAL_L;
2445
2446 inst = emit(OR(header1_w, src_reg(header1_w), 1u << i));
2447 inst->predicate = BRW_PREDICATE_NORMAL;
2448 }
2449
2450 /* i965 clipping workaround:
2451 * 1) Test for -ve rhw
2452 * 2) If set,
2453 * set ndc = (0,0,0,0)
2454 * set ucp[6] = 1
2455 *
2456 * Later, clipping will detect ucp[6] and ensure the primitive is
2457 * clipped against all fixed planes.
2458 */
2459 if (brw->has_negative_rhw_bug) {
2460 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2461 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2462 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2463 vec4_instruction *inst;
2464 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2465 inst->predicate = BRW_PREDICATE_NORMAL;
2466 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2467 inst->predicate = BRW_PREDICATE_NORMAL;
2468 }
2469
2470 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2471 } else if (intel->gen < 6) {
2472 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2473 } else {
2474 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2475 if (prog_data->base.vue_map.slots_valid & VARYING_BIT_PSIZ) {
2476 emit(MOV(brw_writemask(reg, WRITEMASK_W),
2477 src_reg(output_reg[VARYING_SLOT_PSIZ])));
2478 }
2479 }
2480 }
2481
2482 void
2483 vec4_visitor::emit_clip_distances(struct brw_reg reg, int offset)
2484 {
2485 if (intel->gen < 6) {
2486 /* Clip distance slots are set aside in gen5, but they are not used. It
2487 * is not clear whether we actually need to set aside space for them,
2488 * but the performance cost is negligible.
2489 */
2490 return;
2491 }
2492
2493 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2494 *
2495 * "If a linked set of shaders forming the vertex stage contains no
2496 * static write to gl_ClipVertex or gl_ClipDistance, but the
2497 * application has requested clipping against user clip planes through
2498 * the API, then the coordinate written to gl_Position is used for
2499 * comparison against the user clip planes."
2500 *
2501 * This function is only called if the shader didn't write to
2502 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2503 * if the user wrote to it; otherwise we use gl_Position.
2504 */
2505 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2506 if (!(prog_data->base.vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2507 clip_vertex = VARYING_SLOT_POS;
2508 }
2509
2510 for (int i = 0; i + offset < c->key.base.nr_userclip_plane_consts && i < 4;
2511 ++i) {
2512 emit(DP4(dst_reg(brw_writemask(reg, 1 << i)),
2513 src_reg(output_reg[clip_vertex]),
2514 src_reg(this->userplane[i + offset])));
2515 }
2516 }
2517
2518 void
2519 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2520 {
2521 assert (varying < VARYING_SLOT_MAX);
2522 reg.type = output_reg[varying].type;
2523 current_annotation = output_reg_annotation[varying];
2524 /* Copy the register, saturating if necessary */
2525 vec4_instruction *inst = emit(MOV(reg,
2526 src_reg(output_reg[varying])));
2527 if ((varying == VARYING_SLOT_COL0 ||
2528 varying == VARYING_SLOT_COL1 ||
2529 varying == VARYING_SLOT_BFC0 ||
2530 varying == VARYING_SLOT_BFC1) &&
2531 c->key.base.clamp_vertex_color) {
2532 inst->saturate = true;
2533 }
2534 }
2535
2536 void
2537 vec4_visitor::emit_urb_slot(int mrf, int varying)
2538 {
2539 struct brw_reg hw_reg = brw_message_reg(mrf);
2540 dst_reg reg = dst_reg(MRF, mrf);
2541 reg.type = BRW_REGISTER_TYPE_F;
2542
2543 switch (varying) {
2544 case VARYING_SLOT_PSIZ:
2545 /* PSIZ is always in slot 0, and is coupled with other flags. */
2546 current_annotation = "indices, point width, clip flags";
2547 emit_psiz_and_flags(hw_reg);
2548 break;
2549 case BRW_VARYING_SLOT_NDC:
2550 current_annotation = "NDC";
2551 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2552 break;
2553 case BRW_VARYING_SLOT_POS_DUPLICATE:
2554 case VARYING_SLOT_POS:
2555 current_annotation = "gl_Position";
2556 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2557 break;
2558 case VARYING_SLOT_CLIP_DIST0:
2559 case VARYING_SLOT_CLIP_DIST1:
2560 if (this->c->key.base.uses_clip_distance) {
2561 emit_generic_urb_slot(reg, varying);
2562 } else {
2563 current_annotation = "user clip distances";
2564 emit_clip_distances(hw_reg, (varying - VARYING_SLOT_CLIP_DIST0) * 4);
2565 }
2566 break;
2567 case VARYING_SLOT_EDGE:
2568 /* This is present when doing unfilled polygons. We're supposed to copy
2569 * the edge flag from the user-provided vertex array
2570 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
2571 * of that attribute (starts as 1.0f). This is then used in clipping to
2572 * determine which edges should be drawn as wireframe.
2573 */
2574 current_annotation = "edge flag";
2575 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
2576 glsl_type::float_type, WRITEMASK_XYZW))));
2577 break;
2578 case BRW_VARYING_SLOT_PAD:
2579 /* No need to write to this slot */
2580 break;
2581 default:
2582 emit_generic_urb_slot(reg, varying);
2583 break;
2584 }
2585 }
2586
2587 static int
2588 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
2589 {
2590 struct intel_context *intel = &brw->intel;
2591
2592 if (intel->gen >= 6) {
2593 /* URB data written (does not include the message header reg) must
2594 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
2595 * section 5.4.3.2.2: URB_INTERLEAVED.
2596 *
2597 * URB entries are allocated on a multiple of 1024 bits, so an
2598 * extra 128 bits written here to make the end align to 256 is
2599 * no problem.
2600 */
2601 if ((mlen % 2) != 1)
2602 mlen++;
2603 }
2604
2605 return mlen;
2606 }
2607
2608 /**
2609 * Generates the VUE payload plus the 1 or 2 URB write instructions to
2610 * complete the VS thread.
2611 *
2612 * The VUE layout is documented in Volume 2a.
2613 */
2614 void
2615 vec4_vs_visitor::emit_thread_end()
2616 {
2617 /* MRF 0 is reserved for the debugger, so start with message header
2618 * in MRF 1.
2619 */
2620 int base_mrf = 1;
2621 int mrf = base_mrf;
2622 /* In the process of generating our URB write message contents, we
2623 * may need to unspill a register or load from an array. Those
2624 * reads would use MRFs 14-15.
2625 */
2626 int max_usable_mrf = 13;
2627
2628 /* The following assertion verifies that max_usable_mrf causes an
2629 * even-numbered amount of URB write data, which will meet gen6's
2630 * requirements for length alignment.
2631 */
2632 assert ((max_usable_mrf - base_mrf) % 2 == 0);
2633
2634 /* First mrf is the g0-based message header containing URB handles and such,
2635 * which is implied in VS_OPCODE_URB_WRITE.
2636 */
2637 mrf++;
2638
2639 if (intel->gen < 6) {
2640 emit_ndc_computation();
2641 }
2642
2643 /* Set up the VUE data for the first URB write */
2644 int slot;
2645 for (slot = 0; slot < prog_data->base.vue_map.num_slots; ++slot) {
2646 emit_urb_slot(mrf++, prog_data->base.vue_map.slot_to_varying[slot]);
2647
2648 /* If this was max_usable_mrf, we can't fit anything more into this URB
2649 * WRITE.
2650 */
2651 if (mrf > max_usable_mrf) {
2652 slot++;
2653 break;
2654 }
2655 }
2656
2657 bool eot = slot >= prog_data->base.vue_map.num_slots;
2658 if (eot) {
2659 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2660 emit_shader_time_end();
2661 }
2662 current_annotation = "URB write";
2663 vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
2664 inst->base_mrf = base_mrf;
2665 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2666 inst->eot = eot;
2667
2668 /* Optional second URB write */
2669 if (!inst->eot) {
2670 mrf = base_mrf + 1;
2671
2672 for (; slot < prog_data->base.vue_map.num_slots; ++slot) {
2673 assert(mrf < max_usable_mrf);
2674
2675 emit_urb_slot(mrf++, prog_data->base.vue_map.slot_to_varying[slot]);
2676 }
2677
2678 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2679 emit_shader_time_end();
2680
2681 current_annotation = "URB write";
2682 inst = emit(VS_OPCODE_URB_WRITE);
2683 inst->base_mrf = base_mrf;
2684 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
2685 inst->eot = true;
2686 /* URB destination offset. In the previous write, we got MRFs
2687 * 2-13 minus the one header MRF, so 12 regs. URB offset is in
2688 * URB row increments, and each of our MRFs is half of one of
2689 * those, since we're doing interleaved writes.
2690 */
2691 inst->offset = (max_usable_mrf - base_mrf) / 2;
2692 }
2693 }
2694
2695 src_reg
2696 vec4_visitor::get_scratch_offset(vec4_instruction *inst,
2697 src_reg *reladdr, int reg_offset)
2698 {
2699 /* Because we store the values to scratch interleaved like our
2700 * vertex data, we need to scale the vec4 index by 2.
2701 */
2702 int message_header_scale = 2;
2703
2704 /* Pre-gen6, the message header uses byte offsets instead of vec4
2705 * (16-byte) offset units.
2706 */
2707 if (intel->gen < 6)
2708 message_header_scale *= 16;
2709
2710 if (reladdr) {
2711 src_reg index = src_reg(this, glsl_type::int_type);
2712
2713 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2714 emit_before(inst, MUL(dst_reg(index),
2715 index, src_reg(message_header_scale)));
2716
2717 return index;
2718 } else {
2719 return src_reg(reg_offset * message_header_scale);
2720 }
2721 }
2722
2723 src_reg
2724 vec4_visitor::get_pull_constant_offset(vec4_instruction *inst,
2725 src_reg *reladdr, int reg_offset)
2726 {
2727 if (reladdr) {
2728 src_reg index = src_reg(this, glsl_type::int_type);
2729
2730 emit_before(inst, ADD(dst_reg(index), *reladdr, src_reg(reg_offset)));
2731
2732 /* Pre-gen6, the message header uses byte offsets instead of vec4
2733 * (16-byte) offset units.
2734 */
2735 if (intel->gen < 6) {
2736 emit_before(inst, MUL(dst_reg(index), index, src_reg(16)));
2737 }
2738
2739 return index;
2740 } else {
2741 int message_header_scale = intel->gen < 6 ? 16 : 1;
2742 return src_reg(reg_offset * message_header_scale);
2743 }
2744 }
2745
2746 /**
2747 * Emits an instruction before @inst to load the value named by @orig_src
2748 * from scratch space at @base_offset to @temp.
2749 *
2750 * @base_offset is measured in 32-byte units (the size of a register).
2751 */
2752 void
2753 vec4_visitor::emit_scratch_read(vec4_instruction *inst,
2754 dst_reg temp, src_reg orig_src,
2755 int base_offset)
2756 {
2757 int reg_offset = base_offset + orig_src.reg_offset;
2758 src_reg index = get_scratch_offset(inst, orig_src.reladdr, reg_offset);
2759
2760 emit_before(inst, SCRATCH_READ(temp, index));
2761 }
2762
2763 /**
2764 * Emits an instruction after @inst to store the value to be written
2765 * to @orig_dst to scratch space at @base_offset, from @temp.
2766 *
2767 * @base_offset is measured in 32-byte units (the size of a register).
2768 */
2769 void
2770 vec4_visitor::emit_scratch_write(vec4_instruction *inst, int base_offset)
2771 {
2772 int reg_offset = base_offset + inst->dst.reg_offset;
2773 src_reg index = get_scratch_offset(inst, inst->dst.reladdr, reg_offset);
2774
2775 /* Create a temporary register to store *inst's result in.
2776 *
2777 * We have to be careful in MOVing from our temporary result register in
2778 * the scratch write. If we swizzle from channels of the temporary that
2779 * weren't initialized, it will confuse live interval analysis, which will
2780 * make spilling fail to make progress.
2781 */
2782 src_reg temp = src_reg(this, glsl_type::vec4_type);
2783 temp.type = inst->dst.type;
2784 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
2785 int swizzles[4];
2786 for (int i = 0; i < 4; i++)
2787 if (inst->dst.writemask & (1 << i))
2788 swizzles[i] = i;
2789 else
2790 swizzles[i] = first_writemask_chan;
2791 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2792 swizzles[2], swizzles[3]);
2793
2794 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
2795 inst->dst.writemask));
2796 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
2797 write->predicate = inst->predicate;
2798 write->ir = inst->ir;
2799 write->annotation = inst->annotation;
2800 inst->insert_after(write);
2801
2802 inst->dst.file = temp.file;
2803 inst->dst.reg = temp.reg;
2804 inst->dst.reg_offset = temp.reg_offset;
2805 inst->dst.reladdr = NULL;
2806 }
2807
2808 /**
2809 * We can't generally support array access in GRF space, because a
2810 * single instruction's destination can only span 2 contiguous
2811 * registers. So, we send all GRF arrays that get variable index
2812 * access to scratch space.
2813 */
2814 void
2815 vec4_visitor::move_grf_array_access_to_scratch()
2816 {
2817 int scratch_loc[this->virtual_grf_count];
2818
2819 for (int i = 0; i < this->virtual_grf_count; i++) {
2820 scratch_loc[i] = -1;
2821 }
2822
2823 /* First, calculate the set of virtual GRFs that need to be punted
2824 * to scratch due to having any array access on them, and where in
2825 * scratch.
2826 */
2827 foreach_list(node, &this->instructions) {
2828 vec4_instruction *inst = (vec4_instruction *)node;
2829
2830 if (inst->dst.file == GRF && inst->dst.reladdr &&
2831 scratch_loc[inst->dst.reg] == -1) {
2832 scratch_loc[inst->dst.reg] = c->base.last_scratch;
2833 c->base.last_scratch += this->virtual_grf_sizes[inst->dst.reg];
2834 }
2835
2836 for (int i = 0 ; i < 3; i++) {
2837 src_reg *src = &inst->src[i];
2838
2839 if (src->file == GRF && src->reladdr &&
2840 scratch_loc[src->reg] == -1) {
2841 scratch_loc[src->reg] = c->base.last_scratch;
2842 c->base.last_scratch += this->virtual_grf_sizes[src->reg];
2843 }
2844 }
2845 }
2846
2847 /* Now, for anything that will be accessed through scratch, rewrite
2848 * it to load/store. Note that this is a _safe list walk, because
2849 * we may generate a new scratch_write instruction after the one
2850 * we're processing.
2851 */
2852 foreach_list_safe(node, &this->instructions) {
2853 vec4_instruction *inst = (vec4_instruction *)node;
2854
2855 /* Set up the annotation tracking for new generated instructions. */
2856 base_ir = inst->ir;
2857 current_annotation = inst->annotation;
2858
2859 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
2860 emit_scratch_write(inst, scratch_loc[inst->dst.reg]);
2861 }
2862
2863 for (int i = 0 ; i < 3; i++) {
2864 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
2865 continue;
2866
2867 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2868
2869 emit_scratch_read(inst, temp, inst->src[i],
2870 scratch_loc[inst->src[i].reg]);
2871
2872 inst->src[i].file = temp.file;
2873 inst->src[i].reg = temp.reg;
2874 inst->src[i].reg_offset = temp.reg_offset;
2875 inst->src[i].reladdr = NULL;
2876 }
2877 }
2878 }
2879
2880 /**
2881 * Emits an instruction before @inst to load the value named by @orig_src
2882 * from the pull constant buffer (surface) at @base_offset to @temp.
2883 */
2884 void
2885 vec4_visitor::emit_pull_constant_load(vec4_instruction *inst,
2886 dst_reg temp, src_reg orig_src,
2887 int base_offset)
2888 {
2889 int reg_offset = base_offset + orig_src.reg_offset;
2890 src_reg index = src_reg((unsigned)SURF_INDEX_VERT_CONST_BUFFER);
2891 src_reg offset = get_pull_constant_offset(inst, orig_src.reladdr, reg_offset);
2892 vec4_instruction *load;
2893
2894 if (intel->gen >= 7) {
2895 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
2896 grf_offset.type = offset.type;
2897 emit_before(inst, MOV(grf_offset, offset));
2898
2899 load = new(mem_ctx) vec4_instruction(this,
2900 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
2901 temp, index, src_reg(grf_offset));
2902 } else {
2903 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
2904 temp, index, offset);
2905 load->base_mrf = 14;
2906 load->mlen = 1;
2907 }
2908 emit_before(inst, load);
2909 }
2910
2911 /**
2912 * Implements array access of uniforms by inserting a
2913 * PULL_CONSTANT_LOAD instruction.
2914 *
2915 * Unlike temporary GRF array access (where we don't support it due to
2916 * the difficulty of doing relative addressing on instruction
2917 * destinations), we could potentially do array access of uniforms
2918 * that were loaded in GRF space as push constants. In real-world
2919 * usage we've seen, though, the arrays being used are always larger
2920 * than we could load as push constants, so just always move all
2921 * uniform array access out to a pull constant buffer.
2922 */
2923 void
2924 vec4_visitor::move_uniform_array_access_to_pull_constants()
2925 {
2926 int pull_constant_loc[this->uniforms];
2927
2928 for (int i = 0; i < this->uniforms; i++) {
2929 pull_constant_loc[i] = -1;
2930 }
2931
2932 /* Walk through and find array access of uniforms. Put a copy of that
2933 * uniform in the pull constant buffer.
2934 *
2935 * Note that we don't move constant-indexed accesses to arrays. No
2936 * testing has been done of the performance impact of this choice.
2937 */
2938 foreach_list_safe(node, &this->instructions) {
2939 vec4_instruction *inst = (vec4_instruction *)node;
2940
2941 for (int i = 0 ; i < 3; i++) {
2942 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2943 continue;
2944
2945 int uniform = inst->src[i].reg;
2946
2947 /* If this array isn't already present in the pull constant buffer,
2948 * add it.
2949 */
2950 if (pull_constant_loc[uniform] == -1) {
2951 const float **values = &prog_data->base.param[uniform * 4];
2952
2953 pull_constant_loc[uniform] = prog_data->base.nr_pull_params / 4;
2954
2955 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
2956 prog_data->base.pull_param[prog_data->base.nr_pull_params++]
2957 = values[j];
2958 }
2959 }
2960
2961 /* Set up the annotation tracking for new generated instructions. */
2962 base_ir = inst->ir;
2963 current_annotation = inst->annotation;
2964
2965 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
2966
2967 emit_pull_constant_load(inst, temp, inst->src[i],
2968 pull_constant_loc[uniform]);
2969
2970 inst->src[i].file = temp.file;
2971 inst->src[i].reg = temp.reg;
2972 inst->src[i].reg_offset = temp.reg_offset;
2973 inst->src[i].reladdr = NULL;
2974 }
2975 }
2976
2977 /* Now there are no accesses of the UNIFORM file with a reladdr, so
2978 * no need to track them as larger-than-vec4 objects. This will be
2979 * relied on in cutting out unused uniform vectors from push
2980 * constants.
2981 */
2982 split_uniform_registers();
2983 }
2984
2985 void
2986 vec4_visitor::resolve_ud_negate(src_reg *reg)
2987 {
2988 if (reg->type != BRW_REGISTER_TYPE_UD ||
2989 !reg->negate)
2990 return;
2991
2992 src_reg temp = src_reg(this, glsl_type::uvec4_type);
2993 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
2994 *reg = temp;
2995 }
2996
2997 vec4_visitor::vec4_visitor(struct brw_context *brw,
2998 struct brw_vs_compile *c,
2999 struct brw_vs_prog_data *prog_data,
3000 struct gl_shader_program *shader_prog,
3001 struct brw_shader *shader,
3002 void *mem_ctx)
3003 {
3004 this->c = c;
3005 this->brw = brw;
3006 this->intel = &brw->intel;
3007 this->ctx = &intel->ctx;
3008 this->shader_prog = shader_prog;
3009 this->shader = shader;
3010
3011 this->mem_ctx = mem_ctx;
3012 this->failed = false;
3013
3014 this->base_ir = NULL;
3015 this->current_annotation = NULL;
3016 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3017
3018 this->c = c;
3019 this->prog = &c->vp->program.Base;
3020 this->prog_data = prog_data;
3021
3022 this->variable_ht = hash_table_ctor(0,
3023 hash_table_pointer_hash,
3024 hash_table_pointer_compare);
3025
3026 this->virtual_grf_def = NULL;
3027 this->virtual_grf_use = NULL;
3028 this->virtual_grf_sizes = NULL;
3029 this->virtual_grf_count = 0;
3030 this->virtual_grf_reg_map = NULL;
3031 this->virtual_grf_reg_count = 0;
3032 this->virtual_grf_array_size = 0;
3033 this->live_intervals_valid = false;
3034
3035 this->max_grf = intel->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3036
3037 this->uniforms = 0;
3038 }
3039
3040 vec4_visitor::~vec4_visitor()
3041 {
3042 hash_table_dtor(this->variable_ht);
3043 }
3044
3045
3046 vec4_vs_visitor::vec4_vs_visitor(struct brw_context *brw,
3047 struct brw_vs_compile *c,
3048 struct brw_vs_prog_data *prog_data,
3049 struct gl_shader_program *prog,
3050 struct brw_shader *shader,
3051 void *mem_ctx)
3052 : vec4_visitor(brw, c, prog_data, prog, shader, mem_ctx)
3053 {
3054 }
3055
3056
3057 void
3058 vec4_visitor::fail(const char *format, ...)
3059 {
3060 va_list va;
3061 char *msg;
3062
3063 if (failed)
3064 return;
3065
3066 failed = true;
3067
3068 va_start(va, format);
3069 msg = ralloc_vasprintf(mem_ctx, format, va);
3070 va_end(va);
3071 msg = ralloc_asprintf(mem_ctx, "VS compile failed: %s\n", msg);
3072
3073 this->fail_msg = msg;
3074
3075 if (INTEL_DEBUG & DEBUG_VS) {
3076 fprintf(stderr, "%s", msg);
3077 }
3078 }
3079
3080 } /* namespace brw */