i965/vec4: Combine all the math emitters.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4_visitor.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "brw_cfg.h"
26 #include "glsl/ir_uniform.h"
27 extern "C" {
28 #include "program/sampler.h"
29 }
30
31 namespace brw {
32
33 vec4_instruction::vec4_instruction(vec4_visitor *v,
34 enum opcode opcode, const dst_reg &dst,
35 const src_reg &src0, const src_reg &src1,
36 const src_reg &src2)
37 {
38 this->opcode = opcode;
39 this->dst = dst;
40 this->src[0] = src0;
41 this->src[1] = src1;
42 this->src[2] = src2;
43 this->saturate = false;
44 this->force_writemask_all = false;
45 this->no_dd_clear = false;
46 this->no_dd_check = false;
47 this->writes_accumulator = false;
48 this->conditional_mod = BRW_CONDITIONAL_NONE;
49 this->texture_offset = 0;
50 this->target = 0;
51 this->shadow_compare = false;
52 this->ir = v->base_ir;
53 this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
54 this->header_present = false;
55 this->mlen = 0;
56 this->base_mrf = 0;
57 this->offset = 0;
58 this->annotation = v->current_annotation;
59 }
60
61 vec4_instruction *
62 vec4_visitor::emit(vec4_instruction *inst)
63 {
64 this->instructions.push_tail(inst);
65
66 return inst;
67 }
68
69 vec4_instruction *
70 vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
71 vec4_instruction *new_inst)
72 {
73 new_inst->ir = inst->ir;
74 new_inst->annotation = inst->annotation;
75
76 inst->insert_before(block, new_inst);
77
78 return inst;
79 }
80
81 vec4_instruction *
82 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
83 const src_reg &src1, const src_reg &src2)
84 {
85 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst,
86 src0, src1, src2));
87 }
88
89
90 vec4_instruction *
91 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
92 const src_reg &src1)
93 {
94 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0, src1));
95 }
96
97 vec4_instruction *
98 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
99 {
100 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst, src0));
101 }
102
103 vec4_instruction *
104 vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
105 {
106 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst));
107 }
108
109 vec4_instruction *
110 vec4_visitor::emit(enum opcode opcode)
111 {
112 return emit(new(mem_ctx) vec4_instruction(this, opcode, dst_reg()));
113 }
114
115 #define ALU1(op) \
116 vec4_instruction * \
117 vec4_visitor::op(const dst_reg &dst, const src_reg &src0) \
118 { \
119 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
120 src0); \
121 }
122
123 #define ALU2(op) \
124 vec4_instruction * \
125 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
126 const src_reg &src1) \
127 { \
128 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
129 src0, src1); \
130 }
131
132 #define ALU2_ACC(op) \
133 vec4_instruction * \
134 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
135 const src_reg &src1) \
136 { \
137 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, \
138 BRW_OPCODE_##op, dst, src0, src1); \
139 inst->writes_accumulator = true; \
140 return inst; \
141 }
142
143 #define ALU3(op) \
144 vec4_instruction * \
145 vec4_visitor::op(const dst_reg &dst, const src_reg &src0, \
146 const src_reg &src1, const src_reg &src2) \
147 { \
148 assert(brw->gen >= 6); \
149 return new(mem_ctx) vec4_instruction(this, BRW_OPCODE_##op, dst, \
150 src0, src1, src2); \
151 }
152
153 ALU1(NOT)
154 ALU1(MOV)
155 ALU1(FRC)
156 ALU1(RNDD)
157 ALU1(RNDE)
158 ALU1(RNDZ)
159 ALU1(F32TO16)
160 ALU1(F16TO32)
161 ALU2(ADD)
162 ALU2(MUL)
163 ALU2_ACC(MACH)
164 ALU2(AND)
165 ALU2(OR)
166 ALU2(XOR)
167 ALU2(DP3)
168 ALU2(DP4)
169 ALU2(DPH)
170 ALU2(SHL)
171 ALU2(SHR)
172 ALU2(ASR)
173 ALU3(LRP)
174 ALU1(BFREV)
175 ALU3(BFE)
176 ALU2(BFI1)
177 ALU3(BFI2)
178 ALU1(FBH)
179 ALU1(FBL)
180 ALU1(CBIT)
181 ALU3(MAD)
182 ALU2_ACC(ADDC)
183 ALU2_ACC(SUBB)
184 ALU2(MAC)
185
186 /** Gen4 predicated IF. */
187 vec4_instruction *
188 vec4_visitor::IF(enum brw_predicate predicate)
189 {
190 vec4_instruction *inst;
191
192 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF);
193 inst->predicate = predicate;
194
195 return inst;
196 }
197
198 /** Gen6 IF with embedded comparison. */
199 vec4_instruction *
200 vec4_visitor::IF(src_reg src0, src_reg src1,
201 enum brw_conditional_mod condition)
202 {
203 assert(brw->gen == 6);
204
205 vec4_instruction *inst;
206
207 resolve_ud_negate(&src0);
208 resolve_ud_negate(&src1);
209
210 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_IF, dst_null_d(),
211 src0, src1);
212 inst->conditional_mod = condition;
213
214 return inst;
215 }
216
217 /**
218 * CMP: Sets the low bit of the destination channels with the result
219 * of the comparison, while the upper bits are undefined, and updates
220 * the flag register with the packed 16 bits of the result.
221 */
222 vec4_instruction *
223 vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
224 enum brw_conditional_mod condition)
225 {
226 vec4_instruction *inst;
227
228 /* original gen4 does type conversion to the destination type
229 * before before comparison, producing garbage results for floating
230 * point comparisons.
231 */
232 if (brw->gen == 4) {
233 dst.type = src0.type;
234 if (dst.file == HW_REG)
235 dst.fixed_hw_reg.type = dst.type;
236 }
237
238 resolve_ud_negate(&src0);
239 resolve_ud_negate(&src1);
240
241 inst = new(mem_ctx) vec4_instruction(this, BRW_OPCODE_CMP, dst, src0, src1);
242 inst->conditional_mod = condition;
243
244 return inst;
245 }
246
247 vec4_instruction *
248 vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
249 {
250 vec4_instruction *inst;
251
252 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_READ,
253 dst, index);
254 inst->base_mrf = 14;
255 inst->mlen = 2;
256
257 return inst;
258 }
259
260 vec4_instruction *
261 vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
262 const src_reg &index)
263 {
264 vec4_instruction *inst;
265
266 inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_GEN4_SCRATCH_WRITE,
267 dst, src, index);
268 inst->base_mrf = 13;
269 inst->mlen = 3;
270
271 return inst;
272 }
273
274 void
275 vec4_visitor::emit_dp(dst_reg dst, src_reg src0, src_reg src1, unsigned elements)
276 {
277 static enum opcode dot_opcodes[] = {
278 BRW_OPCODE_DP2, BRW_OPCODE_DP3, BRW_OPCODE_DP4
279 };
280
281 emit(dot_opcodes[elements - 2], dst, src0, src1);
282 }
283
284 src_reg
285 vec4_visitor::fix_3src_operand(src_reg src)
286 {
287 /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
288 * able to use vertical stride of zero to replicate the vec4 uniform, like
289 *
290 * g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
291 *
292 * But you can't, since vertical stride is always four in three-source
293 * instructions. Instead, insert a MOV instruction to do the replication so
294 * that the three-source instruction can consume it.
295 */
296
297 /* The MOV is only needed if the source is a uniform or immediate. */
298 if (src.file != UNIFORM && src.file != IMM)
299 return src;
300
301 if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
302 return src;
303
304 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
305 expanded.type = src.type;
306 emit(MOV(expanded, src));
307 return src_reg(expanded);
308 }
309
310 src_reg
311 vec4_visitor::fix_math_operand(src_reg src)
312 {
313 if (brw->gen < 6 || brw->gen >= 8 || src.file == BAD_FILE)
314 return src;
315
316 /* The gen6 math instruction ignores the source modifiers --
317 * swizzle, abs, negate, and at least some parts of the register
318 * region description.
319 *
320 * Rather than trying to enumerate all these cases, *always* expand the
321 * operand to a temp GRF for gen6.
322 *
323 * For gen7, keep the operand as-is, except if immediate, which gen7 still
324 * can't use.
325 */
326
327 if (brw->gen == 7 && src.file != IMM)
328 return src;
329
330 dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
331 expanded.type = src.type;
332 emit(MOV(expanded, src));
333 return src_reg(expanded);
334 }
335
336 void
337 vec4_visitor::emit_math(enum opcode opcode,
338 const dst_reg &dst,
339 const src_reg &src0, const src_reg &src1)
340 {
341 vec4_instruction *math =
342 emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
343
344 if (brw->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
345 /* MATH on Gen6 must be align1, so we can't do writemasks. */
346 math->dst = dst_reg(this, glsl_type::vec4_type);
347 math->dst.type = dst.type;
348 emit(MOV(dst, src_reg(math->dst)));
349 } else if (brw->gen < 6) {
350 math->base_mrf = 1;
351 math->mlen = src1.file == BAD_FILE ? 1 : 2;
352 }
353 }
354
355 void
356 vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
357 {
358 if (brw->gen < 7) {
359 unreachable("ir_unop_pack_half_2x16 should be lowered");
360 }
361
362 assert(dst.type == BRW_REGISTER_TYPE_UD);
363 assert(src0.type == BRW_REGISTER_TYPE_F);
364
365 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
366 *
367 * Because this instruction does not have a 16-bit floating-point type,
368 * the destination data type must be Word (W).
369 *
370 * The destination must be DWord-aligned and specify a horizontal stride
371 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
372 * each destination channel and the upper word is not modified.
373 *
374 * The above restriction implies that the f32to16 instruction must use
375 * align1 mode, because only in align1 mode is it possible to specify
376 * horizontal stride. We choose here to defy the hardware docs and emit
377 * align16 instructions.
378 *
379 * (I [chadv] did attempt to emit align1 instructions for VS f32to16
380 * instructions. I was partially successful in that the code passed all
381 * tests. However, the code was dubiously correct and fragile, and the
382 * tests were not harsh enough to probe that frailty. Not trusting the
383 * code, I chose instead to remain in align16 mode in defiance of the hw
384 * docs).
385 *
386 * I've [chadv] experimentally confirmed that, on gen7 hardware and the
387 * simulator, emitting a f32to16 in align16 mode with UD as destination
388 * data type is safe. The behavior differs from that specified in the PRM
389 * in that the upper word of each destination channel is cleared to 0.
390 */
391
392 dst_reg tmp_dst(this, glsl_type::uvec2_type);
393 src_reg tmp_src(tmp_dst);
394
395 #if 0
396 /* Verify the undocumented behavior on which the following instructions
397 * rely. If f32to16 fails to clear the upper word of the X and Y channels,
398 * then the result of the bit-or instruction below will be incorrect.
399 *
400 * You should inspect the disasm output in order to verify that the MOV is
401 * not optimized away.
402 */
403 emit(MOV(tmp_dst, src_reg(0x12345678u)));
404 #endif
405
406 /* Give tmp the form below, where "." means untouched.
407 *
408 * w z y x w z y x
409 * |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
410 *
411 * That the upper word of each write-channel be 0 is required for the
412 * following bit-shift and bit-or instructions to work. Note that this
413 * relies on the undocumented hardware behavior mentioned above.
414 */
415 tmp_dst.writemask = WRITEMASK_XY;
416 emit(F32TO16(tmp_dst, src0));
417
418 /* Give the write-channels of dst the form:
419 * 0xhhhh0000
420 */
421 tmp_src.swizzle = BRW_SWIZZLE_YYYY;
422 emit(SHL(dst, tmp_src, src_reg(16u)));
423
424 /* Finally, give the write-channels of dst the form of packHalf2x16's
425 * output:
426 * 0xhhhhllll
427 */
428 tmp_src.swizzle = BRW_SWIZZLE_XXXX;
429 emit(OR(dst, src_reg(dst), tmp_src));
430 }
431
432 void
433 vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
434 {
435 if (brw->gen < 7) {
436 unreachable("ir_unop_unpack_half_2x16 should be lowered");
437 }
438
439 assert(dst.type == BRW_REGISTER_TYPE_F);
440 assert(src0.type == BRW_REGISTER_TYPE_UD);
441
442 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
443 *
444 * Because this instruction does not have a 16-bit floating-point type,
445 * the source data type must be Word (W). The destination type must be
446 * F (Float).
447 *
448 * To use W as the source data type, we must adjust horizontal strides,
449 * which is only possible in align1 mode. All my [chadv] attempts at
450 * emitting align1 instructions for unpackHalf2x16 failed to pass the
451 * Piglit tests, so I gave up.
452 *
453 * I've verified that, on gen7 hardware and the simulator, it is safe to
454 * emit f16to32 in align16 mode with UD as source data type.
455 */
456
457 dst_reg tmp_dst(this, glsl_type::uvec2_type);
458 src_reg tmp_src(tmp_dst);
459
460 tmp_dst.writemask = WRITEMASK_X;
461 emit(AND(tmp_dst, src0, src_reg(0xffffu)));
462
463 tmp_dst.writemask = WRITEMASK_Y;
464 emit(SHR(tmp_dst, src0, src_reg(16u)));
465
466 dst.writemask = WRITEMASK_XY;
467 emit(F16TO32(dst, tmp_src));
468 }
469
470 void
471 vec4_visitor::visit_instructions(const exec_list *list)
472 {
473 foreach_in_list(ir_instruction, ir, list) {
474 base_ir = ir;
475 ir->accept(this);
476 }
477 }
478
479
480 static int
481 type_size(const struct glsl_type *type)
482 {
483 unsigned int i;
484 int size;
485
486 switch (type->base_type) {
487 case GLSL_TYPE_UINT:
488 case GLSL_TYPE_INT:
489 case GLSL_TYPE_FLOAT:
490 case GLSL_TYPE_BOOL:
491 if (type->is_matrix()) {
492 return type->matrix_columns;
493 } else {
494 /* Regardless of size of vector, it gets a vec4. This is bad
495 * packing for things like floats, but otherwise arrays become a
496 * mess. Hopefully a later pass over the code can pack scalars
497 * down if appropriate.
498 */
499 return 1;
500 }
501 case GLSL_TYPE_ARRAY:
502 assert(type->length > 0);
503 return type_size(type->fields.array) * type->length;
504 case GLSL_TYPE_STRUCT:
505 size = 0;
506 for (i = 0; i < type->length; i++) {
507 size += type_size(type->fields.structure[i].type);
508 }
509 return size;
510 case GLSL_TYPE_SAMPLER:
511 /* Samplers take up no register space, since they're baked in at
512 * link time.
513 */
514 return 0;
515 case GLSL_TYPE_ATOMIC_UINT:
516 return 0;
517 case GLSL_TYPE_IMAGE:
518 case GLSL_TYPE_VOID:
519 case GLSL_TYPE_ERROR:
520 case GLSL_TYPE_INTERFACE:
521 unreachable("not reached");
522 }
523
524 return 0;
525 }
526
527 int
528 vec4_visitor::virtual_grf_alloc(int size)
529 {
530 if (virtual_grf_array_size <= virtual_grf_count) {
531 if (virtual_grf_array_size == 0)
532 virtual_grf_array_size = 16;
533 else
534 virtual_grf_array_size *= 2;
535 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
536 virtual_grf_array_size);
537 virtual_grf_reg_map = reralloc(mem_ctx, virtual_grf_reg_map, int,
538 virtual_grf_array_size);
539 }
540 virtual_grf_reg_map[virtual_grf_count] = virtual_grf_reg_count;
541 virtual_grf_reg_count += size;
542 virtual_grf_sizes[virtual_grf_count] = size;
543 return virtual_grf_count++;
544 }
545
546 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
547 {
548 init();
549
550 this->file = GRF;
551 this->reg = v->virtual_grf_alloc(type_size(type));
552
553 if (type->is_array() || type->is_record()) {
554 this->swizzle = BRW_SWIZZLE_NOOP;
555 } else {
556 this->swizzle = swizzle_for_size(type->vector_elements);
557 }
558
559 this->type = brw_type_for_base_type(type);
560 }
561
562 src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
563 {
564 assert(size > 0);
565
566 init();
567
568 this->file = GRF;
569 this->reg = v->virtual_grf_alloc(type_size(type) * size);
570
571 this->swizzle = BRW_SWIZZLE_NOOP;
572
573 this->type = brw_type_for_base_type(type);
574 }
575
576 dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
577 {
578 init();
579
580 this->file = GRF;
581 this->reg = v->virtual_grf_alloc(type_size(type));
582
583 if (type->is_array() || type->is_record()) {
584 this->writemask = WRITEMASK_XYZW;
585 } else {
586 this->writemask = (1 << type->vector_elements) - 1;
587 }
588
589 this->type = brw_type_for_base_type(type);
590 }
591
592 /* Our support for uniforms is piggy-backed on the struct
593 * gl_fragment_program, because that's where the values actually
594 * get stored, rather than in some global gl_shader_program uniform
595 * store.
596 */
597 void
598 vec4_visitor::setup_uniform_values(ir_variable *ir)
599 {
600 int namelen = strlen(ir->name);
601
602 /* The data for our (non-builtin) uniforms is stored in a series of
603 * gl_uniform_driver_storage structs for each subcomponent that
604 * glGetUniformLocation() could name. We know it's been set up in the same
605 * order we'd walk the type, so walk the list of storage and find anything
606 * with our name, or the prefix of a component that starts with our name.
607 */
608 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
609 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
610
611 if (strncmp(ir->name, storage->name, namelen) != 0 ||
612 (storage->name[namelen] != 0 &&
613 storage->name[namelen] != '.' &&
614 storage->name[namelen] != '[')) {
615 continue;
616 }
617
618 gl_constant_value *components = storage->storage;
619 unsigned vector_count = (MAX2(storage->array_elements, 1) *
620 storage->type->matrix_columns);
621
622 for (unsigned s = 0; s < vector_count; s++) {
623 assert(uniforms < uniform_array_size);
624 uniform_vector_size[uniforms] = storage->type->vector_elements;
625
626 int i;
627 for (i = 0; i < uniform_vector_size[uniforms]; i++) {
628 stage_prog_data->param[uniforms * 4 + i] = components;
629 components++;
630 }
631 for (; i < 4; i++) {
632 static gl_constant_value zero = { 0.0 };
633 stage_prog_data->param[uniforms * 4 + i] = &zero;
634 }
635
636 uniforms++;
637 }
638 }
639 }
640
641 void
642 vec4_visitor::setup_uniform_clipplane_values()
643 {
644 gl_clip_plane *clip_planes = brw_select_clip_planes(ctx);
645
646 for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
647 assert(this->uniforms < uniform_array_size);
648 this->uniform_vector_size[this->uniforms] = 4;
649 this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
650 this->userplane[i].type = BRW_REGISTER_TYPE_F;
651 for (int j = 0; j < 4; ++j) {
652 stage_prog_data->param[this->uniforms * 4 + j] =
653 (gl_constant_value *) &clip_planes[i][j];
654 }
655 ++this->uniforms;
656 }
657 }
658
659 /* Our support for builtin uniforms is even scarier than non-builtin.
660 * It sits on top of the PROG_STATE_VAR parameters that are
661 * automatically updated from GL context state.
662 */
663 void
664 vec4_visitor::setup_builtin_uniform_values(ir_variable *ir)
665 {
666 const ir_state_slot *const slots = ir->get_state_slots();
667 assert(slots != NULL);
668
669 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
670 /* This state reference has already been setup by ir_to_mesa,
671 * but we'll get the same index back here. We can reference
672 * ParameterValues directly, since unlike brw_fs.cpp, we never
673 * add new state references during compile.
674 */
675 int index = _mesa_add_state_reference(this->prog->Parameters,
676 (gl_state_index *)slots[i].tokens);
677 gl_constant_value *values =
678 &this->prog->Parameters->ParameterValues[index][0];
679
680 assert(this->uniforms < uniform_array_size);
681 this->uniform_vector_size[this->uniforms] = 0;
682 /* Add each of the unique swizzled channels of the element.
683 * This will end up matching the size of the glsl_type of this field.
684 */
685 int last_swiz = -1;
686 for (unsigned int j = 0; j < 4; j++) {
687 int swiz = GET_SWZ(slots[i].swizzle, j);
688 last_swiz = swiz;
689
690 stage_prog_data->param[this->uniforms * 4 + j] = &values[swiz];
691 assert(this->uniforms < uniform_array_size);
692 if (swiz <= last_swiz)
693 this->uniform_vector_size[this->uniforms]++;
694 }
695 this->uniforms++;
696 }
697 }
698
699 dst_reg *
700 vec4_visitor::variable_storage(ir_variable *var)
701 {
702 return (dst_reg *)hash_table_find(this->variable_ht, var);
703 }
704
705 void
706 vec4_visitor::emit_bool_to_cond_code(ir_rvalue *ir,
707 enum brw_predicate *predicate)
708 {
709 ir_expression *expr = ir->as_expression();
710
711 *predicate = BRW_PREDICATE_NORMAL;
712
713 if (expr && expr->operation != ir_binop_ubo_load) {
714 src_reg op[3];
715 vec4_instruction *inst;
716
717 assert(expr->get_num_operands() <= 3);
718 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
719 expr->operands[i]->accept(this);
720 op[i] = this->result;
721
722 resolve_ud_negate(&op[i]);
723 }
724
725 switch (expr->operation) {
726 case ir_unop_logic_not:
727 inst = emit(AND(dst_null_d(), op[0], src_reg(1)));
728 inst->conditional_mod = BRW_CONDITIONAL_Z;
729 break;
730
731 case ir_binop_logic_xor:
732 inst = emit(XOR(dst_null_d(), op[0], op[1]));
733 inst->conditional_mod = BRW_CONDITIONAL_NZ;
734 break;
735
736 case ir_binop_logic_or:
737 inst = emit(OR(dst_null_d(), op[0], op[1]));
738 inst->conditional_mod = BRW_CONDITIONAL_NZ;
739 break;
740
741 case ir_binop_logic_and:
742 inst = emit(AND(dst_null_d(), op[0], op[1]));
743 inst->conditional_mod = BRW_CONDITIONAL_NZ;
744 break;
745
746 case ir_unop_f2b:
747 if (brw->gen >= 6) {
748 emit(CMP(dst_null_d(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
749 } else {
750 inst = emit(MOV(dst_null_f(), op[0]));
751 inst->conditional_mod = BRW_CONDITIONAL_NZ;
752 }
753 break;
754
755 case ir_unop_i2b:
756 if (brw->gen >= 6) {
757 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
758 } else {
759 inst = emit(MOV(dst_null_d(), op[0]));
760 inst->conditional_mod = BRW_CONDITIONAL_NZ;
761 }
762 break;
763
764 case ir_binop_all_equal:
765 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
766 *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
767 break;
768
769 case ir_binop_any_nequal:
770 inst = emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
771 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
772 break;
773
774 case ir_unop_any:
775 inst = emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
776 *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
777 break;
778
779 case ir_binop_greater:
780 case ir_binop_gequal:
781 case ir_binop_less:
782 case ir_binop_lequal:
783 case ir_binop_equal:
784 case ir_binop_nequal:
785 emit(CMP(dst_null_d(), op[0], op[1],
786 brw_conditional_for_comparison(expr->operation)));
787 break;
788
789 case ir_triop_csel: {
790 /* Expand the boolean condition into the flag register. */
791 inst = emit(MOV(dst_null_d(), op[0]));
792 inst->conditional_mod = BRW_CONDITIONAL_NZ;
793
794 /* Select which boolean to return. */
795 dst_reg temp(this, expr->operands[1]->type);
796 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
797 inst->predicate = BRW_PREDICATE_NORMAL;
798
799 /* Expand the result to a condition code. */
800 inst = emit(MOV(dst_null_d(), src_reg(temp)));
801 inst->conditional_mod = BRW_CONDITIONAL_NZ;
802 break;
803 }
804
805 default:
806 unreachable("not reached");
807 }
808 return;
809 }
810
811 ir->accept(this);
812
813 resolve_ud_negate(&this->result);
814
815 if (brw->gen >= 6) {
816 vec4_instruction *inst = emit(AND(dst_null_d(),
817 this->result, src_reg(1)));
818 inst->conditional_mod = BRW_CONDITIONAL_NZ;
819 } else {
820 vec4_instruction *inst = emit(MOV(dst_null_d(), this->result));
821 inst->conditional_mod = BRW_CONDITIONAL_NZ;
822 }
823 }
824
825 /**
826 * Emit a gen6 IF statement with the comparison folded into the IF
827 * instruction.
828 */
829 void
830 vec4_visitor::emit_if_gen6(ir_if *ir)
831 {
832 ir_expression *expr = ir->condition->as_expression();
833
834 if (expr && expr->operation != ir_binop_ubo_load) {
835 src_reg op[3];
836 dst_reg temp;
837
838 assert(expr->get_num_operands() <= 3);
839 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
840 expr->operands[i]->accept(this);
841 op[i] = this->result;
842 }
843
844 switch (expr->operation) {
845 case ir_unop_logic_not:
846 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_Z));
847 return;
848
849 case ir_binop_logic_xor:
850 emit(IF(op[0], op[1], BRW_CONDITIONAL_NZ));
851 return;
852
853 case ir_binop_logic_or:
854 temp = dst_reg(this, glsl_type::bool_type);
855 emit(OR(temp, op[0], op[1]));
856 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
857 return;
858
859 case ir_binop_logic_and:
860 temp = dst_reg(this, glsl_type::bool_type);
861 emit(AND(temp, op[0], op[1]));
862 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
863 return;
864
865 case ir_unop_f2b:
866 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
867 return;
868
869 case ir_unop_i2b:
870 emit(IF(op[0], src_reg(0), BRW_CONDITIONAL_NZ));
871 return;
872
873 case ir_binop_greater:
874 case ir_binop_gequal:
875 case ir_binop_less:
876 case ir_binop_lequal:
877 case ir_binop_equal:
878 case ir_binop_nequal:
879 emit(IF(op[0], op[1],
880 brw_conditional_for_comparison(expr->operation)));
881 return;
882
883 case ir_binop_all_equal:
884 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
885 emit(IF(BRW_PREDICATE_ALIGN16_ALL4H));
886 return;
887
888 case ir_binop_any_nequal:
889 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
890 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
891 return;
892
893 case ir_unop_any:
894 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
895 emit(IF(BRW_PREDICATE_ALIGN16_ANY4H));
896 return;
897
898 case ir_triop_csel: {
899 /* Expand the boolean condition into the flag register. */
900 vec4_instruction *inst = emit(MOV(dst_null_d(), op[0]));
901 inst->conditional_mod = BRW_CONDITIONAL_NZ;
902
903 /* Select which boolean to return. */
904 dst_reg temp(this, expr->operands[1]->type);
905 inst = emit(BRW_OPCODE_SEL, temp, op[1], op[2]);
906 inst->predicate = BRW_PREDICATE_NORMAL;
907
908 emit(IF(src_reg(temp), src_reg(0), BRW_CONDITIONAL_NZ));
909 return;
910 }
911
912 default:
913 unreachable("not reached");
914 }
915 return;
916 }
917
918 ir->condition->accept(this);
919
920 emit(IF(this->result, src_reg(0), BRW_CONDITIONAL_NZ));
921 }
922
923 void
924 vec4_visitor::visit(ir_variable *ir)
925 {
926 dst_reg *reg = NULL;
927
928 if (variable_storage(ir))
929 return;
930
931 switch (ir->data.mode) {
932 case ir_var_shader_in:
933 reg = new(mem_ctx) dst_reg(ATTR, ir->data.location);
934 break;
935
936 case ir_var_shader_out:
937 reg = new(mem_ctx) dst_reg(this, ir->type);
938
939 for (int i = 0; i < type_size(ir->type); i++) {
940 output_reg[ir->data.location + i] = *reg;
941 output_reg[ir->data.location + i].reg_offset = i;
942 output_reg[ir->data.location + i].type =
943 brw_type_for_base_type(ir->type->get_scalar_type());
944 output_reg_annotation[ir->data.location + i] = ir->name;
945 }
946 break;
947
948 case ir_var_auto:
949 case ir_var_temporary:
950 reg = new(mem_ctx) dst_reg(this, ir->type);
951 break;
952
953 case ir_var_uniform:
954 reg = new(this->mem_ctx) dst_reg(UNIFORM, this->uniforms);
955
956 /* Thanks to the lower_ubo_reference pass, we will see only
957 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
958 * variables, so no need for them to be in variable_ht.
959 *
960 * Some uniforms, such as samplers and atomic counters, have no actual
961 * storage, so we should ignore them.
962 */
963 if (ir->is_in_uniform_block() || type_size(ir->type) == 0)
964 return;
965
966 /* Track how big the whole uniform variable is, in case we need to put a
967 * copy of its data into pull constants for array access.
968 */
969 assert(this->uniforms < uniform_array_size);
970 this->uniform_size[this->uniforms] = type_size(ir->type);
971
972 if (!strncmp(ir->name, "gl_", 3)) {
973 setup_builtin_uniform_values(ir);
974 } else {
975 setup_uniform_values(ir);
976 }
977 break;
978
979 case ir_var_system_value:
980 reg = make_reg_for_system_value(ir);
981 break;
982
983 default:
984 unreachable("not reached");
985 }
986
987 reg->type = brw_type_for_base_type(ir->type);
988 hash_table_insert(this->variable_ht, reg, ir);
989 }
990
991 void
992 vec4_visitor::visit(ir_loop *ir)
993 {
994 /* We don't want debugging output to print the whole body of the
995 * loop as the annotation.
996 */
997 this->base_ir = NULL;
998
999 emit(BRW_OPCODE_DO);
1000
1001 visit_instructions(&ir->body_instructions);
1002
1003 emit(BRW_OPCODE_WHILE);
1004 }
1005
1006 void
1007 vec4_visitor::visit(ir_loop_jump *ir)
1008 {
1009 switch (ir->mode) {
1010 case ir_loop_jump::jump_break:
1011 emit(BRW_OPCODE_BREAK);
1012 break;
1013 case ir_loop_jump::jump_continue:
1014 emit(BRW_OPCODE_CONTINUE);
1015 break;
1016 }
1017 }
1018
1019
1020 void
1021 vec4_visitor::visit(ir_function_signature *)
1022 {
1023 unreachable("not reached");
1024 }
1025
1026 void
1027 vec4_visitor::visit(ir_function *ir)
1028 {
1029 /* Ignore function bodies other than main() -- we shouldn't see calls to
1030 * them since they should all be inlined.
1031 */
1032 if (strcmp(ir->name, "main") == 0) {
1033 const ir_function_signature *sig;
1034 exec_list empty;
1035
1036 sig = ir->matching_signature(NULL, &empty, false);
1037
1038 assert(sig);
1039
1040 visit_instructions(&sig->body);
1041 }
1042 }
1043
1044 bool
1045 vec4_visitor::try_emit_mad(ir_expression *ir)
1046 {
1047 /* 3-src instructions were introduced in gen6. */
1048 if (brw->gen < 6)
1049 return false;
1050
1051 /* MAD can only handle floating-point data. */
1052 if (ir->type->base_type != GLSL_TYPE_FLOAT)
1053 return false;
1054
1055 ir_rvalue *nonmul = ir->operands[1];
1056 ir_expression *mul = ir->operands[0]->as_expression();
1057
1058 if (!mul || mul->operation != ir_binop_mul) {
1059 nonmul = ir->operands[0];
1060 mul = ir->operands[1]->as_expression();
1061
1062 if (!mul || mul->operation != ir_binop_mul)
1063 return false;
1064 }
1065
1066 nonmul->accept(this);
1067 src_reg src0 = fix_3src_operand(this->result);
1068
1069 mul->operands[0]->accept(this);
1070 src_reg src1 = fix_3src_operand(this->result);
1071
1072 mul->operands[1]->accept(this);
1073 src_reg src2 = fix_3src_operand(this->result);
1074
1075 this->result = src_reg(this, ir->type);
1076 emit(BRW_OPCODE_MAD, dst_reg(this->result), src0, src1, src2);
1077
1078 return true;
1079 }
1080
1081 bool
1082 vec4_visitor::try_emit_b2f_of_compare(ir_expression *ir)
1083 {
1084 /* This optimization relies on CMP setting the destination to 0 when
1085 * false. Early hardware only sets the least significant bit, and
1086 * leaves the other bits undefined. So we can't use it.
1087 */
1088 if (brw->gen < 6)
1089 return false;
1090
1091 ir_expression *const cmp = ir->operands[0]->as_expression();
1092
1093 if (cmp == NULL)
1094 return false;
1095
1096 switch (cmp->operation) {
1097 case ir_binop_less:
1098 case ir_binop_greater:
1099 case ir_binop_lequal:
1100 case ir_binop_gequal:
1101 case ir_binop_equal:
1102 case ir_binop_nequal:
1103 break;
1104
1105 default:
1106 return false;
1107 }
1108
1109 cmp->operands[0]->accept(this);
1110 const src_reg cmp_src0 = this->result;
1111
1112 cmp->operands[1]->accept(this);
1113 const src_reg cmp_src1 = this->result;
1114
1115 this->result = src_reg(this, ir->type);
1116
1117 emit(CMP(dst_reg(this->result), cmp_src0, cmp_src1,
1118 brw_conditional_for_comparison(cmp->operation)));
1119
1120 /* If the comparison is false, this->result will just happen to be zero.
1121 */
1122 vec4_instruction *const inst = emit(BRW_OPCODE_SEL, dst_reg(this->result),
1123 this->result, src_reg(1.0f));
1124 inst->predicate = BRW_PREDICATE_NORMAL;
1125 inst->predicate_inverse = true;
1126
1127 return true;
1128 }
1129
1130 void
1131 vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
1132 src_reg src0, src_reg src1)
1133 {
1134 vec4_instruction *inst;
1135
1136 if (brw->gen >= 6) {
1137 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1138 inst->conditional_mod = conditionalmod;
1139 } else {
1140 emit(CMP(dst, src0, src1, conditionalmod));
1141
1142 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
1143 inst->predicate = BRW_PREDICATE_NORMAL;
1144 }
1145 }
1146
1147 void
1148 vec4_visitor::emit_lrp(const dst_reg &dst,
1149 const src_reg &x, const src_reg &y, const src_reg &a)
1150 {
1151 if (brw->gen >= 6) {
1152 /* Note that the instruction's argument order is reversed from GLSL
1153 * and the IR.
1154 */
1155 emit(LRP(dst,
1156 fix_3src_operand(a), fix_3src_operand(y), fix_3src_operand(x)));
1157 } else {
1158 /* Earlier generations don't support three source operations, so we
1159 * need to emit x*(1-a) + y*a.
1160 */
1161 dst_reg y_times_a = dst_reg(this, glsl_type::vec4_type);
1162 dst_reg one_minus_a = dst_reg(this, glsl_type::vec4_type);
1163 dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
1164 y_times_a.writemask = dst.writemask;
1165 one_minus_a.writemask = dst.writemask;
1166 x_times_one_minus_a.writemask = dst.writemask;
1167
1168 emit(MUL(y_times_a, y, a));
1169 emit(ADD(one_minus_a, negate(a), src_reg(1.0f)));
1170 emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
1171 emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
1172 }
1173 }
1174
1175 void
1176 vec4_visitor::visit(ir_expression *ir)
1177 {
1178 unsigned int operand;
1179 src_reg op[Elements(ir->operands)];
1180 vec4_instruction *inst;
1181
1182 if (ir->operation == ir_binop_add) {
1183 if (try_emit_mad(ir))
1184 return;
1185 }
1186
1187 if (ir->operation == ir_unop_b2f) {
1188 if (try_emit_b2f_of_compare(ir))
1189 return;
1190 }
1191
1192 /* Storage for our result. Ideally for an assignment we'd be using
1193 * the actual storage for the result here, instead.
1194 */
1195 dst_reg result_dst(this, ir->type);
1196 src_reg result_src(result_dst);
1197
1198 if (ir->operation == ir_triop_csel) {
1199 ir->operands[1]->accept(this);
1200 op[1] = this->result;
1201 ir->operands[2]->accept(this);
1202 op[2] = this->result;
1203
1204 enum brw_predicate predicate;
1205 emit_bool_to_cond_code(ir->operands[0], &predicate);
1206 inst = emit(BRW_OPCODE_SEL, result_dst, op[1], op[2]);
1207 inst->predicate = predicate;
1208 this->result = result_src;
1209 return;
1210 }
1211
1212 for (operand = 0; operand < ir->get_num_operands(); operand++) {
1213 this->result.file = BAD_FILE;
1214 ir->operands[operand]->accept(this);
1215 if (this->result.file == BAD_FILE) {
1216 fprintf(stderr, "Failed to get tree for expression operand:\n");
1217 ir->operands[operand]->fprint(stderr);
1218 exit(1);
1219 }
1220 op[operand] = this->result;
1221
1222 /* Matrix expression operands should have been broken down to vector
1223 * operations already.
1224 */
1225 assert(!ir->operands[operand]->type->is_matrix());
1226 }
1227
1228 /* If nothing special happens, this is the result. */
1229 this->result = result_src;
1230
1231 switch (ir->operation) {
1232 case ir_unop_logic_not:
1233 if (ctx->Const.UniformBooleanTrue != 1) {
1234 emit(NOT(result_dst, op[0]));
1235 } else {
1236 emit(XOR(result_dst, op[0], src_reg(1u)));
1237 }
1238 break;
1239 case ir_unop_neg:
1240 op[0].negate = !op[0].negate;
1241 emit(MOV(result_dst, op[0]));
1242 break;
1243 case ir_unop_abs:
1244 op[0].abs = true;
1245 op[0].negate = false;
1246 emit(MOV(result_dst, op[0]));
1247 break;
1248
1249 case ir_unop_sign:
1250 if (ir->type->is_float()) {
1251 /* AND(val, 0x80000000) gives the sign bit.
1252 *
1253 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
1254 * zero.
1255 */
1256 emit(CMP(dst_null_f(), op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1257
1258 op[0].type = BRW_REGISTER_TYPE_UD;
1259 result_dst.type = BRW_REGISTER_TYPE_UD;
1260 emit(AND(result_dst, op[0], src_reg(0x80000000u)));
1261
1262 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(0x3f800000u)));
1263 inst->predicate = BRW_PREDICATE_NORMAL;
1264
1265 this->result.type = BRW_REGISTER_TYPE_F;
1266 } else {
1267 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
1268 * -> non-negative val generates 0x00000000.
1269 * Predicated OR sets 1 if val is positive.
1270 */
1271 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_G));
1272
1273 emit(ASR(result_dst, op[0], src_reg(31)));
1274
1275 inst = emit(OR(result_dst, src_reg(result_dst), src_reg(1)));
1276 inst->predicate = BRW_PREDICATE_NORMAL;
1277 }
1278 break;
1279
1280 case ir_unop_rcp:
1281 emit_math(SHADER_OPCODE_RCP, result_dst, op[0]);
1282 break;
1283
1284 case ir_unop_exp2:
1285 emit_math(SHADER_OPCODE_EXP2, result_dst, op[0]);
1286 break;
1287 case ir_unop_log2:
1288 emit_math(SHADER_OPCODE_LOG2, result_dst, op[0]);
1289 break;
1290 case ir_unop_exp:
1291 case ir_unop_log:
1292 unreachable("not reached: should be handled by ir_explog_to_explog2");
1293 case ir_unop_sin:
1294 case ir_unop_sin_reduced:
1295 emit_math(SHADER_OPCODE_SIN, result_dst, op[0]);
1296 break;
1297 case ir_unop_cos:
1298 case ir_unop_cos_reduced:
1299 emit_math(SHADER_OPCODE_COS, result_dst, op[0]);
1300 break;
1301
1302 case ir_unop_dFdx:
1303 case ir_unop_dFdx_coarse:
1304 case ir_unop_dFdx_fine:
1305 case ir_unop_dFdy:
1306 case ir_unop_dFdy_coarse:
1307 case ir_unop_dFdy_fine:
1308 unreachable("derivatives not valid in vertex shader");
1309
1310 case ir_unop_bitfield_reverse:
1311 emit(BFREV(result_dst, op[0]));
1312 break;
1313 case ir_unop_bit_count:
1314 emit(CBIT(result_dst, op[0]));
1315 break;
1316 case ir_unop_find_msb: {
1317 src_reg temp = src_reg(this, glsl_type::uint_type);
1318
1319 inst = emit(FBH(dst_reg(temp), op[0]));
1320 inst->dst.writemask = WRITEMASK_XYZW;
1321
1322 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
1323 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
1324 * subtract the result from 31 to convert the MSB count into an LSB count.
1325 */
1326
1327 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
1328 temp.swizzle = BRW_SWIZZLE_NOOP;
1329 emit(MOV(result_dst, temp));
1330
1331 src_reg src_tmp = src_reg(result_dst);
1332 emit(CMP(dst_null_d(), src_tmp, src_reg(-1), BRW_CONDITIONAL_NZ));
1333
1334 src_tmp.negate = true;
1335 inst = emit(ADD(result_dst, src_tmp, src_reg(31)));
1336 inst->predicate = BRW_PREDICATE_NORMAL;
1337 break;
1338 }
1339 case ir_unop_find_lsb:
1340 emit(FBL(result_dst, op[0]));
1341 break;
1342 case ir_unop_saturate:
1343 inst = emit(MOV(result_dst, op[0]));
1344 inst->saturate = true;
1345 break;
1346
1347 case ir_unop_noise:
1348 unreachable("not reached: should be handled by lower_noise");
1349
1350 case ir_binop_add:
1351 emit(ADD(result_dst, op[0], op[1]));
1352 break;
1353 case ir_binop_sub:
1354 unreachable("not reached: should be handled by ir_sub_to_add_neg");
1355
1356 case ir_binop_mul:
1357 if (brw->gen < 8 && ir->type->is_integer()) {
1358 /* For integer multiplication, the MUL uses the low 16 bits of one of
1359 * the operands (src0 through SNB, src1 on IVB and later). The MACH
1360 * accumulates in the contribution of the upper 16 bits of that
1361 * operand. If we can determine that one of the args is in the low
1362 * 16 bits, though, we can just emit a single MUL.
1363 */
1364 if (ir->operands[0]->is_uint16_constant()) {
1365 if (brw->gen < 7)
1366 emit(MUL(result_dst, op[0], op[1]));
1367 else
1368 emit(MUL(result_dst, op[1], op[0]));
1369 } else if (ir->operands[1]->is_uint16_constant()) {
1370 if (brw->gen < 7)
1371 emit(MUL(result_dst, op[1], op[0]));
1372 else
1373 emit(MUL(result_dst, op[0], op[1]));
1374 } else {
1375 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1376
1377 emit(MUL(acc, op[0], op[1]));
1378 emit(MACH(dst_null_d(), op[0], op[1]));
1379 emit(MOV(result_dst, src_reg(acc)));
1380 }
1381 } else {
1382 emit(MUL(result_dst, op[0], op[1]));
1383 }
1384 break;
1385 case ir_binop_imul_high: {
1386 struct brw_reg acc = retype(brw_acc_reg(8), result_dst.type);
1387
1388 emit(MUL(acc, op[0], op[1]));
1389 emit(MACH(result_dst, op[0], op[1]));
1390 break;
1391 }
1392 case ir_binop_div:
1393 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
1394 assert(ir->type->is_integer());
1395 emit_math(SHADER_OPCODE_INT_QUOTIENT, result_dst, op[0], op[1]);
1396 break;
1397 case ir_binop_carry: {
1398 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1399
1400 emit(ADDC(dst_null_ud(), op[0], op[1]));
1401 emit(MOV(result_dst, src_reg(acc)));
1402 break;
1403 }
1404 case ir_binop_borrow: {
1405 struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
1406
1407 emit(SUBB(dst_null_ud(), op[0], op[1]));
1408 emit(MOV(result_dst, src_reg(acc)));
1409 break;
1410 }
1411 case ir_binop_mod:
1412 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
1413 assert(ir->type->is_integer());
1414 emit_math(SHADER_OPCODE_INT_REMAINDER, result_dst, op[0], op[1]);
1415 break;
1416
1417 case ir_binop_less:
1418 case ir_binop_greater:
1419 case ir_binop_lequal:
1420 case ir_binop_gequal:
1421 case ir_binop_equal:
1422 case ir_binop_nequal: {
1423 emit(CMP(result_dst, op[0], op[1],
1424 brw_conditional_for_comparison(ir->operation)));
1425 if (ctx->Const.UniformBooleanTrue == 1) {
1426 emit(AND(result_dst, result_src, src_reg(1u)));
1427 }
1428 break;
1429 }
1430
1431 case ir_binop_all_equal:
1432 /* "==" operator producing a scalar boolean. */
1433 if (ir->operands[0]->type->is_vector() ||
1434 ir->operands[1]->type->is_vector()) {
1435 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_Z));
1436 emit(MOV(result_dst, src_reg(0)));
1437 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1438 inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
1439 } else {
1440 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_Z));
1441 if (ctx->Const.UniformBooleanTrue == 1) {
1442 emit(AND(result_dst, result_src, src_reg(1u)));
1443 }
1444 }
1445 break;
1446 case ir_binop_any_nequal:
1447 /* "!=" operator producing a scalar boolean. */
1448 if (ir->operands[0]->type->is_vector() ||
1449 ir->operands[1]->type->is_vector()) {
1450 emit(CMP(dst_null_d(), op[0], op[1], BRW_CONDITIONAL_NZ));
1451
1452 emit(MOV(result_dst, src_reg(0)));
1453 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1454 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1455 } else {
1456 emit(CMP(result_dst, op[0], op[1], BRW_CONDITIONAL_NZ));
1457 if (ctx->Const.UniformBooleanTrue == 1) {
1458 emit(AND(result_dst, result_src, src_reg(1u)));
1459 }
1460 }
1461 break;
1462
1463 case ir_unop_any:
1464 emit(CMP(dst_null_d(), op[0], src_reg(0), BRW_CONDITIONAL_NZ));
1465 emit(MOV(result_dst, src_reg(0)));
1466
1467 inst = emit(MOV(result_dst, src_reg(ctx->Const.UniformBooleanTrue)));
1468 inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
1469 break;
1470
1471 case ir_binop_logic_xor:
1472 emit(XOR(result_dst, op[0], op[1]));
1473 break;
1474
1475 case ir_binop_logic_or:
1476 emit(OR(result_dst, op[0], op[1]));
1477 break;
1478
1479 case ir_binop_logic_and:
1480 emit(AND(result_dst, op[0], op[1]));
1481 break;
1482
1483 case ir_binop_dot:
1484 assert(ir->operands[0]->type->is_vector());
1485 assert(ir->operands[0]->type == ir->operands[1]->type);
1486 emit_dp(result_dst, op[0], op[1], ir->operands[0]->type->vector_elements);
1487 break;
1488
1489 case ir_unop_sqrt:
1490 emit_math(SHADER_OPCODE_SQRT, result_dst, op[0]);
1491 break;
1492 case ir_unop_rsq:
1493 emit_math(SHADER_OPCODE_RSQ, result_dst, op[0]);
1494 break;
1495
1496 case ir_unop_bitcast_i2f:
1497 case ir_unop_bitcast_u2f:
1498 this->result = op[0];
1499 this->result.type = BRW_REGISTER_TYPE_F;
1500 break;
1501
1502 case ir_unop_bitcast_f2i:
1503 this->result = op[0];
1504 this->result.type = BRW_REGISTER_TYPE_D;
1505 break;
1506
1507 case ir_unop_bitcast_f2u:
1508 this->result = op[0];
1509 this->result.type = BRW_REGISTER_TYPE_UD;
1510 break;
1511
1512 case ir_unop_i2f:
1513 case ir_unop_i2u:
1514 case ir_unop_u2i:
1515 case ir_unop_u2f:
1516 case ir_unop_f2i:
1517 case ir_unop_f2u:
1518 emit(MOV(result_dst, op[0]));
1519 break;
1520 case ir_unop_b2i:
1521 if (ctx->Const.UniformBooleanTrue != 1) {
1522 emit(AND(result_dst, op[0], src_reg(1u)));
1523 } else {
1524 emit(MOV(result_dst, op[0]));
1525 }
1526 break;
1527 case ir_unop_b2f:
1528 if (ctx->Const.UniformBooleanTrue != 1) {
1529 op[0].type = BRW_REGISTER_TYPE_UD;
1530 result_dst.type = BRW_REGISTER_TYPE_UD;
1531 emit(AND(result_dst, op[0], src_reg(0x3f800000u)));
1532 result_dst.type = BRW_REGISTER_TYPE_F;
1533 } else {
1534 emit(MOV(result_dst, op[0]));
1535 }
1536 break;
1537 case ir_unop_f2b:
1538 case ir_unop_i2b:
1539 emit(CMP(result_dst, op[0], src_reg(0.0f), BRW_CONDITIONAL_NZ));
1540 if (ctx->Const.UniformBooleanTrue == 1) {
1541 emit(AND(result_dst, result_src, src_reg(1u)));
1542 }
1543 break;
1544
1545 case ir_unop_trunc:
1546 emit(RNDZ(result_dst, op[0]));
1547 break;
1548 case ir_unop_ceil:
1549 op[0].negate = !op[0].negate;
1550 inst = emit(RNDD(result_dst, op[0]));
1551 this->result.negate = true;
1552 break;
1553 case ir_unop_floor:
1554 inst = emit(RNDD(result_dst, op[0]));
1555 break;
1556 case ir_unop_fract:
1557 inst = emit(FRC(result_dst, op[0]));
1558 break;
1559 case ir_unop_round_even:
1560 emit(RNDE(result_dst, op[0]));
1561 break;
1562
1563 case ir_binop_min:
1564 emit_minmax(BRW_CONDITIONAL_L, result_dst, op[0], op[1]);
1565 break;
1566 case ir_binop_max:
1567 emit_minmax(BRW_CONDITIONAL_G, result_dst, op[0], op[1]);
1568 break;
1569
1570 case ir_binop_pow:
1571 emit_math(SHADER_OPCODE_POW, result_dst, op[0], op[1]);
1572 break;
1573
1574 case ir_unop_bit_not:
1575 inst = emit(NOT(result_dst, op[0]));
1576 break;
1577 case ir_binop_bit_and:
1578 inst = emit(AND(result_dst, op[0], op[1]));
1579 break;
1580 case ir_binop_bit_xor:
1581 inst = emit(XOR(result_dst, op[0], op[1]));
1582 break;
1583 case ir_binop_bit_or:
1584 inst = emit(OR(result_dst, op[0], op[1]));
1585 break;
1586
1587 case ir_binop_lshift:
1588 inst = emit(SHL(result_dst, op[0], op[1]));
1589 break;
1590
1591 case ir_binop_rshift:
1592 if (ir->type->base_type == GLSL_TYPE_INT)
1593 inst = emit(ASR(result_dst, op[0], op[1]));
1594 else
1595 inst = emit(SHR(result_dst, op[0], op[1]));
1596 break;
1597
1598 case ir_binop_bfm:
1599 emit(BFI1(result_dst, op[0], op[1]));
1600 break;
1601
1602 case ir_binop_ubo_load: {
1603 ir_constant *const_uniform_block = ir->operands[0]->as_constant();
1604 ir_constant *const_offset_ir = ir->operands[1]->as_constant();
1605 unsigned const_offset = const_offset_ir ? const_offset_ir->value.u[0] : 0;
1606 src_reg offset;
1607
1608 /* Now, load the vector from that offset. */
1609 assert(ir->type->is_vector() || ir->type->is_scalar());
1610
1611 src_reg packed_consts = src_reg(this, glsl_type::vec4_type);
1612 packed_consts.type = result.type;
1613 src_reg surf_index;
1614
1615 if (const_uniform_block) {
1616 /* The block index is a constant, so just emit the binding table entry
1617 * as an immediate.
1618 */
1619 surf_index = src_reg(prog_data->base.binding_table.ubo_start +
1620 const_uniform_block->value.u[0]);
1621 } else {
1622 /* The block index is not a constant. Evaluate the index expression
1623 * per-channel and add the base UBO index; the generator will select
1624 * a value from any live channel.
1625 */
1626 surf_index = src_reg(this, glsl_type::uint_type);
1627 emit(ADD(dst_reg(surf_index), op[0],
1628 src_reg(prog_data->base.binding_table.ubo_start)));
1629
1630 /* Assume this may touch any UBO. It would be nice to provide
1631 * a tighter bound, but the array information is already lowered away.
1632 */
1633 brw_mark_surface_used(&prog_data->base,
1634 prog_data->base.binding_table.ubo_start +
1635 shader_prog->NumUniformBlocks - 1);
1636 }
1637
1638 if (const_offset_ir) {
1639 if (brw->gen >= 8) {
1640 /* Store the offset in a GRF so we can send-from-GRF. */
1641 offset = src_reg(this, glsl_type::int_type);
1642 emit(MOV(dst_reg(offset), src_reg(const_offset / 16)));
1643 } else {
1644 /* Immediates are fine on older generations since they'll be moved
1645 * to a (potentially fake) MRF at the generator level.
1646 */
1647 offset = src_reg(const_offset / 16);
1648 }
1649 } else {
1650 offset = src_reg(this, glsl_type::uint_type);
1651 emit(SHR(dst_reg(offset), op[1], src_reg(4)));
1652 }
1653
1654 if (brw->gen >= 7) {
1655 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
1656 grf_offset.type = offset.type;
1657
1658 emit(MOV(grf_offset, offset));
1659
1660 emit(new(mem_ctx) vec4_instruction(this,
1661 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
1662 dst_reg(packed_consts),
1663 surf_index,
1664 src_reg(grf_offset)));
1665 } else {
1666 vec4_instruction *pull =
1667 emit(new(mem_ctx) vec4_instruction(this,
1668 VS_OPCODE_PULL_CONSTANT_LOAD,
1669 dst_reg(packed_consts),
1670 surf_index,
1671 offset));
1672 pull->base_mrf = 14;
1673 pull->mlen = 1;
1674 }
1675
1676 packed_consts.swizzle = swizzle_for_size(ir->type->vector_elements);
1677 packed_consts.swizzle += BRW_SWIZZLE4(const_offset % 16 / 4,
1678 const_offset % 16 / 4,
1679 const_offset % 16 / 4,
1680 const_offset % 16 / 4);
1681
1682 /* UBO bools are any nonzero int. We need to convert them to use the
1683 * value of true stored in ctx->Const.UniformBooleanTrue.
1684 */
1685 if (ir->type->base_type == GLSL_TYPE_BOOL) {
1686 emit(CMP(result_dst, packed_consts, src_reg(0u),
1687 BRW_CONDITIONAL_NZ));
1688 if (ctx->Const.UniformBooleanTrue == 1) {
1689 emit(AND(result_dst, result, src_reg(1u)));
1690 }
1691 } else {
1692 emit(MOV(result_dst, packed_consts));
1693 }
1694 break;
1695 }
1696
1697 case ir_binop_vector_extract:
1698 unreachable("should have been lowered by vec_index_to_cond_assign");
1699
1700 case ir_triop_fma:
1701 op[0] = fix_3src_operand(op[0]);
1702 op[1] = fix_3src_operand(op[1]);
1703 op[2] = fix_3src_operand(op[2]);
1704 /* Note that the instruction's argument order is reversed from GLSL
1705 * and the IR.
1706 */
1707 emit(MAD(result_dst, op[2], op[1], op[0]));
1708 break;
1709
1710 case ir_triop_lrp:
1711 emit_lrp(result_dst, op[0], op[1], op[2]);
1712 break;
1713
1714 case ir_triop_csel:
1715 unreachable("already handled above");
1716 break;
1717
1718 case ir_triop_bfi:
1719 op[0] = fix_3src_operand(op[0]);
1720 op[1] = fix_3src_operand(op[1]);
1721 op[2] = fix_3src_operand(op[2]);
1722 emit(BFI2(result_dst, op[0], op[1], op[2]));
1723 break;
1724
1725 case ir_triop_bitfield_extract:
1726 op[0] = fix_3src_operand(op[0]);
1727 op[1] = fix_3src_operand(op[1]);
1728 op[2] = fix_3src_operand(op[2]);
1729 /* Note that the instruction's argument order is reversed from GLSL
1730 * and the IR.
1731 */
1732 emit(BFE(result_dst, op[2], op[1], op[0]));
1733 break;
1734
1735 case ir_triop_vector_insert:
1736 unreachable("should have been lowered by lower_vector_insert");
1737
1738 case ir_quadop_bitfield_insert:
1739 unreachable("not reached: should be handled by "
1740 "bitfield_insert_to_bfm_bfi\n");
1741
1742 case ir_quadop_vector:
1743 unreachable("not reached: should be handled by lower_quadop_vector");
1744
1745 case ir_unop_pack_half_2x16:
1746 emit_pack_half_2x16(result_dst, op[0]);
1747 break;
1748 case ir_unop_unpack_half_2x16:
1749 emit_unpack_half_2x16(result_dst, op[0]);
1750 break;
1751 case ir_unop_pack_snorm_2x16:
1752 case ir_unop_pack_snorm_4x8:
1753 case ir_unop_pack_unorm_2x16:
1754 case ir_unop_pack_unorm_4x8:
1755 case ir_unop_unpack_snorm_2x16:
1756 case ir_unop_unpack_snorm_4x8:
1757 case ir_unop_unpack_unorm_2x16:
1758 case ir_unop_unpack_unorm_4x8:
1759 unreachable("not reached: should be handled by lower_packing_builtins");
1760 case ir_unop_unpack_half_2x16_split_x:
1761 case ir_unop_unpack_half_2x16_split_y:
1762 case ir_binop_pack_half_2x16_split:
1763 case ir_unop_interpolate_at_centroid:
1764 case ir_binop_interpolate_at_sample:
1765 case ir_binop_interpolate_at_offset:
1766 unreachable("not reached: should not occur in vertex shader");
1767 case ir_binop_ldexp:
1768 unreachable("not reached: should be handled by ldexp_to_arith()");
1769 }
1770 }
1771
1772
1773 void
1774 vec4_visitor::visit(ir_swizzle *ir)
1775 {
1776 src_reg src;
1777 int i = 0;
1778 int swizzle[4];
1779
1780 /* Note that this is only swizzles in expressions, not those on the left
1781 * hand side of an assignment, which do write masking. See ir_assignment
1782 * for that.
1783 */
1784
1785 ir->val->accept(this);
1786 src = this->result;
1787 assert(src.file != BAD_FILE);
1788
1789 for (i = 0; i < ir->type->vector_elements; i++) {
1790 switch (i) {
1791 case 0:
1792 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.x);
1793 break;
1794 case 1:
1795 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.y);
1796 break;
1797 case 2:
1798 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.z);
1799 break;
1800 case 3:
1801 swizzle[i] = BRW_GET_SWZ(src.swizzle, ir->mask.w);
1802 break;
1803 }
1804 }
1805 for (; i < 4; i++) {
1806 /* Replicate the last channel out. */
1807 swizzle[i] = swizzle[ir->type->vector_elements - 1];
1808 }
1809
1810 src.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
1811
1812 this->result = src;
1813 }
1814
1815 void
1816 vec4_visitor::visit(ir_dereference_variable *ir)
1817 {
1818 const struct glsl_type *type = ir->type;
1819 dst_reg *reg = variable_storage(ir->var);
1820
1821 if (!reg) {
1822 fail("Failed to find variable storage for %s\n", ir->var->name);
1823 this->result = src_reg(brw_null_reg());
1824 return;
1825 }
1826
1827 this->result = src_reg(*reg);
1828
1829 /* System values get their swizzle from the dst_reg writemask */
1830 if (ir->var->data.mode == ir_var_system_value)
1831 return;
1832
1833 if (type->is_scalar() || type->is_vector() || type->is_matrix())
1834 this->result.swizzle = swizzle_for_size(type->vector_elements);
1835 }
1836
1837
1838 int
1839 vec4_visitor::compute_array_stride(ir_dereference_array *ir)
1840 {
1841 /* Under normal circumstances array elements are stored consecutively, so
1842 * the stride is equal to the size of the array element.
1843 */
1844 return type_size(ir->type);
1845 }
1846
1847
1848 void
1849 vec4_visitor::visit(ir_dereference_array *ir)
1850 {
1851 ir_constant *constant_index;
1852 src_reg src;
1853 int array_stride = compute_array_stride(ir);
1854
1855 constant_index = ir->array_index->constant_expression_value();
1856
1857 ir->array->accept(this);
1858 src = this->result;
1859
1860 if (constant_index) {
1861 src.reg_offset += constant_index->value.i[0] * array_stride;
1862 } else {
1863 /* Variable index array dereference. It eats the "vec4" of the
1864 * base of the array and an index that offsets the Mesa register
1865 * index.
1866 */
1867 ir->array_index->accept(this);
1868
1869 src_reg index_reg;
1870
1871 if (array_stride == 1) {
1872 index_reg = this->result;
1873 } else {
1874 index_reg = src_reg(this, glsl_type::int_type);
1875
1876 emit(MUL(dst_reg(index_reg), this->result, src_reg(array_stride)));
1877 }
1878
1879 if (src.reladdr) {
1880 src_reg temp = src_reg(this, glsl_type::int_type);
1881
1882 emit(ADD(dst_reg(temp), *src.reladdr, index_reg));
1883
1884 index_reg = temp;
1885 }
1886
1887 src.reladdr = ralloc(mem_ctx, src_reg);
1888 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
1889 }
1890
1891 /* If the type is smaller than a vec4, replicate the last channel out. */
1892 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1893 src.swizzle = swizzle_for_size(ir->type->vector_elements);
1894 else
1895 src.swizzle = BRW_SWIZZLE_NOOP;
1896 src.type = brw_type_for_base_type(ir->type);
1897
1898 this->result = src;
1899 }
1900
1901 void
1902 vec4_visitor::visit(ir_dereference_record *ir)
1903 {
1904 unsigned int i;
1905 const glsl_type *struct_type = ir->record->type;
1906 int offset = 0;
1907
1908 ir->record->accept(this);
1909
1910 for (i = 0; i < struct_type->length; i++) {
1911 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
1912 break;
1913 offset += type_size(struct_type->fields.structure[i].type);
1914 }
1915
1916 /* If the type is smaller than a vec4, replicate the last channel out. */
1917 if (ir->type->is_scalar() || ir->type->is_vector() || ir->type->is_matrix())
1918 this->result.swizzle = swizzle_for_size(ir->type->vector_elements);
1919 else
1920 this->result.swizzle = BRW_SWIZZLE_NOOP;
1921 this->result.type = brw_type_for_base_type(ir->type);
1922
1923 this->result.reg_offset += offset;
1924 }
1925
1926 /**
1927 * We want to be careful in assignment setup to hit the actual storage
1928 * instead of potentially using a temporary like we might with the
1929 * ir_dereference handler.
1930 */
1931 static dst_reg
1932 get_assignment_lhs(ir_dereference *ir, vec4_visitor *v)
1933 {
1934 /* The LHS must be a dereference. If the LHS is a variable indexed array
1935 * access of a vector, it must be separated into a series conditional moves
1936 * before reaching this point (see ir_vec_index_to_cond_assign).
1937 */
1938 assert(ir->as_dereference());
1939 ir_dereference_array *deref_array = ir->as_dereference_array();
1940 if (deref_array) {
1941 assert(!deref_array->array->type->is_vector());
1942 }
1943
1944 /* Use the rvalue deref handler for the most part. We'll ignore
1945 * swizzles in it and write swizzles using writemask, though.
1946 */
1947 ir->accept(v);
1948 return dst_reg(v->result);
1949 }
1950
1951 void
1952 vec4_visitor::emit_block_move(dst_reg *dst, src_reg *src,
1953 const struct glsl_type *type,
1954 enum brw_predicate predicate)
1955 {
1956 if (type->base_type == GLSL_TYPE_STRUCT) {
1957 for (unsigned int i = 0; i < type->length; i++) {
1958 emit_block_move(dst, src, type->fields.structure[i].type, predicate);
1959 }
1960 return;
1961 }
1962
1963 if (type->is_array()) {
1964 for (unsigned int i = 0; i < type->length; i++) {
1965 emit_block_move(dst, src, type->fields.array, predicate);
1966 }
1967 return;
1968 }
1969
1970 if (type->is_matrix()) {
1971 const struct glsl_type *vec_type;
1972
1973 vec_type = glsl_type::get_instance(GLSL_TYPE_FLOAT,
1974 type->vector_elements, 1);
1975
1976 for (int i = 0; i < type->matrix_columns; i++) {
1977 emit_block_move(dst, src, vec_type, predicate);
1978 }
1979 return;
1980 }
1981
1982 assert(type->is_scalar() || type->is_vector());
1983
1984 dst->type = brw_type_for_base_type(type);
1985 src->type = dst->type;
1986
1987 dst->writemask = (1 << type->vector_elements) - 1;
1988
1989 src->swizzle = swizzle_for_size(type->vector_elements);
1990
1991 vec4_instruction *inst = emit(MOV(*dst, *src));
1992 inst->predicate = predicate;
1993
1994 dst->reg_offset++;
1995 src->reg_offset++;
1996 }
1997
1998
1999 /* If the RHS processing resulted in an instruction generating a
2000 * temporary value, and it would be easy to rewrite the instruction to
2001 * generate its result right into the LHS instead, do so. This ends
2002 * up reliably removing instructions where it can be tricky to do so
2003 * later without real UD chain information.
2004 */
2005 bool
2006 vec4_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
2007 dst_reg dst,
2008 src_reg src,
2009 vec4_instruction *pre_rhs_inst,
2010 vec4_instruction *last_rhs_inst)
2011 {
2012 /* This could be supported, but it would take more smarts. */
2013 if (ir->condition)
2014 return false;
2015
2016 if (pre_rhs_inst == last_rhs_inst)
2017 return false; /* No instructions generated to work with. */
2018
2019 /* Make sure the last instruction generated our source reg. */
2020 if (src.file != GRF ||
2021 src.file != last_rhs_inst->dst.file ||
2022 src.reg != last_rhs_inst->dst.reg ||
2023 src.reg_offset != last_rhs_inst->dst.reg_offset ||
2024 src.reladdr ||
2025 src.abs ||
2026 src.negate ||
2027 last_rhs_inst->predicate != BRW_PREDICATE_NONE)
2028 return false;
2029
2030 /* Check that that last instruction fully initialized the channels
2031 * we want to use, in the order we want to use them. We could
2032 * potentially reswizzle the operands of many instructions so that
2033 * we could handle out of order channels, but don't yet.
2034 */
2035
2036 for (unsigned i = 0; i < 4; i++) {
2037 if (dst.writemask & (1 << i)) {
2038 if (!(last_rhs_inst->dst.writemask & (1 << i)))
2039 return false;
2040
2041 if (BRW_GET_SWZ(src.swizzle, i) != i)
2042 return false;
2043 }
2044 }
2045
2046 /* Success! Rewrite the instruction. */
2047 last_rhs_inst->dst.file = dst.file;
2048 last_rhs_inst->dst.reg = dst.reg;
2049 last_rhs_inst->dst.reg_offset = dst.reg_offset;
2050 last_rhs_inst->dst.reladdr = dst.reladdr;
2051 last_rhs_inst->dst.writemask &= dst.writemask;
2052
2053 return true;
2054 }
2055
2056 void
2057 vec4_visitor::visit(ir_assignment *ir)
2058 {
2059 dst_reg dst = get_assignment_lhs(ir->lhs, this);
2060 enum brw_predicate predicate = BRW_PREDICATE_NONE;
2061
2062 if (!ir->lhs->type->is_scalar() &&
2063 !ir->lhs->type->is_vector()) {
2064 ir->rhs->accept(this);
2065 src_reg src = this->result;
2066
2067 if (ir->condition) {
2068 emit_bool_to_cond_code(ir->condition, &predicate);
2069 }
2070
2071 /* emit_block_move doesn't account for swizzles in the source register.
2072 * This should be ok, since the source register is a structure or an
2073 * array, and those can't be swizzled. But double-check to be sure.
2074 */
2075 assert(src.swizzle ==
2076 (ir->rhs->type->is_matrix()
2077 ? swizzle_for_size(ir->rhs->type->vector_elements)
2078 : BRW_SWIZZLE_NOOP));
2079
2080 emit_block_move(&dst, &src, ir->rhs->type, predicate);
2081 return;
2082 }
2083
2084 /* Now we're down to just a scalar/vector with writemasks. */
2085 int i;
2086
2087 vec4_instruction *pre_rhs_inst, *last_rhs_inst;
2088 pre_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2089
2090 ir->rhs->accept(this);
2091
2092 last_rhs_inst = (vec4_instruction *)this->instructions.get_tail();
2093
2094 src_reg src = this->result;
2095
2096 int swizzles[4];
2097 int first_enabled_chan = 0;
2098 int src_chan = 0;
2099
2100 assert(ir->lhs->type->is_vector() ||
2101 ir->lhs->type->is_scalar());
2102 dst.writemask = ir->write_mask;
2103
2104 for (int i = 0; i < 4; i++) {
2105 if (dst.writemask & (1 << i)) {
2106 first_enabled_chan = BRW_GET_SWZ(src.swizzle, i);
2107 break;
2108 }
2109 }
2110
2111 /* Swizzle a small RHS vector into the channels being written.
2112 *
2113 * glsl ir treats write_mask as dictating how many channels are
2114 * present on the RHS while in our instructions we need to make
2115 * those channels appear in the slots of the vec4 they're written to.
2116 */
2117 for (int i = 0; i < 4; i++) {
2118 if (dst.writemask & (1 << i))
2119 swizzles[i] = BRW_GET_SWZ(src.swizzle, src_chan++);
2120 else
2121 swizzles[i] = first_enabled_chan;
2122 }
2123 src.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
2124 swizzles[2], swizzles[3]);
2125
2126 if (try_rewrite_rhs_to_dst(ir, dst, src, pre_rhs_inst, last_rhs_inst)) {
2127 return;
2128 }
2129
2130 if (ir->condition) {
2131 emit_bool_to_cond_code(ir->condition, &predicate);
2132 }
2133
2134 for (i = 0; i < type_size(ir->lhs->type); i++) {
2135 vec4_instruction *inst = emit(MOV(dst, src));
2136 inst->predicate = predicate;
2137
2138 dst.reg_offset++;
2139 src.reg_offset++;
2140 }
2141 }
2142
2143 void
2144 vec4_visitor::emit_constant_values(dst_reg *dst, ir_constant *ir)
2145 {
2146 if (ir->type->base_type == GLSL_TYPE_STRUCT) {
2147 foreach_in_list(ir_constant, field_value, &ir->components) {
2148 emit_constant_values(dst, field_value);
2149 }
2150 return;
2151 }
2152
2153 if (ir->type->is_array()) {
2154 for (unsigned int i = 0; i < ir->type->length; i++) {
2155 emit_constant_values(dst, ir->array_elements[i]);
2156 }
2157 return;
2158 }
2159
2160 if (ir->type->is_matrix()) {
2161 for (int i = 0; i < ir->type->matrix_columns; i++) {
2162 float *vec = &ir->value.f[i * ir->type->vector_elements];
2163
2164 for (int j = 0; j < ir->type->vector_elements; j++) {
2165 dst->writemask = 1 << j;
2166 dst->type = BRW_REGISTER_TYPE_F;
2167
2168 emit(MOV(*dst, src_reg(vec[j])));
2169 }
2170 dst->reg_offset++;
2171 }
2172 return;
2173 }
2174
2175 int remaining_writemask = (1 << ir->type->vector_elements) - 1;
2176
2177 for (int i = 0; i < ir->type->vector_elements; i++) {
2178 if (!(remaining_writemask & (1 << i)))
2179 continue;
2180
2181 dst->writemask = 1 << i;
2182 dst->type = brw_type_for_base_type(ir->type);
2183
2184 /* Find other components that match the one we're about to
2185 * write. Emits fewer instructions for things like vec4(0.5,
2186 * 1.5, 1.5, 1.5).
2187 */
2188 for (int j = i + 1; j < ir->type->vector_elements; j++) {
2189 if (ir->type->base_type == GLSL_TYPE_BOOL) {
2190 if (ir->value.b[i] == ir->value.b[j])
2191 dst->writemask |= (1 << j);
2192 } else {
2193 /* u, i, and f storage all line up, so no need for a
2194 * switch case for comparing each type.
2195 */
2196 if (ir->value.u[i] == ir->value.u[j])
2197 dst->writemask |= (1 << j);
2198 }
2199 }
2200
2201 switch (ir->type->base_type) {
2202 case GLSL_TYPE_FLOAT:
2203 emit(MOV(*dst, src_reg(ir->value.f[i])));
2204 break;
2205 case GLSL_TYPE_INT:
2206 emit(MOV(*dst, src_reg(ir->value.i[i])));
2207 break;
2208 case GLSL_TYPE_UINT:
2209 emit(MOV(*dst, src_reg(ir->value.u[i])));
2210 break;
2211 case GLSL_TYPE_BOOL:
2212 emit(MOV(*dst,
2213 src_reg(ir->value.b[i] != 0 ? ctx->Const.UniformBooleanTrue
2214 : 0u)));
2215 break;
2216 default:
2217 unreachable("Non-float/uint/int/bool constant");
2218 }
2219
2220 remaining_writemask &= ~dst->writemask;
2221 }
2222 dst->reg_offset++;
2223 }
2224
2225 void
2226 vec4_visitor::visit(ir_constant *ir)
2227 {
2228 dst_reg dst = dst_reg(this, ir->type);
2229 this->result = src_reg(dst);
2230
2231 emit_constant_values(&dst, ir);
2232 }
2233
2234 void
2235 vec4_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2236 {
2237 ir_dereference *deref = static_cast<ir_dereference *>(
2238 ir->actual_parameters.get_head());
2239 ir_variable *location = deref->variable_referenced();
2240 unsigned surf_index = (prog_data->base.binding_table.abo_start +
2241 location->data.binding);
2242
2243 /* Calculate the surface offset */
2244 src_reg offset(this, glsl_type::uint_type);
2245 ir_dereference_array *deref_array = deref->as_dereference_array();
2246 if (deref_array) {
2247 deref_array->array_index->accept(this);
2248
2249 src_reg tmp(this, glsl_type::uint_type);
2250 emit(MUL(dst_reg(tmp), this->result, ATOMIC_COUNTER_SIZE));
2251 emit(ADD(dst_reg(offset), tmp, location->data.atomic.offset));
2252 } else {
2253 offset = location->data.atomic.offset;
2254 }
2255
2256 /* Emit the appropriate machine instruction */
2257 const char *callee = ir->callee->function_name();
2258 dst_reg dst = get_assignment_lhs(ir->return_deref, this);
2259
2260 if (!strcmp("__intrinsic_atomic_read", callee)) {
2261 emit_untyped_surface_read(surf_index, dst, offset);
2262
2263 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2264 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2265 src_reg(), src_reg());
2266
2267 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2268 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2269 src_reg(), src_reg());
2270 }
2271 }
2272
2273 void
2274 vec4_visitor::visit(ir_call *ir)
2275 {
2276 const char *callee = ir->callee->function_name();
2277
2278 if (!strcmp("__intrinsic_atomic_read", callee) ||
2279 !strcmp("__intrinsic_atomic_increment", callee) ||
2280 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2281 visit_atomic_counter_intrinsic(ir);
2282 } else {
2283 unreachable("Unsupported intrinsic.");
2284 }
2285 }
2286
2287 src_reg
2288 vec4_visitor::emit_mcs_fetch(ir_texture *ir, src_reg coordinate, src_reg sampler)
2289 {
2290 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, SHADER_OPCODE_TXF_MCS);
2291 inst->base_mrf = 2;
2292 inst->mlen = 1;
2293 inst->dst = dst_reg(this, glsl_type::uvec4_type);
2294 inst->dst.writemask = WRITEMASK_XYZW;
2295
2296 inst->src[1] = sampler;
2297
2298 /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
2299 int param_base = inst->base_mrf;
2300 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2301 int zero_mask = 0xf & ~coord_mask;
2302
2303 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2304 coordinate));
2305
2306 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2307 src_reg(0)));
2308
2309 emit(inst);
2310 return src_reg(inst->dst);
2311 }
2312
2313 static bool
2314 is_high_sampler(struct brw_context *brw, src_reg sampler)
2315 {
2316 if (brw->gen < 8 && !brw->is_haswell)
2317 return false;
2318
2319 return sampler.file != IMM || sampler.fixed_hw_reg.dw1.ud >= 16;
2320 }
2321
2322 void
2323 vec4_visitor::visit(ir_texture *ir)
2324 {
2325 uint32_t sampler =
2326 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
2327
2328 ir_rvalue *nonconst_sampler_index =
2329 _mesa_get_sampler_array_nonconst_index(ir->sampler);
2330
2331 /* Handle non-constant sampler array indexing */
2332 src_reg sampler_reg;
2333 if (nonconst_sampler_index) {
2334 /* The highest sampler which may be used by this operation is
2335 * the last element of the array. Mark it here, because the generator
2336 * doesn't have enough information to determine the bound.
2337 */
2338 uint32_t array_size = ir->sampler->as_dereference_array()
2339 ->array->type->array_size();
2340
2341 uint32_t max_used = sampler + array_size - 1;
2342 if (ir->op == ir_tg4 && brw->gen < 8) {
2343 max_used += prog_data->base.binding_table.gather_texture_start;
2344 } else {
2345 max_used += prog_data->base.binding_table.texture_start;
2346 }
2347
2348 brw_mark_surface_used(&prog_data->base, max_used);
2349
2350 /* Emit code to evaluate the actual indexing expression */
2351 nonconst_sampler_index->accept(this);
2352 dst_reg temp(this, glsl_type::uint_type);
2353 emit(ADD(temp, this->result, src_reg(sampler)))
2354 ->force_writemask_all = true;
2355 sampler_reg = src_reg(temp);
2356 } else {
2357 /* Single sampler, or constant array index; the indexing expression
2358 * is just an immediate.
2359 */
2360 sampler_reg = src_reg(sampler);
2361 }
2362
2363 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
2364 * emitting anything other than setting up the constant result.
2365 */
2366 if (ir->op == ir_tg4) {
2367 ir_constant *chan = ir->lod_info.component->as_constant();
2368 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2369 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
2370 dst_reg result(this, ir->type);
2371 this->result = src_reg(result);
2372 emit(MOV(result, src_reg(swiz == SWIZZLE_ONE ? 1.0f : 0.0f)));
2373 return;
2374 }
2375 }
2376
2377 /* Should be lowered by do_lower_texture_projection */
2378 assert(!ir->projector);
2379
2380 /* Should be lowered */
2381 assert(!ir->offset || !ir->offset->type->is_array());
2382
2383 /* Generate code to compute all the subexpression trees. This has to be
2384 * done before loading any values into MRFs for the sampler message since
2385 * generating these values may involve SEND messages that need the MRFs.
2386 */
2387 src_reg coordinate;
2388 if (ir->coordinate) {
2389 ir->coordinate->accept(this);
2390 coordinate = this->result;
2391 }
2392
2393 src_reg shadow_comparitor;
2394 if (ir->shadow_comparitor) {
2395 ir->shadow_comparitor->accept(this);
2396 shadow_comparitor = this->result;
2397 }
2398
2399 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
2400 src_reg offset_value;
2401 if (has_nonconstant_offset) {
2402 ir->offset->accept(this);
2403 offset_value = src_reg(this->result);
2404 }
2405
2406 const glsl_type *lod_type = NULL, *sample_index_type = NULL;
2407 src_reg lod, dPdx, dPdy, sample_index, mcs;
2408 switch (ir->op) {
2409 case ir_tex:
2410 lod = src_reg(0.0f);
2411 lod_type = glsl_type::float_type;
2412 break;
2413 case ir_txf:
2414 case ir_txl:
2415 case ir_txs:
2416 ir->lod_info.lod->accept(this);
2417 lod = this->result;
2418 lod_type = ir->lod_info.lod->type;
2419 break;
2420 case ir_query_levels:
2421 lod = src_reg(0);
2422 lod_type = glsl_type::int_type;
2423 break;
2424 case ir_txf_ms:
2425 ir->lod_info.sample_index->accept(this);
2426 sample_index = this->result;
2427 sample_index_type = ir->lod_info.sample_index->type;
2428
2429 if (brw->gen >= 7 && key->tex.compressed_multisample_layout_mask & (1<<sampler))
2430 mcs = emit_mcs_fetch(ir, coordinate, sampler_reg);
2431 else
2432 mcs = src_reg(0u);
2433 break;
2434 case ir_txd:
2435 ir->lod_info.grad.dPdx->accept(this);
2436 dPdx = this->result;
2437
2438 ir->lod_info.grad.dPdy->accept(this);
2439 dPdy = this->result;
2440
2441 lod_type = ir->lod_info.grad.dPdx->type;
2442 break;
2443 case ir_txb:
2444 case ir_lod:
2445 case ir_tg4:
2446 break;
2447 }
2448
2449 enum opcode opcode;
2450 switch (ir->op) {
2451 case ir_tex: opcode = SHADER_OPCODE_TXL; break;
2452 case ir_txl: opcode = SHADER_OPCODE_TXL; break;
2453 case ir_txd: opcode = SHADER_OPCODE_TXD; break;
2454 case ir_txf: opcode = SHADER_OPCODE_TXF; break;
2455 case ir_txf_ms: opcode = SHADER_OPCODE_TXF_CMS; break;
2456 case ir_txs: opcode = SHADER_OPCODE_TXS; break;
2457 case ir_tg4: opcode = has_nonconstant_offset
2458 ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
2459 case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
2460 case ir_txb:
2461 unreachable("TXB is not valid for vertex shaders.");
2462 case ir_lod:
2463 unreachable("LOD is not valid for vertex shaders.");
2464 default:
2465 unreachable("Unrecognized tex op");
2466 }
2467
2468 vec4_instruction *inst = new(mem_ctx) vec4_instruction(this, opcode);
2469
2470 if (ir->offset != NULL && !has_nonconstant_offset) {
2471 inst->texture_offset =
2472 brw_texture_offset(ctx, ir->offset->as_constant()->value.i,
2473 ir->offset->type->vector_elements);
2474 }
2475
2476 /* Stuff the channel select bits in the top of the texture offset */
2477 if (ir->op == ir_tg4)
2478 inst->texture_offset |= gather_channel(ir, sampler) << 16;
2479
2480 /* The message header is necessary for:
2481 * - Gen4 (always)
2482 * - Texel offsets
2483 * - Gather channel selection
2484 * - Sampler indices too large to fit in a 4-bit value.
2485 */
2486 inst->header_present =
2487 brw->gen < 5 || inst->texture_offset != 0 || ir->op == ir_tg4 ||
2488 is_high_sampler(brw, sampler_reg);
2489 inst->base_mrf = 2;
2490 inst->mlen = inst->header_present + 1; /* always at least one */
2491 inst->dst = dst_reg(this, ir->type);
2492 inst->dst.writemask = WRITEMASK_XYZW;
2493 inst->shadow_compare = ir->shadow_comparitor != NULL;
2494
2495 inst->src[1] = sampler_reg;
2496
2497 /* MRF for the first parameter */
2498 int param_base = inst->base_mrf + inst->header_present;
2499
2500 if (ir->op == ir_txs || ir->op == ir_query_levels) {
2501 int writemask = brw->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
2502 emit(MOV(dst_reg(MRF, param_base, lod_type, writemask), lod));
2503 } else {
2504 /* Load the coordinate */
2505 /* FINISHME: gl_clamp_mask and saturate */
2506 int coord_mask = (1 << ir->coordinate->type->vector_elements) - 1;
2507 int zero_mask = 0xf & ~coord_mask;
2508
2509 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, coord_mask),
2510 coordinate));
2511
2512 if (zero_mask != 0) {
2513 emit(MOV(dst_reg(MRF, param_base, ir->coordinate->type, zero_mask),
2514 src_reg(0)));
2515 }
2516 /* Load the shadow comparitor */
2517 if (ir->shadow_comparitor && ir->op != ir_txd && (ir->op != ir_tg4 || !has_nonconstant_offset)) {
2518 emit(MOV(dst_reg(MRF, param_base + 1, ir->shadow_comparitor->type,
2519 WRITEMASK_X),
2520 shadow_comparitor));
2521 inst->mlen++;
2522 }
2523
2524 /* Load the LOD info */
2525 if (ir->op == ir_tex || ir->op == ir_txl) {
2526 int mrf, writemask;
2527 if (brw->gen >= 5) {
2528 mrf = param_base + 1;
2529 if (ir->shadow_comparitor) {
2530 writemask = WRITEMASK_Y;
2531 /* mlen already incremented */
2532 } else {
2533 writemask = WRITEMASK_X;
2534 inst->mlen++;
2535 }
2536 } else /* brw->gen == 4 */ {
2537 mrf = param_base;
2538 writemask = WRITEMASK_W;
2539 }
2540 emit(MOV(dst_reg(MRF, mrf, lod_type, writemask), lod));
2541 } else if (ir->op == ir_txf) {
2542 emit(MOV(dst_reg(MRF, param_base, lod_type, WRITEMASK_W), lod));
2543 } else if (ir->op == ir_txf_ms) {
2544 emit(MOV(dst_reg(MRF, param_base + 1, sample_index_type, WRITEMASK_X),
2545 sample_index));
2546 if (brw->gen >= 7) {
2547 /* MCS data is in the first channel of `mcs`, but we need to get it into
2548 * the .y channel of the second vec4 of params, so replicate .x across
2549 * the whole vec4 and then mask off everything except .y
2550 */
2551 mcs.swizzle = BRW_SWIZZLE_XXXX;
2552 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
2553 mcs));
2554 }
2555 inst->mlen++;
2556 } else if (ir->op == ir_txd) {
2557 const glsl_type *type = lod_type;
2558
2559 if (brw->gen >= 5) {
2560 dPdx.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2561 dPdy.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
2562 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), dPdx));
2563 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), dPdy));
2564 inst->mlen++;
2565
2566 if (ir->type->vector_elements == 3 || ir->shadow_comparitor) {
2567 dPdx.swizzle = BRW_SWIZZLE_ZZZZ;
2568 dPdy.swizzle = BRW_SWIZZLE_ZZZZ;
2569 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), dPdx));
2570 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), dPdy));
2571 inst->mlen++;
2572
2573 if (ir->shadow_comparitor) {
2574 emit(MOV(dst_reg(MRF, param_base + 2,
2575 ir->shadow_comparitor->type, WRITEMASK_Z),
2576 shadow_comparitor));
2577 }
2578 }
2579 } else /* brw->gen == 4 */ {
2580 emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), dPdx));
2581 emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), dPdy));
2582 inst->mlen += 2;
2583 }
2584 } else if (ir->op == ir_tg4 && has_nonconstant_offset) {
2585 if (ir->shadow_comparitor) {
2586 emit(MOV(dst_reg(MRF, param_base, ir->shadow_comparitor->type, WRITEMASK_W),
2587 shadow_comparitor));
2588 }
2589
2590 emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
2591 offset_value));
2592 inst->mlen++;
2593 }
2594 }
2595
2596 emit(inst);
2597
2598 /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
2599 * spec requires layers.
2600 */
2601 if (ir->op == ir_txs) {
2602 glsl_type const *type = ir->sampler->type;
2603 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
2604 type->sampler_array) {
2605 emit_math(SHADER_OPCODE_INT_QUOTIENT,
2606 writemask(inst->dst, WRITEMASK_Z),
2607 src_reg(inst->dst), src_reg(6));
2608 }
2609 }
2610
2611 if (brw->gen == 6 && ir->op == ir_tg4) {
2612 emit_gen6_gather_wa(key->tex.gen6_gather_wa[sampler], inst->dst);
2613 }
2614
2615 swizzle_result(ir, src_reg(inst->dst), sampler);
2616 }
2617
2618 /**
2619 * Apply workarounds for Gen6 gather with UINT/SINT
2620 */
2621 void
2622 vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
2623 {
2624 if (!wa)
2625 return;
2626
2627 int width = (wa & WA_8BIT) ? 8 : 16;
2628 dst_reg dst_f = dst;
2629 dst_f.type = BRW_REGISTER_TYPE_F;
2630
2631 /* Convert from UNORM to UINT */
2632 emit(MUL(dst_f, src_reg(dst_f), src_reg((float)((1 << width) - 1))));
2633 emit(MOV(dst, src_reg(dst_f)));
2634
2635 if (wa & WA_SIGN) {
2636 /* Reinterpret the UINT value as a signed INT value by
2637 * shifting the sign bit into place, then shifting back
2638 * preserving sign.
2639 */
2640 emit(SHL(dst, src_reg(dst), src_reg(32 - width)));
2641 emit(ASR(dst, src_reg(dst), src_reg(32 - width)));
2642 }
2643 }
2644
2645 /**
2646 * Set up the gather channel based on the swizzle, for gather4.
2647 */
2648 uint32_t
2649 vec4_visitor::gather_channel(ir_texture *ir, uint32_t sampler)
2650 {
2651 ir_constant *chan = ir->lod_info.component->as_constant();
2652 int swiz = GET_SWZ(key->tex.swizzles[sampler], chan->value.i[0]);
2653 switch (swiz) {
2654 case SWIZZLE_X: return 0;
2655 case SWIZZLE_Y:
2656 /* gather4 sampler is broken for green channel on RG32F --
2657 * we must ask for blue instead.
2658 */
2659 if (key->tex.gather_channel_quirk_mask & (1<<sampler))
2660 return 2;
2661 return 1;
2662 case SWIZZLE_Z: return 2;
2663 case SWIZZLE_W: return 3;
2664 default:
2665 unreachable("Not reached"); /* zero, one swizzles handled already */
2666 }
2667 }
2668
2669 void
2670 vec4_visitor::swizzle_result(ir_texture *ir, src_reg orig_val, uint32_t sampler)
2671 {
2672 int s = key->tex.swizzles[sampler];
2673
2674 this->result = src_reg(this, ir->type);
2675 dst_reg swizzled_result(this->result);
2676
2677 if (ir->op == ir_query_levels) {
2678 /* # levels is in .w */
2679 orig_val.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2680 emit(MOV(swizzled_result, orig_val));
2681 return;
2682 }
2683
2684 if (ir->op == ir_txs || ir->type == glsl_type::float_type
2685 || s == SWIZZLE_NOOP || ir->op == ir_tg4) {
2686 emit(MOV(swizzled_result, orig_val));
2687 return;
2688 }
2689
2690
2691 int zero_mask = 0, one_mask = 0, copy_mask = 0;
2692 int swizzle[4] = {0};
2693
2694 for (int i = 0; i < 4; i++) {
2695 switch (GET_SWZ(s, i)) {
2696 case SWIZZLE_ZERO:
2697 zero_mask |= (1 << i);
2698 break;
2699 case SWIZZLE_ONE:
2700 one_mask |= (1 << i);
2701 break;
2702 default:
2703 copy_mask |= (1 << i);
2704 swizzle[i] = GET_SWZ(s, i);
2705 break;
2706 }
2707 }
2708
2709 if (copy_mask) {
2710 orig_val.swizzle = BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
2711 swizzled_result.writemask = copy_mask;
2712 emit(MOV(swizzled_result, orig_val));
2713 }
2714
2715 if (zero_mask) {
2716 swizzled_result.writemask = zero_mask;
2717 emit(MOV(swizzled_result, src_reg(0.0f)));
2718 }
2719
2720 if (one_mask) {
2721 swizzled_result.writemask = one_mask;
2722 emit(MOV(swizzled_result, src_reg(1.0f)));
2723 }
2724 }
2725
2726 void
2727 vec4_visitor::visit(ir_return *)
2728 {
2729 unreachable("not reached");
2730 }
2731
2732 void
2733 vec4_visitor::visit(ir_discard *)
2734 {
2735 unreachable("not reached");
2736 }
2737
2738 void
2739 vec4_visitor::visit(ir_if *ir)
2740 {
2741 /* Don't point the annotation at the if statement, because then it plus
2742 * the then and else blocks get printed.
2743 */
2744 this->base_ir = ir->condition;
2745
2746 if (brw->gen == 6) {
2747 emit_if_gen6(ir);
2748 } else {
2749 enum brw_predicate predicate;
2750 emit_bool_to_cond_code(ir->condition, &predicate);
2751 emit(IF(predicate));
2752 }
2753
2754 visit_instructions(&ir->then_instructions);
2755
2756 if (!ir->else_instructions.is_empty()) {
2757 this->base_ir = ir->condition;
2758 emit(BRW_OPCODE_ELSE);
2759
2760 visit_instructions(&ir->else_instructions);
2761 }
2762
2763 this->base_ir = ir->condition;
2764 emit(BRW_OPCODE_ENDIF);
2765 }
2766
2767 void
2768 vec4_visitor::visit(ir_emit_vertex *)
2769 {
2770 unreachable("not reached");
2771 }
2772
2773 void
2774 vec4_visitor::visit(ir_end_primitive *)
2775 {
2776 unreachable("not reached");
2777 }
2778
2779 void
2780 vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2781 dst_reg dst, src_reg offset,
2782 src_reg src0, src_reg src1)
2783 {
2784 unsigned mlen = 0;
2785
2786 /* Set the atomic operation offset. */
2787 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), offset));
2788 mlen++;
2789
2790 /* Set the atomic operation arguments. */
2791 if (src0.file != BAD_FILE) {
2792 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src0));
2793 mlen++;
2794 }
2795
2796 if (src1.file != BAD_FILE) {
2797 emit(MOV(brw_writemask(brw_uvec_mrf(8, mlen, 0), WRITEMASK_X), src1));
2798 mlen++;
2799 }
2800
2801 /* Emit the instruction. Note that this maps to the normal SIMD8
2802 * untyped atomic message on Ivy Bridge, but that's OK because
2803 * unused channels will be masked out.
2804 */
2805 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
2806 src_reg(atomic_op), src_reg(surf_index));
2807 inst->base_mrf = 0;
2808 inst->mlen = mlen;
2809 }
2810
2811 void
2812 vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
2813 src_reg offset)
2814 {
2815 /* Set the surface read offset. */
2816 emit(MOV(brw_writemask(brw_uvec_mrf(8, 0, 0), WRITEMASK_X), offset));
2817
2818 /* Emit the instruction. Note that this maps to the normal SIMD8
2819 * untyped surface read message, but that's OK because unused
2820 * channels will be masked out.
2821 */
2822 vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ,
2823 dst, src_reg(surf_index));
2824 inst->base_mrf = 0;
2825 inst->mlen = 1;
2826 }
2827
2828 void
2829 vec4_visitor::emit_ndc_computation()
2830 {
2831 /* Get the position */
2832 src_reg pos = src_reg(output_reg[VARYING_SLOT_POS]);
2833
2834 /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
2835 dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
2836 output_reg[BRW_VARYING_SLOT_NDC] = ndc;
2837
2838 current_annotation = "NDC";
2839 dst_reg ndc_w = ndc;
2840 ndc_w.writemask = WRITEMASK_W;
2841 src_reg pos_w = pos;
2842 pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
2843 emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
2844
2845 dst_reg ndc_xyz = ndc;
2846 ndc_xyz.writemask = WRITEMASK_XYZ;
2847
2848 emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
2849 }
2850
2851 void
2852 vec4_visitor::emit_psiz_and_flags(dst_reg reg)
2853 {
2854 if (brw->gen < 6 &&
2855 ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
2856 key->userclip_active || brw->has_negative_rhw_bug)) {
2857 dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
2858 dst_reg header1_w = header1;
2859 header1_w.writemask = WRITEMASK_W;
2860
2861 emit(MOV(header1, 0u));
2862
2863 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2864 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ]);
2865
2866 current_annotation = "Point size";
2867 emit(MUL(header1_w, psiz, src_reg((float)(1 << 11))));
2868 emit(AND(header1_w, src_reg(header1_w), 0x7ff << 8));
2869 }
2870
2871 if (key->userclip_active) {
2872 current_annotation = "Clipping flags";
2873 dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
2874 dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
2875
2876 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0]), src_reg(0.0f), BRW_CONDITIONAL_L));
2877 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, src_reg(0));
2878 emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
2879
2880 emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1]), src_reg(0.0f), BRW_CONDITIONAL_L));
2881 emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, src_reg(0));
2882 emit(SHL(flags1, src_reg(flags1), src_reg(4)));
2883 emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
2884 }
2885
2886 /* i965 clipping workaround:
2887 * 1) Test for -ve rhw
2888 * 2) If set,
2889 * set ndc = (0,0,0,0)
2890 * set ucp[6] = 1
2891 *
2892 * Later, clipping will detect ucp[6] and ensure the primitive is
2893 * clipped against all fixed planes.
2894 */
2895 if (brw->has_negative_rhw_bug) {
2896 src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC]);
2897 ndc_w.swizzle = BRW_SWIZZLE_WWWW;
2898 emit(CMP(dst_null_f(), ndc_w, src_reg(0.0f), BRW_CONDITIONAL_L));
2899 vec4_instruction *inst;
2900 inst = emit(OR(header1_w, src_reg(header1_w), src_reg(1u << 6)));
2901 inst->predicate = BRW_PREDICATE_NORMAL;
2902 inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC], src_reg(0.0f)));
2903 inst->predicate = BRW_PREDICATE_NORMAL;
2904 }
2905
2906 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
2907 } else if (brw->gen < 6) {
2908 emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), 0u));
2909 } else {
2910 emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), src_reg(0)));
2911 if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
2912 dst_reg reg_w = reg;
2913 reg_w.writemask = WRITEMASK_W;
2914 emit(MOV(reg_w, src_reg(output_reg[VARYING_SLOT_PSIZ])));
2915 }
2916 if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
2917 dst_reg reg_y = reg;
2918 reg_y.writemask = WRITEMASK_Y;
2919 reg_y.type = BRW_REGISTER_TYPE_D;
2920 emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER])));
2921 }
2922 if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
2923 dst_reg reg_z = reg;
2924 reg_z.writemask = WRITEMASK_Z;
2925 reg_z.type = BRW_REGISTER_TYPE_D;
2926 emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT])));
2927 }
2928 }
2929 }
2930
2931 void
2932 vec4_visitor::emit_clip_distances(dst_reg reg, int offset)
2933 {
2934 /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
2935 *
2936 * "If a linked set of shaders forming the vertex stage contains no
2937 * static write to gl_ClipVertex or gl_ClipDistance, but the
2938 * application has requested clipping against user clip planes through
2939 * the API, then the coordinate written to gl_Position is used for
2940 * comparison against the user clip planes."
2941 *
2942 * This function is only called if the shader didn't write to
2943 * gl_ClipDistance. Accordingly, we use gl_ClipVertex to perform clipping
2944 * if the user wrote to it; otherwise we use gl_Position.
2945 */
2946 gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
2947 if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
2948 clip_vertex = VARYING_SLOT_POS;
2949 }
2950
2951 for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
2952 ++i) {
2953 reg.writemask = 1 << i;
2954 emit(DP4(reg,
2955 src_reg(output_reg[clip_vertex]),
2956 src_reg(this->userplane[i + offset])));
2957 }
2958 }
2959
2960 void
2961 vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying)
2962 {
2963 assert (varying < VARYING_SLOT_MAX);
2964 reg.type = output_reg[varying].type;
2965 current_annotation = output_reg_annotation[varying];
2966 /* Copy the register, saturating if necessary */
2967 vec4_instruction *inst = emit(MOV(reg,
2968 src_reg(output_reg[varying])));
2969 if ((varying == VARYING_SLOT_COL0 ||
2970 varying == VARYING_SLOT_COL1 ||
2971 varying == VARYING_SLOT_BFC0 ||
2972 varying == VARYING_SLOT_BFC1) &&
2973 key->clamp_vertex_color) {
2974 inst->saturate = true;
2975 }
2976 }
2977
2978 void
2979 vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
2980 {
2981 reg.type = BRW_REGISTER_TYPE_F;
2982
2983 switch (varying) {
2984 case VARYING_SLOT_PSIZ:
2985 {
2986 /* PSIZ is always in slot 0, and is coupled with other flags. */
2987 current_annotation = "indices, point width, clip flags";
2988 emit_psiz_and_flags(reg);
2989 break;
2990 }
2991 case BRW_VARYING_SLOT_NDC:
2992 current_annotation = "NDC";
2993 emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC])));
2994 break;
2995 case VARYING_SLOT_POS:
2996 current_annotation = "gl_Position";
2997 emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS])));
2998 break;
2999 case VARYING_SLOT_EDGE:
3000 /* This is present when doing unfilled polygons. We're supposed to copy
3001 * the edge flag from the user-provided vertex array
3002 * (glEdgeFlagPointer), or otherwise we'll copy from the current value
3003 * of that attribute (starts as 1.0f). This is then used in clipping to
3004 * determine which edges should be drawn as wireframe.
3005 */
3006 current_annotation = "edge flag";
3007 emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
3008 glsl_type::float_type, WRITEMASK_XYZW))));
3009 break;
3010 case BRW_VARYING_SLOT_PAD:
3011 /* No need to write to this slot */
3012 break;
3013 default:
3014 emit_generic_urb_slot(reg, varying);
3015 break;
3016 }
3017 }
3018
3019 static int
3020 align_interleaved_urb_mlen(struct brw_context *brw, int mlen)
3021 {
3022 if (brw->gen >= 6) {
3023 /* URB data written (does not include the message header reg) must
3024 * be a multiple of 256 bits, or 2 VS registers. See vol5c.5,
3025 * section 5.4.3.2.2: URB_INTERLEAVED.
3026 *
3027 * URB entries are allocated on a multiple of 1024 bits, so an
3028 * extra 128 bits written here to make the end align to 256 is
3029 * no problem.
3030 */
3031 if ((mlen % 2) != 1)
3032 mlen++;
3033 }
3034
3035 return mlen;
3036 }
3037
3038
3039 /**
3040 * Generates the VUE payload plus the necessary URB write instructions to
3041 * output it.
3042 *
3043 * The VUE layout is documented in Volume 2a.
3044 */
3045 void
3046 vec4_visitor::emit_vertex()
3047 {
3048 /* MRF 0 is reserved for the debugger, so start with message header
3049 * in MRF 1.
3050 */
3051 int base_mrf = 1;
3052 int mrf = base_mrf;
3053 /* In the process of generating our URB write message contents, we
3054 * may need to unspill a register or load from an array. Those
3055 * reads would use MRFs 14-15.
3056 */
3057 int max_usable_mrf = 13;
3058
3059 /* The following assertion verifies that max_usable_mrf causes an
3060 * even-numbered amount of URB write data, which will meet gen6's
3061 * requirements for length alignment.
3062 */
3063 assert ((max_usable_mrf - base_mrf) % 2 == 0);
3064
3065 /* First mrf is the g0-based message header containing URB handles and
3066 * such.
3067 */
3068 emit_urb_write_header(mrf++);
3069
3070 if (brw->gen < 6) {
3071 emit_ndc_computation();
3072 }
3073
3074 /* Lower legacy ff and ClipVertex clipping to clip distances */
3075 if (key->userclip_active && !prog->UsesClipDistanceOut) {
3076 current_annotation = "user clip distances";
3077
3078 output_reg[VARYING_SLOT_CLIP_DIST0] = dst_reg(this, glsl_type::vec4_type);
3079 output_reg[VARYING_SLOT_CLIP_DIST1] = dst_reg(this, glsl_type::vec4_type);
3080
3081 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0], 0);
3082 emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1], 4);
3083 }
3084
3085 /* We may need to split this up into several URB writes, so do them in a
3086 * loop.
3087 */
3088 int slot = 0;
3089 bool complete = false;
3090 do {
3091 /* URB offset is in URB row increments, and each of our MRFs is half of
3092 * one of those, since we're doing interleaved writes.
3093 */
3094 int offset = slot / 2;
3095
3096 mrf = base_mrf + 1;
3097 for (; slot < prog_data->vue_map.num_slots; ++slot) {
3098 emit_urb_slot(dst_reg(MRF, mrf++),
3099 prog_data->vue_map.slot_to_varying[slot]);
3100
3101 /* If this was max_usable_mrf, we can't fit anything more into this
3102 * URB WRITE.
3103 */
3104 if (mrf > max_usable_mrf) {
3105 slot++;
3106 break;
3107 }
3108 }
3109
3110 complete = slot >= prog_data->vue_map.num_slots;
3111 current_annotation = "URB write";
3112 vec4_instruction *inst = emit_urb_write_opcode(complete);
3113 inst->base_mrf = base_mrf;
3114 inst->mlen = align_interleaved_urb_mlen(brw, mrf - base_mrf);
3115 inst->offset += offset;
3116 } while(!complete);
3117 }
3118
3119
3120 src_reg
3121 vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
3122 src_reg *reladdr, int reg_offset)
3123 {
3124 /* Because we store the values to scratch interleaved like our
3125 * vertex data, we need to scale the vec4 index by 2.
3126 */
3127 int message_header_scale = 2;
3128
3129 /* Pre-gen6, the message header uses byte offsets instead of vec4
3130 * (16-byte) offset units.
3131 */
3132 if (brw->gen < 6)
3133 message_header_scale *= 16;
3134
3135 if (reladdr) {
3136 src_reg index = src_reg(this, glsl_type::int_type);
3137
3138 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3139 src_reg(reg_offset)));
3140 emit_before(block, inst, MUL(dst_reg(index), index,
3141 src_reg(message_header_scale)));
3142
3143 return index;
3144 } else {
3145 return src_reg(reg_offset * message_header_scale);
3146 }
3147 }
3148
3149 src_reg
3150 vec4_visitor::get_pull_constant_offset(bblock_t * block, vec4_instruction *inst,
3151 src_reg *reladdr, int reg_offset)
3152 {
3153 if (reladdr) {
3154 src_reg index = src_reg(this, glsl_type::int_type);
3155
3156 emit_before(block, inst, ADD(dst_reg(index), *reladdr,
3157 src_reg(reg_offset)));
3158
3159 /* Pre-gen6, the message header uses byte offsets instead of vec4
3160 * (16-byte) offset units.
3161 */
3162 if (brw->gen < 6) {
3163 emit_before(block, inst, MUL(dst_reg(index), index, src_reg(16)));
3164 }
3165
3166 return index;
3167 } else if (brw->gen >= 8) {
3168 /* Store the offset in a GRF so we can send-from-GRF. */
3169 src_reg offset = src_reg(this, glsl_type::int_type);
3170 emit_before(block, inst, MOV(dst_reg(offset), src_reg(reg_offset)));
3171 return offset;
3172 } else {
3173 int message_header_scale = brw->gen < 6 ? 16 : 1;
3174 return src_reg(reg_offset * message_header_scale);
3175 }
3176 }
3177
3178 /**
3179 * Emits an instruction before @inst to load the value named by @orig_src
3180 * from scratch space at @base_offset to @temp.
3181 *
3182 * @base_offset is measured in 32-byte units (the size of a register).
3183 */
3184 void
3185 vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
3186 dst_reg temp, src_reg orig_src,
3187 int base_offset)
3188 {
3189 int reg_offset = base_offset + orig_src.reg_offset;
3190 src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
3191 reg_offset);
3192
3193 emit_before(block, inst, SCRATCH_READ(temp, index));
3194 }
3195
3196 /**
3197 * Emits an instruction after @inst to store the value to be written
3198 * to @orig_dst to scratch space at @base_offset, from @temp.
3199 *
3200 * @base_offset is measured in 32-byte units (the size of a register).
3201 */
3202 void
3203 vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
3204 int base_offset)
3205 {
3206 int reg_offset = base_offset + inst->dst.reg_offset;
3207 src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
3208 reg_offset);
3209
3210 /* Create a temporary register to store *inst's result in.
3211 *
3212 * We have to be careful in MOVing from our temporary result register in
3213 * the scratch write. If we swizzle from channels of the temporary that
3214 * weren't initialized, it will confuse live interval analysis, which will
3215 * make spilling fail to make progress.
3216 */
3217 src_reg temp = src_reg(this, glsl_type::vec4_type);
3218 temp.type = inst->dst.type;
3219 int first_writemask_chan = ffs(inst->dst.writemask) - 1;
3220 int swizzles[4];
3221 for (int i = 0; i < 4; i++)
3222 if (inst->dst.writemask & (1 << i))
3223 swizzles[i] = i;
3224 else
3225 swizzles[i] = first_writemask_chan;
3226 temp.swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
3227 swizzles[2], swizzles[3]);
3228
3229 dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
3230 inst->dst.writemask));
3231 vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
3232 write->predicate = inst->predicate;
3233 write->ir = inst->ir;
3234 write->annotation = inst->annotation;
3235 inst->insert_after(block, write);
3236
3237 inst->dst.file = temp.file;
3238 inst->dst.reg = temp.reg;
3239 inst->dst.reg_offset = temp.reg_offset;
3240 inst->dst.reladdr = NULL;
3241 }
3242
3243 /**
3244 * We can't generally support array access in GRF space, because a
3245 * single instruction's destination can only span 2 contiguous
3246 * registers. So, we send all GRF arrays that get variable index
3247 * access to scratch space.
3248 */
3249 void
3250 vec4_visitor::move_grf_array_access_to_scratch()
3251 {
3252 int scratch_loc[this->virtual_grf_count];
3253 memset(scratch_loc, -1, sizeof(scratch_loc));
3254
3255 /* First, calculate the set of virtual GRFs that need to be punted
3256 * to scratch due to having any array access on them, and where in
3257 * scratch.
3258 */
3259 foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
3260 if (inst->dst.file == GRF && inst->dst.reladdr &&
3261 scratch_loc[inst->dst.reg] == -1) {
3262 scratch_loc[inst->dst.reg] = c->last_scratch;
3263 c->last_scratch += this->virtual_grf_sizes[inst->dst.reg];
3264 }
3265
3266 for (int i = 0 ; i < 3; i++) {
3267 src_reg *src = &inst->src[i];
3268
3269 if (src->file == GRF && src->reladdr &&
3270 scratch_loc[src->reg] == -1) {
3271 scratch_loc[src->reg] = c->last_scratch;
3272 c->last_scratch += this->virtual_grf_sizes[src->reg];
3273 }
3274 }
3275 }
3276
3277 /* Now, for anything that will be accessed through scratch, rewrite
3278 * it to load/store. Note that this is a _safe list walk, because
3279 * we may generate a new scratch_write instruction after the one
3280 * we're processing.
3281 */
3282 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3283 /* Set up the annotation tracking for new generated instructions. */
3284 base_ir = inst->ir;
3285 current_annotation = inst->annotation;
3286
3287 if (inst->dst.file == GRF && scratch_loc[inst->dst.reg] != -1) {
3288 emit_scratch_write(block, inst, scratch_loc[inst->dst.reg]);
3289 }
3290
3291 for (int i = 0 ; i < 3; i++) {
3292 if (inst->src[i].file != GRF || scratch_loc[inst->src[i].reg] == -1)
3293 continue;
3294
3295 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3296
3297 emit_scratch_read(block, inst, temp, inst->src[i],
3298 scratch_loc[inst->src[i].reg]);
3299
3300 inst->src[i].file = temp.file;
3301 inst->src[i].reg = temp.reg;
3302 inst->src[i].reg_offset = temp.reg_offset;
3303 inst->src[i].reladdr = NULL;
3304 }
3305 }
3306 }
3307
3308 /**
3309 * Emits an instruction before @inst to load the value named by @orig_src
3310 * from the pull constant buffer (surface) at @base_offset to @temp.
3311 */
3312 void
3313 vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
3314 dst_reg temp, src_reg orig_src,
3315 int base_offset)
3316 {
3317 int reg_offset = base_offset + orig_src.reg_offset;
3318 src_reg index = src_reg(prog_data->base.binding_table.pull_constants_start);
3319 src_reg offset = get_pull_constant_offset(block, inst, orig_src.reladdr,
3320 reg_offset);
3321 vec4_instruction *load;
3322
3323 if (brw->gen >= 7) {
3324 dst_reg grf_offset = dst_reg(this, glsl_type::int_type);
3325 grf_offset.type = offset.type;
3326 emit_before(block, inst, MOV(grf_offset, offset));
3327
3328 load = new(mem_ctx) vec4_instruction(this,
3329 VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
3330 temp, index, src_reg(grf_offset));
3331 } else {
3332 load = new(mem_ctx) vec4_instruction(this, VS_OPCODE_PULL_CONSTANT_LOAD,
3333 temp, index, offset);
3334 load->base_mrf = 14;
3335 load->mlen = 1;
3336 }
3337 emit_before(block, inst, load);
3338 }
3339
3340 /**
3341 * Implements array access of uniforms by inserting a
3342 * PULL_CONSTANT_LOAD instruction.
3343 *
3344 * Unlike temporary GRF array access (where we don't support it due to
3345 * the difficulty of doing relative addressing on instruction
3346 * destinations), we could potentially do array access of uniforms
3347 * that were loaded in GRF space as push constants. In real-world
3348 * usage we've seen, though, the arrays being used are always larger
3349 * than we could load as push constants, so just always move all
3350 * uniform array access out to a pull constant buffer.
3351 */
3352 void
3353 vec4_visitor::move_uniform_array_access_to_pull_constants()
3354 {
3355 int pull_constant_loc[this->uniforms];
3356 memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
3357
3358 /* Walk through and find array access of uniforms. Put a copy of that
3359 * uniform in the pull constant buffer.
3360 *
3361 * Note that we don't move constant-indexed accesses to arrays. No
3362 * testing has been done of the performance impact of this choice.
3363 */
3364 foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
3365 for (int i = 0 ; i < 3; i++) {
3366 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
3367 continue;
3368
3369 int uniform = inst->src[i].reg;
3370
3371 /* If this array isn't already present in the pull constant buffer,
3372 * add it.
3373 */
3374 if (pull_constant_loc[uniform] == -1) {
3375 const gl_constant_value **values =
3376 &stage_prog_data->param[uniform * 4];
3377
3378 pull_constant_loc[uniform] = stage_prog_data->nr_pull_params / 4;
3379
3380 assert(uniform < uniform_array_size);
3381 for (int j = 0; j < uniform_size[uniform] * 4; j++) {
3382 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
3383 = values[j];
3384 }
3385 }
3386
3387 /* Set up the annotation tracking for new generated instructions. */
3388 base_ir = inst->ir;
3389 current_annotation = inst->annotation;
3390
3391 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
3392
3393 emit_pull_constant_load(block, inst, temp, inst->src[i],
3394 pull_constant_loc[uniform]);
3395
3396 inst->src[i].file = temp.file;
3397 inst->src[i].reg = temp.reg;
3398 inst->src[i].reg_offset = temp.reg_offset;
3399 inst->src[i].reladdr = NULL;
3400 }
3401 }
3402
3403 /* Now there are no accesses of the UNIFORM file with a reladdr, so
3404 * no need to track them as larger-than-vec4 objects. This will be
3405 * relied on in cutting out unused uniform vectors from push
3406 * constants.
3407 */
3408 split_uniform_registers();
3409 }
3410
3411 void
3412 vec4_visitor::resolve_ud_negate(src_reg *reg)
3413 {
3414 if (reg->type != BRW_REGISTER_TYPE_UD ||
3415 !reg->negate)
3416 return;
3417
3418 src_reg temp = src_reg(this, glsl_type::uvec4_type);
3419 emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
3420 *reg = temp;
3421 }
3422
3423 vec4_visitor::vec4_visitor(struct brw_context *brw,
3424 struct brw_vec4_compile *c,
3425 struct gl_program *prog,
3426 const struct brw_vec4_prog_key *key,
3427 struct brw_vec4_prog_data *prog_data,
3428 struct gl_shader_program *shader_prog,
3429 gl_shader_stage stage,
3430 void *mem_ctx,
3431 bool debug_flag,
3432 bool no_spills,
3433 shader_time_shader_type st_base,
3434 shader_time_shader_type st_written,
3435 shader_time_shader_type st_reset)
3436 : backend_visitor(brw, shader_prog, prog, &prog_data->base, stage),
3437 c(c),
3438 key(key),
3439 prog_data(prog_data),
3440 sanity_param_count(0),
3441 fail_msg(NULL),
3442 first_non_payload_grf(0),
3443 need_all_constants_in_pull_buffer(false),
3444 debug_flag(debug_flag),
3445 no_spills(no_spills),
3446 st_base(st_base),
3447 st_written(st_written),
3448 st_reset(st_reset)
3449 {
3450 this->mem_ctx = mem_ctx;
3451 this->failed = false;
3452
3453 this->base_ir = NULL;
3454 this->current_annotation = NULL;
3455 memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
3456
3457 this->variable_ht = hash_table_ctor(0,
3458 hash_table_pointer_hash,
3459 hash_table_pointer_compare);
3460
3461 this->virtual_grf_start = NULL;
3462 this->virtual_grf_end = NULL;
3463 this->virtual_grf_sizes = NULL;
3464 this->virtual_grf_count = 0;
3465 this->virtual_grf_reg_map = NULL;
3466 this->virtual_grf_reg_count = 0;
3467 this->virtual_grf_array_size = 0;
3468 this->live_intervals_valid = false;
3469
3470 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
3471
3472 this->uniforms = 0;
3473
3474 /* Initialize uniform_array_size to at least 1 because pre-gen6 VS requires
3475 * at least one. See setup_uniforms() in brw_vec4.cpp.
3476 */
3477 this->uniform_array_size = 1;
3478 if (prog_data) {
3479 this->uniform_array_size = MAX2(stage_prog_data->nr_params, 1);
3480 }
3481
3482 this->uniform_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3483 this->uniform_vector_size = rzalloc_array(mem_ctx, int, this->uniform_array_size);
3484 }
3485
3486 vec4_visitor::~vec4_visitor()
3487 {
3488 hash_table_dtor(this->variable_ht);
3489 }
3490
3491
3492 void
3493 vec4_visitor::fail(const char *format, ...)
3494 {
3495 va_list va;
3496 char *msg;
3497
3498 if (failed)
3499 return;
3500
3501 failed = true;
3502
3503 va_start(va, format);
3504 msg = ralloc_vasprintf(mem_ctx, format, va);
3505 va_end(va);
3506 msg = ralloc_asprintf(mem_ctx, "vec4 compile failed: %s\n", msg);
3507
3508 this->fail_msg = msg;
3509
3510 if (debug_flag) {
3511 fprintf(stderr, "%s", msg);
3512 }
3513 }
3514
3515 } /* namespace brw */