47cf71e3c53ee62edaa27926c4a2b3ecce5afc90
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_visitor.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_visitor.cpp
25 *
26 * This file supports generating the FS LIR from the GLSL IR. The LIR
27 * makes it easier to do backend-specific optimizations than doing so
28 * in the GLSL IR or in the native code.
29 */
30 extern "C" {
31
32 #include <sys/types.h>
33
34 #include "main/macros.h"
35 #include "main/shaderobj.h"
36 #include "program/prog_parameter.h"
37 #include "program/prog_print.h"
38 #include "program/prog_optimize.h"
39 #include "program/register_allocate.h"
40 #include "program/sampler.h"
41 #include "program/hash_table.h"
42 #include "brw_context.h"
43 #include "brw_eu.h"
44 #include "brw_wm.h"
45 }
46 #include "brw_fs.h"
47 #include "main/uniforms.h"
48 #include "glsl/glsl_types.h"
49 #include "glsl/ir_optimization.h"
50
51 void
52 fs_visitor::visit(ir_variable *ir)
53 {
54 fs_reg *reg = NULL;
55
56 if (variable_storage(ir))
57 return;
58
59 if (ir->data.mode == ir_var_shader_in) {
60 if (!strcmp(ir->name, "gl_FragCoord")) {
61 reg = emit_fragcoord_interpolation(ir);
62 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
63 reg = emit_frontfacing_interpolation(ir);
64 } else {
65 reg = emit_general_interpolation(ir);
66 }
67 assert(reg);
68 hash_table_insert(this->variable_ht, reg, ir);
69 return;
70 } else if (ir->data.mode == ir_var_shader_out) {
71 reg = new(this->mem_ctx) fs_reg(this, ir->type);
72
73 if (ir->data.index > 0) {
74 assert(ir->data.location == FRAG_RESULT_DATA0);
75 assert(ir->data.index == 1);
76 this->dual_src_output = *reg;
77 } else if (ir->data.location == FRAG_RESULT_COLOR) {
78 /* Writing gl_FragColor outputs to all color regions. */
79 for (unsigned int i = 0; i < MAX2(c->key.nr_color_regions, 1); i++) {
80 this->outputs[i] = *reg;
81 this->output_components[i] = 4;
82 }
83 } else if (ir->data.location == FRAG_RESULT_DEPTH) {
84 this->frag_depth = *reg;
85 } else if (ir->data.location == FRAG_RESULT_SAMPLE_MASK) {
86 this->sample_mask = *reg;
87 } else {
88 /* gl_FragData or a user-defined FS output */
89 assert(ir->data.location >= FRAG_RESULT_DATA0 &&
90 ir->data.location < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS);
91
92 int vector_elements =
93 ir->type->is_array() ? ir->type->fields.array->vector_elements
94 : ir->type->vector_elements;
95
96 /* General color output. */
97 for (unsigned int i = 0; i < MAX2(1, ir->type->length); i++) {
98 int output = ir->data.location - FRAG_RESULT_DATA0 + i;
99 this->outputs[output] = *reg;
100 this->outputs[output].reg_offset += vector_elements * i;
101 this->output_components[output] = vector_elements;
102 }
103 }
104 } else if (ir->data.mode == ir_var_uniform) {
105 int param_index = c->prog_data.nr_params;
106
107 /* Thanks to the lower_ubo_reference pass, we will see only
108 * ir_binop_ubo_load expressions and not ir_dereference_variable for UBO
109 * variables, so no need for them to be in variable_ht.
110 *
111 * Atomic counters take no uniform storage, no need to do
112 * anything here.
113 */
114 if (ir->is_in_uniform_block() || ir->type->contains_atomic())
115 return;
116
117 if (dispatch_width == 16) {
118 if (!variable_storage(ir)) {
119 fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
120 }
121 return;
122 }
123
124 param_size[param_index] = type_size(ir->type);
125 if (!strncmp(ir->name, "gl_", 3)) {
126 setup_builtin_uniform_values(ir);
127 } else {
128 setup_uniform_values(ir);
129 }
130
131 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
132 reg->type = brw_type_for_base_type(ir->type);
133
134 } else if (ir->data.mode == ir_var_system_value) {
135 if (ir->data.location == SYSTEM_VALUE_SAMPLE_POS) {
136 reg = emit_samplepos_setup(ir);
137 } else if (ir->data.location == SYSTEM_VALUE_SAMPLE_ID) {
138 reg = emit_sampleid_setup(ir);
139 }
140 }
141
142 if (!reg)
143 reg = new(this->mem_ctx) fs_reg(this, ir->type);
144
145 hash_table_insert(this->variable_ht, reg, ir);
146 }
147
148 void
149 fs_visitor::visit(ir_dereference_variable *ir)
150 {
151 fs_reg *reg = variable_storage(ir->var);
152 this->result = *reg;
153 }
154
155 void
156 fs_visitor::visit(ir_dereference_record *ir)
157 {
158 const glsl_type *struct_type = ir->record->type;
159
160 ir->record->accept(this);
161
162 unsigned int offset = 0;
163 for (unsigned int i = 0; i < struct_type->length; i++) {
164 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
165 break;
166 offset += type_size(struct_type->fields.structure[i].type);
167 }
168 this->result.reg_offset += offset;
169 this->result.type = brw_type_for_base_type(ir->type);
170 }
171
172 void
173 fs_visitor::visit(ir_dereference_array *ir)
174 {
175 ir_constant *constant_index;
176 fs_reg src;
177 int element_size = type_size(ir->type);
178
179 constant_index = ir->array_index->as_constant();
180
181 ir->array->accept(this);
182 src = this->result;
183 src.type = brw_type_for_base_type(ir->type);
184
185 if (constant_index) {
186 assert(src.file == UNIFORM || src.file == GRF);
187 src.reg_offset += constant_index->value.i[0] * element_size;
188 } else {
189 /* Variable index array dereference. We attach the variable index
190 * component to the reg as a pointer to a register containing the
191 * offset. Currently only uniform arrays are supported in this patch,
192 * and that reladdr pointer is resolved by
193 * move_uniform_array_access_to_pull_constants(). All other array types
194 * are lowered by lower_variable_index_to_cond_assign().
195 */
196 ir->array_index->accept(this);
197
198 fs_reg index_reg;
199 index_reg = fs_reg(this, glsl_type::int_type);
200 emit(BRW_OPCODE_MUL, index_reg, this->result, fs_reg(element_size));
201
202 if (src.reladdr) {
203 emit(BRW_OPCODE_ADD, index_reg, *src.reladdr, index_reg);
204 }
205
206 src.reladdr = ralloc(mem_ctx, fs_reg);
207 memcpy(src.reladdr, &index_reg, sizeof(index_reg));
208 }
209 this->result = src;
210 }
211
212 void
213 fs_visitor::emit_lrp(fs_reg dst, fs_reg x, fs_reg y, fs_reg a)
214 {
215 if (brw->gen < 6 ||
216 !x.is_valid_3src() ||
217 !y.is_valid_3src() ||
218 !a.is_valid_3src()) {
219 /* We can't use the LRP instruction. Emit x*(1-a) + y*a. */
220 fs_reg y_times_a = fs_reg(this, glsl_type::float_type);
221 fs_reg one_minus_a = fs_reg(this, glsl_type::float_type);
222 fs_reg x_times_one_minus_a = fs_reg(this, glsl_type::float_type);
223
224 emit(MUL(y_times_a, y, a));
225
226 a.negate = !a.negate;
227 emit(ADD(one_minus_a, a, fs_reg(1.0f)));
228 emit(MUL(x_times_one_minus_a, x, one_minus_a));
229
230 emit(ADD(dst, x_times_one_minus_a, y_times_a));
231 } else {
232 /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
233 * we need to reorder the operands.
234 */
235 emit(LRP(dst, a, y, x));
236 }
237 }
238
239 void
240 fs_visitor::emit_minmax(uint32_t conditionalmod, fs_reg dst,
241 fs_reg src0, fs_reg src1)
242 {
243 fs_inst *inst;
244
245 if (brw->gen >= 6) {
246 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
247 inst->conditional_mod = conditionalmod;
248 } else {
249 emit(CMP(reg_null_d, src0, src1, conditionalmod));
250
251 inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
252 inst->predicate = BRW_PREDICATE_NORMAL;
253 }
254 }
255
256 /* Instruction selection: Produce a MOV.sat instead of
257 * MIN(MAX(val, 0), 1) when possible.
258 */
259 bool
260 fs_visitor::try_emit_saturate(ir_expression *ir)
261 {
262 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
263
264 if (!sat_val)
265 return false;
266
267 fs_inst *pre_inst = (fs_inst *) this->instructions.get_tail();
268
269 sat_val->accept(this);
270 fs_reg src = this->result;
271
272 fs_inst *last_inst = (fs_inst *) this->instructions.get_tail();
273
274 /* If the last instruction from our accept() didn't generate our
275 * src, generate a saturated MOV
276 */
277 fs_inst *modify = get_instruction_generating_reg(pre_inst, last_inst, src);
278 if (!modify || modify->regs_written != 1) {
279 this->result = fs_reg(this, ir->type);
280 fs_inst *inst = emit(MOV(this->result, src));
281 inst->saturate = true;
282 } else {
283 modify->saturate = true;
284 this->result = src;
285 }
286
287
288 return true;
289 }
290
291 bool
292 fs_visitor::try_emit_mad(ir_expression *ir, int mul_arg)
293 {
294 /* 3-src instructions were introduced in gen6. */
295 if (brw->gen < 6)
296 return false;
297
298 /* MAD can only handle floating-point data. */
299 if (ir->type != glsl_type::float_type)
300 return false;
301
302 ir_rvalue *nonmul = ir->operands[1 - mul_arg];
303 ir_expression *mul = ir->operands[mul_arg]->as_expression();
304
305 if (!mul || mul->operation != ir_binop_mul)
306 return false;
307
308 if (nonmul->as_constant() ||
309 mul->operands[0]->as_constant() ||
310 mul->operands[1]->as_constant())
311 return false;
312
313 nonmul->accept(this);
314 fs_reg src0 = this->result;
315
316 mul->operands[0]->accept(this);
317 fs_reg src1 = this->result;
318
319 mul->operands[1]->accept(this);
320 fs_reg src2 = this->result;
321
322 this->result = fs_reg(this, ir->type);
323 emit(BRW_OPCODE_MAD, this->result, src0, src1, src2);
324
325 return true;
326 }
327
328 void
329 fs_visitor::visit(ir_expression *ir)
330 {
331 unsigned int operand;
332 fs_reg op[3], temp;
333 fs_inst *inst;
334
335 assert(ir->get_num_operands() <= 3);
336
337 if (try_emit_saturate(ir))
338 return;
339 if (ir->operation == ir_binop_add) {
340 if (try_emit_mad(ir, 0) || try_emit_mad(ir, 1))
341 return;
342 }
343
344 for (operand = 0; operand < ir->get_num_operands(); operand++) {
345 ir->operands[operand]->accept(this);
346 if (this->result.file == BAD_FILE) {
347 fail("Failed to get tree for expression operand:\n");
348 ir->operands[operand]->print();
349 printf("\n");
350 }
351 assert(this->result.is_valid_3src());
352 op[operand] = this->result;
353
354 /* Matrix expression operands should have been broken down to vector
355 * operations already.
356 */
357 assert(!ir->operands[operand]->type->is_matrix());
358 /* And then those vector operands should have been broken down to scalar.
359 */
360 assert(!ir->operands[operand]->type->is_vector());
361 }
362
363 /* Storage for our result. If our result goes into an assignment, it will
364 * just get copy-propagated out, so no worries.
365 */
366 this->result = fs_reg(this, ir->type);
367
368 switch (ir->operation) {
369 case ir_unop_logic_not:
370 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
371 * ones complement of the whole register, not just bit 0.
372 */
373 emit(XOR(this->result, op[0], fs_reg(1)));
374 break;
375 case ir_unop_neg:
376 op[0].negate = !op[0].negate;
377 emit(MOV(this->result, op[0]));
378 break;
379 case ir_unop_abs:
380 op[0].abs = true;
381 op[0].negate = false;
382 emit(MOV(this->result, op[0]));
383 break;
384 case ir_unop_sign:
385 if (ir->type->is_float()) {
386 /* AND(val, 0x80000000) gives the sign bit.
387 *
388 * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
389 * zero.
390 */
391 emit(CMP(reg_null_f, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
392
393 op[0].type = BRW_REGISTER_TYPE_UD;
394 this->result.type = BRW_REGISTER_TYPE_UD;
395 emit(AND(this->result, op[0], fs_reg(0x80000000u)));
396
397 inst = emit(OR(this->result, this->result, fs_reg(0x3f800000u)));
398 inst->predicate = BRW_PREDICATE_NORMAL;
399
400 this->result.type = BRW_REGISTER_TYPE_F;
401 } else {
402 /* ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
403 * -> non-negative val generates 0x00000000.
404 * Predicated OR sets 1 if val is positive.
405 */
406 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_G));
407
408 emit(ASR(this->result, op[0], fs_reg(31)));
409
410 inst = emit(OR(this->result, this->result, fs_reg(1)));
411 inst->predicate = BRW_PREDICATE_NORMAL;
412 }
413 break;
414 case ir_unop_rcp:
415 emit_math(SHADER_OPCODE_RCP, this->result, op[0]);
416 break;
417
418 case ir_unop_exp2:
419 emit_math(SHADER_OPCODE_EXP2, this->result, op[0]);
420 break;
421 case ir_unop_log2:
422 emit_math(SHADER_OPCODE_LOG2, this->result, op[0]);
423 break;
424 case ir_unop_exp:
425 case ir_unop_log:
426 assert(!"not reached: should be handled by ir_explog_to_explog2");
427 break;
428 case ir_unop_sin:
429 case ir_unop_sin_reduced:
430 emit_math(SHADER_OPCODE_SIN, this->result, op[0]);
431 break;
432 case ir_unop_cos:
433 case ir_unop_cos_reduced:
434 emit_math(SHADER_OPCODE_COS, this->result, op[0]);
435 break;
436
437 case ir_unop_dFdx:
438 emit(FS_OPCODE_DDX, this->result, op[0]);
439 break;
440 case ir_unop_dFdy:
441 emit(FS_OPCODE_DDY, this->result, op[0]);
442 break;
443
444 case ir_binop_add:
445 emit(ADD(this->result, op[0], op[1]));
446 break;
447 case ir_binop_sub:
448 assert(!"not reached: should be handled by ir_sub_to_add_neg");
449 break;
450
451 case ir_binop_mul:
452 if (brw->gen < 8 && ir->type->is_integer()) {
453 /* For integer multiplication, the MUL uses the low 16 bits
454 * of one of the operands (src0 on gen6, src1 on gen7). The
455 * MACH accumulates in the contribution of the upper 16 bits
456 * of that operand.
457 *
458 * FINISHME: Emit just the MUL if we know an operand is small
459 * enough.
460 */
461 if (brw->gen >= 7 && dispatch_width == 16)
462 fail("16-wide explicit accumulator operands unsupported\n");
463
464 struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
465
466 emit(MUL(acc, op[0], op[1]));
467 emit(MACH(reg_null_d, op[0], op[1]));
468 emit(MOV(this->result, fs_reg(acc)));
469 } else {
470 emit(MUL(this->result, op[0], op[1]));
471 }
472 break;
473 case ir_binop_imul_high: {
474 if (brw->gen >= 7 && dispatch_width == 16)
475 fail("16-wide explicit accumulator operands unsupported\n");
476
477 struct brw_reg acc = retype(brw_acc_reg(), this->result.type);
478
479 emit(MUL(acc, op[0], op[1]));
480 emit(MACH(this->result, op[0], op[1]));
481 break;
482 }
483 case ir_binop_div:
484 /* Floating point should be lowered by DIV_TO_MUL_RCP in the compiler. */
485 assert(ir->type->is_integer());
486 emit_math(SHADER_OPCODE_INT_QUOTIENT, this->result, op[0], op[1]);
487 break;
488 case ir_binop_carry: {
489 if (brw->gen >= 7 && dispatch_width == 16)
490 fail("16-wide explicit accumulator operands unsupported\n");
491
492 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
493
494 emit(ADDC(reg_null_ud, op[0], op[1]));
495 emit(MOV(this->result, fs_reg(acc)));
496 break;
497 }
498 case ir_binop_borrow: {
499 if (brw->gen >= 7 && dispatch_width == 16)
500 fail("16-wide explicit accumulator operands unsupported\n");
501
502 struct brw_reg acc = retype(brw_acc_reg(), BRW_REGISTER_TYPE_UD);
503
504 emit(SUBB(reg_null_ud, op[0], op[1]));
505 emit(MOV(this->result, fs_reg(acc)));
506 break;
507 }
508 case ir_binop_mod:
509 /* Floating point should be lowered by MOD_TO_FRACT in the compiler. */
510 assert(ir->type->is_integer());
511 emit_math(SHADER_OPCODE_INT_REMAINDER, this->result, op[0], op[1]);
512 break;
513
514 case ir_binop_less:
515 case ir_binop_greater:
516 case ir_binop_lequal:
517 case ir_binop_gequal:
518 case ir_binop_equal:
519 case ir_binop_all_equal:
520 case ir_binop_nequal:
521 case ir_binop_any_nequal:
522 resolve_bool_comparison(ir->operands[0], &op[0]);
523 resolve_bool_comparison(ir->operands[1], &op[1]);
524
525 emit(CMP(this->result, op[0], op[1],
526 brw_conditional_for_comparison(ir->operation)));
527 break;
528
529 case ir_binop_logic_xor:
530 emit(XOR(this->result, op[0], op[1]));
531 break;
532
533 case ir_binop_logic_or:
534 emit(OR(this->result, op[0], op[1]));
535 break;
536
537 case ir_binop_logic_and:
538 emit(AND(this->result, op[0], op[1]));
539 break;
540
541 case ir_binop_dot:
542 case ir_unop_any:
543 assert(!"not reached: should be handled by brw_fs_channel_expressions");
544 break;
545
546 case ir_unop_noise:
547 assert(!"not reached: should be handled by lower_noise");
548 break;
549
550 case ir_quadop_vector:
551 assert(!"not reached: should be handled by lower_quadop_vector");
552 break;
553
554 case ir_binop_vector_extract:
555 assert(!"not reached: should be handled by lower_vec_index_to_cond_assign()");
556 break;
557
558 case ir_triop_vector_insert:
559 assert(!"not reached: should be handled by lower_vector_insert()");
560 break;
561
562 case ir_binop_ldexp:
563 assert(!"not reached: should be handled by ldexp_to_arith()");
564 break;
565
566 case ir_unop_sqrt:
567 emit_math(SHADER_OPCODE_SQRT, this->result, op[0]);
568 break;
569
570 case ir_unop_rsq:
571 emit_math(SHADER_OPCODE_RSQ, this->result, op[0]);
572 break;
573
574 case ir_unop_bitcast_i2f:
575 case ir_unop_bitcast_u2f:
576 op[0].type = BRW_REGISTER_TYPE_F;
577 this->result = op[0];
578 break;
579 case ir_unop_i2u:
580 case ir_unop_bitcast_f2u:
581 op[0].type = BRW_REGISTER_TYPE_UD;
582 this->result = op[0];
583 break;
584 case ir_unop_u2i:
585 case ir_unop_bitcast_f2i:
586 op[0].type = BRW_REGISTER_TYPE_D;
587 this->result = op[0];
588 break;
589 case ir_unop_i2f:
590 case ir_unop_u2f:
591 case ir_unop_f2i:
592 case ir_unop_f2u:
593 emit(MOV(this->result, op[0]));
594 break;
595
596 case ir_unop_b2i:
597 emit(AND(this->result, op[0], fs_reg(1)));
598 break;
599 case ir_unop_b2f:
600 temp = fs_reg(this, glsl_type::int_type);
601 emit(AND(temp, op[0], fs_reg(1)));
602 emit(MOV(this->result, temp));
603 break;
604
605 case ir_unop_f2b:
606 emit(CMP(this->result, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
607 break;
608 case ir_unop_i2b:
609 emit(CMP(this->result, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
610 break;
611
612 case ir_unop_trunc:
613 emit(RNDZ(this->result, op[0]));
614 break;
615 case ir_unop_ceil:
616 op[0].negate = !op[0].negate;
617 emit(RNDD(this->result, op[0]));
618 this->result.negate = true;
619 break;
620 case ir_unop_floor:
621 emit(RNDD(this->result, op[0]));
622 break;
623 case ir_unop_fract:
624 emit(FRC(this->result, op[0]));
625 break;
626 case ir_unop_round_even:
627 emit(RNDE(this->result, op[0]));
628 break;
629
630 case ir_binop_min:
631 case ir_binop_max:
632 resolve_ud_negate(&op[0]);
633 resolve_ud_negate(&op[1]);
634 emit_minmax(ir->operation == ir_binop_min ?
635 BRW_CONDITIONAL_L : BRW_CONDITIONAL_GE,
636 this->result, op[0], op[1]);
637 break;
638 case ir_unop_pack_snorm_2x16:
639 case ir_unop_pack_snorm_4x8:
640 case ir_unop_pack_unorm_2x16:
641 case ir_unop_pack_unorm_4x8:
642 case ir_unop_unpack_snorm_2x16:
643 case ir_unop_unpack_snorm_4x8:
644 case ir_unop_unpack_unorm_2x16:
645 case ir_unop_unpack_unorm_4x8:
646 case ir_unop_unpack_half_2x16:
647 case ir_unop_pack_half_2x16:
648 assert(!"not reached: should be handled by lower_packing_builtins");
649 break;
650 case ir_unop_unpack_half_2x16_split_x:
651 emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, this->result, op[0]);
652 break;
653 case ir_unop_unpack_half_2x16_split_y:
654 emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, this->result, op[0]);
655 break;
656 case ir_binop_pow:
657 emit_math(SHADER_OPCODE_POW, this->result, op[0], op[1]);
658 break;
659
660 case ir_unop_bitfield_reverse:
661 emit(BFREV(this->result, op[0]));
662 break;
663 case ir_unop_bit_count:
664 emit(CBIT(this->result, op[0]));
665 break;
666 case ir_unop_find_msb:
667 temp = fs_reg(this, glsl_type::uint_type);
668 emit(FBH(temp, op[0]));
669
670 /* FBH counts from the MSB side, while GLSL's findMSB() wants the count
671 * from the LSB side. If FBH didn't return an error (0xFFFFFFFF), then
672 * subtract the result from 31 to convert the MSB count into an LSB count.
673 */
674
675 /* FBH only supports UD type for dst, so use a MOV to convert UD to D. */
676 emit(MOV(this->result, temp));
677 emit(CMP(reg_null_d, this->result, fs_reg(-1), BRW_CONDITIONAL_NZ));
678
679 temp.negate = true;
680 inst = emit(ADD(this->result, temp, fs_reg(31)));
681 inst->predicate = BRW_PREDICATE_NORMAL;
682 break;
683 case ir_unop_find_lsb:
684 emit(FBL(this->result, op[0]));
685 break;
686 case ir_triop_bitfield_extract:
687 /* Note that the instruction's argument order is reversed from GLSL
688 * and the IR.
689 */
690 emit(BFE(this->result, op[2], op[1], op[0]));
691 break;
692 case ir_binop_bfm:
693 emit(BFI1(this->result, op[0], op[1]));
694 break;
695 case ir_triop_bfi:
696 emit(BFI2(this->result, op[0], op[1], op[2]));
697 break;
698 case ir_quadop_bitfield_insert:
699 assert(!"not reached: should be handled by "
700 "lower_instructions::bitfield_insert_to_bfm_bfi");
701 break;
702
703 case ir_unop_bit_not:
704 emit(NOT(this->result, op[0]));
705 break;
706 case ir_binop_bit_and:
707 emit(AND(this->result, op[0], op[1]));
708 break;
709 case ir_binop_bit_xor:
710 emit(XOR(this->result, op[0], op[1]));
711 break;
712 case ir_binop_bit_or:
713 emit(OR(this->result, op[0], op[1]));
714 break;
715
716 case ir_binop_lshift:
717 emit(SHL(this->result, op[0], op[1]));
718 break;
719
720 case ir_binop_rshift:
721 if (ir->type->base_type == GLSL_TYPE_INT)
722 emit(ASR(this->result, op[0], op[1]));
723 else
724 emit(SHR(this->result, op[0], op[1]));
725 break;
726 case ir_binop_pack_half_2x16_split:
727 emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, this->result, op[0], op[1]);
728 break;
729 case ir_binop_ubo_load: {
730 /* This IR node takes a constant uniform block and a constant or
731 * variable byte offset within the block and loads a vector from that.
732 */
733 ir_constant *uniform_block = ir->operands[0]->as_constant();
734 ir_constant *const_offset = ir->operands[1]->as_constant();
735 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.ubo_start +
736 uniform_block->value.u[0]);
737 if (const_offset) {
738 fs_reg packed_consts = fs_reg(this, glsl_type::float_type);
739 packed_consts.type = result.type;
740
741 fs_reg const_offset_reg = fs_reg(const_offset->value.u[0] & ~15);
742 emit(fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
743 packed_consts, surf_index, const_offset_reg));
744
745 packed_consts.smear = const_offset->value.u[0] % 16 / 4;
746 for (int i = 0; i < ir->type->vector_elements; i++) {
747 /* UBO bools are any nonzero value. We consider bools to be
748 * values with the low bit set to 1. Convert them using CMP.
749 */
750 if (ir->type->base_type == GLSL_TYPE_BOOL) {
751 emit(CMP(result, packed_consts, fs_reg(0u), BRW_CONDITIONAL_NZ));
752 } else {
753 emit(MOV(result, packed_consts));
754 }
755
756 packed_consts.smear++;
757 result.reg_offset++;
758
759 /* The std140 packing rules don't allow vectors to cross 16-byte
760 * boundaries, and a reg is 32 bytes.
761 */
762 assert(packed_consts.smear < 8);
763 }
764 } else {
765 /* Turn the byte offset into a dword offset. */
766 fs_reg base_offset = fs_reg(this, glsl_type::int_type);
767 emit(SHR(base_offset, op[1], fs_reg(2)));
768
769 for (int i = 0; i < ir->type->vector_elements; i++) {
770 emit(VARYING_PULL_CONSTANT_LOAD(result, surf_index,
771 base_offset, i));
772
773 if (ir->type->base_type == GLSL_TYPE_BOOL)
774 emit(CMP(result, result, fs_reg(0), BRW_CONDITIONAL_NZ));
775
776 result.reg_offset++;
777 }
778 }
779
780 result.reg_offset = 0;
781 break;
782 }
783
784 case ir_triop_fma:
785 /* Note that the instruction's argument order is reversed from GLSL
786 * and the IR.
787 */
788 emit(MAD(this->result, op[2], op[1], op[0]));
789 break;
790
791 case ir_triop_lrp:
792 emit_lrp(this->result, op[0], op[1], op[2]);
793 break;
794
795 case ir_triop_csel:
796 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
797 inst = emit(BRW_OPCODE_SEL, this->result, op[1], op[2]);
798 inst->predicate = BRW_PREDICATE_NORMAL;
799 break;
800 }
801 }
802
803 void
804 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
805 const glsl_type *type, bool predicated)
806 {
807 switch (type->base_type) {
808 case GLSL_TYPE_FLOAT:
809 case GLSL_TYPE_UINT:
810 case GLSL_TYPE_INT:
811 case GLSL_TYPE_BOOL:
812 for (unsigned int i = 0; i < type->components(); i++) {
813 l.type = brw_type_for_base_type(type);
814 r.type = brw_type_for_base_type(type);
815
816 if (predicated || !l.equals(r)) {
817 fs_inst *inst = emit(MOV(l, r));
818 inst->predicate = predicated ? BRW_PREDICATE_NORMAL : BRW_PREDICATE_NONE;
819 }
820
821 l.reg_offset++;
822 r.reg_offset++;
823 }
824 break;
825 case GLSL_TYPE_ARRAY:
826 for (unsigned int i = 0; i < type->length; i++) {
827 emit_assignment_writes(l, r, type->fields.array, predicated);
828 }
829 break;
830
831 case GLSL_TYPE_STRUCT:
832 for (unsigned int i = 0; i < type->length; i++) {
833 emit_assignment_writes(l, r, type->fields.structure[i].type,
834 predicated);
835 }
836 break;
837
838 case GLSL_TYPE_SAMPLER:
839 case GLSL_TYPE_ATOMIC_UINT:
840 break;
841
842 case GLSL_TYPE_VOID:
843 case GLSL_TYPE_ERROR:
844 case GLSL_TYPE_INTERFACE:
845 assert(!"not reached");
846 break;
847 }
848 }
849
850 /* If the RHS processing resulted in an instruction generating a
851 * temporary value, and it would be easy to rewrite the instruction to
852 * generate its result right into the LHS instead, do so. This ends
853 * up reliably removing instructions where it can be tricky to do so
854 * later without real UD chain information.
855 */
856 bool
857 fs_visitor::try_rewrite_rhs_to_dst(ir_assignment *ir,
858 fs_reg dst,
859 fs_reg src,
860 fs_inst *pre_rhs_inst,
861 fs_inst *last_rhs_inst)
862 {
863 /* Only attempt if we're doing a direct assignment. */
864 if (ir->condition ||
865 !(ir->lhs->type->is_scalar() ||
866 (ir->lhs->type->is_vector() &&
867 ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1)))
868 return false;
869
870 /* Make sure the last instruction generated our source reg. */
871 fs_inst *modify = get_instruction_generating_reg(pre_rhs_inst,
872 last_rhs_inst,
873 src);
874 if (!modify)
875 return false;
876
877 /* If last_rhs_inst wrote a different number of components than our LHS,
878 * we can't safely rewrite it.
879 */
880 if (virtual_grf_sizes[dst.reg] != modify->regs_written)
881 return false;
882
883 /* Success! Rewrite the instruction. */
884 modify->dst = dst;
885
886 return true;
887 }
888
889 void
890 fs_visitor::visit(ir_assignment *ir)
891 {
892 fs_reg l, r;
893 fs_inst *inst;
894
895 /* FINISHME: arrays on the lhs */
896 ir->lhs->accept(this);
897 l = this->result;
898
899 fs_inst *pre_rhs_inst = (fs_inst *) this->instructions.get_tail();
900
901 ir->rhs->accept(this);
902 r = this->result;
903
904 fs_inst *last_rhs_inst = (fs_inst *) this->instructions.get_tail();
905
906 assert(l.file != BAD_FILE);
907 assert(r.file != BAD_FILE);
908
909 if (try_rewrite_rhs_to_dst(ir, l, r, pre_rhs_inst, last_rhs_inst))
910 return;
911
912 if (ir->condition) {
913 emit_bool_to_cond_code(ir->condition);
914 }
915
916 if (ir->lhs->type->is_scalar() ||
917 ir->lhs->type->is_vector()) {
918 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
919 if (ir->write_mask & (1 << i)) {
920 inst = emit(MOV(l, r));
921 if (ir->condition)
922 inst->predicate = BRW_PREDICATE_NORMAL;
923 r.reg_offset++;
924 }
925 l.reg_offset++;
926 }
927 } else {
928 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
929 }
930 }
931
932 fs_inst *
933 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
934 fs_reg shadow_c, fs_reg lod, fs_reg dPdy)
935 {
936 int mlen;
937 int base_mrf = 1;
938 bool simd16 = false;
939 fs_reg orig_dst;
940
941 /* g0 header. */
942 mlen = 1;
943
944 if (ir->shadow_comparitor) {
945 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
946 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
947 coordinate.reg_offset++;
948 }
949
950 /* gen4's SIMD8 sampler always has the slots for u,v,r present.
951 * the unused slots must be zeroed.
952 */
953 for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
954 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
955 }
956 mlen += 3;
957
958 if (ir->op == ir_tex) {
959 /* There's no plain shadow compare message, so we use shadow
960 * compare with a bias of 0.0.
961 */
962 emit(MOV(fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f)));
963 mlen++;
964 } else if (ir->op == ir_txb || ir->op == ir_txl) {
965 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
966 mlen++;
967 } else {
968 assert(!"Should not get here.");
969 }
970
971 emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
972 mlen++;
973 } else if (ir->op == ir_tex) {
974 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
975 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
976 coordinate.reg_offset++;
977 }
978 /* zero the others. */
979 for (int i = ir->coordinate->type->vector_elements; i<3; i++) {
980 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), fs_reg(0.0f)));
981 }
982 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
983 mlen += 3;
984 } else if (ir->op == ir_txd) {
985 fs_reg &dPdx = lod;
986
987 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
988 emit(MOV(fs_reg(MRF, base_mrf + mlen + i), coordinate));
989 coordinate.reg_offset++;
990 }
991 /* the slots for u and v are always present, but r is optional */
992 mlen += MAX2(ir->coordinate->type->vector_elements, 2);
993
994 /* P = u, v, r
995 * dPdx = dudx, dvdx, drdx
996 * dPdy = dudy, dvdy, drdy
997 *
998 * 1-arg: Does not exist.
999 *
1000 * 2-arg: dudx dvdx dudy dvdy
1001 * dPdx.x dPdx.y dPdy.x dPdy.y
1002 * m4 m5 m6 m7
1003 *
1004 * 3-arg: dudx dvdx drdx dudy dvdy drdy
1005 * dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
1006 * m5 m6 m7 m8 m9 m10
1007 */
1008 for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1009 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdx));
1010 dPdx.reg_offset++;
1011 }
1012 mlen += MAX2(ir->lod_info.grad.dPdx->type->vector_elements, 2);
1013
1014 for (int i = 0; i < ir->lod_info.grad.dPdy->type->vector_elements; i++) {
1015 emit(MOV(fs_reg(MRF, base_mrf + mlen), dPdy));
1016 dPdy.reg_offset++;
1017 }
1018 mlen += MAX2(ir->lod_info.grad.dPdy->type->vector_elements, 2);
1019 } else if (ir->op == ir_txs) {
1020 /* There's no SIMD8 resinfo message on Gen4. Use SIMD16 instead. */
1021 simd16 = true;
1022 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1023 mlen += 2;
1024 } else {
1025 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1026 * instructions. We'll need to do SIMD16 here.
1027 */
1028 simd16 = true;
1029 assert(ir->op == ir_txb || ir->op == ir_txl || ir->op == ir_txf);
1030
1031 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1032 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2, coordinate.type),
1033 coordinate));
1034 coordinate.reg_offset++;
1035 }
1036
1037 /* Initialize the rest of u/v/r with 0.0. Empirically, this seems to
1038 * be necessary for TXF (ld), but seems wise to do for all messages.
1039 */
1040 for (int i = ir->coordinate->type->vector_elements; i < 3; i++) {
1041 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * 2), fs_reg(0.0f)));
1042 }
1043
1044 /* lod/bias appears after u/v/r. */
1045 mlen += 6;
1046
1047 emit(MOV(fs_reg(MRF, base_mrf + mlen, lod.type), lod));
1048 mlen++;
1049
1050 /* The unused upper half. */
1051 mlen++;
1052 }
1053
1054 if (simd16) {
1055 /* Now, since we're doing simd16, the return is 2 interleaved
1056 * vec4s where the odd-indexed ones are junk. We'll need to move
1057 * this weirdness around to the expected layout.
1058 */
1059 orig_dst = dst;
1060 dst = fs_reg(GRF, virtual_grf_alloc(8),
1061 (brw->is_g4x ?
1062 brw_type_for_base_type(ir->type) :
1063 BRW_REGISTER_TYPE_F));
1064 }
1065
1066 fs_inst *inst = NULL;
1067 switch (ir->op) {
1068 case ir_tex:
1069 inst = emit(SHADER_OPCODE_TEX, dst);
1070 break;
1071 case ir_txb:
1072 inst = emit(FS_OPCODE_TXB, dst);
1073 break;
1074 case ir_txl:
1075 inst = emit(SHADER_OPCODE_TXL, dst);
1076 break;
1077 case ir_txd:
1078 inst = emit(SHADER_OPCODE_TXD, dst);
1079 break;
1080 case ir_txs:
1081 inst = emit(SHADER_OPCODE_TXS, dst);
1082 break;
1083 case ir_txf:
1084 inst = emit(SHADER_OPCODE_TXF, dst);
1085 break;
1086 default:
1087 fail("unrecognized texture opcode");
1088 }
1089 inst->base_mrf = base_mrf;
1090 inst->mlen = mlen;
1091 inst->header_present = true;
1092 inst->regs_written = simd16 ? 8 : 4;
1093
1094 if (simd16) {
1095 for (int i = 0; i < 4; i++) {
1096 emit(MOV(orig_dst, dst));
1097 orig_dst.reg_offset++;
1098 dst.reg_offset += 2;
1099 }
1100 }
1101
1102 return inst;
1103 }
1104
1105 /* gen5's sampler has slots for u, v, r, array index, then optional
1106 * parameters like shadow comparitor or LOD bias. If optional
1107 * parameters aren't present, those base slots are optional and don't
1108 * need to be included in the message.
1109 *
1110 * We don't fill in the unnecessary slots regardless, which may look
1111 * surprising in the disassembly.
1112 */
1113 fs_inst *
1114 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1115 fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1116 fs_reg sample_index)
1117 {
1118 int mlen = 0;
1119 int base_mrf = 2;
1120 int reg_width = dispatch_width / 8;
1121 bool header_present = false;
1122 const int vector_elements =
1123 ir->coordinate ? ir->coordinate->type->vector_elements : 0;
1124
1125 if (ir->offset) {
1126 /* The offsets set up by the ir_texture visitor are in the
1127 * m1 header, so we can't go headerless.
1128 */
1129 header_present = true;
1130 mlen++;
1131 base_mrf--;
1132 }
1133
1134 for (int i = 0; i < vector_elements; i++) {
1135 emit(MOV(fs_reg(MRF, base_mrf + mlen + i * reg_width, coordinate.type),
1136 coordinate));
1137 coordinate.reg_offset++;
1138 }
1139 mlen += vector_elements * reg_width;
1140
1141 if (ir->shadow_comparitor) {
1142 mlen = MAX2(mlen, header_present + 4 * reg_width);
1143
1144 emit(MOV(fs_reg(MRF, base_mrf + mlen), shadow_c));
1145 mlen += reg_width;
1146 }
1147
1148 fs_inst *inst = NULL;
1149 switch (ir->op) {
1150 case ir_tex:
1151 inst = emit(SHADER_OPCODE_TEX, dst);
1152 break;
1153 case ir_txb:
1154 mlen = MAX2(mlen, header_present + 4 * reg_width);
1155 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1156 mlen += reg_width;
1157
1158 inst = emit(FS_OPCODE_TXB, dst);
1159 break;
1160 case ir_txl:
1161 mlen = MAX2(mlen, header_present + 4 * reg_width);
1162 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1163 mlen += reg_width;
1164
1165 inst = emit(SHADER_OPCODE_TXL, dst);
1166 break;
1167 case ir_txd: {
1168 mlen = MAX2(mlen, header_present + 4 * reg_width); /* skip over 'ai' */
1169
1170 /**
1171 * P = u, v, r
1172 * dPdx = dudx, dvdx, drdx
1173 * dPdy = dudy, dvdy, drdy
1174 *
1175 * Load up these values:
1176 * - dudx dudy dvdx dvdy drdx drdy
1177 * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
1178 */
1179 for (int i = 0; i < ir->lod_info.grad.dPdx->type->vector_elements; i++) {
1180 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod));
1181 lod.reg_offset++;
1182 mlen += reg_width;
1183
1184 emit(MOV(fs_reg(MRF, base_mrf + mlen), lod2));
1185 lod2.reg_offset++;
1186 mlen += reg_width;
1187 }
1188
1189 inst = emit(SHADER_OPCODE_TXD, dst);
1190 break;
1191 }
1192 case ir_txs:
1193 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), lod));
1194 mlen += reg_width;
1195 inst = emit(SHADER_OPCODE_TXS, dst);
1196 break;
1197 case ir_query_levels:
1198 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1199 mlen += reg_width;
1200 inst = emit(SHADER_OPCODE_TXS, dst);
1201 break;
1202 case ir_txf:
1203 mlen = header_present + 4 * reg_width;
1204 emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), lod));
1205 inst = emit(SHADER_OPCODE_TXF, dst);
1206 break;
1207 case ir_txf_ms:
1208 mlen = header_present + 4 * reg_width;
1209
1210 /* lod */
1211 emit(MOV(fs_reg(MRF, base_mrf + mlen - reg_width, BRW_REGISTER_TYPE_UD), fs_reg(0)));
1212 /* sample index */
1213 emit(MOV(fs_reg(MRF, base_mrf + mlen, BRW_REGISTER_TYPE_UD), sample_index));
1214 mlen += reg_width;
1215 inst = emit(SHADER_OPCODE_TXF_MS, dst);
1216 break;
1217 case ir_lod:
1218 inst = emit(SHADER_OPCODE_LOD, dst);
1219 break;
1220 case ir_tg4:
1221 inst = emit(SHADER_OPCODE_TG4, dst);
1222 break;
1223 default:
1224 fail("unrecognized texture opcode");
1225 break;
1226 }
1227 inst->base_mrf = base_mrf;
1228 inst->mlen = mlen;
1229 inst->header_present = header_present;
1230 inst->regs_written = 4;
1231
1232 if (mlen > 11) {
1233 fail("Message length >11 disallowed by hardware\n");
1234 }
1235
1236 return inst;
1237 }
1238
1239 fs_inst *
1240 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1241 fs_reg shadow_c, fs_reg lod, fs_reg lod2,
1242 fs_reg sample_index, fs_reg mcs)
1243 {
1244 int reg_width = dispatch_width / 8;
1245 bool header_present = false;
1246
1247 fs_reg payload = fs_reg(this, glsl_type::float_type);
1248 fs_reg next = payload;
1249
1250 if (ir->op == ir_tg4 || (ir->offset && ir->op != ir_txf)) {
1251 /* For general texture offsets (no txf workaround), we need a header to
1252 * put them in. Note that for 16-wide we're making space for two actual
1253 * hardware registers here, so the emit will have to fix up for this.
1254 *
1255 * * ir4_tg4 needs to place its channel select in the header,
1256 * for interaction with ARB_texture_swizzle
1257 */
1258 header_present = true;
1259 next.reg_offset++;
1260 }
1261
1262 if (ir->shadow_comparitor) {
1263 emit(MOV(next, shadow_c));
1264 next.reg_offset++;
1265 }
1266
1267 bool has_nonconstant_offset = ir->offset && !ir->offset->as_constant();
1268 bool coordinate_done = false;
1269
1270 /* Set up the LOD info */
1271 switch (ir->op) {
1272 case ir_tex:
1273 case ir_lod:
1274 break;
1275 case ir_txb:
1276 emit(MOV(next, lod));
1277 next.reg_offset++;
1278 break;
1279 case ir_txl:
1280 emit(MOV(next, lod));
1281 next.reg_offset++;
1282 break;
1283 case ir_txd: {
1284 if (dispatch_width == 16)
1285 fail("Gen7 does not support sample_d/sample_d_c in SIMD16 mode.");
1286
1287 /* Load dPdx and the coordinate together:
1288 * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
1289 */
1290 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1291 emit(MOV(next, coordinate));
1292 coordinate.reg_offset++;
1293 next.reg_offset++;
1294
1295 /* For cube map array, the coordinate is (u,v,r,ai) but there are
1296 * only derivatives for (u, v, r).
1297 */
1298 if (i < ir->lod_info.grad.dPdx->type->vector_elements) {
1299 emit(MOV(next, lod));
1300 lod.reg_offset++;
1301 next.reg_offset++;
1302
1303 emit(MOV(next, lod2));
1304 lod2.reg_offset++;
1305 next.reg_offset++;
1306 }
1307 }
1308
1309 coordinate_done = true;
1310 break;
1311 }
1312 case ir_txs:
1313 emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), lod));
1314 next.reg_offset++;
1315 break;
1316 case ir_query_levels:
1317 emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), fs_reg(0u)));
1318 next.reg_offset++;
1319 break;
1320 case ir_txf:
1321 /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r. */
1322 emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1323 coordinate.reg_offset++;
1324 next.reg_offset++;
1325
1326 emit(MOV(next.retype(BRW_REGISTER_TYPE_D), lod));
1327 next.reg_offset++;
1328
1329 for (int i = 1; i < ir->coordinate->type->vector_elements; i++) {
1330 emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1331 coordinate.reg_offset++;
1332 next.reg_offset++;
1333 }
1334
1335 coordinate_done = true;
1336 break;
1337 case ir_txf_ms:
1338 emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), sample_index));
1339 next.reg_offset++;
1340
1341 /* data from the multisample control surface */
1342 emit(MOV(next.retype(BRW_REGISTER_TYPE_UD), mcs));
1343 next.reg_offset++;
1344
1345 /* there is no offsetting for this message; just copy in the integer
1346 * texture coordinates
1347 */
1348 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1349 emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1350 coordinate.reg_offset++;
1351 next.reg_offset++;
1352 }
1353
1354 coordinate_done = true;
1355 break;
1356 case ir_tg4:
1357 if (has_nonconstant_offset) {
1358 if (ir->shadow_comparitor && dispatch_width == 16)
1359 fail("Gen7 does not support gather4_po_c in SIMD16 mode.");
1360
1361 /* More crazy intermixing */
1362 ir->offset->accept(this);
1363 fs_reg offset_value = this->result;
1364
1365 for (int i = 0; i < 2; i++) { /* u, v */
1366 emit(MOV(next, coordinate));
1367 coordinate.reg_offset++;
1368 next.reg_offset++;
1369 }
1370
1371 for (int i = 0; i < 2; i++) { /* offu, offv */
1372 emit(MOV(next.retype(BRW_REGISTER_TYPE_D), offset_value));
1373 offset_value.reg_offset++;
1374 next.reg_offset++;
1375 }
1376
1377 if (ir->coordinate->type->vector_elements == 3) { /* r if present */
1378 emit(MOV(next, coordinate));
1379 coordinate.reg_offset++;
1380 next.reg_offset++;
1381 }
1382
1383 coordinate_done = true;
1384 }
1385 break;
1386 }
1387
1388 /* Set up the coordinate (except for cases where it was done above) */
1389 if (ir->coordinate && !coordinate_done) {
1390 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1391 emit(MOV(next, coordinate));
1392 coordinate.reg_offset++;
1393 next.reg_offset++;
1394 }
1395 }
1396
1397 /* Generate the SEND */
1398 fs_inst *inst = NULL;
1399 switch (ir->op) {
1400 case ir_tex: inst = emit(SHADER_OPCODE_TEX, dst, payload); break;
1401 case ir_txb: inst = emit(FS_OPCODE_TXB, dst, payload); break;
1402 case ir_txl: inst = emit(SHADER_OPCODE_TXL, dst, payload); break;
1403 case ir_txd: inst = emit(SHADER_OPCODE_TXD, dst, payload); break;
1404 case ir_txf: inst = emit(SHADER_OPCODE_TXF, dst, payload); break;
1405 case ir_txf_ms: inst = emit(SHADER_OPCODE_TXF_MS, dst, payload); break;
1406 case ir_txs: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
1407 case ir_query_levels: inst = emit(SHADER_OPCODE_TXS, dst, payload); break;
1408 case ir_lod: inst = emit(SHADER_OPCODE_LOD, dst, payload); break;
1409 case ir_tg4:
1410 if (has_nonconstant_offset)
1411 inst = emit(SHADER_OPCODE_TG4_OFFSET, dst, payload);
1412 else
1413 inst = emit(SHADER_OPCODE_TG4, dst, payload);
1414 break;
1415 }
1416 inst->base_mrf = -1;
1417 if (reg_width == 2)
1418 inst->mlen = next.reg_offset * reg_width - header_present;
1419 else
1420 inst->mlen = next.reg_offset * reg_width;
1421 inst->header_present = header_present;
1422 inst->regs_written = 4;
1423
1424 virtual_grf_sizes[payload.reg] = next.reg_offset;
1425 if (inst->mlen > 11) {
1426 fail("Message length >11 disallowed by hardware\n");
1427 }
1428
1429 return inst;
1430 }
1431
1432 fs_reg
1433 fs_visitor::rescale_texcoord(ir_texture *ir, fs_reg coordinate,
1434 bool is_rect, int sampler, int texunit)
1435 {
1436 fs_inst *inst = NULL;
1437 bool needs_gl_clamp = true;
1438 fs_reg scale_x, scale_y;
1439
1440 /* The 965 requires the EU to do the normalization of GL rectangle
1441 * texture coordinates. We use the program parameter state
1442 * tracking to get the scaling factor.
1443 */
1444 if (is_rect &&
1445 (brw->gen < 6 ||
1446 (brw->gen >= 6 && (c->key.tex.gl_clamp_mask[0] & (1 << sampler) ||
1447 c->key.tex.gl_clamp_mask[1] & (1 << sampler))))) {
1448 struct gl_program_parameter_list *params = prog->Parameters;
1449 int tokens[STATE_LENGTH] = {
1450 STATE_INTERNAL,
1451 STATE_TEXRECT_SCALE,
1452 texunit,
1453 0,
1454 0
1455 };
1456
1457 if (dispatch_width == 16) {
1458 fail("rectangle scale uniform setup not supported on 16-wide\n");
1459 return coordinate;
1460 }
1461
1462 scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1463 scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1464
1465 GLuint index = _mesa_add_state_reference(params,
1466 (gl_state_index *)tokens);
1467 c->prog_data.param[c->prog_data.nr_params++] =
1468 &prog->Parameters->ParameterValues[index][0].f;
1469 c->prog_data.param[c->prog_data.nr_params++] =
1470 &prog->Parameters->ParameterValues[index][1].f;
1471 }
1472
1473 /* The 965 requires the EU to do the normalization of GL rectangle
1474 * texture coordinates. We use the program parameter state
1475 * tracking to get the scaling factor.
1476 */
1477 if (brw->gen < 6 && is_rect) {
1478 fs_reg dst = fs_reg(this, ir->coordinate->type);
1479 fs_reg src = coordinate;
1480 coordinate = dst;
1481
1482 emit(MUL(dst, src, scale_x));
1483 dst.reg_offset++;
1484 src.reg_offset++;
1485 emit(MUL(dst, src, scale_y));
1486 } else if (is_rect) {
1487 /* On gen6+, the sampler handles the rectangle coordinates
1488 * natively, without needing rescaling. But that means we have
1489 * to do GL_CLAMP clamping at the [0, width], [0, height] scale,
1490 * not [0, 1] like the default case below.
1491 */
1492 needs_gl_clamp = false;
1493
1494 for (int i = 0; i < 2; i++) {
1495 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1496 fs_reg chan = coordinate;
1497 chan.reg_offset += i;
1498
1499 inst = emit(BRW_OPCODE_SEL, chan, chan, brw_imm_f(0.0));
1500 inst->conditional_mod = BRW_CONDITIONAL_G;
1501
1502 /* Our parameter comes in as 1.0/width or 1.0/height,
1503 * because that's what people normally want for doing
1504 * texture rectangle handling. We need width or height
1505 * for clamping, but we don't care enough to make a new
1506 * parameter type, so just invert back.
1507 */
1508 fs_reg limit = fs_reg(this, glsl_type::float_type);
1509 emit(MOV(limit, i == 0 ? scale_x : scale_y));
1510 emit(SHADER_OPCODE_RCP, limit, limit);
1511
1512 inst = emit(BRW_OPCODE_SEL, chan, chan, limit);
1513 inst->conditional_mod = BRW_CONDITIONAL_L;
1514 }
1515 }
1516 }
1517
1518 if (ir->coordinate && needs_gl_clamp) {
1519 for (unsigned int i = 0;
1520 i < MIN2(ir->coordinate->type->vector_elements, 3); i++) {
1521 if (c->key.tex.gl_clamp_mask[i] & (1 << sampler)) {
1522 fs_reg chan = coordinate;
1523 chan.reg_offset += i;
1524
1525 fs_inst *inst = emit(MOV(chan, chan));
1526 inst->saturate = true;
1527 }
1528 }
1529 }
1530 return coordinate;
1531 }
1532
1533 /* Sample from the MCS surface attached to this multisample texture. */
1534 fs_reg
1535 fs_visitor::emit_mcs_fetch(ir_texture *ir, fs_reg coordinate, int sampler)
1536 {
1537 int reg_width = dispatch_width / 8;
1538 fs_reg payload = fs_reg(this, glsl_type::float_type);
1539 fs_reg dest = fs_reg(this, glsl_type::uvec4_type);
1540 fs_reg next = payload;
1541
1542 /* parameters are: u, v, r, lod; missing parameters are treated as zero */
1543 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1544 emit(MOV(next.retype(BRW_REGISTER_TYPE_D), coordinate));
1545 coordinate.reg_offset++;
1546 next.reg_offset++;
1547 }
1548
1549 fs_inst *inst = emit(SHADER_OPCODE_TXF_MCS, dest, payload);
1550 inst->base_mrf = -1;
1551 inst->mlen = next.reg_offset * reg_width;
1552 inst->header_present = false;
1553 inst->regs_written = 4 * reg_width; /* we only care about one reg of response,
1554 * but the sampler always writes 4/8
1555 */
1556 inst->sampler = sampler;
1557
1558 return dest;
1559 }
1560
1561 void
1562 fs_visitor::visit(ir_texture *ir)
1563 {
1564 fs_inst *inst = NULL;
1565
1566 int sampler =
1567 _mesa_get_sampler_uniform_value(ir->sampler, shader_prog, prog);
1568 /* FINISHME: We're failing to recompile our programs when the sampler is
1569 * updated. This only matters for the texture rectangle scale parameters
1570 * (pre-gen6, or gen6+ with GL_CLAMP).
1571 */
1572 int texunit = prog->SamplerUnits[sampler];
1573
1574 if (ir->op == ir_tg4) {
1575 /* When tg4 is used with the degenerate ZERO/ONE swizzles, don't bother
1576 * emitting anything other than setting up the constant result.
1577 */
1578 ir_constant *chan = ir->lod_info.component->as_constant();
1579 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], chan->value.i[0]);
1580 if (swiz == SWIZZLE_ZERO || swiz == SWIZZLE_ONE) {
1581
1582 fs_reg res = fs_reg(this, glsl_type::vec4_type);
1583 this->result = res;
1584
1585 for (int i=0; i<4; i++) {
1586 emit(MOV(res, fs_reg(swiz == SWIZZLE_ZERO ? 0.0f : 1.0f)));
1587 res.reg_offset++;
1588 }
1589 return;
1590 }
1591 }
1592
1593 /* Should be lowered by do_lower_texture_projection */
1594 assert(!ir->projector);
1595
1596 /* Should be lowered */
1597 assert(!ir->offset || !ir->offset->type->is_array());
1598
1599 /* Generate code to compute all the subexpression trees. This has to be
1600 * done before loading any values into MRFs for the sampler message since
1601 * generating these values may involve SEND messages that need the MRFs.
1602 */
1603 fs_reg coordinate;
1604 if (ir->coordinate) {
1605 ir->coordinate->accept(this);
1606
1607 coordinate = rescale_texcoord(ir, this->result,
1608 ir->sampler->type->sampler_dimensionality ==
1609 GLSL_SAMPLER_DIM_RECT,
1610 sampler, texunit);
1611 }
1612
1613 fs_reg shadow_comparitor;
1614 if (ir->shadow_comparitor) {
1615 ir->shadow_comparitor->accept(this);
1616 shadow_comparitor = this->result;
1617 }
1618
1619 fs_reg lod, lod2, sample_index, mcs;
1620 switch (ir->op) {
1621 case ir_tex:
1622 case ir_lod:
1623 case ir_tg4:
1624 case ir_query_levels:
1625 break;
1626 case ir_txb:
1627 ir->lod_info.bias->accept(this);
1628 lod = this->result;
1629 break;
1630 case ir_txd:
1631 ir->lod_info.grad.dPdx->accept(this);
1632 lod = this->result;
1633
1634 ir->lod_info.grad.dPdy->accept(this);
1635 lod2 = this->result;
1636 break;
1637 case ir_txf:
1638 case ir_txl:
1639 case ir_txs:
1640 ir->lod_info.lod->accept(this);
1641 lod = this->result;
1642 break;
1643 case ir_txf_ms:
1644 ir->lod_info.sample_index->accept(this);
1645 sample_index = this->result;
1646
1647 if (brw->gen >= 7 && c->key.tex.compressed_multisample_layout_mask & (1<<sampler))
1648 mcs = emit_mcs_fetch(ir, coordinate, sampler);
1649 else
1650 mcs = fs_reg(0u);
1651 break;
1652 default:
1653 assert(!"Unrecognized texture opcode");
1654 };
1655
1656 /* Writemasking doesn't eliminate channels on SIMD8 texture
1657 * samples, so don't worry about them.
1658 */
1659 fs_reg dst = fs_reg(this, glsl_type::get_instance(ir->type->base_type, 4, 1));
1660
1661 if (brw->gen >= 7) {
1662 inst = emit_texture_gen7(ir, dst, coordinate, shadow_comparitor,
1663 lod, lod2, sample_index, mcs);
1664 } else if (brw->gen >= 5) {
1665 inst = emit_texture_gen5(ir, dst, coordinate, shadow_comparitor,
1666 lod, lod2, sample_index);
1667 } else {
1668 inst = emit_texture_gen4(ir, dst, coordinate, shadow_comparitor,
1669 lod, lod2);
1670 }
1671
1672 if (ir->offset != NULL && ir->op != ir_txf)
1673 inst->texture_offset = brw_texture_offset(ctx, ir->offset->as_constant());
1674
1675 if (ir->op == ir_tg4)
1676 inst->texture_offset |= gather_channel(ir, sampler) << 16; // M0.2:16-17
1677
1678 inst->sampler = sampler;
1679
1680 if (ir->shadow_comparitor)
1681 inst->shadow_compare = true;
1682
1683 /* fixup #layers for cube map arrays */
1684 if (ir->op == ir_txs) {
1685 glsl_type const *type = ir->sampler->type;
1686 if (type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
1687 type->sampler_array) {
1688 fs_reg depth = dst;
1689 depth.reg_offset = 2;
1690 emit_math(SHADER_OPCODE_INT_QUOTIENT, depth, depth, fs_reg(6));
1691 }
1692 }
1693
1694 swizzle_result(ir, dst, sampler);
1695 }
1696
1697 /**
1698 * Set up the gather channel based on the swizzle, for gather4.
1699 */
1700 uint32_t
1701 fs_visitor::gather_channel(ir_texture *ir, int sampler)
1702 {
1703 ir_constant *chan = ir->lod_info.component->as_constant();
1704 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], chan->value.i[0]);
1705 switch (swiz) {
1706 case SWIZZLE_X: return 0;
1707 case SWIZZLE_Y:
1708 /* gather4 sampler is broken for green channel on RG32F --
1709 * we must ask for blue instead.
1710 */
1711 if (c->key.tex.gather_channel_quirk_mask & (1<<sampler))
1712 return 2;
1713 return 1;
1714 case SWIZZLE_Z: return 2;
1715 case SWIZZLE_W: return 3;
1716 default:
1717 assert(!"Not reached"); /* zero, one swizzles handled already */
1718 return 0;
1719 }
1720 }
1721
1722 /**
1723 * Swizzle the result of a texture result. This is necessary for
1724 * EXT_texture_swizzle as well as DEPTH_TEXTURE_MODE for shadow comparisons.
1725 */
1726 void
1727 fs_visitor::swizzle_result(ir_texture *ir, fs_reg orig_val, int sampler)
1728 {
1729 if (ir->op == ir_query_levels) {
1730 /* # levels is in .w */
1731 orig_val.reg_offset += 3;
1732 this->result = orig_val;
1733 return;
1734 }
1735
1736 this->result = orig_val;
1737
1738 /* txs,lod don't actually sample the texture, so swizzling the result
1739 * makes no sense.
1740 */
1741 if (ir->op == ir_txs || ir->op == ir_lod || ir->op == ir_tg4)
1742 return;
1743
1744 if (ir->type == glsl_type::float_type) {
1745 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1746 assert(ir->sampler->type->sampler_shadow);
1747 } else if (c->key.tex.swizzles[sampler] != SWIZZLE_NOOP) {
1748 fs_reg swizzled_result = fs_reg(this, glsl_type::vec4_type);
1749
1750 for (int i = 0; i < 4; i++) {
1751 int swiz = GET_SWZ(c->key.tex.swizzles[sampler], i);
1752 fs_reg l = swizzled_result;
1753 l.reg_offset += i;
1754
1755 if (swiz == SWIZZLE_ZERO) {
1756 emit(MOV(l, fs_reg(0.0f)));
1757 } else if (swiz == SWIZZLE_ONE) {
1758 emit(MOV(l, fs_reg(1.0f)));
1759 } else {
1760 fs_reg r = orig_val;
1761 r.reg_offset += GET_SWZ(c->key.tex.swizzles[sampler], i);
1762 emit(MOV(l, r));
1763 }
1764 }
1765 this->result = swizzled_result;
1766 }
1767 }
1768
1769 void
1770 fs_visitor::visit(ir_swizzle *ir)
1771 {
1772 ir->val->accept(this);
1773 fs_reg val = this->result;
1774
1775 if (ir->type->vector_elements == 1) {
1776 this->result.reg_offset += ir->mask.x;
1777 return;
1778 }
1779
1780 fs_reg result = fs_reg(this, ir->type);
1781 this->result = result;
1782
1783 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1784 fs_reg channel = val;
1785 int swiz = 0;
1786
1787 switch (i) {
1788 case 0:
1789 swiz = ir->mask.x;
1790 break;
1791 case 1:
1792 swiz = ir->mask.y;
1793 break;
1794 case 2:
1795 swiz = ir->mask.z;
1796 break;
1797 case 3:
1798 swiz = ir->mask.w;
1799 break;
1800 }
1801
1802 channel.reg_offset += swiz;
1803 emit(MOV(result, channel));
1804 result.reg_offset++;
1805 }
1806 }
1807
1808 void
1809 fs_visitor::visit(ir_discard *ir)
1810 {
1811 assert(ir->condition == NULL); /* FINISHME */
1812
1813 /* We track our discarded pixels in f0.1. By predicating on it, we can
1814 * update just the flag bits that aren't yet discarded. By emitting a
1815 * CMP of g0 != g0, all our currently executing channels will get turned
1816 * off.
1817 */
1818 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
1819 BRW_REGISTER_TYPE_UW));
1820 fs_inst *cmp = emit(CMP(reg_null_f, some_reg, some_reg,
1821 BRW_CONDITIONAL_NZ));
1822 cmp->predicate = BRW_PREDICATE_NORMAL;
1823 cmp->flag_subreg = 1;
1824
1825 if (brw->gen >= 6) {
1826 /* For performance, after a discard, jump to the end of the shader.
1827 * However, many people will do foliage by discarding based on a
1828 * texture's alpha mask, and then continue on to texture with the
1829 * remaining pixels. To avoid trashing the derivatives for those
1830 * texture samples, we'll only jump if all of the pixels in the subspan
1831 * have been discarded.
1832 */
1833 fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1834 discard_jump->flag_subreg = 1;
1835 discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
1836 discard_jump->predicate_inverse = true;
1837 }
1838 }
1839
1840 void
1841 fs_visitor::visit(ir_constant *ir)
1842 {
1843 /* Set this->result to reg at the bottom of the function because some code
1844 * paths will cause this visitor to be applied to other fields. This will
1845 * cause the value stored in this->result to be modified.
1846 *
1847 * Make reg constant so that it doesn't get accidentally modified along the
1848 * way. Yes, I actually had this problem. :(
1849 */
1850 const fs_reg reg(this, ir->type);
1851 fs_reg dst_reg = reg;
1852
1853 if (ir->type->is_array()) {
1854 const unsigned size = type_size(ir->type->fields.array);
1855
1856 for (unsigned i = 0; i < ir->type->length; i++) {
1857 ir->array_elements[i]->accept(this);
1858 fs_reg src_reg = this->result;
1859
1860 dst_reg.type = src_reg.type;
1861 for (unsigned j = 0; j < size; j++) {
1862 emit(MOV(dst_reg, src_reg));
1863 src_reg.reg_offset++;
1864 dst_reg.reg_offset++;
1865 }
1866 }
1867 } else if (ir->type->is_record()) {
1868 foreach_list(node, &ir->components) {
1869 ir_constant *const field = (ir_constant *) node;
1870 const unsigned size = type_size(field->type);
1871
1872 field->accept(this);
1873 fs_reg src_reg = this->result;
1874
1875 dst_reg.type = src_reg.type;
1876 for (unsigned j = 0; j < size; j++) {
1877 emit(MOV(dst_reg, src_reg));
1878 src_reg.reg_offset++;
1879 dst_reg.reg_offset++;
1880 }
1881 }
1882 } else {
1883 const unsigned size = type_size(ir->type);
1884
1885 for (unsigned i = 0; i < size; i++) {
1886 switch (ir->type->base_type) {
1887 case GLSL_TYPE_FLOAT:
1888 emit(MOV(dst_reg, fs_reg(ir->value.f[i])));
1889 break;
1890 case GLSL_TYPE_UINT:
1891 emit(MOV(dst_reg, fs_reg(ir->value.u[i])));
1892 break;
1893 case GLSL_TYPE_INT:
1894 emit(MOV(dst_reg, fs_reg(ir->value.i[i])));
1895 break;
1896 case GLSL_TYPE_BOOL:
1897 emit(MOV(dst_reg, fs_reg((int)ir->value.b[i])));
1898 break;
1899 default:
1900 assert(!"Non-float/uint/int/bool constant");
1901 }
1902 dst_reg.reg_offset++;
1903 }
1904 }
1905
1906 this->result = reg;
1907 }
1908
1909 void
1910 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1911 {
1912 ir_expression *expr = ir->as_expression();
1913
1914 if (expr &&
1915 expr->operation != ir_binop_logic_and &&
1916 expr->operation != ir_binop_logic_or &&
1917 expr->operation != ir_binop_logic_xor) {
1918 fs_reg op[2];
1919 fs_inst *inst;
1920
1921 assert(expr->get_num_operands() <= 2);
1922 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1923 assert(expr->operands[i]->type->is_scalar());
1924
1925 expr->operands[i]->accept(this);
1926 op[i] = this->result;
1927
1928 resolve_ud_negate(&op[i]);
1929 }
1930
1931 switch (expr->operation) {
1932 case ir_unop_logic_not:
1933 inst = emit(AND(reg_null_d, op[0], fs_reg(1)));
1934 inst->conditional_mod = BRW_CONDITIONAL_Z;
1935 break;
1936
1937 case ir_unop_f2b:
1938 if (brw->gen >= 6) {
1939 emit(CMP(reg_null_d, op[0], fs_reg(0.0f), BRW_CONDITIONAL_NZ));
1940 } else {
1941 inst = emit(MOV(reg_null_f, op[0]));
1942 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1943 }
1944 break;
1945
1946 case ir_unop_i2b:
1947 if (brw->gen >= 6) {
1948 emit(CMP(reg_null_d, op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
1949 } else {
1950 inst = emit(MOV(reg_null_d, op[0]));
1951 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1952 }
1953 break;
1954
1955 case ir_binop_greater:
1956 case ir_binop_gequal:
1957 case ir_binop_less:
1958 case ir_binop_lequal:
1959 case ir_binop_equal:
1960 case ir_binop_all_equal:
1961 case ir_binop_nequal:
1962 case ir_binop_any_nequal:
1963 resolve_bool_comparison(expr->operands[0], &op[0]);
1964 resolve_bool_comparison(expr->operands[1], &op[1]);
1965
1966 emit(CMP(reg_null_d, op[0], op[1],
1967 brw_conditional_for_comparison(expr->operation)));
1968 break;
1969
1970 default:
1971 assert(!"not reached");
1972 fail("bad cond code\n");
1973 break;
1974 }
1975 return;
1976 }
1977
1978 ir->accept(this);
1979
1980 fs_inst *inst = emit(AND(reg_null_d, this->result, fs_reg(1)));
1981 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1982 }
1983
1984 /**
1985 * Emit a gen6 IF statement with the comparison folded into the IF
1986 * instruction.
1987 */
1988 void
1989 fs_visitor::emit_if_gen6(ir_if *ir)
1990 {
1991 ir_expression *expr = ir->condition->as_expression();
1992
1993 if (expr) {
1994 fs_reg op[2];
1995 fs_inst *inst;
1996 fs_reg temp;
1997
1998 assert(expr->get_num_operands() <= 2);
1999 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
2000 assert(expr->operands[i]->type->is_scalar());
2001
2002 expr->operands[i]->accept(this);
2003 op[i] = this->result;
2004 }
2005
2006 switch (expr->operation) {
2007 case ir_unop_logic_not:
2008 case ir_binop_logic_xor:
2009 case ir_binop_logic_or:
2010 case ir_binop_logic_and:
2011 /* For operations on bool arguments, only the low bit of the bool is
2012 * valid, and the others are undefined. Fall back to the condition
2013 * code path.
2014 */
2015 break;
2016
2017 case ir_unop_f2b:
2018 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
2019 inst->conditional_mod = BRW_CONDITIONAL_NZ;
2020 return;
2021
2022 case ir_unop_i2b:
2023 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2024 return;
2025
2026 case ir_binop_greater:
2027 case ir_binop_gequal:
2028 case ir_binop_less:
2029 case ir_binop_lequal:
2030 case ir_binop_equal:
2031 case ir_binop_all_equal:
2032 case ir_binop_nequal:
2033 case ir_binop_any_nequal:
2034 resolve_bool_comparison(expr->operands[0], &op[0]);
2035 resolve_bool_comparison(expr->operands[1], &op[1]);
2036
2037 emit(IF(op[0], op[1],
2038 brw_conditional_for_comparison(expr->operation)));
2039 return;
2040 default:
2041 assert(!"not reached");
2042 emit(IF(op[0], fs_reg(0), BRW_CONDITIONAL_NZ));
2043 fail("bad condition\n");
2044 return;
2045 }
2046 }
2047
2048 emit_bool_to_cond_code(ir->condition);
2049 fs_inst *inst = emit(BRW_OPCODE_IF);
2050 inst->predicate = BRW_PREDICATE_NORMAL;
2051 }
2052
2053 /**
2054 * Try to replace IF/MOV/ELSE/MOV/ENDIF with SEL.
2055 *
2056 * Many GLSL shaders contain the following pattern:
2057 *
2058 * x = condition ? foo : bar
2059 *
2060 * The compiler emits an ir_if tree for this, since each subexpression might be
2061 * a complex tree that could have side-effects or short-circuit logic.
2062 *
2063 * However, the common case is to simply select one of two constants or
2064 * variable values---which is exactly what SEL is for. In this case, the
2065 * assembly looks like:
2066 *
2067 * (+f0) IF
2068 * MOV dst src0
2069 * ELSE
2070 * MOV dst src1
2071 * ENDIF
2072 *
2073 * which can be easily translated into:
2074 *
2075 * (+f0) SEL dst src0 src1
2076 *
2077 * If src0 is an immediate value, we promote it to a temporary GRF.
2078 */
2079 void
2080 fs_visitor::try_replace_with_sel()
2081 {
2082 fs_inst *endif_inst = (fs_inst *) instructions.get_tail();
2083 assert(endif_inst->opcode == BRW_OPCODE_ENDIF);
2084
2085 /* Pattern match in reverse: IF, MOV, ELSE, MOV, ENDIF. */
2086 int opcodes[] = {
2087 BRW_OPCODE_IF, BRW_OPCODE_MOV, BRW_OPCODE_ELSE, BRW_OPCODE_MOV,
2088 };
2089
2090 fs_inst *match = (fs_inst *) endif_inst->prev;
2091 for (int i = 0; i < 4; i++) {
2092 if (match->is_head_sentinel() || match->opcode != opcodes[4-i-1])
2093 return;
2094 match = (fs_inst *) match->prev;
2095 }
2096
2097 /* The opcodes match; it looks like the right sequence of instructions. */
2098 fs_inst *else_mov = (fs_inst *) endif_inst->prev;
2099 fs_inst *then_mov = (fs_inst *) else_mov->prev->prev;
2100 fs_inst *if_inst = (fs_inst *) then_mov->prev;
2101
2102 /* Check that the MOVs are the right form. */
2103 if (then_mov->dst.equals(else_mov->dst) &&
2104 !then_mov->is_partial_write() &&
2105 !else_mov->is_partial_write()) {
2106
2107 /* Remove the matched instructions; we'll emit a SEL to replace them. */
2108 while (!if_inst->next->is_tail_sentinel())
2109 if_inst->next->remove();
2110 if_inst->remove();
2111
2112 /* Only the last source register can be a constant, so if the MOV in
2113 * the "then" clause uses a constant, we need to put it in a temporary.
2114 */
2115 fs_reg src0(then_mov->src[0]);
2116 if (src0.file == IMM) {
2117 src0 = fs_reg(this, glsl_type::float_type);
2118 src0.type = then_mov->src[0].type;
2119 emit(MOV(src0, then_mov->src[0]));
2120 }
2121
2122 fs_inst *sel;
2123 if (if_inst->conditional_mod) {
2124 /* Sandybridge-specific IF with embedded comparison */
2125 emit(CMP(reg_null_d, if_inst->src[0], if_inst->src[1],
2126 if_inst->conditional_mod));
2127 sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2128 sel->predicate = BRW_PREDICATE_NORMAL;
2129 } else {
2130 /* Separate CMP and IF instructions */
2131 sel = emit(BRW_OPCODE_SEL, then_mov->dst, src0, else_mov->src[0]);
2132 sel->predicate = if_inst->predicate;
2133 sel->predicate_inverse = if_inst->predicate_inverse;
2134 }
2135 }
2136 }
2137
2138 void
2139 fs_visitor::visit(ir_if *ir)
2140 {
2141 if (brw->gen < 6 && dispatch_width == 16) {
2142 fail("Can't support (non-uniform) control flow on 16-wide\n");
2143 }
2144
2145 /* Don't point the annotation at the if statement, because then it plus
2146 * the then and else blocks get printed.
2147 */
2148 this->base_ir = ir->condition;
2149
2150 if (brw->gen == 6) {
2151 emit_if_gen6(ir);
2152 } else {
2153 emit_bool_to_cond_code(ir->condition);
2154
2155 emit(IF(BRW_PREDICATE_NORMAL));
2156 }
2157
2158 foreach_list(node, &ir->then_instructions) {
2159 ir_instruction *ir = (ir_instruction *)node;
2160 this->base_ir = ir;
2161
2162 ir->accept(this);
2163 }
2164
2165 if (!ir->else_instructions.is_empty()) {
2166 emit(BRW_OPCODE_ELSE);
2167
2168 foreach_list(node, &ir->else_instructions) {
2169 ir_instruction *ir = (ir_instruction *)node;
2170 this->base_ir = ir;
2171
2172 ir->accept(this);
2173 }
2174 }
2175
2176 emit(BRW_OPCODE_ENDIF);
2177
2178 try_replace_with_sel();
2179 }
2180
2181 void
2182 fs_visitor::visit(ir_loop *ir)
2183 {
2184 if (brw->gen < 6 && dispatch_width == 16) {
2185 fail("Can't support (non-uniform) control flow on 16-wide\n");
2186 }
2187
2188 this->base_ir = NULL;
2189 emit(BRW_OPCODE_DO);
2190
2191 foreach_list(node, &ir->body_instructions) {
2192 ir_instruction *ir = (ir_instruction *)node;
2193
2194 this->base_ir = ir;
2195 ir->accept(this);
2196 }
2197
2198 this->base_ir = NULL;
2199 emit(BRW_OPCODE_WHILE);
2200 }
2201
2202 void
2203 fs_visitor::visit(ir_loop_jump *ir)
2204 {
2205 switch (ir->mode) {
2206 case ir_loop_jump::jump_break:
2207 emit(BRW_OPCODE_BREAK);
2208 break;
2209 case ir_loop_jump::jump_continue:
2210 emit(BRW_OPCODE_CONTINUE);
2211 break;
2212 }
2213 }
2214
2215 void
2216 fs_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
2217 {
2218 ir_dereference *deref = static_cast<ir_dereference *>(
2219 ir->actual_parameters.get_head());
2220 ir_variable *location = deref->variable_referenced();
2221 unsigned surf_index = (c->prog_data.base.binding_table.abo_start +
2222 location->data.atomic.buffer_index);
2223
2224 /* Calculate the surface offset */
2225 fs_reg offset(this, glsl_type::uint_type);
2226 ir_dereference_array *deref_array = deref->as_dereference_array();
2227
2228 if (deref_array) {
2229 deref_array->array_index->accept(this);
2230
2231 fs_reg tmp(this, glsl_type::uint_type);
2232 emit(MUL(tmp, this->result, ATOMIC_COUNTER_SIZE));
2233 emit(ADD(offset, tmp, location->data.atomic.offset));
2234 } else {
2235 offset = location->data.atomic.offset;
2236 }
2237
2238 /* Emit the appropriate machine instruction */
2239 const char *callee = ir->callee->function_name();
2240 ir->return_deref->accept(this);
2241 fs_reg dst = this->result;
2242
2243 if (!strcmp("__intrinsic_atomic_read", callee)) {
2244 emit_untyped_surface_read(surf_index, dst, offset);
2245
2246 } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
2247 emit_untyped_atomic(BRW_AOP_INC, surf_index, dst, offset,
2248 fs_reg(), fs_reg());
2249
2250 } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
2251 emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dst, offset,
2252 fs_reg(), fs_reg());
2253 }
2254 }
2255
2256 void
2257 fs_visitor::visit(ir_call *ir)
2258 {
2259 const char *callee = ir->callee->function_name();
2260
2261 if (!strcmp("__intrinsic_atomic_read", callee) ||
2262 !strcmp("__intrinsic_atomic_increment", callee) ||
2263 !strcmp("__intrinsic_atomic_predecrement", callee)) {
2264 visit_atomic_counter_intrinsic(ir);
2265 } else {
2266 assert(!"Unsupported intrinsic.");
2267 }
2268 }
2269
2270 void
2271 fs_visitor::visit(ir_return *ir)
2272 {
2273 assert(!"FINISHME");
2274 }
2275
2276 void
2277 fs_visitor::visit(ir_function *ir)
2278 {
2279 /* Ignore function bodies other than main() -- we shouldn't see calls to
2280 * them since they should all be inlined before we get to ir_to_mesa.
2281 */
2282 if (strcmp(ir->name, "main") == 0) {
2283 const ir_function_signature *sig;
2284 exec_list empty;
2285
2286 sig = ir->matching_signature(NULL, &empty);
2287
2288 assert(sig);
2289
2290 foreach_list(node, &sig->body) {
2291 ir_instruction *ir = (ir_instruction *)node;
2292 this->base_ir = ir;
2293
2294 ir->accept(this);
2295 }
2296 }
2297 }
2298
2299 void
2300 fs_visitor::visit(ir_function_signature *ir)
2301 {
2302 assert(!"not reached");
2303 (void)ir;
2304 }
2305
2306 void
2307 fs_visitor::visit(ir_emit_vertex *)
2308 {
2309 assert(!"not reached");
2310 }
2311
2312 void
2313 fs_visitor::visit(ir_end_primitive *)
2314 {
2315 assert(!"not reached");
2316 }
2317
2318 void
2319 fs_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
2320 fs_reg dst, fs_reg offset, fs_reg src0,
2321 fs_reg src1)
2322 {
2323 const unsigned operand_len = dispatch_width / 8;
2324 unsigned mlen = 0;
2325
2326 /* Initialize the sample mask in the message header. */
2327 emit(MOV(brw_uvec_mrf(8, mlen, 0), brw_imm_ud(0)))
2328 ->force_writemask_all = true;
2329
2330 if (fp->UsesKill) {
2331 emit(MOV(brw_uvec_mrf(1, mlen, 7), brw_flag_reg(0, 1)))
2332 ->force_writemask_all = true;
2333 } else {
2334 emit(MOV(brw_uvec_mrf(1, mlen, 7),
2335 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2336 ->force_writemask_all = true;
2337 }
2338
2339 mlen++;
2340
2341 /* Set the atomic operation offset. */
2342 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), offset));
2343 mlen += operand_len;
2344
2345 /* Set the atomic operation arguments. */
2346 if (src0.file != BAD_FILE) {
2347 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), src0));
2348 mlen += operand_len;
2349 }
2350
2351 if (src1.file != BAD_FILE) {
2352 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), src1));
2353 mlen += operand_len;
2354 }
2355
2356 /* Emit the instruction. */
2357 fs_inst inst(SHADER_OPCODE_UNTYPED_ATOMIC, dst, atomic_op, surf_index);
2358 inst.base_mrf = 0;
2359 inst.mlen = mlen;
2360 emit(inst);
2361 }
2362
2363 void
2364 fs_visitor::emit_untyped_surface_read(unsigned surf_index, fs_reg dst,
2365 fs_reg offset)
2366 {
2367 const unsigned operand_len = dispatch_width / 8;
2368 unsigned mlen = 0;
2369
2370 /* Initialize the sample mask in the message header. */
2371 emit(MOV(brw_uvec_mrf(8, mlen, 0), brw_imm_ud(0)))
2372 ->force_writemask_all = true;
2373
2374 if (fp->UsesKill) {
2375 emit(MOV(brw_uvec_mrf(1, mlen, 7), brw_flag_reg(0, 1)))
2376 ->force_writemask_all = true;
2377 } else {
2378 emit(MOV(brw_uvec_mrf(1, mlen, 7),
2379 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD)))
2380 ->force_writemask_all = true;
2381 }
2382
2383 mlen++;
2384
2385 /* Set the surface read offset. */
2386 emit(MOV(brw_uvec_mrf(dispatch_width, mlen, 0), offset));
2387 mlen += operand_len;
2388
2389 /* Emit the instruction. */
2390 fs_inst inst(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst, surf_index);
2391 inst.base_mrf = 0;
2392 inst.mlen = mlen;
2393 emit(inst);
2394 }
2395
2396 fs_inst *
2397 fs_visitor::emit(fs_inst inst)
2398 {
2399 fs_inst *list_inst = new(mem_ctx) fs_inst;
2400 *list_inst = inst;
2401 emit(list_inst);
2402 return list_inst;
2403 }
2404
2405 fs_inst *
2406 fs_visitor::emit(fs_inst *inst)
2407 {
2408 if (force_uncompressed_stack > 0)
2409 inst->force_uncompressed = true;
2410
2411 inst->annotation = this->current_annotation;
2412 inst->ir = this->base_ir;
2413
2414 this->instructions.push_tail(inst);
2415
2416 return inst;
2417 }
2418
2419 void
2420 fs_visitor::emit(exec_list list)
2421 {
2422 foreach_list_safe(node, &list) {
2423 fs_inst *inst = (fs_inst *)node;
2424 inst->remove();
2425 emit(inst);
2426 }
2427 }
2428
2429 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
2430 void
2431 fs_visitor::emit_dummy_fs()
2432 {
2433 int reg_width = dispatch_width / 8;
2434
2435 /* Everyone's favorite color. */
2436 emit(MOV(fs_reg(MRF, 2 + 0 * reg_width), fs_reg(1.0f)));
2437 emit(MOV(fs_reg(MRF, 2 + 1 * reg_width), fs_reg(0.0f)));
2438 emit(MOV(fs_reg(MRF, 2 + 2 * reg_width), fs_reg(1.0f)));
2439 emit(MOV(fs_reg(MRF, 2 + 3 * reg_width), fs_reg(0.0f)));
2440
2441 fs_inst *write;
2442 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
2443 write->base_mrf = 2;
2444 write->mlen = 4 * reg_width;
2445 write->eot = true;
2446 }
2447
2448 /* The register location here is relative to the start of the URB
2449 * data. It will get adjusted to be a real location before
2450 * generate_code() time.
2451 */
2452 struct brw_reg
2453 fs_visitor::interp_reg(int location, int channel)
2454 {
2455 int regnr = c->prog_data.urb_setup[location] * 2 + channel / 2;
2456 int stride = (channel & 1) * 4;
2457
2458 assert(c->prog_data.urb_setup[location] != -1);
2459
2460 return brw_vec1_grf(regnr, stride);
2461 }
2462
2463 /** Emits the interpolation for the varying inputs. */
2464 void
2465 fs_visitor::emit_interpolation_setup_gen4()
2466 {
2467 this->current_annotation = "compute pixel centers";
2468 this->pixel_x = fs_reg(this, glsl_type::uint_type);
2469 this->pixel_y = fs_reg(this, glsl_type::uint_type);
2470 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2471 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2472
2473 emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2474 emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2475
2476 this->current_annotation = "compute pixel deltas from v0";
2477 if (brw->has_pln) {
2478 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2479 fs_reg(this, glsl_type::vec2_type);
2480 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2481 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC];
2482 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg_offset++;
2483 } else {
2484 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2485 fs_reg(this, glsl_type::float_type);
2486 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] =
2487 fs_reg(this, glsl_type::float_type);
2488 }
2489 emit(ADD(this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2490 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0)))));
2491 emit(ADD(this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2492 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1)))));
2493
2494 this->current_annotation = "compute pos.w and 1/pos.w";
2495 /* Compute wpos.w. It's always in our setup, since it's needed to
2496 * interpolate the other attributes.
2497 */
2498 this->wpos_w = fs_reg(this, glsl_type::float_type);
2499 emit(FS_OPCODE_LINTERP, wpos_w,
2500 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2501 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
2502 interp_reg(VARYING_SLOT_POS, 3));
2503 /* Compute the pixel 1/W value from wpos.w. */
2504 this->pixel_w = fs_reg(this, glsl_type::float_type);
2505 emit_math(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
2506 this->current_annotation = NULL;
2507 }
2508
2509 /** Emits the interpolation for the varying inputs. */
2510 void
2511 fs_visitor::emit_interpolation_setup_gen6()
2512 {
2513 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2514
2515 /* If the pixel centers end up used, the setup is the same as for gen4. */
2516 this->current_annotation = "compute pixel centers";
2517 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2518 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2519 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2520 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2521 emit(ADD(int_pixel_x,
2522 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2523 fs_reg(brw_imm_v(0x10101010))));
2524 emit(ADD(int_pixel_y,
2525 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2526 fs_reg(brw_imm_v(0x11001100))));
2527
2528 /* As of gen6, we can no longer mix float and int sources. We have
2529 * to turn the integer pixel centers into floats for their actual
2530 * use.
2531 */
2532 this->pixel_x = fs_reg(this, glsl_type::float_type);
2533 this->pixel_y = fs_reg(this, glsl_type::float_type);
2534 emit(MOV(this->pixel_x, int_pixel_x));
2535 emit(MOV(this->pixel_y, int_pixel_y));
2536
2537 this->current_annotation = "compute pos.w";
2538 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2539 this->wpos_w = fs_reg(this, glsl_type::float_type);
2540 emit_math(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
2541
2542 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2543 uint8_t reg = c->barycentric_coord_reg[i];
2544 this->delta_x[i] = fs_reg(brw_vec8_grf(reg, 0));
2545 this->delta_y[i] = fs_reg(brw_vec8_grf(reg + 1, 0));
2546 }
2547
2548 this->current_annotation = NULL;
2549 }
2550
2551 void
2552 fs_visitor::emit_color_write(int target, int index, int first_color_mrf)
2553 {
2554 int reg_width = dispatch_width / 8;
2555 fs_inst *inst;
2556 fs_reg color = outputs[target];
2557 fs_reg mrf;
2558
2559 /* If there's no color data to be written, skip it. */
2560 if (color.file == BAD_FILE)
2561 return;
2562
2563 color.reg_offset += index;
2564
2565 if (dispatch_width == 8 || brw->gen >= 6) {
2566 /* SIMD8 write looks like:
2567 * m + 0: r0
2568 * m + 1: r1
2569 * m + 2: g0
2570 * m + 3: g1
2571 *
2572 * gen6 SIMD16 DP write looks like:
2573 * m + 0: r0
2574 * m + 1: r1
2575 * m + 2: g0
2576 * m + 3: g1
2577 * m + 4: b0
2578 * m + 5: b1
2579 * m + 6: a0
2580 * m + 7: a1
2581 */
2582 inst = emit(MOV(fs_reg(MRF, first_color_mrf + index * reg_width,
2583 color.type),
2584 color));
2585 inst->saturate = c->key.clamp_fragment_color;
2586 } else {
2587 /* pre-gen6 SIMD16 single source DP write looks like:
2588 * m + 0: r0
2589 * m + 1: g0
2590 * m + 2: b0
2591 * m + 3: a0
2592 * m + 4: r1
2593 * m + 5: g1
2594 * m + 6: b1
2595 * m + 7: a1
2596 */
2597 if (brw->has_compr4) {
2598 /* By setting the high bit of the MRF register number, we
2599 * indicate that we want COMPR4 mode - instead of doing the
2600 * usual destination + 1 for the second half we get
2601 * destination + 4.
2602 */
2603 inst = emit(MOV(fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index,
2604 color.type),
2605 color));
2606 inst->saturate = c->key.clamp_fragment_color;
2607 } else {
2608 push_force_uncompressed();
2609 inst = emit(MOV(fs_reg(MRF, first_color_mrf + index, color.type),
2610 color));
2611 inst->saturate = c->key.clamp_fragment_color;
2612 pop_force_uncompressed();
2613
2614 color.sechalf = true;
2615 inst = emit(MOV(fs_reg(MRF, first_color_mrf + index + 4, color.type),
2616 color));
2617 inst->force_sechalf = true;
2618 inst->saturate = c->key.clamp_fragment_color;
2619 color.sechalf = false;
2620 }
2621 }
2622 }
2623
2624 static int
2625 cond_for_alpha_func(GLenum func)
2626 {
2627 switch(func) {
2628 case GL_GREATER:
2629 return BRW_CONDITIONAL_G;
2630 case GL_GEQUAL:
2631 return BRW_CONDITIONAL_GE;
2632 case GL_LESS:
2633 return BRW_CONDITIONAL_L;
2634 case GL_LEQUAL:
2635 return BRW_CONDITIONAL_LE;
2636 case GL_EQUAL:
2637 return BRW_CONDITIONAL_EQ;
2638 case GL_NOTEQUAL:
2639 return BRW_CONDITIONAL_NEQ;
2640 default:
2641 assert(!"Not reached");
2642 return 0;
2643 }
2644 }
2645
2646 /**
2647 * Alpha test support for when we compile it into the shader instead
2648 * of using the normal fixed-function alpha test.
2649 */
2650 void
2651 fs_visitor::emit_alpha_test()
2652 {
2653 this->current_annotation = "Alpha test";
2654
2655 fs_inst *cmp;
2656 if (c->key.alpha_test_func == GL_ALWAYS)
2657 return;
2658
2659 if (c->key.alpha_test_func == GL_NEVER) {
2660 /* f0.1 = 0 */
2661 fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
2662 BRW_REGISTER_TYPE_UW));
2663 cmp = emit(CMP(reg_null_f, some_reg, some_reg,
2664 BRW_CONDITIONAL_NEQ));
2665 } else {
2666 /* RT0 alpha */
2667 fs_reg color = outputs[0];
2668 color.reg_offset += 3;
2669
2670 /* f0.1 &= func(color, ref) */
2671 cmp = emit(CMP(reg_null_f, color, fs_reg(c->key.alpha_test_ref),
2672 cond_for_alpha_func(c->key.alpha_test_func)));
2673 }
2674 cmp->predicate = BRW_PREDICATE_NORMAL;
2675 cmp->flag_subreg = 1;
2676 }
2677
2678 void
2679 fs_visitor::emit_fb_writes()
2680 {
2681 this->current_annotation = "FB write header";
2682 bool header_present = true;
2683 /* We can potentially have a message length of up to 15, so we have to set
2684 * base_mrf to either 0 or 1 in order to fit in m0..m15.
2685 */
2686 int base_mrf = 1;
2687 int nr = base_mrf;
2688 int reg_width = dispatch_width / 8;
2689 bool do_dual_src = this->dual_src_output.file != BAD_FILE;
2690 bool src0_alpha_to_render_target = false;
2691
2692 if (dispatch_width == 16 && do_dual_src) {
2693 fail("GL_ARB_blend_func_extended not yet supported in 16-wide.");
2694 do_dual_src = false;
2695 }
2696
2697 /* From the Sandy Bridge PRM, volume 4, page 198:
2698 *
2699 * "Dispatched Pixel Enables. One bit per pixel indicating
2700 * which pixels were originally enabled when the thread was
2701 * dispatched. This field is only required for the end-of-
2702 * thread message and on all dual-source messages."
2703 */
2704 if (brw->gen >= 6 &&
2705 !this->fp->UsesKill &&
2706 !do_dual_src &&
2707 c->key.nr_color_regions == 1) {
2708 header_present = false;
2709 }
2710
2711 if (header_present) {
2712 src0_alpha_to_render_target = brw->gen >= 6 &&
2713 !do_dual_src &&
2714 c->key.replicate_alpha;
2715 /* m2, m3 header */
2716 nr += 2;
2717 }
2718
2719 if (c->aa_dest_stencil_reg) {
2720 push_force_uncompressed();
2721 emit(MOV(fs_reg(MRF, nr++),
2722 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0))));
2723 pop_force_uncompressed();
2724 }
2725
2726 c->prog_data.uses_omask =
2727 fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
2728 if(c->prog_data.uses_omask) {
2729 this->current_annotation = "FB write oMask";
2730 assert(this->sample_mask.file != BAD_FILE);
2731 /* Hand over gl_SampleMask. Only lower 16 bits are relevant. */
2732 emit(FS_OPCODE_SET_OMASK, fs_reg(MRF, nr, BRW_REGISTER_TYPE_UW), this->sample_mask);
2733 nr += 1;
2734 }
2735
2736 /* Reserve space for color. It'll be filled in per MRT below. */
2737 int color_mrf = nr;
2738 nr += 4 * reg_width;
2739 if (do_dual_src)
2740 nr += 4;
2741 if (src0_alpha_to_render_target)
2742 nr += reg_width;
2743
2744 if (c->source_depth_to_render_target) {
2745 if (brw->gen == 6 && dispatch_width == 16) {
2746 /* For outputting oDepth on gen6, SIMD8 writes have to be
2747 * used. This would require 8-wide moves of each half to
2748 * message regs, kind of like pre-gen5 SIMD16 FB writes.
2749 * Just bail on doing so for now.
2750 */
2751 fail("Missing support for simd16 depth writes on gen6\n");
2752 }
2753
2754 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2755 /* Hand over gl_FragDepth. */
2756 assert(this->frag_depth.file != BAD_FILE);
2757 emit(MOV(fs_reg(MRF, nr), this->frag_depth));
2758 } else {
2759 /* Pass through the payload depth. */
2760 emit(MOV(fs_reg(MRF, nr),
2761 fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
2762 }
2763 nr += reg_width;
2764 }
2765
2766 if (c->dest_depth_reg) {
2767 emit(MOV(fs_reg(MRF, nr),
2768 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0))));
2769 nr += reg_width;
2770 }
2771
2772 if (do_dual_src) {
2773 fs_reg src0 = this->outputs[0];
2774 fs_reg src1 = this->dual_src_output;
2775
2776 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2777 "FB write src0");
2778 for (int i = 0; i < 4; i++) {
2779 fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + i, src0.type), src0));
2780 src0.reg_offset++;
2781 inst->saturate = c->key.clamp_fragment_color;
2782 }
2783
2784 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2785 "FB write src1");
2786 for (int i = 0; i < 4; i++) {
2787 fs_inst *inst = emit(MOV(fs_reg(MRF, color_mrf + 4 + i, src1.type),
2788 src1));
2789 src1.reg_offset++;
2790 inst->saturate = c->key.clamp_fragment_color;
2791 }
2792
2793 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2794 emit_shader_time_end();
2795
2796 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2797 inst->target = 0;
2798 inst->base_mrf = base_mrf;
2799 inst->mlen = nr - base_mrf;
2800 inst->eot = true;
2801 inst->header_present = header_present;
2802
2803 c->prog_data.dual_src_blend = true;
2804 this->current_annotation = NULL;
2805 return;
2806 }
2807
2808 for (int target = 0; target < c->key.nr_color_regions; target++) {
2809 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2810 "FB write target %d",
2811 target);
2812 /* If src0_alpha_to_render_target is true, include source zero alpha
2813 * data in RenderTargetWrite message for targets > 0.
2814 */
2815 int write_color_mrf = color_mrf;
2816 if (src0_alpha_to_render_target && target != 0) {
2817 fs_inst *inst;
2818 fs_reg color = outputs[0];
2819 color.reg_offset += 3;
2820
2821 inst = emit(MOV(fs_reg(MRF, write_color_mrf, color.type),
2822 color));
2823 inst->saturate = c->key.clamp_fragment_color;
2824 write_color_mrf = color_mrf + reg_width;
2825 }
2826
2827 for (unsigned i = 0; i < this->output_components[target]; i++)
2828 emit_color_write(target, i, write_color_mrf);
2829
2830 bool eot = false;
2831 if (target == c->key.nr_color_regions - 1) {
2832 eot = true;
2833
2834 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2835 emit_shader_time_end();
2836 }
2837
2838 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2839 inst->target = target;
2840 inst->base_mrf = base_mrf;
2841 if (src0_alpha_to_render_target && target == 0)
2842 inst->mlen = nr - base_mrf - reg_width;
2843 else
2844 inst->mlen = nr - base_mrf;
2845 inst->eot = eot;
2846 inst->header_present = header_present;
2847 }
2848
2849 if (c->key.nr_color_regions == 0) {
2850 /* Even if there's no color buffers enabled, we still need to send
2851 * alpha out the pipeline to our null renderbuffer to support
2852 * alpha-testing, alpha-to-coverage, and so on.
2853 */
2854 emit_color_write(0, 3, color_mrf);
2855
2856 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2857 emit_shader_time_end();
2858
2859 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2860 inst->base_mrf = base_mrf;
2861 inst->mlen = nr - base_mrf;
2862 inst->eot = true;
2863 inst->header_present = header_present;
2864 }
2865
2866 this->current_annotation = NULL;
2867 }
2868
2869 void
2870 fs_visitor::resolve_ud_negate(fs_reg *reg)
2871 {
2872 if (reg->type != BRW_REGISTER_TYPE_UD ||
2873 !reg->negate)
2874 return;
2875
2876 fs_reg temp = fs_reg(this, glsl_type::uint_type);
2877 emit(MOV(temp, *reg));
2878 *reg = temp;
2879 }
2880
2881 void
2882 fs_visitor::resolve_bool_comparison(ir_rvalue *rvalue, fs_reg *reg)
2883 {
2884 if (rvalue->type != glsl_type::bool_type)
2885 return;
2886
2887 fs_reg temp = fs_reg(this, glsl_type::bool_type);
2888 emit(AND(temp, *reg, fs_reg(1)));
2889 *reg = temp;
2890 }
2891
2892 fs_visitor::fs_visitor(struct brw_context *brw,
2893 struct brw_wm_compile *c,
2894 struct gl_shader_program *shader_prog,
2895 struct gl_fragment_program *fp,
2896 unsigned dispatch_width)
2897 : dispatch_width(dispatch_width)
2898 {
2899 this->c = c;
2900 this->brw = brw;
2901 this->fp = fp;
2902 this->prog = &fp->Base;
2903 this->shader_prog = shader_prog;
2904 this->prog = &fp->Base;
2905 this->stage_prog_data = &c->prog_data.base;
2906 this->ctx = &brw->ctx;
2907 this->mem_ctx = ralloc_context(NULL);
2908 if (shader_prog)
2909 shader = (struct brw_shader *)
2910 shader_prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2911 else
2912 shader = NULL;
2913 this->failed = false;
2914 this->variable_ht = hash_table_ctor(0,
2915 hash_table_pointer_hash,
2916 hash_table_pointer_compare);
2917
2918 memset(this->outputs, 0, sizeof(this->outputs));
2919 memset(this->output_components, 0, sizeof(this->output_components));
2920 this->first_non_payload_grf = 0;
2921 this->max_grf = brw->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
2922
2923 this->current_annotation = NULL;
2924 this->base_ir = NULL;
2925
2926 this->virtual_grf_sizes = NULL;
2927 this->virtual_grf_count = 0;
2928 this->virtual_grf_array_size = 0;
2929 this->virtual_grf_start = NULL;
2930 this->virtual_grf_end = NULL;
2931 this->live_intervals = NULL;
2932
2933 this->params_remap = NULL;
2934 this->nr_params_remap = 0;
2935
2936 this->force_uncompressed_stack = 0;
2937
2938 this->spilled_any_registers = false;
2939
2940 memset(&this->param_size, 0, sizeof(this->param_size));
2941 }
2942
2943 fs_visitor::~fs_visitor()
2944 {
2945 ralloc_free(this->mem_ctx);
2946 hash_table_dtor(this->variable_ht);
2947 }