i965/fs: Split the BRW native code emit to brw_fs_emit.cpp
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 extern "C" {
29
30 #include <sys/types.h>
31
32 #include "main/macros.h"
33 #include "main/shaderobj.h"
34 #include "main/uniforms.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "program/register_allocate.h"
38 #include "program/sampler.h"
39 #include "program/hash_table.h"
40 #include "brw_context.h"
41 #include "brw_eu.h"
42 #include "brw_wm.h"
43 }
44 #include "brw_shader.h"
45 #include "brw_fs.h"
46 #include "../glsl/glsl_types.h"
47 #include "../glsl/ir_print_visitor.h"
48
49 #define MAX_INSTRUCTION (1 << 30)
50
51 static int
52 type_size(const struct glsl_type *type)
53 {
54 unsigned int size, i;
55
56 switch (type->base_type) {
57 case GLSL_TYPE_UINT:
58 case GLSL_TYPE_INT:
59 case GLSL_TYPE_FLOAT:
60 case GLSL_TYPE_BOOL:
61 return type->components();
62 case GLSL_TYPE_ARRAY:
63 return type_size(type->fields.array) * type->length;
64 case GLSL_TYPE_STRUCT:
65 size = 0;
66 for (i = 0; i < type->length; i++) {
67 size += type_size(type->fields.structure[i].type);
68 }
69 return size;
70 case GLSL_TYPE_SAMPLER:
71 /* Samplers take up no register space, since they're baked in at
72 * link time.
73 */
74 return 0;
75 default:
76 assert(!"not reached");
77 return 0;
78 }
79 }
80
81 void
82 fs_visitor::fail(const char *format, ...)
83 {
84 if (!failed) {
85 failed = true;
86
87 if (INTEL_DEBUG & DEBUG_WM) {
88 fprintf(stderr, "FS compile failed: ");
89
90 va_list va;
91 va_start(va, format);
92 vfprintf(stderr, format, va);
93 va_end(va);
94 }
95 }
96 }
97
98 void
99 fs_visitor::push_force_uncompressed()
100 {
101 force_uncompressed_stack++;
102 }
103
104 void
105 fs_visitor::pop_force_uncompressed()
106 {
107 force_uncompressed_stack--;
108 assert(force_uncompressed_stack >= 0);
109 }
110
111 void
112 fs_visitor::push_force_sechalf()
113 {
114 force_sechalf_stack++;
115 }
116
117 void
118 fs_visitor::pop_force_sechalf()
119 {
120 force_sechalf_stack--;
121 assert(force_sechalf_stack >= 0);
122 }
123
124 /**
125 * Returns how many MRFs an FS opcode will write over.
126 *
127 * Note that this is not the 0 or 1 implied writes in an actual gen
128 * instruction -- the FS opcodes often generate MOVs in addition.
129 */
130 int
131 fs_visitor::implied_mrf_writes(fs_inst *inst)
132 {
133 if (inst->mlen == 0)
134 return 0;
135
136 switch (inst->opcode) {
137 case FS_OPCODE_RCP:
138 case FS_OPCODE_RSQ:
139 case FS_OPCODE_SQRT:
140 case FS_OPCODE_EXP2:
141 case FS_OPCODE_LOG2:
142 case FS_OPCODE_SIN:
143 case FS_OPCODE_COS:
144 return 1 * c->dispatch_width / 8;
145 case FS_OPCODE_POW:
146 return 2 * c->dispatch_width / 8;
147 case FS_OPCODE_TEX:
148 case FS_OPCODE_TXB:
149 case FS_OPCODE_TXD:
150 case FS_OPCODE_TXL:
151 return 1;
152 case FS_OPCODE_FB_WRITE:
153 return 2;
154 case FS_OPCODE_PULL_CONSTANT_LOAD:
155 case FS_OPCODE_UNSPILL:
156 return 1;
157 case FS_OPCODE_SPILL:
158 return 2;
159 default:
160 assert(!"not reached");
161 return inst->mlen;
162 }
163 }
164
165 int
166 fs_visitor::virtual_grf_alloc(int size)
167 {
168 if (virtual_grf_array_size <= virtual_grf_next) {
169 if (virtual_grf_array_size == 0)
170 virtual_grf_array_size = 16;
171 else
172 virtual_grf_array_size *= 2;
173 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
174 virtual_grf_array_size);
175
176 /* This slot is always unused. */
177 virtual_grf_sizes[0] = 0;
178 }
179 virtual_grf_sizes[virtual_grf_next] = size;
180 return virtual_grf_next++;
181 }
182
183 /** Fixed HW reg constructor. */
184 fs_reg::fs_reg(enum register_file file, int hw_reg)
185 {
186 init();
187 this->file = file;
188 this->hw_reg = hw_reg;
189 this->type = BRW_REGISTER_TYPE_F;
190 }
191
192 /** Fixed HW reg constructor. */
193 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
194 {
195 init();
196 this->file = file;
197 this->hw_reg = hw_reg;
198 this->type = type;
199 }
200
201 /** Automatic reg constructor. */
202 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
203 {
204 init();
205
206 this->file = GRF;
207 this->reg = v->virtual_grf_alloc(type_size(type));
208 this->reg_offset = 0;
209 this->type = brw_type_for_base_type(type);
210 }
211
212 fs_reg *
213 fs_visitor::variable_storage(ir_variable *var)
214 {
215 return (fs_reg *)hash_table_find(this->variable_ht, var);
216 }
217
218 void
219 import_uniforms_callback(const void *key,
220 void *data,
221 void *closure)
222 {
223 struct hash_table *dst_ht = (struct hash_table *)closure;
224 const fs_reg *reg = (const fs_reg *)data;
225
226 if (reg->file != UNIFORM)
227 return;
228
229 hash_table_insert(dst_ht, data, key);
230 }
231
232 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
233 * This brings in those uniform definitions
234 */
235 void
236 fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
237 {
238 hash_table_call_foreach(src_variable_ht,
239 import_uniforms_callback,
240 variable_ht);
241 }
242
243 /* Our support for uniforms is piggy-backed on the struct
244 * gl_fragment_program, because that's where the values actually
245 * get stored, rather than in some global gl_shader_program uniform
246 * store.
247 */
248 int
249 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
250 {
251 unsigned int offset = 0;
252
253 if (type->is_matrix()) {
254 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
255 type->vector_elements,
256 1);
257
258 for (unsigned int i = 0; i < type->matrix_columns; i++) {
259 offset += setup_uniform_values(loc + offset, column);
260 }
261
262 return offset;
263 }
264
265 switch (type->base_type) {
266 case GLSL_TYPE_FLOAT:
267 case GLSL_TYPE_UINT:
268 case GLSL_TYPE_INT:
269 case GLSL_TYPE_BOOL:
270 for (unsigned int i = 0; i < type->vector_elements; i++) {
271 unsigned int param = c->prog_data.nr_params++;
272
273 assert(param < ARRAY_SIZE(c->prog_data.param));
274
275 switch (type->base_type) {
276 case GLSL_TYPE_FLOAT:
277 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
278 break;
279 case GLSL_TYPE_UINT:
280 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
281 break;
282 case GLSL_TYPE_INT:
283 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
284 break;
285 case GLSL_TYPE_BOOL:
286 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
287 break;
288 default:
289 assert(!"not reached");
290 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
291 break;
292 }
293 this->param_index[param] = loc;
294 this->param_offset[param] = i;
295 }
296 return 1;
297
298 case GLSL_TYPE_STRUCT:
299 for (unsigned int i = 0; i < type->length; i++) {
300 offset += setup_uniform_values(loc + offset,
301 type->fields.structure[i].type);
302 }
303 return offset;
304
305 case GLSL_TYPE_ARRAY:
306 for (unsigned int i = 0; i < type->length; i++) {
307 offset += setup_uniform_values(loc + offset, type->fields.array);
308 }
309 return offset;
310
311 case GLSL_TYPE_SAMPLER:
312 /* The sampler takes up a slot, but we don't use any values from it. */
313 return 1;
314
315 default:
316 assert(!"not reached");
317 return 0;
318 }
319 }
320
321
322 /* Our support for builtin uniforms is even scarier than non-builtin.
323 * It sits on top of the PROG_STATE_VAR parameters that are
324 * automatically updated from GL context state.
325 */
326 void
327 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
328 {
329 const ir_state_slot *const slots = ir->state_slots;
330 assert(ir->state_slots != NULL);
331
332 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
333 /* This state reference has already been setup by ir_to_mesa, but we'll
334 * get the same index back here.
335 */
336 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
337 (gl_state_index *)slots[i].tokens);
338
339 /* Add each of the unique swizzles of the element as a parameter.
340 * This'll end up matching the expected layout of the
341 * array/matrix/structure we're trying to fill in.
342 */
343 int last_swiz = -1;
344 for (unsigned int j = 0; j < 4; j++) {
345 int swiz = GET_SWZ(slots[i].swizzle, j);
346 if (swiz == last_swiz)
347 break;
348 last_swiz = swiz;
349
350 c->prog_data.param_convert[c->prog_data.nr_params] =
351 PARAM_NO_CONVERT;
352 this->param_index[c->prog_data.nr_params] = index;
353 this->param_offset[c->prog_data.nr_params] = swiz;
354 c->prog_data.nr_params++;
355 }
356 }
357 }
358
359 fs_reg *
360 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
361 {
362 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
363 fs_reg wpos = *reg;
364 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
365
366 /* gl_FragCoord.x */
367 if (ir->pixel_center_integer) {
368 emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
369 } else {
370 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
371 }
372 wpos.reg_offset++;
373
374 /* gl_FragCoord.y */
375 if (!flip && ir->pixel_center_integer) {
376 emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
377 } else {
378 fs_reg pixel_y = this->pixel_y;
379 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
380
381 if (flip) {
382 pixel_y.negate = true;
383 offset += c->key.drawable_height - 1.0;
384 }
385
386 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
387 }
388 wpos.reg_offset++;
389
390 /* gl_FragCoord.z */
391 if (intel->gen >= 6) {
392 emit(BRW_OPCODE_MOV, wpos,
393 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
394 } else {
395 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
396 interp_reg(FRAG_ATTRIB_WPOS, 2));
397 }
398 wpos.reg_offset++;
399
400 /* gl_FragCoord.w: Already set up in emit_interpolation */
401 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
402
403 return reg;
404 }
405
406 fs_reg *
407 fs_visitor::emit_general_interpolation(ir_variable *ir)
408 {
409 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
410 /* Interpolation is always in floating point regs. */
411 reg->type = BRW_REGISTER_TYPE_F;
412 fs_reg attr = *reg;
413
414 unsigned int array_elements;
415 const glsl_type *type;
416
417 if (ir->type->is_array()) {
418 array_elements = ir->type->length;
419 if (array_elements == 0) {
420 fail("dereferenced array '%s' has length 0\n", ir->name);
421 }
422 type = ir->type->fields.array;
423 } else {
424 array_elements = 1;
425 type = ir->type;
426 }
427
428 int location = ir->location;
429 for (unsigned int i = 0; i < array_elements; i++) {
430 for (unsigned int j = 0; j < type->matrix_columns; j++) {
431 if (urb_setup[location] == -1) {
432 /* If there's no incoming setup data for this slot, don't
433 * emit interpolation for it.
434 */
435 attr.reg_offset += type->vector_elements;
436 location++;
437 continue;
438 }
439
440 bool is_gl_Color =
441 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
442
443 if (c->key.flat_shade && is_gl_Color) {
444 /* Constant interpolation (flat shading) case. The SF has
445 * handed us defined values in only the constant offset
446 * field of the setup reg.
447 */
448 for (unsigned int k = 0; k < type->vector_elements; k++) {
449 struct brw_reg interp = interp_reg(location, k);
450 interp = suboffset(interp, 3);
451 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
452 attr.reg_offset++;
453 }
454 } else {
455 /* Perspective interpolation case. */
456 for (unsigned int k = 0; k < type->vector_elements; k++) {
457 struct brw_reg interp = interp_reg(location, k);
458 emit(FS_OPCODE_LINTERP, attr,
459 this->delta_x, this->delta_y, fs_reg(interp));
460 attr.reg_offset++;
461 }
462
463 if (intel->gen < 6) {
464 attr.reg_offset -= type->vector_elements;
465 for (unsigned int k = 0; k < type->vector_elements; k++) {
466 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
467 attr.reg_offset++;
468 }
469 }
470 }
471 location++;
472 }
473 }
474
475 return reg;
476 }
477
478 fs_reg *
479 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
480 {
481 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
482
483 /* The frontfacing comes in as a bit in the thread payload. */
484 if (intel->gen >= 6) {
485 emit(BRW_OPCODE_ASR, *reg,
486 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
487 fs_reg(15));
488 emit(BRW_OPCODE_NOT, *reg, *reg);
489 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
490 } else {
491 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
492 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
493 * us front face
494 */
495 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
496 fs_reg(r1_6ud),
497 fs_reg(1u << 31));
498 inst->conditional_mod = BRW_CONDITIONAL_L;
499 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
500 }
501
502 return reg;
503 }
504
505 fs_inst *
506 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
507 {
508 switch (opcode) {
509 case FS_OPCODE_RCP:
510 case FS_OPCODE_RSQ:
511 case FS_OPCODE_SQRT:
512 case FS_OPCODE_EXP2:
513 case FS_OPCODE_LOG2:
514 case FS_OPCODE_SIN:
515 case FS_OPCODE_COS:
516 break;
517 default:
518 assert(!"not reached: bad math opcode");
519 return NULL;
520 }
521
522 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
523 * might be able to do better by doing execsize = 1 math and then
524 * expanding that result out, but we would need to be careful with
525 * masking.
526 *
527 * The hardware ignores source modifiers (negate and abs) on math
528 * instructions, so we also move to a temp to set those up.
529 */
530 if (intel->gen >= 6 && (src.file == UNIFORM ||
531 src.abs ||
532 src.negate)) {
533 fs_reg expanded = fs_reg(this, glsl_type::float_type);
534 emit(BRW_OPCODE_MOV, expanded, src);
535 src = expanded;
536 }
537
538 fs_inst *inst = emit(opcode, dst, src);
539
540 if (intel->gen < 6) {
541 inst->base_mrf = 2;
542 inst->mlen = c->dispatch_width / 8;
543 }
544
545 return inst;
546 }
547
548 fs_inst *
549 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
550 {
551 int base_mrf = 2;
552 fs_inst *inst;
553
554 assert(opcode == FS_OPCODE_POW);
555
556 if (intel->gen >= 6) {
557 /* Can't do hstride == 0 args to gen6 math, so expand it out.
558 *
559 * The hardware ignores source modifiers (negate and abs) on math
560 * instructions, so we also move to a temp to set those up.
561 */
562 if (src0.file == UNIFORM || src0.abs || src0.negate) {
563 fs_reg expanded = fs_reg(this, glsl_type::float_type);
564 emit(BRW_OPCODE_MOV, expanded, src0);
565 src0 = expanded;
566 }
567
568 if (src1.file == UNIFORM || src1.abs || src1.negate) {
569 fs_reg expanded = fs_reg(this, glsl_type::float_type);
570 emit(BRW_OPCODE_MOV, expanded, src1);
571 src1 = expanded;
572 }
573
574 inst = emit(opcode, dst, src0, src1);
575 } else {
576 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
577 inst = emit(opcode, dst, src0, reg_null_f);
578
579 inst->base_mrf = base_mrf;
580 inst->mlen = 2 * c->dispatch_width / 8;
581 }
582 return inst;
583 }
584
585 void
586 fs_visitor::visit(ir_variable *ir)
587 {
588 fs_reg *reg = NULL;
589
590 if (variable_storage(ir))
591 return;
592
593 if (strcmp(ir->name, "gl_FragColor") == 0) {
594 this->frag_color = ir;
595 } else if (strcmp(ir->name, "gl_FragData") == 0) {
596 this->frag_data = ir;
597 } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
598 this->frag_depth = ir;
599 }
600
601 if (ir->mode == ir_var_in) {
602 if (!strcmp(ir->name, "gl_FragCoord")) {
603 reg = emit_fragcoord_interpolation(ir);
604 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
605 reg = emit_frontfacing_interpolation(ir);
606 } else {
607 reg = emit_general_interpolation(ir);
608 }
609 assert(reg);
610 hash_table_insert(this->variable_ht, reg, ir);
611 return;
612 }
613
614 if (ir->mode == ir_var_uniform) {
615 int param_index = c->prog_data.nr_params;
616
617 if (c->dispatch_width == 16) {
618 if (!variable_storage(ir)) {
619 fail("Failed to find uniform '%s' in 16-wide\n", ir->name);
620 }
621 return;
622 }
623
624 if (!strncmp(ir->name, "gl_", 3)) {
625 setup_builtin_uniform_values(ir);
626 } else {
627 setup_uniform_values(ir->location, ir->type);
628 }
629
630 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
631 reg->type = brw_type_for_base_type(ir->type);
632 }
633
634 if (!reg)
635 reg = new(this->mem_ctx) fs_reg(this, ir->type);
636
637 hash_table_insert(this->variable_ht, reg, ir);
638 }
639
640 void
641 fs_visitor::visit(ir_dereference_variable *ir)
642 {
643 fs_reg *reg = variable_storage(ir->var);
644 this->result = *reg;
645 }
646
647 void
648 fs_visitor::visit(ir_dereference_record *ir)
649 {
650 const glsl_type *struct_type = ir->record->type;
651
652 ir->record->accept(this);
653
654 unsigned int offset = 0;
655 for (unsigned int i = 0; i < struct_type->length; i++) {
656 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
657 break;
658 offset += type_size(struct_type->fields.structure[i].type);
659 }
660 this->result.reg_offset += offset;
661 this->result.type = brw_type_for_base_type(ir->type);
662 }
663
664 void
665 fs_visitor::visit(ir_dereference_array *ir)
666 {
667 ir_constant *index;
668 int element_size;
669
670 ir->array->accept(this);
671 index = ir->array_index->as_constant();
672
673 element_size = type_size(ir->type);
674 this->result.type = brw_type_for_base_type(ir->type);
675
676 if (index) {
677 assert(this->result.file == UNIFORM ||
678 (this->result.file == GRF &&
679 this->result.reg != 0));
680 this->result.reg_offset += index->value.i[0] * element_size;
681 } else {
682 assert(!"FINISHME: non-constant array element");
683 }
684 }
685
686 /* Instruction selection: Produce a MOV.sat instead of
687 * MIN(MAX(val, 0), 1) when possible.
688 */
689 bool
690 fs_visitor::try_emit_saturate(ir_expression *ir)
691 {
692 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
693
694 if (!sat_val)
695 return false;
696
697 this->result = reg_undef;
698 sat_val->accept(this);
699 fs_reg src = this->result;
700
701 this->result = fs_reg(this, ir->type);
702 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
703 inst->saturate = true;
704
705 return true;
706 }
707
708 void
709 fs_visitor::visit(ir_expression *ir)
710 {
711 unsigned int operand;
712 fs_reg op[2], temp;
713 fs_inst *inst;
714
715 assert(ir->get_num_operands() <= 2);
716
717 if (try_emit_saturate(ir))
718 return;
719
720 /* This is where our caller would like us to put the result, if possible. */
721 fs_reg saved_result_storage = this->result;
722
723 for (operand = 0; operand < ir->get_num_operands(); operand++) {
724 this->result = reg_undef;
725 ir->operands[operand]->accept(this);
726 if (this->result.file == BAD_FILE) {
727 ir_print_visitor v;
728 fail("Failed to get tree for expression operand:\n");
729 ir->operands[operand]->accept(&v);
730 }
731 op[operand] = this->result;
732
733 /* Matrix expression operands should have been broken down to vector
734 * operations already.
735 */
736 assert(!ir->operands[operand]->type->is_matrix());
737 /* And then those vector operands should have been broken down to scalar.
738 */
739 assert(!ir->operands[operand]->type->is_vector());
740 }
741
742 /* Inherit storage from our parent if possible, and otherwise we
743 * alloc a temporary.
744 */
745 if (saved_result_storage.file == BAD_FILE) {
746 this->result = fs_reg(this, ir->type);
747 } else {
748 this->result = saved_result_storage;
749 }
750
751 switch (ir->operation) {
752 case ir_unop_logic_not:
753 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
754 * ones complement of the whole register, not just bit 0.
755 */
756 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
757 break;
758 case ir_unop_neg:
759 op[0].negate = !op[0].negate;
760 this->result = op[0];
761 break;
762 case ir_unop_abs:
763 op[0].abs = true;
764 op[0].negate = false;
765 this->result = op[0];
766 break;
767 case ir_unop_sign:
768 temp = fs_reg(this, ir->type);
769
770 /* Unalias the destination. (imagine a = sign(a)) */
771 this->result = fs_reg(this, ir->type);
772
773 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
774
775 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
776 inst->conditional_mod = BRW_CONDITIONAL_G;
777 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
778 inst->predicated = true;
779
780 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
781 inst->conditional_mod = BRW_CONDITIONAL_L;
782 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
783 inst->predicated = true;
784
785 break;
786 case ir_unop_rcp:
787 emit_math(FS_OPCODE_RCP, this->result, op[0]);
788 break;
789
790 case ir_unop_exp2:
791 emit_math(FS_OPCODE_EXP2, this->result, op[0]);
792 break;
793 case ir_unop_log2:
794 emit_math(FS_OPCODE_LOG2, this->result, op[0]);
795 break;
796 case ir_unop_exp:
797 case ir_unop_log:
798 assert(!"not reached: should be handled by ir_explog_to_explog2");
799 break;
800 case ir_unop_sin:
801 case ir_unop_sin_reduced:
802 emit_math(FS_OPCODE_SIN, this->result, op[0]);
803 break;
804 case ir_unop_cos:
805 case ir_unop_cos_reduced:
806 emit_math(FS_OPCODE_COS, this->result, op[0]);
807 break;
808
809 case ir_unop_dFdx:
810 emit(FS_OPCODE_DDX, this->result, op[0]);
811 break;
812 case ir_unop_dFdy:
813 emit(FS_OPCODE_DDY, this->result, op[0]);
814 break;
815
816 case ir_binop_add:
817 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
818 break;
819 case ir_binop_sub:
820 assert(!"not reached: should be handled by ir_sub_to_add_neg");
821 break;
822
823 case ir_binop_mul:
824 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
825 break;
826 case ir_binop_div:
827 assert(!"not reached: should be handled by ir_div_to_mul_rcp");
828 break;
829 case ir_binop_mod:
830 assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
831 break;
832
833 case ir_binop_less:
834 case ir_binop_greater:
835 case ir_binop_lequal:
836 case ir_binop_gequal:
837 case ir_binop_equal:
838 case ir_binop_all_equal:
839 case ir_binop_nequal:
840 case ir_binop_any_nequal:
841 temp = this->result;
842 /* original gen4 does implicit conversion before comparison. */
843 if (intel->gen < 5)
844 temp.type = op[0].type;
845
846 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
847 inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
848 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
849 break;
850
851 case ir_binop_logic_xor:
852 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
853 break;
854
855 case ir_binop_logic_or:
856 emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
857 break;
858
859 case ir_binop_logic_and:
860 emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
861 break;
862
863 case ir_binop_dot:
864 case ir_unop_any:
865 assert(!"not reached: should be handled by brw_fs_channel_expressions");
866 break;
867
868 case ir_unop_noise:
869 assert(!"not reached: should be handled by lower_noise");
870 break;
871
872 case ir_quadop_vector:
873 assert(!"not reached: should be handled by lower_quadop_vector");
874 break;
875
876 case ir_unop_sqrt:
877 emit_math(FS_OPCODE_SQRT, this->result, op[0]);
878 break;
879
880 case ir_unop_rsq:
881 emit_math(FS_OPCODE_RSQ, this->result, op[0]);
882 break;
883
884 case ir_unop_i2f:
885 case ir_unop_b2f:
886 case ir_unop_b2i:
887 case ir_unop_f2i:
888 emit(BRW_OPCODE_MOV, this->result, op[0]);
889 break;
890 case ir_unop_f2b:
891 case ir_unop_i2b:
892 temp = this->result;
893 /* original gen4 does implicit conversion before comparison. */
894 if (intel->gen < 5)
895 temp.type = op[0].type;
896
897 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
898 inst->conditional_mod = BRW_CONDITIONAL_NZ;
899 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
900 break;
901
902 case ir_unop_trunc:
903 emit(BRW_OPCODE_RNDZ, this->result, op[0]);
904 break;
905 case ir_unop_ceil:
906 op[0].negate = !op[0].negate;
907 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
908 this->result.negate = true;
909 break;
910 case ir_unop_floor:
911 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
912 break;
913 case ir_unop_fract:
914 inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
915 break;
916 case ir_unop_round_even:
917 emit(BRW_OPCODE_RNDE, this->result, op[0]);
918 break;
919
920 case ir_binop_min:
921 /* Unalias the destination */
922 this->result = fs_reg(this, ir->type);
923
924 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
925 inst->conditional_mod = BRW_CONDITIONAL_L;
926
927 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
928 inst->predicated = true;
929 break;
930 case ir_binop_max:
931 /* Unalias the destination */
932 this->result = fs_reg(this, ir->type);
933
934 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
935 inst->conditional_mod = BRW_CONDITIONAL_G;
936
937 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
938 inst->predicated = true;
939 break;
940
941 case ir_binop_pow:
942 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
943 break;
944
945 case ir_unop_bit_not:
946 inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
947 break;
948 case ir_binop_bit_and:
949 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
950 break;
951 case ir_binop_bit_xor:
952 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
953 break;
954 case ir_binop_bit_or:
955 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
956 break;
957
958 case ir_unop_u2f:
959 case ir_binop_lshift:
960 case ir_binop_rshift:
961 assert(!"GLSL 1.30 features unsupported");
962 break;
963 }
964 }
965
966 void
967 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
968 const glsl_type *type, bool predicated)
969 {
970 switch (type->base_type) {
971 case GLSL_TYPE_FLOAT:
972 case GLSL_TYPE_UINT:
973 case GLSL_TYPE_INT:
974 case GLSL_TYPE_BOOL:
975 for (unsigned int i = 0; i < type->components(); i++) {
976 l.type = brw_type_for_base_type(type);
977 r.type = brw_type_for_base_type(type);
978
979 if (predicated || !l.equals(&r)) {
980 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
981 inst->predicated = predicated;
982 }
983
984 l.reg_offset++;
985 r.reg_offset++;
986 }
987 break;
988 case GLSL_TYPE_ARRAY:
989 for (unsigned int i = 0; i < type->length; i++) {
990 emit_assignment_writes(l, r, type->fields.array, predicated);
991 }
992 break;
993
994 case GLSL_TYPE_STRUCT:
995 for (unsigned int i = 0; i < type->length; i++) {
996 emit_assignment_writes(l, r, type->fields.structure[i].type,
997 predicated);
998 }
999 break;
1000
1001 case GLSL_TYPE_SAMPLER:
1002 break;
1003
1004 default:
1005 assert(!"not reached");
1006 break;
1007 }
1008 }
1009
1010 void
1011 fs_visitor::visit(ir_assignment *ir)
1012 {
1013 struct fs_reg l, r;
1014 fs_inst *inst;
1015
1016 /* FINISHME: arrays on the lhs */
1017 this->result = reg_undef;
1018 ir->lhs->accept(this);
1019 l = this->result;
1020
1021 /* If we're doing a direct assignment, an RHS expression could
1022 * drop its result right into our destination. Otherwise, tell it
1023 * not to.
1024 */
1025 if (ir->condition ||
1026 !(ir->lhs->type->is_scalar() ||
1027 (ir->lhs->type->is_vector() &&
1028 ir->write_mask == (1 << ir->lhs->type->vector_elements) - 1))) {
1029 this->result = reg_undef;
1030 }
1031
1032 ir->rhs->accept(this);
1033 r = this->result;
1034
1035 assert(l.file != BAD_FILE);
1036 assert(r.file != BAD_FILE);
1037
1038 if (ir->condition) {
1039 emit_bool_to_cond_code(ir->condition);
1040 }
1041
1042 if (ir->lhs->type->is_scalar() ||
1043 ir->lhs->type->is_vector()) {
1044 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1045 if (ir->write_mask & (1 << i)) {
1046 if (ir->condition) {
1047 inst = emit(BRW_OPCODE_MOV, l, r);
1048 inst->predicated = true;
1049 } else if (!l.equals(&r)) {
1050 inst = emit(BRW_OPCODE_MOV, l, r);
1051 }
1052
1053 r.reg_offset++;
1054 }
1055 l.reg_offset++;
1056 }
1057 } else {
1058 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1059 }
1060 }
1061
1062 fs_inst *
1063 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1064 int sampler)
1065 {
1066 int mlen;
1067 int base_mrf = 1;
1068 bool simd16 = false;
1069 fs_reg orig_dst;
1070
1071 /* g0 header. */
1072 mlen = 1;
1073
1074 if (ir->shadow_comparitor) {
1075 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1076 fs_inst *inst = emit(BRW_OPCODE_MOV,
1077 fs_reg(MRF, base_mrf + mlen + i), coordinate);
1078 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1079 inst->saturate = true;
1080
1081 coordinate.reg_offset++;
1082 }
1083 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1084 mlen += 3;
1085
1086 if (ir->op == ir_tex) {
1087 /* There's no plain shadow compare message, so we use shadow
1088 * compare with a bias of 0.0.
1089 */
1090 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1091 mlen++;
1092 } else if (ir->op == ir_txb) {
1093 this->result = reg_undef;
1094 ir->lod_info.bias->accept(this);
1095 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1096 mlen++;
1097 } else {
1098 assert(ir->op == ir_txl);
1099 this->result = reg_undef;
1100 ir->lod_info.lod->accept(this);
1101 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1102 mlen++;
1103 }
1104
1105 this->result = reg_undef;
1106 ir->shadow_comparitor->accept(this);
1107 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1108 mlen++;
1109 } else if (ir->op == ir_tex) {
1110 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1111 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i),
1112 coordinate);
1113 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1114 inst->saturate = true;
1115 coordinate.reg_offset++;
1116 }
1117 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1118 mlen += 3;
1119 } else if (ir->op == ir_txd) {
1120 assert(!"TXD isn't supported on gen4 yet.");
1121 } else {
1122 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1123 * instructions. We'll need to do SIMD16 here.
1124 */
1125 assert(ir->op == ir_txb || ir->op == ir_txl);
1126
1127 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1128 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF,
1129 base_mrf + mlen + i * 2),
1130 coordinate);
1131 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1132 inst->saturate = true;
1133 coordinate.reg_offset++;
1134 }
1135
1136 /* lod/bias appears after u/v/r. */
1137 mlen += 6;
1138
1139 if (ir->op == ir_txb) {
1140 this->result = reg_undef;
1141 ir->lod_info.bias->accept(this);
1142 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1143 mlen++;
1144 } else {
1145 this->result = reg_undef;
1146 ir->lod_info.lod->accept(this);
1147 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1148 mlen++;
1149 }
1150
1151 /* The unused upper half. */
1152 mlen++;
1153
1154 /* Now, since we're doing simd16, the return is 2 interleaved
1155 * vec4s where the odd-indexed ones are junk. We'll need to move
1156 * this weirdness around to the expected layout.
1157 */
1158 simd16 = true;
1159 orig_dst = dst;
1160 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1161 2));
1162 dst.type = BRW_REGISTER_TYPE_F;
1163 }
1164
1165 fs_inst *inst = NULL;
1166 switch (ir->op) {
1167 case ir_tex:
1168 inst = emit(FS_OPCODE_TEX, dst);
1169 break;
1170 case ir_txb:
1171 inst = emit(FS_OPCODE_TXB, dst);
1172 break;
1173 case ir_txl:
1174 inst = emit(FS_OPCODE_TXL, dst);
1175 break;
1176 case ir_txd:
1177 inst = emit(FS_OPCODE_TXD, dst);
1178 break;
1179 case ir_txf:
1180 assert(!"GLSL 1.30 features unsupported");
1181 break;
1182 }
1183 inst->base_mrf = base_mrf;
1184 inst->mlen = mlen;
1185 inst->header_present = true;
1186
1187 if (simd16) {
1188 for (int i = 0; i < 4; i++) {
1189 emit(BRW_OPCODE_MOV, orig_dst, dst);
1190 orig_dst.reg_offset++;
1191 dst.reg_offset += 2;
1192 }
1193 }
1194
1195 return inst;
1196 }
1197
1198 /* gen5's sampler has slots for u, v, r, array index, then optional
1199 * parameters like shadow comparitor or LOD bias. If optional
1200 * parameters aren't present, those base slots are optional and don't
1201 * need to be included in the message.
1202 *
1203 * We don't fill in the unnecessary slots regardless, which may look
1204 * surprising in the disassembly.
1205 */
1206 fs_inst *
1207 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1208 int sampler)
1209 {
1210 int mlen = 0;
1211 int base_mrf = 2;
1212 int reg_width = c->dispatch_width / 8;
1213 bool header_present = false;
1214
1215 if (ir->offset) {
1216 /* The offsets set up by the ir_texture visitor are in the
1217 * m1 header, so we can't go headerless.
1218 */
1219 header_present = true;
1220 mlen++;
1221 base_mrf--;
1222 }
1223
1224 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1225 fs_inst *inst = emit(BRW_OPCODE_MOV,
1226 fs_reg(MRF, base_mrf + mlen + i * reg_width),
1227 coordinate);
1228 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1229 inst->saturate = true;
1230 coordinate.reg_offset++;
1231 }
1232 mlen += ir->coordinate->type->vector_elements * reg_width;
1233
1234 if (ir->shadow_comparitor) {
1235 mlen = MAX2(mlen, header_present + 4 * reg_width);
1236
1237 this->result = reg_undef;
1238 ir->shadow_comparitor->accept(this);
1239 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1240 mlen += reg_width;
1241 }
1242
1243 fs_inst *inst = NULL;
1244 switch (ir->op) {
1245 case ir_tex:
1246 inst = emit(FS_OPCODE_TEX, dst);
1247 break;
1248 case ir_txb:
1249 this->result = reg_undef;
1250 ir->lod_info.bias->accept(this);
1251 mlen = MAX2(mlen, header_present + 4 * reg_width);
1252 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1253 mlen += reg_width;
1254
1255 inst = emit(FS_OPCODE_TXB, dst);
1256
1257 break;
1258 case ir_txl:
1259 this->result = reg_undef;
1260 ir->lod_info.lod->accept(this);
1261 mlen = MAX2(mlen, header_present + 4 * reg_width);
1262 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1263 mlen += reg_width;
1264
1265 inst = emit(FS_OPCODE_TXL, dst);
1266 break;
1267 case ir_txd:
1268 case ir_txf:
1269 assert(!"GLSL 1.30 features unsupported");
1270 break;
1271 }
1272 inst->base_mrf = base_mrf;
1273 inst->mlen = mlen;
1274 inst->header_present = header_present;
1275
1276 if (mlen > 11) {
1277 fail("Message length >11 disallowed by hardware\n");
1278 }
1279
1280 return inst;
1281 }
1282
1283 fs_inst *
1284 fs_visitor::emit_texture_gen7(ir_texture *ir, fs_reg dst, fs_reg coordinate,
1285 int sampler)
1286 {
1287 int mlen = 0;
1288 int base_mrf = 2;
1289 int reg_width = c->dispatch_width / 8;
1290 bool header_present = false;
1291
1292 if (ir->offset) {
1293 /* The offsets set up by the ir_texture visitor are in the
1294 * m1 header, so we can't go headerless.
1295 */
1296 header_present = true;
1297 mlen++;
1298 base_mrf--;
1299 }
1300
1301 if (ir->shadow_comparitor) {
1302 ir->shadow_comparitor->accept(this);
1303 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1304 mlen += reg_width;
1305 }
1306
1307 /* Set up the LOD info */
1308 switch (ir->op) {
1309 case ir_tex:
1310 break;
1311 case ir_txb:
1312 ir->lod_info.bias->accept(this);
1313 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1314 mlen += reg_width;
1315 break;
1316 case ir_txl:
1317 ir->lod_info.lod->accept(this);
1318 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1319 mlen += reg_width;
1320 break;
1321 case ir_txd:
1322 case ir_txf:
1323 assert(!"GLSL 1.30 features unsupported");
1324 break;
1325 }
1326
1327 /* Set up the coordinate */
1328 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1329 fs_inst *inst = emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen),
1330 coordinate);
1331 if (i < 3 && c->key.gl_clamp_mask[i] & (1 << sampler))
1332 inst->saturate = true;
1333 coordinate.reg_offset++;
1334 mlen += reg_width;
1335 }
1336
1337 /* Generate the SEND */
1338 fs_inst *inst = NULL;
1339 switch (ir->op) {
1340 case ir_tex: inst = emit(FS_OPCODE_TEX, dst); break;
1341 case ir_txb: inst = emit(FS_OPCODE_TXB, dst); break;
1342 case ir_txl: inst = emit(FS_OPCODE_TXL, dst); break;
1343 case ir_txd: inst = emit(FS_OPCODE_TXD, dst); break;
1344 case ir_txf: assert(!"TXF unsupported.");
1345 }
1346 inst->base_mrf = base_mrf;
1347 inst->mlen = mlen;
1348 inst->header_present = header_present;
1349
1350 if (mlen > 11) {
1351 fail("Message length >11 disallowed by hardware\n");
1352 }
1353
1354 return inst;
1355 }
1356
1357 void
1358 fs_visitor::visit(ir_texture *ir)
1359 {
1360 int sampler;
1361 fs_inst *inst = NULL;
1362
1363 this->result = reg_undef;
1364 ir->coordinate->accept(this);
1365 fs_reg coordinate = this->result;
1366
1367 if (ir->offset != NULL) {
1368 ir_constant *offset = ir->offset->as_constant();
1369 assert(offset != NULL);
1370
1371 signed char offsets[3];
1372 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1373 offsets[i] = (signed char) offset->value.i[i];
1374
1375 /* Combine all three offsets into a single unsigned dword:
1376 *
1377 * bits 11:8 - U Offset (X component)
1378 * bits 7:4 - V Offset (Y component)
1379 * bits 3:0 - R Offset (Z component)
1380 */
1381 unsigned offset_bits = 0;
1382 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1383 const unsigned shift = 4 * (2 - i);
1384 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1385 }
1386
1387 /* Explicitly set up the message header by copying g0 to msg reg m1. */
1388 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1389 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1390
1391 /* Then set the offset bits in DWord 2 of the message header. */
1392 emit(BRW_OPCODE_MOV,
1393 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1394 BRW_REGISTER_TYPE_UD)),
1395 fs_reg(brw_imm_uw(offset_bits)));
1396 }
1397
1398 /* Should be lowered by do_lower_texture_projection */
1399 assert(!ir->projector);
1400
1401 sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1402 ctx->Shader.CurrentFragmentProgram,
1403 &brw->fragment_program->Base);
1404 sampler = c->fp->program.Base.SamplerUnits[sampler];
1405
1406 /* The 965 requires the EU to do the normalization of GL rectangle
1407 * texture coordinates. We use the program parameter state
1408 * tracking to get the scaling factor.
1409 */
1410 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1411 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1412 int tokens[STATE_LENGTH] = {
1413 STATE_INTERNAL,
1414 STATE_TEXRECT_SCALE,
1415 sampler,
1416 0,
1417 0
1418 };
1419
1420 if (c->dispatch_width == 16) {
1421 fail("rectangle scale uniform setup not supported on 16-wide\n");
1422 this->result = fs_reg(this, ir->type);
1423 return;
1424 }
1425
1426 c->prog_data.param_convert[c->prog_data.nr_params] =
1427 PARAM_NO_CONVERT;
1428 c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1429 PARAM_NO_CONVERT;
1430
1431 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1432 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1433 GLuint index = _mesa_add_state_reference(params,
1434 (gl_state_index *)tokens);
1435
1436 this->param_index[c->prog_data.nr_params] = index;
1437 this->param_offset[c->prog_data.nr_params] = 0;
1438 c->prog_data.nr_params++;
1439 this->param_index[c->prog_data.nr_params] = index;
1440 this->param_offset[c->prog_data.nr_params] = 1;
1441 c->prog_data.nr_params++;
1442
1443 fs_reg dst = fs_reg(this, ir->coordinate->type);
1444 fs_reg src = coordinate;
1445 coordinate = dst;
1446
1447 emit(BRW_OPCODE_MUL, dst, src, scale_x);
1448 dst.reg_offset++;
1449 src.reg_offset++;
1450 emit(BRW_OPCODE_MUL, dst, src, scale_y);
1451 }
1452
1453 /* Writemasking doesn't eliminate channels on SIMD8 texture
1454 * samples, so don't worry about them.
1455 */
1456 fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1457
1458 if (intel->gen >= 7) {
1459 inst = emit_texture_gen7(ir, dst, coordinate, sampler);
1460 } else if (intel->gen >= 5) {
1461 inst = emit_texture_gen5(ir, dst, coordinate, sampler);
1462 } else {
1463 inst = emit_texture_gen4(ir, dst, coordinate, sampler);
1464 }
1465
1466 /* If there's an offset, we already set up m1. To avoid the implied move,
1467 * use the null register. Otherwise, we want an implied move from g0.
1468 */
1469 if (ir->offset != NULL || !inst->header_present)
1470 inst->src[0] = reg_undef;
1471 else
1472 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1473
1474 inst->sampler = sampler;
1475
1476 this->result = dst;
1477
1478 if (ir->shadow_comparitor)
1479 inst->shadow_compare = true;
1480
1481 if (ir->type == glsl_type::float_type) {
1482 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1483 assert(ir->sampler->type->sampler_shadow);
1484 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1485 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1486
1487 for (int i = 0; i < 4; i++) {
1488 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1489 fs_reg l = swizzle_dst;
1490 l.reg_offset += i;
1491
1492 if (swiz == SWIZZLE_ZERO) {
1493 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1494 } else if (swiz == SWIZZLE_ONE) {
1495 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1496 } else {
1497 fs_reg r = dst;
1498 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1499 emit(BRW_OPCODE_MOV, l, r);
1500 }
1501 }
1502 this->result = swizzle_dst;
1503 }
1504 }
1505
1506 void
1507 fs_visitor::visit(ir_swizzle *ir)
1508 {
1509 this->result = reg_undef;
1510 ir->val->accept(this);
1511 fs_reg val = this->result;
1512
1513 if (ir->type->vector_elements == 1) {
1514 this->result.reg_offset += ir->mask.x;
1515 return;
1516 }
1517
1518 fs_reg result = fs_reg(this, ir->type);
1519 this->result = result;
1520
1521 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1522 fs_reg channel = val;
1523 int swiz = 0;
1524
1525 switch (i) {
1526 case 0:
1527 swiz = ir->mask.x;
1528 break;
1529 case 1:
1530 swiz = ir->mask.y;
1531 break;
1532 case 2:
1533 swiz = ir->mask.z;
1534 break;
1535 case 3:
1536 swiz = ir->mask.w;
1537 break;
1538 }
1539
1540 channel.reg_offset += swiz;
1541 emit(BRW_OPCODE_MOV, result, channel);
1542 result.reg_offset++;
1543 }
1544 }
1545
1546 void
1547 fs_visitor::visit(ir_discard *ir)
1548 {
1549 assert(ir->condition == NULL); /* FINISHME */
1550
1551 emit(FS_OPCODE_DISCARD);
1552 kill_emitted = true;
1553 }
1554
1555 void
1556 fs_visitor::visit(ir_constant *ir)
1557 {
1558 /* Set this->result to reg at the bottom of the function because some code
1559 * paths will cause this visitor to be applied to other fields. This will
1560 * cause the value stored in this->result to be modified.
1561 *
1562 * Make reg constant so that it doesn't get accidentally modified along the
1563 * way. Yes, I actually had this problem. :(
1564 */
1565 const fs_reg reg(this, ir->type);
1566 fs_reg dst_reg = reg;
1567
1568 if (ir->type->is_array()) {
1569 const unsigned size = type_size(ir->type->fields.array);
1570
1571 for (unsigned i = 0; i < ir->type->length; i++) {
1572 this->result = reg_undef;
1573 ir->array_elements[i]->accept(this);
1574 fs_reg src_reg = this->result;
1575
1576 dst_reg.type = src_reg.type;
1577 for (unsigned j = 0; j < size; j++) {
1578 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1579 src_reg.reg_offset++;
1580 dst_reg.reg_offset++;
1581 }
1582 }
1583 } else if (ir->type->is_record()) {
1584 foreach_list(node, &ir->components) {
1585 ir_instruction *const field = (ir_instruction *) node;
1586 const unsigned size = type_size(field->type);
1587
1588 this->result = reg_undef;
1589 field->accept(this);
1590 fs_reg src_reg = this->result;
1591
1592 dst_reg.type = src_reg.type;
1593 for (unsigned j = 0; j < size; j++) {
1594 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1595 src_reg.reg_offset++;
1596 dst_reg.reg_offset++;
1597 }
1598 }
1599 } else {
1600 const unsigned size = type_size(ir->type);
1601
1602 for (unsigned i = 0; i < size; i++) {
1603 switch (ir->type->base_type) {
1604 case GLSL_TYPE_FLOAT:
1605 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1606 break;
1607 case GLSL_TYPE_UINT:
1608 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1609 break;
1610 case GLSL_TYPE_INT:
1611 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1612 break;
1613 case GLSL_TYPE_BOOL:
1614 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1615 break;
1616 default:
1617 assert(!"Non-float/uint/int/bool constant");
1618 }
1619 dst_reg.reg_offset++;
1620 }
1621 }
1622
1623 this->result = reg;
1624 }
1625
1626 void
1627 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1628 {
1629 ir_expression *expr = ir->as_expression();
1630
1631 if (expr) {
1632 fs_reg op[2];
1633 fs_inst *inst;
1634
1635 assert(expr->get_num_operands() <= 2);
1636 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1637 assert(expr->operands[i]->type->is_scalar());
1638
1639 this->result = reg_undef;
1640 expr->operands[i]->accept(this);
1641 op[i] = this->result;
1642 }
1643
1644 switch (expr->operation) {
1645 case ir_unop_logic_not:
1646 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1647 inst->conditional_mod = BRW_CONDITIONAL_Z;
1648 break;
1649
1650 case ir_binop_logic_xor:
1651 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1652 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1653 break;
1654
1655 case ir_binop_logic_or:
1656 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1657 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1658 break;
1659
1660 case ir_binop_logic_and:
1661 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1662 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1663 break;
1664
1665 case ir_unop_f2b:
1666 if (intel->gen >= 6) {
1667 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1668 } else {
1669 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1670 }
1671 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1672 break;
1673
1674 case ir_unop_i2b:
1675 if (intel->gen >= 6) {
1676 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1677 } else {
1678 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1679 }
1680 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1681 break;
1682
1683 case ir_binop_greater:
1684 case ir_binop_gequal:
1685 case ir_binop_less:
1686 case ir_binop_lequal:
1687 case ir_binop_equal:
1688 case ir_binop_all_equal:
1689 case ir_binop_nequal:
1690 case ir_binop_any_nequal:
1691 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1692 inst->conditional_mod =
1693 brw_conditional_for_comparison(expr->operation);
1694 break;
1695
1696 default:
1697 assert(!"not reached");
1698 fail("bad cond code\n");
1699 break;
1700 }
1701 return;
1702 }
1703
1704 this->result = reg_undef;
1705 ir->accept(this);
1706
1707 if (intel->gen >= 6) {
1708 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1709 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1710 } else {
1711 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1712 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1713 }
1714 }
1715
1716 /**
1717 * Emit a gen6 IF statement with the comparison folded into the IF
1718 * instruction.
1719 */
1720 void
1721 fs_visitor::emit_if_gen6(ir_if *ir)
1722 {
1723 ir_expression *expr = ir->condition->as_expression();
1724
1725 if (expr) {
1726 fs_reg op[2];
1727 fs_inst *inst;
1728 fs_reg temp;
1729
1730 assert(expr->get_num_operands() <= 2);
1731 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1732 assert(expr->operands[i]->type->is_scalar());
1733
1734 this->result = reg_undef;
1735 expr->operands[i]->accept(this);
1736 op[i] = this->result;
1737 }
1738
1739 switch (expr->operation) {
1740 case ir_unop_logic_not:
1741 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1742 inst->conditional_mod = BRW_CONDITIONAL_Z;
1743 return;
1744
1745 case ir_binop_logic_xor:
1746 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1747 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1748 return;
1749
1750 case ir_binop_logic_or:
1751 temp = fs_reg(this, glsl_type::bool_type);
1752 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1753 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1754 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1755 return;
1756
1757 case ir_binop_logic_and:
1758 temp = fs_reg(this, glsl_type::bool_type);
1759 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1760 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1761 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1762 return;
1763
1764 case ir_unop_f2b:
1765 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1766 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1767 return;
1768
1769 case ir_unop_i2b:
1770 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1771 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1772 return;
1773
1774 case ir_binop_greater:
1775 case ir_binop_gequal:
1776 case ir_binop_less:
1777 case ir_binop_lequal:
1778 case ir_binop_equal:
1779 case ir_binop_all_equal:
1780 case ir_binop_nequal:
1781 case ir_binop_any_nequal:
1782 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1783 inst->conditional_mod =
1784 brw_conditional_for_comparison(expr->operation);
1785 return;
1786 default:
1787 assert(!"not reached");
1788 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1789 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1790 fail("bad condition\n");
1791 return;
1792 }
1793 return;
1794 }
1795
1796 this->result = reg_undef;
1797 ir->condition->accept(this);
1798
1799 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1800 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1801 }
1802
1803 void
1804 fs_visitor::visit(ir_if *ir)
1805 {
1806 fs_inst *inst;
1807
1808 if (intel->gen != 6 && c->dispatch_width == 16) {
1809 fail("Can't support (non-uniform) control flow on 16-wide\n");
1810 }
1811
1812 /* Don't point the annotation at the if statement, because then it plus
1813 * the then and else blocks get printed.
1814 */
1815 this->base_ir = ir->condition;
1816
1817 if (intel->gen == 6) {
1818 emit_if_gen6(ir);
1819 } else {
1820 emit_bool_to_cond_code(ir->condition);
1821
1822 inst = emit(BRW_OPCODE_IF);
1823 inst->predicated = true;
1824 }
1825
1826 foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1827 ir_instruction *ir = (ir_instruction *)iter.get();
1828 this->base_ir = ir;
1829 this->result = reg_undef;
1830 ir->accept(this);
1831 }
1832
1833 if (!ir->else_instructions.is_empty()) {
1834 emit(BRW_OPCODE_ELSE);
1835
1836 foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1837 ir_instruction *ir = (ir_instruction *)iter.get();
1838 this->base_ir = ir;
1839 this->result = reg_undef;
1840 ir->accept(this);
1841 }
1842 }
1843
1844 emit(BRW_OPCODE_ENDIF);
1845 }
1846
1847 void
1848 fs_visitor::visit(ir_loop *ir)
1849 {
1850 fs_reg counter = reg_undef;
1851
1852 if (c->dispatch_width == 16) {
1853 fail("Can't support (non-uniform) control flow on 16-wide\n");
1854 }
1855
1856 if (ir->counter) {
1857 this->base_ir = ir->counter;
1858 ir->counter->accept(this);
1859 counter = *(variable_storage(ir->counter));
1860
1861 if (ir->from) {
1862 this->result = counter;
1863
1864 this->base_ir = ir->from;
1865 this->result = counter;
1866 ir->from->accept(this);
1867
1868 if (!this->result.equals(&counter))
1869 emit(BRW_OPCODE_MOV, counter, this->result);
1870 }
1871 }
1872
1873 emit(BRW_OPCODE_DO);
1874
1875 if (ir->to) {
1876 this->base_ir = ir->to;
1877 this->result = reg_undef;
1878 ir->to->accept(this);
1879
1880 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1881 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1882
1883 inst = emit(BRW_OPCODE_BREAK);
1884 inst->predicated = true;
1885 }
1886
1887 foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1888 ir_instruction *ir = (ir_instruction *)iter.get();
1889
1890 this->base_ir = ir;
1891 this->result = reg_undef;
1892 ir->accept(this);
1893 }
1894
1895 if (ir->increment) {
1896 this->base_ir = ir->increment;
1897 this->result = reg_undef;
1898 ir->increment->accept(this);
1899 emit(BRW_OPCODE_ADD, counter, counter, this->result);
1900 }
1901
1902 emit(BRW_OPCODE_WHILE);
1903 }
1904
1905 void
1906 fs_visitor::visit(ir_loop_jump *ir)
1907 {
1908 switch (ir->mode) {
1909 case ir_loop_jump::jump_break:
1910 emit(BRW_OPCODE_BREAK);
1911 break;
1912 case ir_loop_jump::jump_continue:
1913 emit(BRW_OPCODE_CONTINUE);
1914 break;
1915 }
1916 }
1917
1918 void
1919 fs_visitor::visit(ir_call *ir)
1920 {
1921 assert(!"FINISHME");
1922 }
1923
1924 void
1925 fs_visitor::visit(ir_return *ir)
1926 {
1927 assert(!"FINISHME");
1928 }
1929
1930 void
1931 fs_visitor::visit(ir_function *ir)
1932 {
1933 /* Ignore function bodies other than main() -- we shouldn't see calls to
1934 * them since they should all be inlined before we get to ir_to_mesa.
1935 */
1936 if (strcmp(ir->name, "main") == 0) {
1937 const ir_function_signature *sig;
1938 exec_list empty;
1939
1940 sig = ir->matching_signature(&empty);
1941
1942 assert(sig);
1943
1944 foreach_iter(exec_list_iterator, iter, sig->body) {
1945 ir_instruction *ir = (ir_instruction *)iter.get();
1946 this->base_ir = ir;
1947 this->result = reg_undef;
1948 ir->accept(this);
1949 }
1950 }
1951 }
1952
1953 void
1954 fs_visitor::visit(ir_function_signature *ir)
1955 {
1956 assert(!"not reached");
1957 (void)ir;
1958 }
1959
1960 fs_inst *
1961 fs_visitor::emit(fs_inst inst)
1962 {
1963 fs_inst *list_inst = new(mem_ctx) fs_inst;
1964 *list_inst = inst;
1965
1966 if (force_uncompressed_stack > 0)
1967 list_inst->force_uncompressed = true;
1968 else if (force_sechalf_stack > 0)
1969 list_inst->force_sechalf = true;
1970
1971 list_inst->annotation = this->current_annotation;
1972 list_inst->ir = this->base_ir;
1973
1974 this->instructions.push_tail(list_inst);
1975
1976 return list_inst;
1977 }
1978
1979 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1980 void
1981 fs_visitor::emit_dummy_fs()
1982 {
1983 /* Everyone's favorite color. */
1984 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1985 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1986 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1987 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1988
1989 fs_inst *write;
1990 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1991 write->base_mrf = 0;
1992 }
1993
1994 /* The register location here is relative to the start of the URB
1995 * data. It will get adjusted to be a real location before
1996 * generate_code() time.
1997 */
1998 struct brw_reg
1999 fs_visitor::interp_reg(int location, int channel)
2000 {
2001 int regnr = urb_setup[location] * 2 + channel / 2;
2002 int stride = (channel & 1) * 4;
2003
2004 assert(urb_setup[location] != -1);
2005
2006 return brw_vec1_grf(regnr, stride);
2007 }
2008
2009 /** Emits the interpolation for the varying inputs. */
2010 void
2011 fs_visitor::emit_interpolation_setup_gen4()
2012 {
2013 this->current_annotation = "compute pixel centers";
2014 this->pixel_x = fs_reg(this, glsl_type::uint_type);
2015 this->pixel_y = fs_reg(this, glsl_type::uint_type);
2016 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
2017 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
2018
2019 emit(FS_OPCODE_PIXEL_X, this->pixel_x);
2020 emit(FS_OPCODE_PIXEL_Y, this->pixel_y);
2021
2022 this->current_annotation = "compute pixel deltas from v0";
2023 if (brw->has_pln) {
2024 this->delta_x = fs_reg(this, glsl_type::vec2_type);
2025 this->delta_y = this->delta_x;
2026 this->delta_y.reg_offset++;
2027 } else {
2028 this->delta_x = fs_reg(this, glsl_type::float_type);
2029 this->delta_y = fs_reg(this, glsl_type::float_type);
2030 }
2031 emit(BRW_OPCODE_ADD, this->delta_x,
2032 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
2033 emit(BRW_OPCODE_ADD, this->delta_y,
2034 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
2035
2036 this->current_annotation = "compute pos.w and 1/pos.w";
2037 /* Compute wpos.w. It's always in our setup, since it's needed to
2038 * interpolate the other attributes.
2039 */
2040 this->wpos_w = fs_reg(this, glsl_type::float_type);
2041 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
2042 interp_reg(FRAG_ATTRIB_WPOS, 3));
2043 /* Compute the pixel 1/W value from wpos.w. */
2044 this->pixel_w = fs_reg(this, glsl_type::float_type);
2045 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
2046 this->current_annotation = NULL;
2047 }
2048
2049 /** Emits the interpolation for the varying inputs. */
2050 void
2051 fs_visitor::emit_interpolation_setup_gen6()
2052 {
2053 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
2054
2055 /* If the pixel centers end up used, the setup is the same as for gen4. */
2056 this->current_annotation = "compute pixel centers";
2057 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
2058 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
2059 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
2060 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
2061 emit(BRW_OPCODE_ADD,
2062 int_pixel_x,
2063 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
2064 fs_reg(brw_imm_v(0x10101010)));
2065 emit(BRW_OPCODE_ADD,
2066 int_pixel_y,
2067 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
2068 fs_reg(brw_imm_v(0x11001100)));
2069
2070 /* As of gen6, we can no longer mix float and int sources. We have
2071 * to turn the integer pixel centers into floats for their actual
2072 * use.
2073 */
2074 this->pixel_x = fs_reg(this, glsl_type::float_type);
2075 this->pixel_y = fs_reg(this, glsl_type::float_type);
2076 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
2077 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
2078
2079 this->current_annotation = "compute pos.w";
2080 this->pixel_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
2081 this->wpos_w = fs_reg(this, glsl_type::float_type);
2082 emit_math(FS_OPCODE_RCP, this->wpos_w, this->pixel_w);
2083
2084 this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2085 this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2086
2087 this->current_annotation = NULL;
2088 }
2089
2090 void
2091 fs_visitor::emit_color_write(int index, int first_color_mrf, fs_reg color)
2092 {
2093 int reg_width = c->dispatch_width / 8;
2094
2095 if (c->dispatch_width == 8 || intel->gen == 6) {
2096 /* SIMD8 write looks like:
2097 * m + 0: r0
2098 * m + 1: r1
2099 * m + 2: g0
2100 * m + 3: g1
2101 *
2102 * gen6 SIMD16 DP write looks like:
2103 * m + 0: r0
2104 * m + 1: r1
2105 * m + 2: g0
2106 * m + 3: g1
2107 * m + 4: b0
2108 * m + 5: b1
2109 * m + 6: a0
2110 * m + 7: a1
2111 */
2112 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index * reg_width),
2113 color);
2114 } else {
2115 /* pre-gen6 SIMD16 single source DP write looks like:
2116 * m + 0: r0
2117 * m + 1: g0
2118 * m + 2: b0
2119 * m + 3: a0
2120 * m + 4: r1
2121 * m + 5: g1
2122 * m + 6: b1
2123 * m + 7: a1
2124 */
2125 if (brw->has_compr4) {
2126 /* By setting the high bit of the MRF register number, we
2127 * indicate that we want COMPR4 mode - instead of doing the
2128 * usual destination + 1 for the second half we get
2129 * destination + 4.
2130 */
2131 emit(BRW_OPCODE_MOV,
2132 fs_reg(MRF, BRW_MRF_COMPR4 + first_color_mrf + index), color);
2133 } else {
2134 push_force_uncompressed();
2135 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index), color);
2136 pop_force_uncompressed();
2137
2138 push_force_sechalf();
2139 color.sechalf = true;
2140 emit(BRW_OPCODE_MOV, fs_reg(MRF, first_color_mrf + index + 4), color);
2141 pop_force_sechalf();
2142 color.sechalf = false;
2143 }
2144 }
2145 }
2146
2147 void
2148 fs_visitor::emit_fb_writes()
2149 {
2150 this->current_annotation = "FB write header";
2151 GLboolean header_present = GL_TRUE;
2152 int nr = 0;
2153 int reg_width = c->dispatch_width / 8;
2154
2155 if (intel->gen >= 6 &&
2156 !this->kill_emitted &&
2157 c->key.nr_color_regions == 1) {
2158 header_present = false;
2159 }
2160
2161 if (header_present) {
2162 /* m0, m1 header */
2163 nr += 2;
2164 }
2165
2166 if (c->aa_dest_stencil_reg) {
2167 push_force_uncompressed();
2168 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2169 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2170 pop_force_uncompressed();
2171 }
2172
2173 /* Reserve space for color. It'll be filled in per MRT below. */
2174 int color_mrf = nr;
2175 nr += 4 * reg_width;
2176
2177 if (c->source_depth_to_render_target) {
2178 if (intel->gen == 6 && c->dispatch_width == 16) {
2179 /* For outputting oDepth on gen6, SIMD8 writes have to be
2180 * used. This would require 8-wide moves of each half to
2181 * message regs, kind of like pre-gen5 SIMD16 FB writes.
2182 * Just bail on doing so for now.
2183 */
2184 fail("Missing support for simd16 depth writes on gen6\n");
2185 }
2186
2187 if (c->computes_depth) {
2188 /* Hand over gl_FragDepth. */
2189 assert(this->frag_depth);
2190 fs_reg depth = *(variable_storage(this->frag_depth));
2191
2192 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr), depth);
2193 } else {
2194 /* Pass through the payload depth. */
2195 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2196 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2197 }
2198 nr += reg_width;
2199 }
2200
2201 if (c->dest_depth_reg) {
2202 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr),
2203 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2204 nr += reg_width;
2205 }
2206
2207 fs_reg color = reg_undef;
2208 if (this->frag_color)
2209 color = *(variable_storage(this->frag_color));
2210 else if (this->frag_data) {
2211 color = *(variable_storage(this->frag_data));
2212 color.type = BRW_REGISTER_TYPE_F;
2213 }
2214
2215 for (int target = 0; target < c->key.nr_color_regions; target++) {
2216 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2217 "FB write target %d",
2218 target);
2219 if (this->frag_color || this->frag_data) {
2220 for (int i = 0; i < 4; i++) {
2221 emit_color_write(i, color_mrf, color);
2222 color.reg_offset++;
2223 }
2224 }
2225
2226 if (this->frag_color)
2227 color.reg_offset -= 4;
2228
2229 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2230 inst->target = target;
2231 inst->base_mrf = 0;
2232 inst->mlen = nr;
2233 if (target == c->key.nr_color_regions - 1)
2234 inst->eot = true;
2235 inst->header_present = header_present;
2236 }
2237
2238 if (c->key.nr_color_regions == 0) {
2239 if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2240 /* If the alpha test is enabled but there's no color buffer,
2241 * we still need to send alpha out the pipeline to our null
2242 * renderbuffer.
2243 */
2244 color.reg_offset += 3;
2245 emit_color_write(3, color_mrf, color);
2246 }
2247
2248 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2249 inst->base_mrf = 0;
2250 inst->mlen = nr;
2251 inst->eot = true;
2252 inst->header_present = header_present;
2253 }
2254
2255 this->current_annotation = NULL;
2256 }
2257
2258 /**
2259 * To be called after the last _mesa_add_state_reference() call, to
2260 * set up prog_data.param[] for assign_curb_setup() and
2261 * setup_pull_constants().
2262 */
2263 void
2264 fs_visitor::setup_paramvalues_refs()
2265 {
2266 if (c->dispatch_width != 8)
2267 return;
2268
2269 /* Set up the pointers to ParamValues now that that array is finalized. */
2270 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2271 c->prog_data.param[i] =
2272 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2273 this->param_offset[i];
2274 }
2275 }
2276
2277 void
2278 fs_visitor::assign_curb_setup()
2279 {
2280 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2281 if (c->dispatch_width == 8) {
2282 c->prog_data.first_curbe_grf = c->nr_payload_regs;
2283 } else {
2284 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
2285 }
2286
2287 /* Map the offsets in the UNIFORM file to fixed HW regs. */
2288 foreach_iter(exec_list_iterator, iter, this->instructions) {
2289 fs_inst *inst = (fs_inst *)iter.get();
2290
2291 for (unsigned int i = 0; i < 3; i++) {
2292 if (inst->src[i].file == UNIFORM) {
2293 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2294 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
2295 constant_nr / 8,
2296 constant_nr % 8);
2297
2298 inst->src[i].file = FIXED_HW_REG;
2299 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2300 }
2301 }
2302 }
2303 }
2304
2305 void
2306 fs_visitor::calculate_urb_setup()
2307 {
2308 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2309 urb_setup[i] = -1;
2310 }
2311
2312 int urb_next = 0;
2313 /* Figure out where each of the incoming setup attributes lands. */
2314 if (intel->gen >= 6) {
2315 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2316 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2317 urb_setup[i] = urb_next++;
2318 }
2319 }
2320 } else {
2321 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2322 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2323 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2324 int fp_index;
2325
2326 if (i >= VERT_RESULT_VAR0)
2327 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2328 else if (i <= VERT_RESULT_TEX7)
2329 fp_index = i;
2330 else
2331 fp_index = -1;
2332
2333 if (fp_index >= 0)
2334 urb_setup[fp_index] = urb_next++;
2335 }
2336 }
2337 }
2338
2339 /* Each attribute is 4 setup channels, each of which is half a reg. */
2340 c->prog_data.urb_read_length = urb_next * 2;
2341 }
2342
2343 void
2344 fs_visitor::assign_urb_setup()
2345 {
2346 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
2347
2348 /* Offset all the urb_setup[] index by the actual position of the
2349 * setup regs, now that the location of the constants has been chosen.
2350 */
2351 foreach_iter(exec_list_iterator, iter, this->instructions) {
2352 fs_inst *inst = (fs_inst *)iter.get();
2353
2354 if (inst->opcode == FS_OPCODE_LINTERP) {
2355 assert(inst->src[2].file == FIXED_HW_REG);
2356 inst->src[2].fixed_hw_reg.nr += urb_start;
2357 }
2358
2359 if (inst->opcode == FS_OPCODE_CINTERP) {
2360 assert(inst->src[0].file == FIXED_HW_REG);
2361 inst->src[0].fixed_hw_reg.nr += urb_start;
2362 }
2363 }
2364
2365 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2366 }
2367
2368 /**
2369 * Split large virtual GRFs into separate components if we can.
2370 *
2371 * This is mostly duplicated with what brw_fs_vector_splitting does,
2372 * but that's really conservative because it's afraid of doing
2373 * splitting that doesn't result in real progress after the rest of
2374 * the optimization phases, which would cause infinite looping in
2375 * optimization. We can do it once here, safely. This also has the
2376 * opportunity to split interpolated values, or maybe even uniforms,
2377 * which we don't have at the IR level.
2378 *
2379 * We want to split, because virtual GRFs are what we register
2380 * allocate and spill (due to contiguousness requirements for some
2381 * instructions), and they're what we naturally generate in the
2382 * codegen process, but most virtual GRFs don't actually need to be
2383 * contiguous sets of GRFs. If we split, we'll end up with reduced
2384 * live intervals and better dead code elimination and coalescing.
2385 */
2386 void
2387 fs_visitor::split_virtual_grfs()
2388 {
2389 int num_vars = this->virtual_grf_next;
2390 bool split_grf[num_vars];
2391 int new_virtual_grf[num_vars];
2392
2393 /* Try to split anything > 0 sized. */
2394 for (int i = 0; i < num_vars; i++) {
2395 if (this->virtual_grf_sizes[i] != 1)
2396 split_grf[i] = true;
2397 else
2398 split_grf[i] = false;
2399 }
2400
2401 if (brw->has_pln) {
2402 /* PLN opcodes rely on the delta_xy being contiguous. */
2403 split_grf[this->delta_x.reg] = false;
2404 }
2405
2406 foreach_iter(exec_list_iterator, iter, this->instructions) {
2407 fs_inst *inst = (fs_inst *)iter.get();
2408
2409 /* Texturing produces 4 contiguous registers, so no splitting. */
2410 if (inst->is_tex()) {
2411 split_grf[inst->dst.reg] = false;
2412 }
2413 }
2414
2415 /* Allocate new space for split regs. Note that the virtual
2416 * numbers will be contiguous.
2417 */
2418 for (int i = 0; i < num_vars; i++) {
2419 if (split_grf[i]) {
2420 new_virtual_grf[i] = virtual_grf_alloc(1);
2421 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2422 int reg = virtual_grf_alloc(1);
2423 assert(reg == new_virtual_grf[i] + j - 1);
2424 (void) reg;
2425 }
2426 this->virtual_grf_sizes[i] = 1;
2427 }
2428 }
2429
2430 foreach_iter(exec_list_iterator, iter, this->instructions) {
2431 fs_inst *inst = (fs_inst *)iter.get();
2432
2433 if (inst->dst.file == GRF &&
2434 split_grf[inst->dst.reg] &&
2435 inst->dst.reg_offset != 0) {
2436 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2437 inst->dst.reg_offset - 1);
2438 inst->dst.reg_offset = 0;
2439 }
2440 for (int i = 0; i < 3; i++) {
2441 if (inst->src[i].file == GRF &&
2442 split_grf[inst->src[i].reg] &&
2443 inst->src[i].reg_offset != 0) {
2444 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2445 inst->src[i].reg_offset - 1);
2446 inst->src[i].reg_offset = 0;
2447 }
2448 }
2449 }
2450 this->live_intervals_valid = false;
2451 }
2452
2453 /**
2454 * Choose accesses from the UNIFORM file to demote to using the pull
2455 * constant buffer.
2456 *
2457 * We allow a fragment shader to have more than the specified minimum
2458 * maximum number of fragment shader uniform components (64). If
2459 * there are too many of these, they'd fill up all of register space.
2460 * So, this will push some of them out to the pull constant buffer and
2461 * update the program to load them.
2462 */
2463 void
2464 fs_visitor::setup_pull_constants()
2465 {
2466 /* Only allow 16 registers (128 uniform components) as push constants. */
2467 unsigned int max_uniform_components = 16 * 8;
2468 if (c->prog_data.nr_params <= max_uniform_components)
2469 return;
2470
2471 if (c->dispatch_width == 16) {
2472 fail("Pull constants not supported in 16-wide\n");
2473 return;
2474 }
2475
2476 /* Just demote the end of the list. We could probably do better
2477 * here, demoting things that are rarely used in the program first.
2478 */
2479 int pull_uniform_base = max_uniform_components;
2480 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2481
2482 foreach_iter(exec_list_iterator, iter, this->instructions) {
2483 fs_inst *inst = (fs_inst *)iter.get();
2484
2485 for (int i = 0; i < 3; i++) {
2486 if (inst->src[i].file != UNIFORM)
2487 continue;
2488
2489 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2490 if (uniform_nr < pull_uniform_base)
2491 continue;
2492
2493 fs_reg dst = fs_reg(this, glsl_type::float_type);
2494 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2495 dst);
2496 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2497 pull->ir = inst->ir;
2498 pull->annotation = inst->annotation;
2499 pull->base_mrf = 14;
2500 pull->mlen = 1;
2501
2502 inst->insert_before(pull);
2503
2504 inst->src[i].file = GRF;
2505 inst->src[i].reg = dst.reg;
2506 inst->src[i].reg_offset = 0;
2507 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2508 }
2509 }
2510
2511 for (int i = 0; i < pull_uniform_count; i++) {
2512 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2513 c->prog_data.pull_param_convert[i] =
2514 c->prog_data.param_convert[pull_uniform_base + i];
2515 }
2516 c->prog_data.nr_params -= pull_uniform_count;
2517 c->prog_data.nr_pull_params = pull_uniform_count;
2518 }
2519
2520 void
2521 fs_visitor::calculate_live_intervals()
2522 {
2523 int num_vars = this->virtual_grf_next;
2524 int *def = ralloc_array(mem_ctx, int, num_vars);
2525 int *use = ralloc_array(mem_ctx, int, num_vars);
2526 int loop_depth = 0;
2527 int loop_start = 0;
2528
2529 if (this->live_intervals_valid)
2530 return;
2531
2532 for (int i = 0; i < num_vars; i++) {
2533 def[i] = MAX_INSTRUCTION;
2534 use[i] = -1;
2535 }
2536
2537 int ip = 0;
2538 foreach_iter(exec_list_iterator, iter, this->instructions) {
2539 fs_inst *inst = (fs_inst *)iter.get();
2540
2541 if (inst->opcode == BRW_OPCODE_DO) {
2542 if (loop_depth++ == 0)
2543 loop_start = ip;
2544 } else if (inst->opcode == BRW_OPCODE_WHILE) {
2545 loop_depth--;
2546
2547 if (loop_depth == 0) {
2548 /* Patches up the use of vars marked for being live across
2549 * the whole loop.
2550 */
2551 for (int i = 0; i < num_vars; i++) {
2552 if (use[i] == loop_start) {
2553 use[i] = ip;
2554 }
2555 }
2556 }
2557 } else {
2558 for (unsigned int i = 0; i < 3; i++) {
2559 if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2560 int reg = inst->src[i].reg;
2561
2562 if (!loop_depth) {
2563 use[reg] = ip;
2564 } else {
2565 def[reg] = MIN2(loop_start, def[reg]);
2566 use[reg] = loop_start;
2567
2568 /* Nobody else is going to go smash our start to
2569 * later in the loop now, because def[reg] now
2570 * points before the bb header.
2571 */
2572 }
2573 }
2574 }
2575 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2576 int reg = inst->dst.reg;
2577
2578 if (!loop_depth) {
2579 def[reg] = MIN2(def[reg], ip);
2580 } else {
2581 def[reg] = MIN2(def[reg], loop_start);
2582 }
2583 }
2584 }
2585
2586 ip++;
2587 }
2588
2589 ralloc_free(this->virtual_grf_def);
2590 ralloc_free(this->virtual_grf_use);
2591 this->virtual_grf_def = def;
2592 this->virtual_grf_use = use;
2593
2594 this->live_intervals_valid = true;
2595 }
2596
2597 /**
2598 * Attempts to move immediate constants into the immediate
2599 * constant slot of following instructions.
2600 *
2601 * Immediate constants are a bit tricky -- they have to be in the last
2602 * operand slot, you can't do abs/negate on them,
2603 */
2604
2605 bool
2606 fs_visitor::propagate_constants()
2607 {
2608 bool progress = false;
2609
2610 calculate_live_intervals();
2611
2612 foreach_iter(exec_list_iterator, iter, this->instructions) {
2613 fs_inst *inst = (fs_inst *)iter.get();
2614
2615 if (inst->opcode != BRW_OPCODE_MOV ||
2616 inst->predicated ||
2617 inst->dst.file != GRF || inst->src[0].file != IMM ||
2618 inst->dst.type != inst->src[0].type ||
2619 (c->dispatch_width == 16 &&
2620 (inst->force_uncompressed || inst->force_sechalf)))
2621 continue;
2622
2623 /* Don't bother with cases where we should have had the
2624 * operation on the constant folded in GLSL already.
2625 */
2626 if (inst->saturate)
2627 continue;
2628
2629 /* Found a move of a constant to a GRF. Find anything else using the GRF
2630 * before it's written, and replace it with the constant if we can.
2631 */
2632 exec_list_iterator scan_iter = iter;
2633 scan_iter.next();
2634 for (; scan_iter.has_next(); scan_iter.next()) {
2635 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2636
2637 if (scan_inst->opcode == BRW_OPCODE_DO ||
2638 scan_inst->opcode == BRW_OPCODE_WHILE ||
2639 scan_inst->opcode == BRW_OPCODE_ELSE ||
2640 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2641 break;
2642 }
2643
2644 for (int i = 2; i >= 0; i--) {
2645 if (scan_inst->src[i].file != GRF ||
2646 scan_inst->src[i].reg != inst->dst.reg ||
2647 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2648 continue;
2649
2650 /* Don't bother with cases where we should have had the
2651 * operation on the constant folded in GLSL already.
2652 */
2653 if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2654 continue;
2655
2656 switch (scan_inst->opcode) {
2657 case BRW_OPCODE_MOV:
2658 scan_inst->src[i] = inst->src[0];
2659 progress = true;
2660 break;
2661
2662 case BRW_OPCODE_MUL:
2663 case BRW_OPCODE_ADD:
2664 if (i == 1) {
2665 scan_inst->src[i] = inst->src[0];
2666 progress = true;
2667 } else if (i == 0 && scan_inst->src[1].file != IMM) {
2668 /* Fit this constant in by commuting the operands */
2669 scan_inst->src[0] = scan_inst->src[1];
2670 scan_inst->src[1] = inst->src[0];
2671 progress = true;
2672 }
2673 break;
2674
2675 case BRW_OPCODE_CMP:
2676 if (i == 1) {
2677 scan_inst->src[i] = inst->src[0];
2678 progress = true;
2679 } else if (i == 0 && scan_inst->src[1].file != IMM) {
2680 uint32_t new_cmod;
2681
2682 new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
2683 if (new_cmod != ~0u) {
2684 /* Fit this constant in by swapping the operands and
2685 * flipping the test
2686 */
2687 scan_inst->src[0] = scan_inst->src[1];
2688 scan_inst->src[1] = inst->src[0];
2689 scan_inst->conditional_mod = new_cmod;
2690 progress = true;
2691 }
2692 }
2693 break;
2694
2695 case BRW_OPCODE_SEL:
2696 if (i == 1) {
2697 scan_inst->src[i] = inst->src[0];
2698 progress = true;
2699 } else if (i == 0 && scan_inst->src[1].file != IMM) {
2700 /* Fit this constant in by swapping the operands and
2701 * flipping the predicate
2702 */
2703 scan_inst->src[0] = scan_inst->src[1];
2704 scan_inst->src[1] = inst->src[0];
2705 scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
2706 progress = true;
2707 }
2708 break;
2709 }
2710 }
2711
2712 if (scan_inst->dst.file == GRF &&
2713 scan_inst->dst.reg == inst->dst.reg &&
2714 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2715 scan_inst->is_tex())) {
2716 break;
2717 }
2718 }
2719 }
2720
2721 if (progress)
2722 this->live_intervals_valid = false;
2723
2724 return progress;
2725 }
2726 /**
2727 * Must be called after calculate_live_intervales() to remove unused
2728 * writes to registers -- register allocation will fail otherwise
2729 * because something deffed but not used won't be considered to
2730 * interfere with other regs.
2731 */
2732 bool
2733 fs_visitor::dead_code_eliminate()
2734 {
2735 bool progress = false;
2736 int pc = 0;
2737
2738 calculate_live_intervals();
2739
2740 foreach_iter(exec_list_iterator, iter, this->instructions) {
2741 fs_inst *inst = (fs_inst *)iter.get();
2742
2743 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2744 inst->remove();
2745 progress = true;
2746 }
2747
2748 pc++;
2749 }
2750
2751 if (progress)
2752 live_intervals_valid = false;
2753
2754 return progress;
2755 }
2756
2757 bool
2758 fs_visitor::register_coalesce()
2759 {
2760 bool progress = false;
2761 int if_depth = 0;
2762 int loop_depth = 0;
2763
2764 foreach_iter(exec_list_iterator, iter, this->instructions) {
2765 fs_inst *inst = (fs_inst *)iter.get();
2766
2767 /* Make sure that we dominate the instructions we're going to
2768 * scan for interfering with our coalescing, or we won't have
2769 * scanned enough to see if anything interferes with our
2770 * coalescing. We don't dominate the following instructions if
2771 * we're in a loop or an if block.
2772 */
2773 switch (inst->opcode) {
2774 case BRW_OPCODE_DO:
2775 loop_depth++;
2776 break;
2777 case BRW_OPCODE_WHILE:
2778 loop_depth--;
2779 break;
2780 case BRW_OPCODE_IF:
2781 if_depth++;
2782 break;
2783 case BRW_OPCODE_ENDIF:
2784 if_depth--;
2785 break;
2786 }
2787 if (loop_depth || if_depth)
2788 continue;
2789
2790 if (inst->opcode != BRW_OPCODE_MOV ||
2791 inst->predicated ||
2792 inst->saturate ||
2793 inst->dst.file != GRF || inst->src[0].file != GRF ||
2794 inst->dst.type != inst->src[0].type)
2795 continue;
2796
2797 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
2798
2799 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2800 * them: check for no writes to either one until the exit of the
2801 * program.
2802 */
2803 bool interfered = false;
2804 exec_list_iterator scan_iter = iter;
2805 scan_iter.next();
2806 for (; scan_iter.has_next(); scan_iter.next()) {
2807 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2808
2809 if (scan_inst->dst.file == GRF) {
2810 if (scan_inst->dst.reg == inst->dst.reg &&
2811 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2812 scan_inst->is_tex())) {
2813 interfered = true;
2814 break;
2815 }
2816 if (scan_inst->dst.reg == inst->src[0].reg &&
2817 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
2818 scan_inst->is_tex())) {
2819 interfered = true;
2820 break;
2821 }
2822 }
2823
2824 /* The gen6 MATH instruction can't handle source modifiers, so avoid
2825 * coalescing those for now. We should do something more specific.
2826 */
2827 if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) {
2828 interfered = true;
2829 break;
2830 }
2831 }
2832 if (interfered) {
2833 continue;
2834 }
2835
2836 /* Rewrite the later usage to point at the source of the move to
2837 * be removed.
2838 */
2839 for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
2840 scan_iter.next()) {
2841 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2842
2843 for (int i = 0; i < 3; i++) {
2844 if (scan_inst->src[i].file == GRF &&
2845 scan_inst->src[i].reg == inst->dst.reg &&
2846 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2847 scan_inst->src[i].reg = inst->src[0].reg;
2848 scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
2849 scan_inst->src[i].abs |= inst->src[0].abs;
2850 scan_inst->src[i].negate ^= inst->src[0].negate;
2851 scan_inst->src[i].smear = inst->src[0].smear;
2852 }
2853 }
2854 }
2855
2856 inst->remove();
2857 progress = true;
2858 }
2859
2860 if (progress)
2861 live_intervals_valid = false;
2862
2863 return progress;
2864 }
2865
2866
2867 bool
2868 fs_visitor::compute_to_mrf()
2869 {
2870 bool progress = false;
2871 int next_ip = 0;
2872
2873 calculate_live_intervals();
2874
2875 foreach_iter(exec_list_iterator, iter, this->instructions) {
2876 fs_inst *inst = (fs_inst *)iter.get();
2877
2878 int ip = next_ip;
2879 next_ip++;
2880
2881 if (inst->opcode != BRW_OPCODE_MOV ||
2882 inst->predicated ||
2883 inst->dst.file != MRF || inst->src[0].file != GRF ||
2884 inst->dst.type != inst->src[0].type ||
2885 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2886 continue;
2887
2888 /* Work out which hardware MRF registers are written by this
2889 * instruction.
2890 */
2891 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
2892 int mrf_high;
2893 if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
2894 mrf_high = mrf_low + 4;
2895 } else if (c->dispatch_width == 16 &&
2896 (!inst->force_uncompressed && !inst->force_sechalf)) {
2897 mrf_high = mrf_low + 1;
2898 } else {
2899 mrf_high = mrf_low;
2900 }
2901
2902 /* Can't compute-to-MRF this GRF if someone else was going to
2903 * read it later.
2904 */
2905 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2906 continue;
2907
2908 /* Found a move of a GRF to a MRF. Let's see if we can go
2909 * rewrite the thing that made this GRF to write into the MRF.
2910 */
2911 fs_inst *scan_inst;
2912 for (scan_inst = (fs_inst *)inst->prev;
2913 scan_inst->prev != NULL;
2914 scan_inst = (fs_inst *)scan_inst->prev) {
2915 if (scan_inst->dst.file == GRF &&
2916 scan_inst->dst.reg == inst->src[0].reg) {
2917 /* Found the last thing to write our reg we want to turn
2918 * into a compute-to-MRF.
2919 */
2920
2921 if (scan_inst->is_tex()) {
2922 /* texturing writes several continuous regs, so we can't
2923 * compute-to-mrf that.
2924 */
2925 break;
2926 }
2927
2928 /* If it's predicated, it (probably) didn't populate all
2929 * the channels. We might be able to rewrite everything
2930 * that writes that reg, but it would require smarter
2931 * tracking to delay the rewriting until complete success.
2932 */
2933 if (scan_inst->predicated)
2934 break;
2935
2936 /* If it's half of register setup and not the same half as
2937 * our MOV we're trying to remove, bail for now.
2938 */
2939 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2940 scan_inst->force_sechalf != inst->force_sechalf) {
2941 break;
2942 }
2943
2944 /* SEND instructions can't have MRF as a destination. */
2945 if (scan_inst->mlen)
2946 break;
2947
2948 if (intel->gen >= 6) {
2949 /* gen6 math instructions must have the destination be
2950 * GRF, so no compute-to-MRF for them.
2951 */
2952 if (scan_inst->is_math()) {
2953 break;
2954 }
2955 }
2956
2957 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2958 /* Found the creator of our MRF's source value. */
2959 scan_inst->dst.file = MRF;
2960 scan_inst->dst.hw_reg = inst->dst.hw_reg;
2961 scan_inst->saturate |= inst->saturate;
2962 inst->remove();
2963 progress = true;
2964 }
2965 break;
2966 }
2967
2968 /* We don't handle flow control here. Most computation of
2969 * values that end up in MRFs are shortly before the MRF
2970 * write anyway.
2971 */
2972 if (scan_inst->opcode == BRW_OPCODE_DO ||
2973 scan_inst->opcode == BRW_OPCODE_WHILE ||
2974 scan_inst->opcode == BRW_OPCODE_ELSE ||
2975 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2976 break;
2977 }
2978
2979 /* You can't read from an MRF, so if someone else reads our
2980 * MRF's source GRF that we wanted to rewrite, that stops us.
2981 */
2982 bool interfered = false;
2983 for (int i = 0; i < 3; i++) {
2984 if (scan_inst->src[i].file == GRF &&
2985 scan_inst->src[i].reg == inst->src[0].reg &&
2986 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2987 interfered = true;
2988 }
2989 }
2990 if (interfered)
2991 break;
2992
2993 if (scan_inst->dst.file == MRF) {
2994 /* If somebody else writes our MRF here, we can't
2995 * compute-to-MRF before that.
2996 */
2997 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
2998 int scan_mrf_high;
2999
3000 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
3001 scan_mrf_high = scan_mrf_low + 4;
3002 } else if (c->dispatch_width == 16 &&
3003 (!scan_inst->force_uncompressed &&
3004 !scan_inst->force_sechalf)) {
3005 scan_mrf_high = scan_mrf_low + 1;
3006 } else {
3007 scan_mrf_high = scan_mrf_low;
3008 }
3009
3010 if (mrf_low == scan_mrf_low ||
3011 mrf_low == scan_mrf_high ||
3012 mrf_high == scan_mrf_low ||
3013 mrf_high == scan_mrf_high) {
3014 break;
3015 }
3016 }
3017
3018 if (scan_inst->mlen > 0) {
3019 /* Found a SEND instruction, which means that there are
3020 * live values in MRFs from base_mrf to base_mrf +
3021 * scan_inst->mlen - 1. Don't go pushing our MRF write up
3022 * above it.
3023 */
3024 if (mrf_low >= scan_inst->base_mrf &&
3025 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
3026 break;
3027 }
3028 if (mrf_high >= scan_inst->base_mrf &&
3029 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
3030 break;
3031 }
3032 }
3033 }
3034 }
3035
3036 return progress;
3037 }
3038
3039 /**
3040 * Walks through basic blocks, locking for repeated MRF writes and
3041 * removing the later ones.
3042 */
3043 bool
3044 fs_visitor::remove_duplicate_mrf_writes()
3045 {
3046 fs_inst *last_mrf_move[16];
3047 bool progress = false;
3048
3049 /* Need to update the MRF tracking for compressed instructions. */
3050 if (c->dispatch_width == 16)
3051 return false;
3052
3053 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3054
3055 foreach_iter(exec_list_iterator, iter, this->instructions) {
3056 fs_inst *inst = (fs_inst *)iter.get();
3057
3058 switch (inst->opcode) {
3059 case BRW_OPCODE_DO:
3060 case BRW_OPCODE_WHILE:
3061 case BRW_OPCODE_IF:
3062 case BRW_OPCODE_ELSE:
3063 case BRW_OPCODE_ENDIF:
3064 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3065 continue;
3066 default:
3067 break;
3068 }
3069
3070 if (inst->opcode == BRW_OPCODE_MOV &&
3071 inst->dst.file == MRF) {
3072 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3073 if (prev_inst && inst->equals(prev_inst)) {
3074 inst->remove();
3075 progress = true;
3076 continue;
3077 }
3078 }
3079
3080 /* Clear out the last-write records for MRFs that were overwritten. */
3081 if (inst->dst.file == MRF) {
3082 last_mrf_move[inst->dst.hw_reg] = NULL;
3083 }
3084
3085 if (inst->mlen > 0) {
3086 /* Found a SEND instruction, which will include two or fewer
3087 * implied MRF writes. We could do better here.
3088 */
3089 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3090 last_mrf_move[inst->base_mrf + i] = NULL;
3091 }
3092 }
3093
3094 /* Clear out any MRF move records whose sources got overwritten. */
3095 if (inst->dst.file == GRF) {
3096 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3097 if (last_mrf_move[i] &&
3098 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3099 last_mrf_move[i] = NULL;
3100 }
3101 }
3102 }
3103
3104 if (inst->opcode == BRW_OPCODE_MOV &&
3105 inst->dst.file == MRF &&
3106 inst->src[0].file == GRF &&
3107 !inst->predicated) {
3108 last_mrf_move[inst->dst.hw_reg] = inst;
3109 }
3110 }
3111
3112 return progress;
3113 }
3114
3115 bool
3116 fs_visitor::virtual_grf_interferes(int a, int b)
3117 {
3118 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3119 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3120
3121 /* We can't handle dead register writes here, without iterating
3122 * over the whole instruction stream to find every single dead
3123 * write to that register to compare to the live interval of the
3124 * other register. Just assert that dead_code_eliminate() has been
3125 * called.
3126 */
3127 assert((this->virtual_grf_use[a] != -1 ||
3128 this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3129 (this->virtual_grf_use[b] != -1 ||
3130 this->virtual_grf_def[b] == MAX_INSTRUCTION));
3131
3132 /* If the register is used to store 16 values of less than float
3133 * size (only the case for pixel_[xy]), then we can't allocate
3134 * another dword-sized thing to that register that would be used in
3135 * the same instruction. This is because when the GPU decodes (for
3136 * example):
3137 *
3138 * (declare (in ) vec4 gl_FragCoord@0x97766a0)
3139 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr };
3140 *
3141 * it's actually processed as:
3142 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 };
3143 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf };
3144 *
3145 * so our second half values in g6 got overwritten in the first
3146 * half.
3147 */
3148 if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
3149 this->pixel_x.reg == b ||
3150 this->pixel_y.reg == a ||
3151 this->pixel_y.reg == b)) {
3152 return start <= end;
3153 }
3154
3155 return start < end;
3156 }
3157
3158 bool
3159 fs_visitor::run()
3160 {
3161 uint32_t prog_offset_16 = 0;
3162 uint32_t orig_nr_params = c->prog_data.nr_params;
3163
3164 brw_wm_payload_setup(brw, c);
3165
3166 if (c->dispatch_width == 16) {
3167 /* align to 64 byte boundary. */
3168 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
3169 brw_NOP(p);
3170 }
3171
3172 /* Save off the start of this 16-wide program in case we succeed. */
3173 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
3174
3175 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
3176 }
3177
3178 if (0) {
3179 emit_dummy_fs();
3180 } else {
3181 calculate_urb_setup();
3182 if (intel->gen < 6)
3183 emit_interpolation_setup_gen4();
3184 else
3185 emit_interpolation_setup_gen6();
3186
3187 /* Generate FS IR for main(). (the visitor only descends into
3188 * functions called "main").
3189 */
3190 foreach_iter(exec_list_iterator, iter, *shader->ir) {
3191 ir_instruction *ir = (ir_instruction *)iter.get();
3192 base_ir = ir;
3193 this->result = reg_undef;
3194 ir->accept(this);
3195 }
3196
3197 emit_fb_writes();
3198
3199 split_virtual_grfs();
3200
3201 setup_paramvalues_refs();
3202 setup_pull_constants();
3203
3204 bool progress;
3205 do {
3206 progress = false;
3207
3208 progress = remove_duplicate_mrf_writes() || progress;
3209
3210 progress = propagate_constants() || progress;
3211 progress = register_coalesce() || progress;
3212 progress = compute_to_mrf() || progress;
3213 progress = dead_code_eliminate() || progress;
3214 } while (progress);
3215
3216 schedule_instructions();
3217
3218 assign_curb_setup();
3219 assign_urb_setup();
3220
3221 if (0) {
3222 /* Debug of register spilling: Go spill everything. */
3223 int virtual_grf_count = virtual_grf_next;
3224 for (int i = 1; i < virtual_grf_count; i++) {
3225 spill_reg(i);
3226 }
3227 }
3228
3229 if (0)
3230 assign_regs_trivial();
3231 else {
3232 while (!assign_regs()) {
3233 if (failed)
3234 break;
3235 }
3236 }
3237 }
3238 assert(force_uncompressed_stack == 0);
3239 assert(force_sechalf_stack == 0);
3240
3241 if (failed)
3242 return false;
3243
3244 generate_code();
3245
3246 if (c->dispatch_width == 8) {
3247 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3248 } else {
3249 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3250 c->prog_data.prog_offset_16 = prog_offset_16;
3251
3252 /* Make sure we didn't try to sneak in an extra uniform */
3253 assert(orig_nr_params == c->prog_data.nr_params);
3254 }
3255
3256 return !failed;
3257 }
3258
3259 bool
3260 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3261 {
3262 struct intel_context *intel = &brw->intel;
3263 struct gl_context *ctx = &intel->ctx;
3264 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3265
3266 if (!prog)
3267 return false;
3268
3269 struct brw_shader *shader =
3270 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3271 if (!shader)
3272 return false;
3273
3274 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3275 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3276 _mesa_print_ir(shader->ir, NULL);
3277 printf("\n\n");
3278 }
3279
3280 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3281 */
3282 c->dispatch_width = 8;
3283
3284 fs_visitor v(c, shader);
3285 if (!v.run()) {
3286 /* FINISHME: Cleanly fail, test at link time, etc. */
3287 assert(!"not reached");
3288 return false;
3289 }
3290
3291 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
3292 c->dispatch_width = 16;
3293 fs_visitor v2(c, shader);
3294 v2.import_uniforms(v.variable_ht);
3295 v2.run();
3296 }
3297
3298 c->prog_data.dispatch_width = 8;
3299
3300 return true;
3301 }