glsl: Calcluate Mesa state slots in front-end instead of back-end
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 extern "C" {
29
30 #include <sys/types.h>
31
32 #include "main/macros.h"
33 #include "main/shaderobj.h"
34 #include "main/uniforms.h"
35 #include "program/prog_parameter.h"
36 #include "program/prog_print.h"
37 #include "program/prog_optimize.h"
38 #include "program/register_allocate.h"
39 #include "program/sampler.h"
40 #include "program/hash_table.h"
41 #include "brw_context.h"
42 #include "brw_eu.h"
43 #include "brw_wm.h"
44 }
45 #include "brw_fs.h"
46 #include "../glsl/glsl_types.h"
47 #include "../glsl/ir_optimization.h"
48 #include "../glsl/ir_print_visitor.h"
49
50 #define MAX_INSTRUCTION (1 << 30)
51 static struct brw_reg brw_reg_from_fs_reg(class fs_reg *reg);
52
53 struct gl_shader *
54 brw_new_shader(struct gl_context *ctx, GLuint name, GLuint type)
55 {
56 struct brw_shader *shader;
57
58 shader = rzalloc(NULL, struct brw_shader);
59 if (shader) {
60 shader->base.Type = type;
61 shader->base.Name = name;
62 _mesa_init_shader(ctx, &shader->base);
63 }
64
65 return &shader->base;
66 }
67
68 struct gl_shader_program *
69 brw_new_shader_program(struct gl_context *ctx, GLuint name)
70 {
71 struct brw_shader_program *prog;
72 prog = rzalloc(NULL, struct brw_shader_program);
73 if (prog) {
74 prog->base.Name = name;
75 _mesa_init_shader_program(ctx, &prog->base);
76 }
77 return &prog->base;
78 }
79
80 GLboolean
81 brw_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
82 {
83 struct brw_context *brw = brw_context(ctx);
84 struct intel_context *intel = &brw->intel;
85
86 struct brw_shader *shader =
87 (struct brw_shader *)prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
88 if (shader != NULL) {
89 void *mem_ctx = ralloc_context(NULL);
90 bool progress;
91
92 if (shader->ir)
93 ralloc_free(shader->ir);
94 shader->ir = new(shader) exec_list;
95 clone_ir_list(mem_ctx, shader->ir, shader->base.ir);
96
97 do_mat_op_to_vec(shader->ir);
98 lower_instructions(shader->ir,
99 MOD_TO_FRACT |
100 DIV_TO_MUL_RCP |
101 SUB_TO_ADD_NEG |
102 EXP_TO_EXP2 |
103 LOG_TO_LOG2);
104
105 /* Pre-gen6 HW can only nest if-statements 16 deep. Beyond this,
106 * if-statements need to be flattened.
107 */
108 if (intel->gen < 6)
109 lower_if_to_cond_assign(shader->ir, 16);
110
111 do_lower_texture_projection(shader->ir);
112 do_vec_index_to_cond_assign(shader->ir);
113 brw_do_cubemap_normalize(shader->ir);
114 lower_noise(shader->ir);
115 lower_quadop_vector(shader->ir, false);
116 lower_variable_index_to_cond_assign(shader->ir,
117 GL_TRUE, /* input */
118 GL_TRUE, /* output */
119 GL_TRUE, /* temp */
120 GL_TRUE /* uniform */
121 );
122
123 do {
124 progress = false;
125
126 brw_do_channel_expressions(shader->ir);
127 brw_do_vector_splitting(shader->ir);
128
129 progress = do_lower_jumps(shader->ir, true, true,
130 true, /* main return */
131 false, /* continue */
132 false /* loops */
133 ) || progress;
134
135 progress = do_common_optimization(shader->ir, true, 32) || progress;
136 } while (progress);
137
138 validate_ir_tree(shader->ir);
139
140 reparent_ir(shader->ir, shader->ir);
141 ralloc_free(mem_ctx);
142 }
143
144 if (!_mesa_ir_link_shader(ctx, prog))
145 return GL_FALSE;
146
147 return GL_TRUE;
148 }
149
150 static int
151 type_size(const struct glsl_type *type)
152 {
153 unsigned int size, i;
154
155 switch (type->base_type) {
156 case GLSL_TYPE_UINT:
157 case GLSL_TYPE_INT:
158 case GLSL_TYPE_FLOAT:
159 case GLSL_TYPE_BOOL:
160 return type->components();
161 case GLSL_TYPE_ARRAY:
162 return type_size(type->fields.array) * type->length;
163 case GLSL_TYPE_STRUCT:
164 size = 0;
165 for (i = 0; i < type->length; i++) {
166 size += type_size(type->fields.structure[i].type);
167 }
168 return size;
169 case GLSL_TYPE_SAMPLER:
170 /* Samplers take up no register space, since they're baked in at
171 * link time.
172 */
173 return 0;
174 default:
175 assert(!"not reached");
176 return 0;
177 }
178 }
179
180 void
181 fs_visitor::fail(const char *format, ...)
182 {
183 if (!failed) {
184 failed = true;
185
186 if (INTEL_DEBUG & DEBUG_WM) {
187 fprintf(stderr, "FS compile failed: ");
188
189 va_list va;
190 va_start(va, format);
191 vfprintf(stderr, format, va);
192 va_end(va);
193 }
194 }
195 }
196
197 /**
198 * Returns how many MRFs an FS opcode will write over.
199 *
200 * Note that this is not the 0 or 1 implied writes in an actual gen
201 * instruction -- the FS opcodes often generate MOVs in addition.
202 */
203 int
204 fs_visitor::implied_mrf_writes(fs_inst *inst)
205 {
206 if (inst->mlen == 0)
207 return 0;
208
209 switch (inst->opcode) {
210 case FS_OPCODE_RCP:
211 case FS_OPCODE_RSQ:
212 case FS_OPCODE_SQRT:
213 case FS_OPCODE_EXP2:
214 case FS_OPCODE_LOG2:
215 case FS_OPCODE_SIN:
216 case FS_OPCODE_COS:
217 return 1;
218 case FS_OPCODE_POW:
219 return 2;
220 case FS_OPCODE_TEX:
221 case FS_OPCODE_TXB:
222 case FS_OPCODE_TXD:
223 case FS_OPCODE_TXL:
224 return 1;
225 case FS_OPCODE_FB_WRITE:
226 return 2;
227 case FS_OPCODE_PULL_CONSTANT_LOAD:
228 case FS_OPCODE_UNSPILL:
229 return 1;
230 case FS_OPCODE_SPILL:
231 return 2;
232 default:
233 assert(!"not reached");
234 return inst->mlen;
235 }
236 }
237
238 int
239 fs_visitor::virtual_grf_alloc(int size)
240 {
241 if (virtual_grf_array_size <= virtual_grf_next) {
242 if (virtual_grf_array_size == 0)
243 virtual_grf_array_size = 16;
244 else
245 virtual_grf_array_size *= 2;
246 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
247 virtual_grf_array_size);
248
249 /* This slot is always unused. */
250 virtual_grf_sizes[0] = 0;
251 }
252 virtual_grf_sizes[virtual_grf_next] = size;
253 return virtual_grf_next++;
254 }
255
256 /** Fixed HW reg constructor. */
257 fs_reg::fs_reg(enum register_file file, int hw_reg)
258 {
259 init();
260 this->file = file;
261 this->hw_reg = hw_reg;
262 this->type = BRW_REGISTER_TYPE_F;
263 }
264
265 /** Fixed HW reg constructor. */
266 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
267 {
268 init();
269 this->file = file;
270 this->hw_reg = hw_reg;
271 this->type = type;
272 }
273
274 int
275 brw_type_for_base_type(const struct glsl_type *type)
276 {
277 switch (type->base_type) {
278 case GLSL_TYPE_FLOAT:
279 return BRW_REGISTER_TYPE_F;
280 case GLSL_TYPE_INT:
281 case GLSL_TYPE_BOOL:
282 return BRW_REGISTER_TYPE_D;
283 case GLSL_TYPE_UINT:
284 return BRW_REGISTER_TYPE_UD;
285 case GLSL_TYPE_ARRAY:
286 case GLSL_TYPE_STRUCT:
287 case GLSL_TYPE_SAMPLER:
288 /* These should be overridden with the type of the member when
289 * dereferenced into. BRW_REGISTER_TYPE_UD seems like a likely
290 * way to trip up if we don't.
291 */
292 return BRW_REGISTER_TYPE_UD;
293 default:
294 assert(!"not reached");
295 return BRW_REGISTER_TYPE_F;
296 }
297 }
298
299 /** Automatic reg constructor. */
300 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
301 {
302 init();
303
304 this->file = GRF;
305 this->reg = v->virtual_grf_alloc(type_size(type));
306 this->reg_offset = 0;
307 this->type = brw_type_for_base_type(type);
308 }
309
310 fs_reg *
311 fs_visitor::variable_storage(ir_variable *var)
312 {
313 return (fs_reg *)hash_table_find(this->variable_ht, var);
314 }
315
316 /* Our support for uniforms is piggy-backed on the struct
317 * gl_fragment_program, because that's where the values actually
318 * get stored, rather than in some global gl_shader_program uniform
319 * store.
320 */
321 int
322 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
323 {
324 unsigned int offset = 0;
325
326 if (type->is_matrix()) {
327 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
328 type->vector_elements,
329 1);
330
331 for (unsigned int i = 0; i < type->matrix_columns; i++) {
332 offset += setup_uniform_values(loc + offset, column);
333 }
334
335 return offset;
336 }
337
338 switch (type->base_type) {
339 case GLSL_TYPE_FLOAT:
340 case GLSL_TYPE_UINT:
341 case GLSL_TYPE_INT:
342 case GLSL_TYPE_BOOL:
343 for (unsigned int i = 0; i < type->vector_elements; i++) {
344 unsigned int param = c->prog_data.nr_params++;
345
346 assert(param < ARRAY_SIZE(c->prog_data.param));
347
348 switch (type->base_type) {
349 case GLSL_TYPE_FLOAT:
350 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
351 break;
352 case GLSL_TYPE_UINT:
353 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
354 break;
355 case GLSL_TYPE_INT:
356 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
357 break;
358 case GLSL_TYPE_BOOL:
359 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
360 break;
361 default:
362 assert(!"not reached");
363 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
364 break;
365 }
366 this->param_index[param] = loc;
367 this->param_offset[param] = i;
368 }
369 return 1;
370
371 case GLSL_TYPE_STRUCT:
372 for (unsigned int i = 0; i < type->length; i++) {
373 offset += setup_uniform_values(loc + offset,
374 type->fields.structure[i].type);
375 }
376 return offset;
377
378 case GLSL_TYPE_ARRAY:
379 for (unsigned int i = 0; i < type->length; i++) {
380 offset += setup_uniform_values(loc + offset, type->fields.array);
381 }
382 return offset;
383
384 case GLSL_TYPE_SAMPLER:
385 /* The sampler takes up a slot, but we don't use any values from it. */
386 return 1;
387
388 default:
389 assert(!"not reached");
390 return 0;
391 }
392 }
393
394
395 /* Our support for builtin uniforms is even scarier than non-builtin.
396 * It sits on top of the PROG_STATE_VAR parameters that are
397 * automatically updated from GL context state.
398 */
399 void
400 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
401 {
402 const ir_state_slot *const slots = ir->state_slots;
403 assert(ir->state_slots != NULL);
404
405 {
406 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
407 /* This state reference has already been setup by ir_to_mesa,
408 * but we'll get the same index back here.
409 */
410 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
411 (gl_state_index *)slots[i].tokens);
412
413 /* Add each of the unique swizzles of the element as a
414 * parameter. This'll end up matching the expected layout of
415 * the array/matrix/structure we're trying to fill in.
416 */
417 int last_swiz = -1;
418 for (unsigned int j = 0; j < 4; j++) {
419 int swiz = GET_SWZ(slots[i].swizzle, j);
420 if (swiz == last_swiz)
421 break;
422 last_swiz = swiz;
423
424 c->prog_data.param_convert[c->prog_data.nr_params] =
425 PARAM_NO_CONVERT;
426 this->param_index[c->prog_data.nr_params] = index;
427 this->param_offset[c->prog_data.nr_params] = swiz;
428 c->prog_data.nr_params++;
429 }
430 }
431 }
432 }
433
434 fs_reg *
435 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
436 {
437 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
438 fs_reg wpos = *reg;
439 fs_reg neg_y = this->pixel_y;
440 neg_y.negate = true;
441 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
442
443 /* gl_FragCoord.x */
444 if (ir->pixel_center_integer) {
445 emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
446 } else {
447 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
448 }
449 wpos.reg_offset++;
450
451 /* gl_FragCoord.y */
452 if (!flip && ir->pixel_center_integer) {
453 emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
454 } else {
455 fs_reg pixel_y = this->pixel_y;
456 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
457
458 if (flip) {
459 pixel_y.negate = true;
460 offset += c->key.drawable_height - 1.0;
461 }
462
463 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
464 }
465 wpos.reg_offset++;
466
467 /* gl_FragCoord.z */
468 if (intel->gen >= 6) {
469 emit(BRW_OPCODE_MOV, wpos,
470 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
471 } else {
472 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
473 interp_reg(FRAG_ATTRIB_WPOS, 2));
474 }
475 wpos.reg_offset++;
476
477 /* gl_FragCoord.w: Already set up in emit_interpolation */
478 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
479
480 return reg;
481 }
482
483 fs_reg *
484 fs_visitor::emit_general_interpolation(ir_variable *ir)
485 {
486 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
487 /* Interpolation is always in floating point regs. */
488 reg->type = BRW_REGISTER_TYPE_F;
489 fs_reg attr = *reg;
490
491 unsigned int array_elements;
492 const glsl_type *type;
493
494 if (ir->type->is_array()) {
495 array_elements = ir->type->length;
496 if (array_elements == 0) {
497 fail("dereferenced array '%s' has length 0\n", ir->name);
498 }
499 type = ir->type->fields.array;
500 } else {
501 array_elements = 1;
502 type = ir->type;
503 }
504
505 int location = ir->location;
506 for (unsigned int i = 0; i < array_elements; i++) {
507 for (unsigned int j = 0; j < type->matrix_columns; j++) {
508 if (urb_setup[location] == -1) {
509 /* If there's no incoming setup data for this slot, don't
510 * emit interpolation for it.
511 */
512 attr.reg_offset += type->vector_elements;
513 location++;
514 continue;
515 }
516
517 bool is_gl_Color =
518 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
519
520 if (c->key.flat_shade && is_gl_Color) {
521 /* Constant interpolation (flat shading) case. The SF has
522 * handed us defined values in only the constant offset
523 * field of the setup reg.
524 */
525 for (unsigned int k = 0; k < type->vector_elements; k++) {
526 struct brw_reg interp = interp_reg(location, k);
527 interp = suboffset(interp, 3);
528 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
529 attr.reg_offset++;
530 }
531 } else {
532 /* Perspective interpolation case. */
533 for (unsigned int k = 0; k < type->vector_elements; k++) {
534 struct brw_reg interp = interp_reg(location, k);
535 emit(FS_OPCODE_LINTERP, attr,
536 this->delta_x, this->delta_y, fs_reg(interp));
537 attr.reg_offset++;
538 }
539
540 if (intel->gen < 6 && !(is_gl_Color && c->key.linear_color)) {
541 attr.reg_offset -= type->vector_elements;
542 for (unsigned int k = 0; k < type->vector_elements; k++) {
543 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
544 attr.reg_offset++;
545 }
546 }
547 }
548 location++;
549 }
550 }
551
552 return reg;
553 }
554
555 fs_reg *
556 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
557 {
558 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
559
560 /* The frontfacing comes in as a bit in the thread payload. */
561 if (intel->gen >= 6) {
562 emit(BRW_OPCODE_ASR, *reg,
563 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
564 fs_reg(15));
565 emit(BRW_OPCODE_NOT, *reg, *reg);
566 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
567 } else {
568 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
569 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
570 * us front face
571 */
572 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
573 fs_reg(r1_6ud),
574 fs_reg(1u << 31));
575 inst->conditional_mod = BRW_CONDITIONAL_L;
576 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
577 }
578
579 return reg;
580 }
581
582 fs_inst *
583 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
584 {
585 switch (opcode) {
586 case FS_OPCODE_RCP:
587 case FS_OPCODE_RSQ:
588 case FS_OPCODE_SQRT:
589 case FS_OPCODE_EXP2:
590 case FS_OPCODE_LOG2:
591 case FS_OPCODE_SIN:
592 case FS_OPCODE_COS:
593 break;
594 default:
595 assert(!"not reached: bad math opcode");
596 return NULL;
597 }
598
599 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
600 * might be able to do better by doing execsize = 1 math and then
601 * expanding that result out, but we would need to be careful with
602 * masking.
603 *
604 * The hardware ignores source modifiers (negate and abs) on math
605 * instructions, so we also move to a temp to set those up.
606 */
607 if (intel->gen >= 6 && (src.file == UNIFORM ||
608 src.abs ||
609 src.negate)) {
610 fs_reg expanded = fs_reg(this, glsl_type::float_type);
611 emit(BRW_OPCODE_MOV, expanded, src);
612 src = expanded;
613 }
614
615 fs_inst *inst = emit(opcode, dst, src);
616
617 if (intel->gen < 6) {
618 inst->base_mrf = 2;
619 inst->mlen = 1;
620 }
621
622 return inst;
623 }
624
625 fs_inst *
626 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
627 {
628 int base_mrf = 2;
629 fs_inst *inst;
630
631 assert(opcode == FS_OPCODE_POW);
632
633 if (intel->gen >= 6) {
634 /* Can't do hstride == 0 args to gen6 math, so expand it out.
635 *
636 * The hardware ignores source modifiers (negate and abs) on math
637 * instructions, so we also move to a temp to set those up.
638 */
639 if (src0.file == UNIFORM || src0.abs || src0.negate) {
640 fs_reg expanded = fs_reg(this, glsl_type::float_type);
641 emit(BRW_OPCODE_MOV, expanded, src0);
642 src0 = expanded;
643 }
644
645 if (src1.file == UNIFORM || src1.abs || src1.negate) {
646 fs_reg expanded = fs_reg(this, glsl_type::float_type);
647 emit(BRW_OPCODE_MOV, expanded, src1);
648 src1 = expanded;
649 }
650
651 inst = emit(opcode, dst, src0, src1);
652 } else {
653 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
654 inst = emit(opcode, dst, src0, reg_null_f);
655
656 inst->base_mrf = base_mrf;
657 inst->mlen = 2;
658 }
659 return inst;
660 }
661
662 void
663 fs_visitor::visit(ir_variable *ir)
664 {
665 fs_reg *reg = NULL;
666
667 if (variable_storage(ir))
668 return;
669
670 if (strcmp(ir->name, "gl_FragColor") == 0) {
671 this->frag_color = ir;
672 } else if (strcmp(ir->name, "gl_FragData") == 0) {
673 this->frag_data = ir;
674 } else if (strcmp(ir->name, "gl_FragDepth") == 0) {
675 this->frag_depth = ir;
676 }
677
678 if (ir->mode == ir_var_in) {
679 if (!strcmp(ir->name, "gl_FragCoord")) {
680 reg = emit_fragcoord_interpolation(ir);
681 } else if (!strcmp(ir->name, "gl_FrontFacing")) {
682 reg = emit_frontfacing_interpolation(ir);
683 } else {
684 reg = emit_general_interpolation(ir);
685 }
686 assert(reg);
687 hash_table_insert(this->variable_ht, reg, ir);
688 return;
689 }
690
691 if (ir->mode == ir_var_uniform) {
692 int param_index = c->prog_data.nr_params;
693
694 if (!strncmp(ir->name, "gl_", 3)) {
695 setup_builtin_uniform_values(ir);
696 } else {
697 setup_uniform_values(ir->location, ir->type);
698 }
699
700 reg = new(this->mem_ctx) fs_reg(UNIFORM, param_index);
701 reg->type = brw_type_for_base_type(ir->type);
702 }
703
704 if (!reg)
705 reg = new(this->mem_ctx) fs_reg(this, ir->type);
706
707 hash_table_insert(this->variable_ht, reg, ir);
708 }
709
710 void
711 fs_visitor::visit(ir_dereference_variable *ir)
712 {
713 fs_reg *reg = variable_storage(ir->var);
714 this->result = *reg;
715 }
716
717 void
718 fs_visitor::visit(ir_dereference_record *ir)
719 {
720 const glsl_type *struct_type = ir->record->type;
721
722 ir->record->accept(this);
723
724 unsigned int offset = 0;
725 for (unsigned int i = 0; i < struct_type->length; i++) {
726 if (strcmp(struct_type->fields.structure[i].name, ir->field) == 0)
727 break;
728 offset += type_size(struct_type->fields.structure[i].type);
729 }
730 this->result.reg_offset += offset;
731 this->result.type = brw_type_for_base_type(ir->type);
732 }
733
734 void
735 fs_visitor::visit(ir_dereference_array *ir)
736 {
737 ir_constant *index;
738 int element_size;
739
740 ir->array->accept(this);
741 index = ir->array_index->as_constant();
742
743 element_size = type_size(ir->type);
744 this->result.type = brw_type_for_base_type(ir->type);
745
746 if (index) {
747 assert(this->result.file == UNIFORM ||
748 (this->result.file == GRF &&
749 this->result.reg != 0));
750 this->result.reg_offset += index->value.i[0] * element_size;
751 } else {
752 assert(!"FINISHME: non-constant array element");
753 }
754 }
755
756 /* Instruction selection: Produce a MOV.sat instead of
757 * MIN(MAX(val, 0), 1) when possible.
758 */
759 bool
760 fs_visitor::try_emit_saturate(ir_expression *ir)
761 {
762 ir_rvalue *sat_val = ir->as_rvalue_to_saturate();
763
764 if (!sat_val)
765 return false;
766
767 sat_val->accept(this);
768 fs_reg src = this->result;
769
770 this->result = fs_reg(this, ir->type);
771 fs_inst *inst = emit(BRW_OPCODE_MOV, this->result, src);
772 inst->saturate = true;
773
774 return true;
775 }
776
777 static uint32_t
778 brw_conditional_for_comparison(unsigned int op)
779 {
780 switch (op) {
781 case ir_binop_less:
782 return BRW_CONDITIONAL_L;
783 case ir_binop_greater:
784 return BRW_CONDITIONAL_G;
785 case ir_binop_lequal:
786 return BRW_CONDITIONAL_LE;
787 case ir_binop_gequal:
788 return BRW_CONDITIONAL_GE;
789 case ir_binop_equal:
790 case ir_binop_all_equal: /* same as equal for scalars */
791 return BRW_CONDITIONAL_Z;
792 case ir_binop_nequal:
793 case ir_binop_any_nequal: /* same as nequal for scalars */
794 return BRW_CONDITIONAL_NZ;
795 default:
796 assert(!"not reached: bad operation for comparison");
797 return BRW_CONDITIONAL_NZ;
798 }
799 }
800
801 void
802 fs_visitor::visit(ir_expression *ir)
803 {
804 unsigned int operand;
805 fs_reg op[2], temp;
806 fs_inst *inst;
807
808 assert(ir->get_num_operands() <= 2);
809
810 if (try_emit_saturate(ir))
811 return;
812
813 for (operand = 0; operand < ir->get_num_operands(); operand++) {
814 ir->operands[operand]->accept(this);
815 if (this->result.file == BAD_FILE) {
816 ir_print_visitor v;
817 fail("Failed to get tree for expression operand:\n");
818 ir->operands[operand]->accept(&v);
819 }
820 op[operand] = this->result;
821
822 /* Matrix expression operands should have been broken down to vector
823 * operations already.
824 */
825 assert(!ir->operands[operand]->type->is_matrix());
826 /* And then those vector operands should have been broken down to scalar.
827 */
828 assert(!ir->operands[operand]->type->is_vector());
829 }
830
831 /* Storage for our result. If our result goes into an assignment, it will
832 * just get copy-propagated out, so no worries.
833 */
834 this->result = fs_reg(this, ir->type);
835
836 switch (ir->operation) {
837 case ir_unop_logic_not:
838 /* Note that BRW_OPCODE_NOT is not appropriate here, since it is
839 * ones complement of the whole register, not just bit 0.
840 */
841 emit(BRW_OPCODE_XOR, this->result, op[0], fs_reg(1));
842 break;
843 case ir_unop_neg:
844 op[0].negate = !op[0].negate;
845 this->result = op[0];
846 break;
847 case ir_unop_abs:
848 op[0].abs = true;
849 op[0].negate = false;
850 this->result = op[0];
851 break;
852 case ir_unop_sign:
853 temp = fs_reg(this, ir->type);
854
855 emit(BRW_OPCODE_MOV, this->result, fs_reg(0.0f));
856
857 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
858 inst->conditional_mod = BRW_CONDITIONAL_G;
859 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(1.0f));
860 inst->predicated = true;
861
862 inst = emit(BRW_OPCODE_CMP, reg_null_f, op[0], fs_reg(0.0f));
863 inst->conditional_mod = BRW_CONDITIONAL_L;
864 inst = emit(BRW_OPCODE_MOV, this->result, fs_reg(-1.0f));
865 inst->predicated = true;
866
867 break;
868 case ir_unop_rcp:
869 emit_math(FS_OPCODE_RCP, this->result, op[0]);
870 break;
871
872 case ir_unop_exp2:
873 emit_math(FS_OPCODE_EXP2, this->result, op[0]);
874 break;
875 case ir_unop_log2:
876 emit_math(FS_OPCODE_LOG2, this->result, op[0]);
877 break;
878 case ir_unop_exp:
879 case ir_unop_log:
880 assert(!"not reached: should be handled by ir_explog_to_explog2");
881 break;
882 case ir_unop_sin:
883 case ir_unop_sin_reduced:
884 emit_math(FS_OPCODE_SIN, this->result, op[0]);
885 break;
886 case ir_unop_cos:
887 case ir_unop_cos_reduced:
888 emit_math(FS_OPCODE_COS, this->result, op[0]);
889 break;
890
891 case ir_unop_dFdx:
892 emit(FS_OPCODE_DDX, this->result, op[0]);
893 break;
894 case ir_unop_dFdy:
895 emit(FS_OPCODE_DDY, this->result, op[0]);
896 break;
897
898 case ir_binop_add:
899 emit(BRW_OPCODE_ADD, this->result, op[0], op[1]);
900 break;
901 case ir_binop_sub:
902 assert(!"not reached: should be handled by ir_sub_to_add_neg");
903 break;
904
905 case ir_binop_mul:
906 emit(BRW_OPCODE_MUL, this->result, op[0], op[1]);
907 break;
908 case ir_binop_div:
909 assert(!"not reached: should be handled by ir_div_to_mul_rcp");
910 break;
911 case ir_binop_mod:
912 assert(!"ir_binop_mod should have been converted to b * fract(a/b)");
913 break;
914
915 case ir_binop_less:
916 case ir_binop_greater:
917 case ir_binop_lequal:
918 case ir_binop_gequal:
919 case ir_binop_equal:
920 case ir_binop_all_equal:
921 case ir_binop_nequal:
922 case ir_binop_any_nequal:
923 temp = this->result;
924 /* original gen4 does implicit conversion before comparison. */
925 if (intel->gen < 5)
926 temp.type = op[0].type;
927
928 inst = emit(BRW_OPCODE_CMP, temp, op[0], op[1]);
929 inst->conditional_mod = brw_conditional_for_comparison(ir->operation);
930 emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(0x1));
931 break;
932
933 case ir_binop_logic_xor:
934 emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
935 break;
936
937 case ir_binop_logic_or:
938 emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
939 break;
940
941 case ir_binop_logic_and:
942 emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
943 break;
944
945 case ir_binop_dot:
946 case ir_unop_any:
947 assert(!"not reached: should be handled by brw_fs_channel_expressions");
948 break;
949
950 case ir_unop_noise:
951 assert(!"not reached: should be handled by lower_noise");
952 break;
953
954 case ir_quadop_vector:
955 assert(!"not reached: should be handled by lower_quadop_vector");
956 break;
957
958 case ir_unop_sqrt:
959 emit_math(FS_OPCODE_SQRT, this->result, op[0]);
960 break;
961
962 case ir_unop_rsq:
963 emit_math(FS_OPCODE_RSQ, this->result, op[0]);
964 break;
965
966 case ir_unop_i2f:
967 case ir_unop_b2f:
968 case ir_unop_b2i:
969 case ir_unop_f2i:
970 emit(BRW_OPCODE_MOV, this->result, op[0]);
971 break;
972 case ir_unop_f2b:
973 case ir_unop_i2b:
974 temp = this->result;
975 /* original gen4 does implicit conversion before comparison. */
976 if (intel->gen < 5)
977 temp.type = op[0].type;
978
979 inst = emit(BRW_OPCODE_CMP, temp, op[0], fs_reg(0.0f));
980 inst->conditional_mod = BRW_CONDITIONAL_NZ;
981 inst = emit(BRW_OPCODE_AND, this->result, this->result, fs_reg(1));
982 break;
983
984 case ir_unop_trunc:
985 emit(BRW_OPCODE_RNDZ, this->result, op[0]);
986 break;
987 case ir_unop_ceil:
988 op[0].negate = !op[0].negate;
989 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
990 this->result.negate = true;
991 break;
992 case ir_unop_floor:
993 inst = emit(BRW_OPCODE_RNDD, this->result, op[0]);
994 break;
995 case ir_unop_fract:
996 inst = emit(BRW_OPCODE_FRC, this->result, op[0]);
997 break;
998 case ir_unop_round_even:
999 emit(BRW_OPCODE_RNDE, this->result, op[0]);
1000 break;
1001
1002 case ir_binop_min:
1003 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1004 inst->conditional_mod = BRW_CONDITIONAL_L;
1005
1006 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1007 inst->predicated = true;
1008 break;
1009 case ir_binop_max:
1010 inst = emit(BRW_OPCODE_CMP, this->result, op[0], op[1]);
1011 inst->conditional_mod = BRW_CONDITIONAL_G;
1012
1013 inst = emit(BRW_OPCODE_SEL, this->result, op[0], op[1]);
1014 inst->predicated = true;
1015 break;
1016
1017 case ir_binop_pow:
1018 emit_math(FS_OPCODE_POW, this->result, op[0], op[1]);
1019 break;
1020
1021 case ir_unop_bit_not:
1022 inst = emit(BRW_OPCODE_NOT, this->result, op[0]);
1023 break;
1024 case ir_binop_bit_and:
1025 inst = emit(BRW_OPCODE_AND, this->result, op[0], op[1]);
1026 break;
1027 case ir_binop_bit_xor:
1028 inst = emit(BRW_OPCODE_XOR, this->result, op[0], op[1]);
1029 break;
1030 case ir_binop_bit_or:
1031 inst = emit(BRW_OPCODE_OR, this->result, op[0], op[1]);
1032 break;
1033
1034 case ir_unop_u2f:
1035 case ir_binop_lshift:
1036 case ir_binop_rshift:
1037 assert(!"GLSL 1.30 features unsupported");
1038 break;
1039 }
1040 }
1041
1042 void
1043 fs_visitor::emit_assignment_writes(fs_reg &l, fs_reg &r,
1044 const glsl_type *type, bool predicated)
1045 {
1046 switch (type->base_type) {
1047 case GLSL_TYPE_FLOAT:
1048 case GLSL_TYPE_UINT:
1049 case GLSL_TYPE_INT:
1050 case GLSL_TYPE_BOOL:
1051 for (unsigned int i = 0; i < type->components(); i++) {
1052 l.type = brw_type_for_base_type(type);
1053 r.type = brw_type_for_base_type(type);
1054
1055 fs_inst *inst = emit(BRW_OPCODE_MOV, l, r);
1056 inst->predicated = predicated;
1057
1058 l.reg_offset++;
1059 r.reg_offset++;
1060 }
1061 break;
1062 case GLSL_TYPE_ARRAY:
1063 for (unsigned int i = 0; i < type->length; i++) {
1064 emit_assignment_writes(l, r, type->fields.array, predicated);
1065 }
1066 break;
1067
1068 case GLSL_TYPE_STRUCT:
1069 for (unsigned int i = 0; i < type->length; i++) {
1070 emit_assignment_writes(l, r, type->fields.structure[i].type,
1071 predicated);
1072 }
1073 break;
1074
1075 case GLSL_TYPE_SAMPLER:
1076 break;
1077
1078 default:
1079 assert(!"not reached");
1080 break;
1081 }
1082 }
1083
1084 void
1085 fs_visitor::visit(ir_assignment *ir)
1086 {
1087 struct fs_reg l, r;
1088 fs_inst *inst;
1089
1090 /* FINISHME: arrays on the lhs */
1091 ir->lhs->accept(this);
1092 l = this->result;
1093
1094 ir->rhs->accept(this);
1095 r = this->result;
1096
1097 assert(l.file != BAD_FILE);
1098 assert(r.file != BAD_FILE);
1099
1100 if (ir->condition) {
1101 emit_bool_to_cond_code(ir->condition);
1102 }
1103
1104 if (ir->lhs->type->is_scalar() ||
1105 ir->lhs->type->is_vector()) {
1106 for (int i = 0; i < ir->lhs->type->vector_elements; i++) {
1107 if (ir->write_mask & (1 << i)) {
1108 inst = emit(BRW_OPCODE_MOV, l, r);
1109 if (ir->condition)
1110 inst->predicated = true;
1111 r.reg_offset++;
1112 }
1113 l.reg_offset++;
1114 }
1115 } else {
1116 emit_assignment_writes(l, r, ir->lhs->type, ir->condition != NULL);
1117 }
1118 }
1119
1120 fs_inst *
1121 fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1122 {
1123 int mlen;
1124 int base_mrf = 1;
1125 bool simd16 = false;
1126 fs_reg orig_dst;
1127
1128 /* g0 header. */
1129 mlen = 1;
1130
1131 if (ir->shadow_comparitor) {
1132 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1133 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1134 coordinate.reg_offset++;
1135 }
1136 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1137 mlen += 3;
1138
1139 if (ir->op == ir_tex) {
1140 /* There's no plain shadow compare message, so we use shadow
1141 * compare with a bias of 0.0.
1142 */
1143 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), fs_reg(0.0f));
1144 mlen++;
1145 } else if (ir->op == ir_txb) {
1146 ir->lod_info.bias->accept(this);
1147 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1148 mlen++;
1149 } else {
1150 assert(ir->op == ir_txl);
1151 ir->lod_info.lod->accept(this);
1152 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1153 mlen++;
1154 }
1155
1156 ir->shadow_comparitor->accept(this);
1157 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1158 mlen++;
1159 } else if (ir->op == ir_tex) {
1160 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1161 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1162 coordinate.reg_offset++;
1163 }
1164 /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
1165 mlen += 3;
1166 } else if (ir->op == ir_txd) {
1167 assert(!"TXD isn't supported on gen4 yet.");
1168 } else {
1169 /* Oh joy. gen4 doesn't have SIMD8 non-shadow-compare bias/lod
1170 * instructions. We'll need to do SIMD16 here.
1171 */
1172 assert(ir->op == ir_txb || ir->op == ir_txl);
1173
1174 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1175 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i * 2), coordinate);
1176 coordinate.reg_offset++;
1177 }
1178
1179 /* lod/bias appears after u/v/r. */
1180 mlen += 6;
1181
1182 if (ir->op == ir_txb) {
1183 ir->lod_info.bias->accept(this);
1184 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1185 mlen++;
1186 } else {
1187 ir->lod_info.lod->accept(this);
1188 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1189 mlen++;
1190 }
1191
1192 /* The unused upper half. */
1193 mlen++;
1194
1195 /* Now, since we're doing simd16, the return is 2 interleaved
1196 * vec4s where the odd-indexed ones are junk. We'll need to move
1197 * this weirdness around to the expected layout.
1198 */
1199 simd16 = true;
1200 orig_dst = dst;
1201 dst = fs_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type,
1202 2));
1203 dst.type = BRW_REGISTER_TYPE_F;
1204 }
1205
1206 fs_inst *inst = NULL;
1207 switch (ir->op) {
1208 case ir_tex:
1209 inst = emit(FS_OPCODE_TEX, dst);
1210 break;
1211 case ir_txb:
1212 inst = emit(FS_OPCODE_TXB, dst);
1213 break;
1214 case ir_txl:
1215 inst = emit(FS_OPCODE_TXL, dst);
1216 break;
1217 case ir_txd:
1218 inst = emit(FS_OPCODE_TXD, dst);
1219 break;
1220 case ir_txf:
1221 assert(!"GLSL 1.30 features unsupported");
1222 break;
1223 }
1224 inst->base_mrf = base_mrf;
1225 inst->mlen = mlen;
1226
1227 if (simd16) {
1228 for (int i = 0; i < 4; i++) {
1229 emit(BRW_OPCODE_MOV, orig_dst, dst);
1230 orig_dst.reg_offset++;
1231 dst.reg_offset += 2;
1232 }
1233 }
1234
1235 return inst;
1236 }
1237
1238 fs_inst *
1239 fs_visitor::emit_texture_gen5(ir_texture *ir, fs_reg dst, fs_reg coordinate)
1240 {
1241 /* gen5's SIMD8 sampler has slots for u, v, r, array index, then
1242 * optional parameters like shadow comparitor or LOD bias. If
1243 * optional parameters aren't present, those base slots are
1244 * optional and don't need to be included in the message.
1245 *
1246 * We don't fill in the unnecessary slots regardless, which may
1247 * look surprising in the disassembly.
1248 */
1249 int mlen = 1; /* g0 header always present. */
1250 int base_mrf = 1;
1251
1252 for (int i = 0; i < ir->coordinate->type->vector_elements; i++) {
1253 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen + i), coordinate);
1254 coordinate.reg_offset++;
1255 }
1256 mlen += ir->coordinate->type->vector_elements;
1257
1258 if (ir->shadow_comparitor) {
1259 mlen = MAX2(mlen, 5);
1260
1261 ir->shadow_comparitor->accept(this);
1262 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1263 mlen++;
1264 }
1265
1266 fs_inst *inst = NULL;
1267 switch (ir->op) {
1268 case ir_tex:
1269 inst = emit(FS_OPCODE_TEX, dst);
1270 break;
1271 case ir_txb:
1272 ir->lod_info.bias->accept(this);
1273 mlen = MAX2(mlen, 5);
1274 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1275 mlen++;
1276
1277 inst = emit(FS_OPCODE_TXB, dst);
1278 break;
1279 case ir_txl:
1280 ir->lod_info.lod->accept(this);
1281 mlen = MAX2(mlen, 5);
1282 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + mlen), this->result);
1283 mlen++;
1284
1285 inst = emit(FS_OPCODE_TXL, dst);
1286 break;
1287 case ir_txd:
1288 case ir_txf:
1289 assert(!"GLSL 1.30 features unsupported");
1290 break;
1291 }
1292 inst->base_mrf = base_mrf;
1293 inst->mlen = mlen;
1294
1295 return inst;
1296 }
1297
1298 void
1299 fs_visitor::visit(ir_texture *ir)
1300 {
1301 int sampler;
1302 fs_inst *inst = NULL;
1303
1304 ir->coordinate->accept(this);
1305 fs_reg coordinate = this->result;
1306
1307 if (ir->offset != NULL) {
1308 ir_constant *offset = ir->offset->as_constant();
1309 assert(offset != NULL);
1310
1311 signed char offsets[3];
1312 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++)
1313 offsets[i] = (signed char) offset->value.i[i];
1314
1315 /* Combine all three offsets into a single unsigned dword:
1316 *
1317 * bits 11:8 - U Offset (X component)
1318 * bits 7:4 - V Offset (Y component)
1319 * bits 3:0 - R Offset (Z component)
1320 */
1321 unsigned offset_bits = 0;
1322 for (unsigned i = 0; i < ir->offset->type->vector_elements; i++) {
1323 const unsigned shift = 4 * (2 - i);
1324 offset_bits |= (offsets[i] << shift) & (0xF << shift);
1325 }
1326
1327 /* Explicitly set up the message header by copying g0 to msg reg m1. */
1328 emit(BRW_OPCODE_MOV, fs_reg(MRF, 1, BRW_REGISTER_TYPE_UD),
1329 fs_reg(GRF, 0, BRW_REGISTER_TYPE_UD));
1330
1331 /* Then set the offset bits in DWord 2 of the message header. */
1332 emit(BRW_OPCODE_MOV,
1333 fs_reg(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 1, 2),
1334 BRW_REGISTER_TYPE_UD)),
1335 fs_reg(brw_imm_uw(offset_bits)));
1336 }
1337
1338 /* Should be lowered by do_lower_texture_projection */
1339 assert(!ir->projector);
1340
1341 sampler = _mesa_get_sampler_uniform_value(ir->sampler,
1342 ctx->Shader.CurrentFragmentProgram,
1343 &brw->fragment_program->Base);
1344 sampler = c->fp->program.Base.SamplerUnits[sampler];
1345
1346 /* The 965 requires the EU to do the normalization of GL rectangle
1347 * texture coordinates. We use the program parameter state
1348 * tracking to get the scaling factor.
1349 */
1350 if (ir->sampler->type->sampler_dimensionality == GLSL_SAMPLER_DIM_RECT) {
1351 struct gl_program_parameter_list *params = c->fp->program.Base.Parameters;
1352 int tokens[STATE_LENGTH] = {
1353 STATE_INTERNAL,
1354 STATE_TEXRECT_SCALE,
1355 sampler,
1356 0,
1357 0
1358 };
1359
1360 c->prog_data.param_convert[c->prog_data.nr_params] =
1361 PARAM_NO_CONVERT;
1362 c->prog_data.param_convert[c->prog_data.nr_params + 1] =
1363 PARAM_NO_CONVERT;
1364
1365 fs_reg scale_x = fs_reg(UNIFORM, c->prog_data.nr_params);
1366 fs_reg scale_y = fs_reg(UNIFORM, c->prog_data.nr_params + 1);
1367 GLuint index = _mesa_add_state_reference(params,
1368 (gl_state_index *)tokens);
1369
1370 this->param_index[c->prog_data.nr_params] = index;
1371 this->param_offset[c->prog_data.nr_params] = 0;
1372 c->prog_data.nr_params++;
1373 this->param_index[c->prog_data.nr_params] = index;
1374 this->param_offset[c->prog_data.nr_params] = 1;
1375 c->prog_data.nr_params++;
1376
1377 fs_reg dst = fs_reg(this, ir->coordinate->type);
1378 fs_reg src = coordinate;
1379 coordinate = dst;
1380
1381 emit(BRW_OPCODE_MUL, dst, src, scale_x);
1382 dst.reg_offset++;
1383 src.reg_offset++;
1384 emit(BRW_OPCODE_MUL, dst, src, scale_y);
1385 }
1386
1387 /* Writemasking doesn't eliminate channels on SIMD8 texture
1388 * samples, so don't worry about them.
1389 */
1390 fs_reg dst = fs_reg(this, glsl_type::vec4_type);
1391
1392 if (intel->gen < 5) {
1393 inst = emit_texture_gen4(ir, dst, coordinate);
1394 } else {
1395 inst = emit_texture_gen5(ir, dst, coordinate);
1396 }
1397
1398 /* If there's an offset, we already set up m1. To avoid the implied move,
1399 * use the null register. Otherwise, we want an implied move from g0.
1400 */
1401 if (ir->offset != NULL)
1402 inst->src[0] = fs_reg(brw_null_reg());
1403 else
1404 inst->src[0] = fs_reg(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
1405
1406 inst->sampler = sampler;
1407
1408 this->result = dst;
1409
1410 if (ir->shadow_comparitor)
1411 inst->shadow_compare = true;
1412
1413 if (ir->type == glsl_type::float_type) {
1414 /* Ignore DEPTH_TEXTURE_MODE swizzling. */
1415 assert(ir->sampler->type->sampler_shadow);
1416 } else if (c->key.tex_swizzles[inst->sampler] != SWIZZLE_NOOP) {
1417 fs_reg swizzle_dst = fs_reg(this, glsl_type::vec4_type);
1418
1419 for (int i = 0; i < 4; i++) {
1420 int swiz = GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1421 fs_reg l = swizzle_dst;
1422 l.reg_offset += i;
1423
1424 if (swiz == SWIZZLE_ZERO) {
1425 emit(BRW_OPCODE_MOV, l, fs_reg(0.0f));
1426 } else if (swiz == SWIZZLE_ONE) {
1427 emit(BRW_OPCODE_MOV, l, fs_reg(1.0f));
1428 } else {
1429 fs_reg r = dst;
1430 r.reg_offset += GET_SWZ(c->key.tex_swizzles[inst->sampler], i);
1431 emit(BRW_OPCODE_MOV, l, r);
1432 }
1433 }
1434 this->result = swizzle_dst;
1435 }
1436 }
1437
1438 void
1439 fs_visitor::visit(ir_swizzle *ir)
1440 {
1441 ir->val->accept(this);
1442 fs_reg val = this->result;
1443
1444 if (ir->type->vector_elements == 1) {
1445 this->result.reg_offset += ir->mask.x;
1446 return;
1447 }
1448
1449 fs_reg result = fs_reg(this, ir->type);
1450 this->result = result;
1451
1452 for (unsigned int i = 0; i < ir->type->vector_elements; i++) {
1453 fs_reg channel = val;
1454 int swiz = 0;
1455
1456 switch (i) {
1457 case 0:
1458 swiz = ir->mask.x;
1459 break;
1460 case 1:
1461 swiz = ir->mask.y;
1462 break;
1463 case 2:
1464 swiz = ir->mask.z;
1465 break;
1466 case 3:
1467 swiz = ir->mask.w;
1468 break;
1469 }
1470
1471 channel.reg_offset += swiz;
1472 emit(BRW_OPCODE_MOV, result, channel);
1473 result.reg_offset++;
1474 }
1475 }
1476
1477 void
1478 fs_visitor::visit(ir_discard *ir)
1479 {
1480 fs_reg temp = fs_reg(this, glsl_type::uint_type);
1481
1482 assert(ir->condition == NULL); /* FINISHME */
1483
1484 emit(FS_OPCODE_DISCARD_NOT, temp, reg_null_d);
1485 emit(FS_OPCODE_DISCARD_AND, reg_null_d, temp);
1486 kill_emitted = true;
1487 }
1488
1489 void
1490 fs_visitor::visit(ir_constant *ir)
1491 {
1492 /* Set this->result to reg at the bottom of the function because some code
1493 * paths will cause this visitor to be applied to other fields. This will
1494 * cause the value stored in this->result to be modified.
1495 *
1496 * Make reg constant so that it doesn't get accidentally modified along the
1497 * way. Yes, I actually had this problem. :(
1498 */
1499 const fs_reg reg(this, ir->type);
1500 fs_reg dst_reg = reg;
1501
1502 if (ir->type->is_array()) {
1503 const unsigned size = type_size(ir->type->fields.array);
1504
1505 for (unsigned i = 0; i < ir->type->length; i++) {
1506 ir->array_elements[i]->accept(this);
1507 fs_reg src_reg = this->result;
1508
1509 dst_reg.type = src_reg.type;
1510 for (unsigned j = 0; j < size; j++) {
1511 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1512 src_reg.reg_offset++;
1513 dst_reg.reg_offset++;
1514 }
1515 }
1516 } else if (ir->type->is_record()) {
1517 foreach_list(node, &ir->components) {
1518 ir_instruction *const field = (ir_instruction *) node;
1519 const unsigned size = type_size(field->type);
1520
1521 field->accept(this);
1522 fs_reg src_reg = this->result;
1523
1524 dst_reg.type = src_reg.type;
1525 for (unsigned j = 0; j < size; j++) {
1526 emit(BRW_OPCODE_MOV, dst_reg, src_reg);
1527 src_reg.reg_offset++;
1528 dst_reg.reg_offset++;
1529 }
1530 }
1531 } else {
1532 const unsigned size = type_size(ir->type);
1533
1534 for (unsigned i = 0; i < size; i++) {
1535 switch (ir->type->base_type) {
1536 case GLSL_TYPE_FLOAT:
1537 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.f[i]));
1538 break;
1539 case GLSL_TYPE_UINT:
1540 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.u[i]));
1541 break;
1542 case GLSL_TYPE_INT:
1543 emit(BRW_OPCODE_MOV, dst_reg, fs_reg(ir->value.i[i]));
1544 break;
1545 case GLSL_TYPE_BOOL:
1546 emit(BRW_OPCODE_MOV, dst_reg, fs_reg((int)ir->value.b[i]));
1547 break;
1548 default:
1549 assert(!"Non-float/uint/int/bool constant");
1550 }
1551 dst_reg.reg_offset++;
1552 }
1553 }
1554
1555 this->result = reg;
1556 }
1557
1558 void
1559 fs_visitor::emit_bool_to_cond_code(ir_rvalue *ir)
1560 {
1561 ir_expression *expr = ir->as_expression();
1562
1563 if (expr) {
1564 fs_reg op[2];
1565 fs_inst *inst;
1566
1567 assert(expr->get_num_operands() <= 2);
1568 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1569 assert(expr->operands[i]->type->is_scalar());
1570
1571 expr->operands[i]->accept(this);
1572 op[i] = this->result;
1573 }
1574
1575 switch (expr->operation) {
1576 case ir_unop_logic_not:
1577 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], fs_reg(1));
1578 inst->conditional_mod = BRW_CONDITIONAL_Z;
1579 break;
1580
1581 case ir_binop_logic_xor:
1582 inst = emit(BRW_OPCODE_XOR, reg_null_d, op[0], op[1]);
1583 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1584 break;
1585
1586 case ir_binop_logic_or:
1587 inst = emit(BRW_OPCODE_OR, reg_null_d, op[0], op[1]);
1588 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1589 break;
1590
1591 case ir_binop_logic_and:
1592 inst = emit(BRW_OPCODE_AND, reg_null_d, op[0], op[1]);
1593 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1594 break;
1595
1596 case ir_unop_f2b:
1597 if (intel->gen >= 6) {
1598 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0.0f));
1599 } else {
1600 inst = emit(BRW_OPCODE_MOV, reg_null_f, op[0]);
1601 }
1602 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1603 break;
1604
1605 case ir_unop_i2b:
1606 if (intel->gen >= 6) {
1607 inst = emit(BRW_OPCODE_CMP, reg_null_d, op[0], fs_reg(0));
1608 } else {
1609 inst = emit(BRW_OPCODE_MOV, reg_null_d, op[0]);
1610 }
1611 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1612 break;
1613
1614 case ir_binop_greater:
1615 case ir_binop_gequal:
1616 case ir_binop_less:
1617 case ir_binop_lequal:
1618 case ir_binop_equal:
1619 case ir_binop_all_equal:
1620 case ir_binop_nequal:
1621 case ir_binop_any_nequal:
1622 inst = emit(BRW_OPCODE_CMP, reg_null_cmp, op[0], op[1]);
1623 inst->conditional_mod =
1624 brw_conditional_for_comparison(expr->operation);
1625 break;
1626
1627 default:
1628 assert(!"not reached");
1629 fail("bad cond code\n");
1630 break;
1631 }
1632 return;
1633 }
1634
1635 ir->accept(this);
1636
1637 if (intel->gen >= 6) {
1638 fs_inst *inst = emit(BRW_OPCODE_AND, reg_null_d, this->result, fs_reg(1));
1639 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1640 } else {
1641 fs_inst *inst = emit(BRW_OPCODE_MOV, reg_null_d, this->result);
1642 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1643 }
1644 }
1645
1646 /**
1647 * Emit a gen6 IF statement with the comparison folded into the IF
1648 * instruction.
1649 */
1650 void
1651 fs_visitor::emit_if_gen6(ir_if *ir)
1652 {
1653 ir_expression *expr = ir->condition->as_expression();
1654
1655 if (expr) {
1656 fs_reg op[2];
1657 fs_inst *inst;
1658 fs_reg temp;
1659
1660 assert(expr->get_num_operands() <= 2);
1661 for (unsigned int i = 0; i < expr->get_num_operands(); i++) {
1662 assert(expr->operands[i]->type->is_scalar());
1663
1664 expr->operands[i]->accept(this);
1665 op[i] = this->result;
1666 }
1667
1668 switch (expr->operation) {
1669 case ir_unop_logic_not:
1670 inst = emit(BRW_OPCODE_IF, temp, op[0], fs_reg(0));
1671 inst->conditional_mod = BRW_CONDITIONAL_Z;
1672 return;
1673
1674 case ir_binop_logic_xor:
1675 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1676 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1677 return;
1678
1679 case ir_binop_logic_or:
1680 temp = fs_reg(this, glsl_type::bool_type);
1681 emit(BRW_OPCODE_OR, temp, op[0], op[1]);
1682 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1683 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1684 return;
1685
1686 case ir_binop_logic_and:
1687 temp = fs_reg(this, glsl_type::bool_type);
1688 emit(BRW_OPCODE_AND, temp, op[0], op[1]);
1689 inst = emit(BRW_OPCODE_IF, reg_null_d, temp, fs_reg(0));
1690 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1691 return;
1692
1693 case ir_unop_f2b:
1694 inst = emit(BRW_OPCODE_IF, reg_null_f, op[0], fs_reg(0));
1695 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1696 return;
1697
1698 case ir_unop_i2b:
1699 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1700 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1701 return;
1702
1703 case ir_binop_greater:
1704 case ir_binop_gequal:
1705 case ir_binop_less:
1706 case ir_binop_lequal:
1707 case ir_binop_equal:
1708 case ir_binop_all_equal:
1709 case ir_binop_nequal:
1710 case ir_binop_any_nequal:
1711 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], op[1]);
1712 inst->conditional_mod =
1713 brw_conditional_for_comparison(expr->operation);
1714 return;
1715 default:
1716 assert(!"not reached");
1717 inst = emit(BRW_OPCODE_IF, reg_null_d, op[0], fs_reg(0));
1718 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1719 fail("bad condition\n");
1720 return;
1721 }
1722 return;
1723 }
1724
1725 ir->condition->accept(this);
1726
1727 fs_inst *inst = emit(BRW_OPCODE_IF, reg_null_d, this->result, fs_reg(0));
1728 inst->conditional_mod = BRW_CONDITIONAL_NZ;
1729 }
1730
1731 void
1732 fs_visitor::visit(ir_if *ir)
1733 {
1734 fs_inst *inst;
1735
1736 /* Don't point the annotation at the if statement, because then it plus
1737 * the then and else blocks get printed.
1738 */
1739 this->base_ir = ir->condition;
1740
1741 if (intel->gen >= 6) {
1742 emit_if_gen6(ir);
1743 } else {
1744 emit_bool_to_cond_code(ir->condition);
1745
1746 inst = emit(BRW_OPCODE_IF);
1747 inst->predicated = true;
1748 }
1749
1750 foreach_iter(exec_list_iterator, iter, ir->then_instructions) {
1751 ir_instruction *ir = (ir_instruction *)iter.get();
1752 this->base_ir = ir;
1753
1754 ir->accept(this);
1755 }
1756
1757 if (!ir->else_instructions.is_empty()) {
1758 emit(BRW_OPCODE_ELSE);
1759
1760 foreach_iter(exec_list_iterator, iter, ir->else_instructions) {
1761 ir_instruction *ir = (ir_instruction *)iter.get();
1762 this->base_ir = ir;
1763
1764 ir->accept(this);
1765 }
1766 }
1767
1768 emit(BRW_OPCODE_ENDIF);
1769 }
1770
1771 void
1772 fs_visitor::visit(ir_loop *ir)
1773 {
1774 fs_reg counter = reg_undef;
1775
1776 if (ir->counter) {
1777 this->base_ir = ir->counter;
1778 ir->counter->accept(this);
1779 counter = *(variable_storage(ir->counter));
1780
1781 if (ir->from) {
1782 this->base_ir = ir->from;
1783 ir->from->accept(this);
1784
1785 emit(BRW_OPCODE_MOV, counter, this->result);
1786 }
1787 }
1788
1789 emit(BRW_OPCODE_DO);
1790
1791 if (ir->to) {
1792 this->base_ir = ir->to;
1793 ir->to->accept(this);
1794
1795 fs_inst *inst = emit(BRW_OPCODE_CMP, reg_null_cmp, counter, this->result);
1796 inst->conditional_mod = brw_conditional_for_comparison(ir->cmp);
1797
1798 inst = emit(BRW_OPCODE_BREAK);
1799 inst->predicated = true;
1800 }
1801
1802 foreach_iter(exec_list_iterator, iter, ir->body_instructions) {
1803 ir_instruction *ir = (ir_instruction *)iter.get();
1804
1805 this->base_ir = ir;
1806 ir->accept(this);
1807 }
1808
1809 if (ir->increment) {
1810 this->base_ir = ir->increment;
1811 ir->increment->accept(this);
1812 emit(BRW_OPCODE_ADD, counter, counter, this->result);
1813 }
1814
1815 emit(BRW_OPCODE_WHILE);
1816 }
1817
1818 void
1819 fs_visitor::visit(ir_loop_jump *ir)
1820 {
1821 switch (ir->mode) {
1822 case ir_loop_jump::jump_break:
1823 emit(BRW_OPCODE_BREAK);
1824 break;
1825 case ir_loop_jump::jump_continue:
1826 emit(BRW_OPCODE_CONTINUE);
1827 break;
1828 }
1829 }
1830
1831 void
1832 fs_visitor::visit(ir_call *ir)
1833 {
1834 assert(!"FINISHME");
1835 }
1836
1837 void
1838 fs_visitor::visit(ir_return *ir)
1839 {
1840 assert(!"FINISHME");
1841 }
1842
1843 void
1844 fs_visitor::visit(ir_function *ir)
1845 {
1846 /* Ignore function bodies other than main() -- we shouldn't see calls to
1847 * them since they should all be inlined before we get to ir_to_mesa.
1848 */
1849 if (strcmp(ir->name, "main") == 0) {
1850 const ir_function_signature *sig;
1851 exec_list empty;
1852
1853 sig = ir->matching_signature(&empty);
1854
1855 assert(sig);
1856
1857 foreach_iter(exec_list_iterator, iter, sig->body) {
1858 ir_instruction *ir = (ir_instruction *)iter.get();
1859 this->base_ir = ir;
1860
1861 ir->accept(this);
1862 }
1863 }
1864 }
1865
1866 void
1867 fs_visitor::visit(ir_function_signature *ir)
1868 {
1869 assert(!"not reached");
1870 (void)ir;
1871 }
1872
1873 fs_inst *
1874 fs_visitor::emit(fs_inst inst)
1875 {
1876 fs_inst *list_inst = new(mem_ctx) fs_inst;
1877 *list_inst = inst;
1878
1879 list_inst->annotation = this->current_annotation;
1880 list_inst->ir = this->base_ir;
1881
1882 this->instructions.push_tail(list_inst);
1883
1884 return list_inst;
1885 }
1886
1887 /** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
1888 void
1889 fs_visitor::emit_dummy_fs()
1890 {
1891 /* Everyone's favorite color. */
1892 emit(BRW_OPCODE_MOV, fs_reg(MRF, 2), fs_reg(1.0f));
1893 emit(BRW_OPCODE_MOV, fs_reg(MRF, 3), fs_reg(0.0f));
1894 emit(BRW_OPCODE_MOV, fs_reg(MRF, 4), fs_reg(1.0f));
1895 emit(BRW_OPCODE_MOV, fs_reg(MRF, 5), fs_reg(0.0f));
1896
1897 fs_inst *write;
1898 write = emit(FS_OPCODE_FB_WRITE, fs_reg(0), fs_reg(0));
1899 write->base_mrf = 0;
1900 }
1901
1902 /* The register location here is relative to the start of the URB
1903 * data. It will get adjusted to be a real location before
1904 * generate_code() time.
1905 */
1906 struct brw_reg
1907 fs_visitor::interp_reg(int location, int channel)
1908 {
1909 int regnr = urb_setup[location] * 2 + channel / 2;
1910 int stride = (channel & 1) * 4;
1911
1912 assert(urb_setup[location] != -1);
1913
1914 return brw_vec1_grf(regnr, stride);
1915 }
1916
1917 /** Emits the interpolation for the varying inputs. */
1918 void
1919 fs_visitor::emit_interpolation_setup_gen4()
1920 {
1921 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1922
1923 this->current_annotation = "compute pixel centers";
1924 this->pixel_x = fs_reg(this, glsl_type::uint_type);
1925 this->pixel_y = fs_reg(this, glsl_type::uint_type);
1926 this->pixel_x.type = BRW_REGISTER_TYPE_UW;
1927 this->pixel_y.type = BRW_REGISTER_TYPE_UW;
1928 emit(BRW_OPCODE_ADD,
1929 this->pixel_x,
1930 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1931 fs_reg(brw_imm_v(0x10101010)));
1932 emit(BRW_OPCODE_ADD,
1933 this->pixel_y,
1934 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1935 fs_reg(brw_imm_v(0x11001100)));
1936
1937 this->current_annotation = "compute pixel deltas from v0";
1938 if (brw->has_pln) {
1939 this->delta_x = fs_reg(this, glsl_type::vec2_type);
1940 this->delta_y = this->delta_x;
1941 this->delta_y.reg_offset++;
1942 } else {
1943 this->delta_x = fs_reg(this, glsl_type::float_type);
1944 this->delta_y = fs_reg(this, glsl_type::float_type);
1945 }
1946 emit(BRW_OPCODE_ADD, this->delta_x,
1947 this->pixel_x, fs_reg(negate(brw_vec1_grf(1, 0))));
1948 emit(BRW_OPCODE_ADD, this->delta_y,
1949 this->pixel_y, fs_reg(negate(brw_vec1_grf(1, 1))));
1950
1951 this->current_annotation = "compute pos.w and 1/pos.w";
1952 /* Compute wpos.w. It's always in our setup, since it's needed to
1953 * interpolate the other attributes.
1954 */
1955 this->wpos_w = fs_reg(this, glsl_type::float_type);
1956 emit(FS_OPCODE_LINTERP, wpos_w, this->delta_x, this->delta_y,
1957 interp_reg(FRAG_ATTRIB_WPOS, 3));
1958 /* Compute the pixel 1/W value from wpos.w. */
1959 this->pixel_w = fs_reg(this, glsl_type::float_type);
1960 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1961 this->current_annotation = NULL;
1962 }
1963
1964 /** Emits the interpolation for the varying inputs. */
1965 void
1966 fs_visitor::emit_interpolation_setup_gen6()
1967 {
1968 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
1969
1970 /* If the pixel centers end up used, the setup is the same as for gen4. */
1971 this->current_annotation = "compute pixel centers";
1972 fs_reg int_pixel_x = fs_reg(this, glsl_type::uint_type);
1973 fs_reg int_pixel_y = fs_reg(this, glsl_type::uint_type);
1974 int_pixel_x.type = BRW_REGISTER_TYPE_UW;
1975 int_pixel_y.type = BRW_REGISTER_TYPE_UW;
1976 emit(BRW_OPCODE_ADD,
1977 int_pixel_x,
1978 fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
1979 fs_reg(brw_imm_v(0x10101010)));
1980 emit(BRW_OPCODE_ADD,
1981 int_pixel_y,
1982 fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
1983 fs_reg(brw_imm_v(0x11001100)));
1984
1985 /* As of gen6, we can no longer mix float and int sources. We have
1986 * to turn the integer pixel centers into floats for their actual
1987 * use.
1988 */
1989 this->pixel_x = fs_reg(this, glsl_type::float_type);
1990 this->pixel_y = fs_reg(this, glsl_type::float_type);
1991 emit(BRW_OPCODE_MOV, this->pixel_x, int_pixel_x);
1992 emit(BRW_OPCODE_MOV, this->pixel_y, int_pixel_y);
1993
1994 this->current_annotation = "compute 1/pos.w";
1995 this->wpos_w = fs_reg(brw_vec8_grf(c->source_w_reg, 0));
1996 this->pixel_w = fs_reg(this, glsl_type::float_type);
1997 emit_math(FS_OPCODE_RCP, this->pixel_w, wpos_w);
1998
1999 this->delta_x = fs_reg(brw_vec8_grf(2, 0));
2000 this->delta_y = fs_reg(brw_vec8_grf(3, 0));
2001
2002 this->current_annotation = NULL;
2003 }
2004
2005 void
2006 fs_visitor::emit_fb_writes()
2007 {
2008 this->current_annotation = "FB write header";
2009 GLboolean header_present = GL_TRUE;
2010 int nr = 0;
2011
2012 if (intel->gen >= 6 &&
2013 !this->kill_emitted &&
2014 c->key.nr_color_regions == 1) {
2015 header_present = false;
2016 }
2017
2018 if (header_present) {
2019 /* m0, m1 header */
2020 nr += 2;
2021 }
2022
2023 if (c->aa_dest_stencil_reg) {
2024 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2025 fs_reg(brw_vec8_grf(c->aa_dest_stencil_reg, 0)));
2026 }
2027
2028 /* Reserve space for color. It'll be filled in per MRT below. */
2029 int color_mrf = nr;
2030 nr += 4;
2031
2032 if (c->source_depth_to_render_target) {
2033 if (c->computes_depth) {
2034 /* Hand over gl_FragDepth. */
2035 assert(this->frag_depth);
2036 fs_reg depth = *(variable_storage(this->frag_depth));
2037
2038 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++), depth);
2039 } else {
2040 /* Pass through the payload depth. */
2041 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2042 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
2043 }
2044 }
2045
2046 if (c->dest_depth_reg) {
2047 emit(BRW_OPCODE_MOV, fs_reg(MRF, nr++),
2048 fs_reg(brw_vec8_grf(c->dest_depth_reg, 0)));
2049 }
2050
2051 fs_reg color = reg_undef;
2052 if (this->frag_color)
2053 color = *(variable_storage(this->frag_color));
2054 else if (this->frag_data) {
2055 color = *(variable_storage(this->frag_data));
2056 color.type = BRW_REGISTER_TYPE_F;
2057 }
2058
2059 for (int target = 0; target < c->key.nr_color_regions; target++) {
2060 this->current_annotation = ralloc_asprintf(this->mem_ctx,
2061 "FB write target %d",
2062 target);
2063 if (this->frag_color || this->frag_data) {
2064 for (int i = 0; i < 4; i++) {
2065 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + i), color);
2066 color.reg_offset++;
2067 }
2068 }
2069
2070 if (this->frag_color)
2071 color.reg_offset -= 4;
2072
2073 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2074 inst->target = target;
2075 inst->base_mrf = 0;
2076 inst->mlen = nr;
2077 if (target == c->key.nr_color_regions - 1)
2078 inst->eot = true;
2079 inst->header_present = header_present;
2080 }
2081
2082 if (c->key.nr_color_regions == 0) {
2083 if (c->key.alpha_test && (this->frag_color || this->frag_data)) {
2084 /* If the alpha test is enabled but there's no color buffer,
2085 * we still need to send alpha out the pipeline to our null
2086 * renderbuffer.
2087 */
2088 color.reg_offset += 3;
2089 emit(BRW_OPCODE_MOV, fs_reg(MRF, color_mrf + 3), color);
2090 }
2091
2092 fs_inst *inst = emit(FS_OPCODE_FB_WRITE);
2093 inst->base_mrf = 0;
2094 inst->mlen = nr;
2095 inst->eot = true;
2096 inst->header_present = header_present;
2097 }
2098
2099 this->current_annotation = NULL;
2100 }
2101
2102 void
2103 fs_visitor::generate_fb_write(fs_inst *inst)
2104 {
2105 GLboolean eot = inst->eot;
2106 struct brw_reg implied_header;
2107
2108 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
2109 * move, here's g1.
2110 */
2111 brw_push_insn_state(p);
2112 brw_set_mask_control(p, BRW_MASK_DISABLE);
2113 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
2114
2115 if (inst->header_present) {
2116 if (intel->gen >= 6) {
2117 brw_MOV(p,
2118 brw_message_reg(inst->base_mrf),
2119 brw_vec8_grf(0, 0));
2120
2121 if (inst->target > 0) {
2122 /* Set the render target index for choosing BLEND_STATE. */
2123 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 2),
2124 BRW_REGISTER_TYPE_UD),
2125 brw_imm_ud(inst->target));
2126 }
2127
2128 /* Clear viewport index, render target array index. */
2129 brw_AND(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, 0, 0),
2130 BRW_REGISTER_TYPE_UD),
2131 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
2132 brw_imm_ud(0xf7ff));
2133
2134 implied_header = brw_null_reg();
2135 } else {
2136 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
2137 }
2138
2139 brw_MOV(p,
2140 brw_message_reg(inst->base_mrf + 1),
2141 brw_vec8_grf(1, 0));
2142 } else {
2143 implied_header = brw_null_reg();
2144 }
2145
2146 brw_pop_insn_state(p);
2147
2148 brw_fb_WRITE(p,
2149 8, /* dispatch_width */
2150 retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW),
2151 inst->base_mrf,
2152 implied_header,
2153 inst->target,
2154 inst->mlen,
2155 0,
2156 eot,
2157 inst->header_present);
2158 }
2159
2160 void
2161 fs_visitor::generate_linterp(fs_inst *inst,
2162 struct brw_reg dst, struct brw_reg *src)
2163 {
2164 struct brw_reg delta_x = src[0];
2165 struct brw_reg delta_y = src[1];
2166 struct brw_reg interp = src[2];
2167
2168 if (brw->has_pln &&
2169 delta_y.nr == delta_x.nr + 1 &&
2170 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
2171 brw_PLN(p, dst, interp, delta_x);
2172 } else {
2173 brw_LINE(p, brw_null_reg(), interp, delta_x);
2174 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
2175 }
2176 }
2177
2178 void
2179 fs_visitor::generate_math(fs_inst *inst,
2180 struct brw_reg dst, struct brw_reg *src)
2181 {
2182 int op;
2183
2184 switch (inst->opcode) {
2185 case FS_OPCODE_RCP:
2186 op = BRW_MATH_FUNCTION_INV;
2187 break;
2188 case FS_OPCODE_RSQ:
2189 op = BRW_MATH_FUNCTION_RSQ;
2190 break;
2191 case FS_OPCODE_SQRT:
2192 op = BRW_MATH_FUNCTION_SQRT;
2193 break;
2194 case FS_OPCODE_EXP2:
2195 op = BRW_MATH_FUNCTION_EXP;
2196 break;
2197 case FS_OPCODE_LOG2:
2198 op = BRW_MATH_FUNCTION_LOG;
2199 break;
2200 case FS_OPCODE_POW:
2201 op = BRW_MATH_FUNCTION_POW;
2202 break;
2203 case FS_OPCODE_SIN:
2204 op = BRW_MATH_FUNCTION_SIN;
2205 break;
2206 case FS_OPCODE_COS:
2207 op = BRW_MATH_FUNCTION_COS;
2208 break;
2209 default:
2210 assert(!"not reached: unknown math function");
2211 op = 0;
2212 break;
2213 }
2214
2215 if (intel->gen >= 6) {
2216 assert(inst->mlen == 0);
2217
2218 if (inst->opcode == FS_OPCODE_POW) {
2219 brw_math2(p, dst, op, src[0], src[1]);
2220 } else {
2221 brw_math(p, dst,
2222 op,
2223 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2224 BRW_MATH_SATURATE_NONE,
2225 0, src[0],
2226 BRW_MATH_DATA_VECTOR,
2227 BRW_MATH_PRECISION_FULL);
2228 }
2229 } else {
2230 assert(inst->mlen >= 1);
2231
2232 brw_math(p, dst,
2233 op,
2234 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
2235 BRW_MATH_SATURATE_NONE,
2236 inst->base_mrf, src[0],
2237 BRW_MATH_DATA_VECTOR,
2238 BRW_MATH_PRECISION_FULL);
2239 }
2240 }
2241
2242 void
2243 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2244 {
2245 int msg_type = -1;
2246 int rlen = 4;
2247 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
2248
2249 if (intel->gen >= 5) {
2250 switch (inst->opcode) {
2251 case FS_OPCODE_TEX:
2252 if (inst->shadow_compare) {
2253 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
2254 } else {
2255 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
2256 }
2257 break;
2258 case FS_OPCODE_TXB:
2259 if (inst->shadow_compare) {
2260 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
2261 } else {
2262 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
2263 }
2264 break;
2265 case FS_OPCODE_TXL:
2266 if (inst->shadow_compare) {
2267 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
2268 } else {
2269 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
2270 }
2271 break;
2272 case FS_OPCODE_TXD:
2273 assert(!"TXD isn't supported on gen5+ yet.");
2274 break;
2275 }
2276 } else {
2277 switch (inst->opcode) {
2278 case FS_OPCODE_TEX:
2279 /* Note that G45 and older determines shadow compare and dispatch width
2280 * from message length for most messages.
2281 */
2282 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
2283 if (inst->shadow_compare) {
2284 assert(inst->mlen == 6);
2285 } else {
2286 assert(inst->mlen <= 4);
2287 }
2288 break;
2289 case FS_OPCODE_TXB:
2290 if (inst->shadow_compare) {
2291 assert(inst->mlen == 6);
2292 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
2293 } else {
2294 assert(inst->mlen == 9);
2295 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
2296 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2297 }
2298 break;
2299 case FS_OPCODE_TXL:
2300 if (inst->shadow_compare) {
2301 assert(inst->mlen == 6);
2302 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
2303 } else {
2304 assert(inst->mlen == 9);
2305 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
2306 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
2307 }
2308 break;
2309 case FS_OPCODE_TXD:
2310 assert(!"TXD isn't supported on gen4 yet.");
2311 break;
2312 }
2313 }
2314 assert(msg_type != -1);
2315
2316 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
2317 rlen = 8;
2318 dst = vec16(dst);
2319 }
2320
2321 brw_SAMPLE(p,
2322 retype(dst, BRW_REGISTER_TYPE_UW),
2323 inst->base_mrf,
2324 src,
2325 SURF_INDEX_TEXTURE(inst->sampler),
2326 inst->sampler,
2327 WRITEMASK_XYZW,
2328 msg_type,
2329 rlen,
2330 inst->mlen,
2331 0,
2332 1,
2333 simd_mode);
2334 }
2335
2336
2337 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
2338 * looking like:
2339 *
2340 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
2341 *
2342 * and we're trying to produce:
2343 *
2344 * DDX DDY
2345 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
2346 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
2347 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
2348 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
2349 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
2350 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
2351 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
2352 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
2353 *
2354 * and add another set of two more subspans if in 16-pixel dispatch mode.
2355 *
2356 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
2357 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
2358 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
2359 * between each other. We could probably do it like ddx and swizzle the right
2360 * order later, but bail for now and just produce
2361 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
2362 */
2363 void
2364 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2365 {
2366 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
2367 BRW_REGISTER_TYPE_F,
2368 BRW_VERTICAL_STRIDE_2,
2369 BRW_WIDTH_2,
2370 BRW_HORIZONTAL_STRIDE_0,
2371 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2372 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
2373 BRW_REGISTER_TYPE_F,
2374 BRW_VERTICAL_STRIDE_2,
2375 BRW_WIDTH_2,
2376 BRW_HORIZONTAL_STRIDE_0,
2377 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2378 brw_ADD(p, dst, src0, negate(src1));
2379 }
2380
2381 void
2382 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
2383 {
2384 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
2385 BRW_REGISTER_TYPE_F,
2386 BRW_VERTICAL_STRIDE_4,
2387 BRW_WIDTH_4,
2388 BRW_HORIZONTAL_STRIDE_0,
2389 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2390 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
2391 BRW_REGISTER_TYPE_F,
2392 BRW_VERTICAL_STRIDE_4,
2393 BRW_WIDTH_4,
2394 BRW_HORIZONTAL_STRIDE_0,
2395 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
2396 brw_ADD(p, dst, src0, negate(src1));
2397 }
2398
2399 void
2400 fs_visitor::generate_discard_not(fs_inst *inst, struct brw_reg mask)
2401 {
2402 if (intel->gen >= 6) {
2403 /* Gen6 no longer has the mask reg for us to just read the
2404 * active channels from. However, cmp updates just the channels
2405 * of the flag reg that are enabled, so we can get at the
2406 * channel enables that way. In this step, make a reg of ones
2407 * we'll compare to.
2408 */
2409 brw_MOV(p, mask, brw_imm_ud(1));
2410 } else {
2411 brw_push_insn_state(p);
2412 brw_set_mask_control(p, BRW_MASK_DISABLE);
2413 brw_NOT(p, mask, brw_mask_reg(1)); /* IMASK */
2414 brw_pop_insn_state(p);
2415 }
2416 }
2417
2418 void
2419 fs_visitor::generate_discard_and(fs_inst *inst, struct brw_reg mask)
2420 {
2421 if (intel->gen >= 6) {
2422 struct brw_reg f0 = brw_flag_reg();
2423 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
2424
2425 brw_push_insn_state(p);
2426 brw_set_mask_control(p, BRW_MASK_DISABLE);
2427 brw_MOV(p, f0, brw_imm_uw(0xffff)); /* inactive channels undiscarded */
2428 brw_pop_insn_state(p);
2429
2430 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
2431 BRW_CONDITIONAL_Z, mask, brw_imm_ud(0)); /* active channels fail test */
2432 /* Undo CMP's whacking of predication*/
2433 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
2434
2435 brw_push_insn_state(p);
2436 brw_set_mask_control(p, BRW_MASK_DISABLE);
2437 brw_AND(p, g1, f0, g1);
2438 brw_pop_insn_state(p);
2439 } else {
2440 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
2441
2442 mask = brw_uw1_reg(mask.file, mask.nr, 0);
2443
2444 brw_push_insn_state(p);
2445 brw_set_mask_control(p, BRW_MASK_DISABLE);
2446 brw_AND(p, g0, mask, g0);
2447 brw_pop_insn_state(p);
2448 }
2449 }
2450
2451 void
2452 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
2453 {
2454 assert(inst->mlen != 0);
2455
2456 brw_MOV(p,
2457 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
2458 retype(src, BRW_REGISTER_TYPE_UD));
2459 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
2460 inst->offset);
2461 }
2462
2463 void
2464 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
2465 {
2466 assert(inst->mlen != 0);
2467
2468 /* Clear any post destination dependencies that would be ignored by
2469 * the block read. See the B-Spec for pre-gen5 send instruction.
2470 *
2471 * This could use a better solution, since texture sampling and
2472 * math reads could potentially run into it as well -- anywhere
2473 * that we have a SEND with a destination that is a register that
2474 * was written but not read within the last N instructions (what's
2475 * N? unsure). This is rare because of dead code elimination, but
2476 * not impossible.
2477 */
2478 if (intel->gen == 4 && !intel->is_g4x)
2479 brw_MOV(p, brw_null_reg(), dst);
2480
2481 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
2482 inst->offset);
2483
2484 if (intel->gen == 4 && !intel->is_g4x) {
2485 /* gen4 errata: destination from a send can't be used as a
2486 * destination until it's been read. Just read it so we don't
2487 * have to worry.
2488 */
2489 brw_MOV(p, brw_null_reg(), dst);
2490 }
2491 }
2492
2493
2494 void
2495 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
2496 {
2497 assert(inst->mlen != 0);
2498
2499 /* Clear any post destination dependencies that would be ignored by
2500 * the block read. See the B-Spec for pre-gen5 send instruction.
2501 *
2502 * This could use a better solution, since texture sampling and
2503 * math reads could potentially run into it as well -- anywhere
2504 * that we have a SEND with a destination that is a register that
2505 * was written but not read within the last N instructions (what's
2506 * N? unsure). This is rare because of dead code elimination, but
2507 * not impossible.
2508 */
2509 if (intel->gen == 4 && !intel->is_g4x)
2510 brw_MOV(p, brw_null_reg(), dst);
2511
2512 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
2513 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
2514
2515 if (intel->gen == 4 && !intel->is_g4x) {
2516 /* gen4 errata: destination from a send can't be used as a
2517 * destination until it's been read. Just read it so we don't
2518 * have to worry.
2519 */
2520 brw_MOV(p, brw_null_reg(), dst);
2521 }
2522 }
2523
2524 /**
2525 * To be called after the last _mesa_add_state_reference() call, to
2526 * set up prog_data.param[] for assign_curb_setup() and
2527 * setup_pull_constants().
2528 */
2529 void
2530 fs_visitor::setup_paramvalues_refs()
2531 {
2532 /* Set up the pointers to ParamValues now that that array is finalized. */
2533 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
2534 c->prog_data.param[i] =
2535 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
2536 this->param_offset[i];
2537 }
2538 }
2539
2540 void
2541 fs_visitor::assign_curb_setup()
2542 {
2543 c->prog_data.first_curbe_grf = c->nr_payload_regs;
2544 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
2545
2546 /* Map the offsets in the UNIFORM file to fixed HW regs. */
2547 foreach_iter(exec_list_iterator, iter, this->instructions) {
2548 fs_inst *inst = (fs_inst *)iter.get();
2549
2550 for (unsigned int i = 0; i < 3; i++) {
2551 if (inst->src[i].file == UNIFORM) {
2552 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2553 struct brw_reg brw_reg = brw_vec1_grf(c->prog_data.first_curbe_grf +
2554 constant_nr / 8,
2555 constant_nr % 8);
2556
2557 inst->src[i].file = FIXED_HW_REG;
2558 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
2559 }
2560 }
2561 }
2562 }
2563
2564 void
2565 fs_visitor::calculate_urb_setup()
2566 {
2567 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2568 urb_setup[i] = -1;
2569 }
2570
2571 int urb_next = 0;
2572 /* Figure out where each of the incoming setup attributes lands. */
2573 if (intel->gen >= 6) {
2574 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2575 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
2576 urb_setup[i] = urb_next++;
2577 }
2578 }
2579 } else {
2580 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
2581 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
2582 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
2583 int fp_index;
2584
2585 if (i >= VERT_RESULT_VAR0)
2586 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
2587 else if (i <= VERT_RESULT_TEX7)
2588 fp_index = i;
2589 else
2590 fp_index = -1;
2591
2592 if (fp_index >= 0)
2593 urb_setup[fp_index] = urb_next++;
2594 }
2595 }
2596 }
2597
2598 /* Each attribute is 4 setup channels, each of which is half a reg. */
2599 c->prog_data.urb_read_length = urb_next * 2;
2600 }
2601
2602 void
2603 fs_visitor::assign_urb_setup()
2604 {
2605 int urb_start = c->prog_data.first_curbe_grf + c->prog_data.curb_read_length;
2606
2607 /* Offset all the urb_setup[] index by the actual position of the
2608 * setup regs, now that the location of the constants has been chosen.
2609 */
2610 foreach_iter(exec_list_iterator, iter, this->instructions) {
2611 fs_inst *inst = (fs_inst *)iter.get();
2612
2613 if (inst->opcode == FS_OPCODE_LINTERP) {
2614 assert(inst->src[2].file == FIXED_HW_REG);
2615 inst->src[2].fixed_hw_reg.nr += urb_start;
2616 }
2617
2618 if (inst->opcode == FS_OPCODE_CINTERP) {
2619 assert(inst->src[0].file == FIXED_HW_REG);
2620 inst->src[0].fixed_hw_reg.nr += urb_start;
2621 }
2622 }
2623
2624 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
2625 }
2626
2627 /**
2628 * Split large virtual GRFs into separate components if we can.
2629 *
2630 * This is mostly duplicated with what brw_fs_vector_splitting does,
2631 * but that's really conservative because it's afraid of doing
2632 * splitting that doesn't result in real progress after the rest of
2633 * the optimization phases, which would cause infinite looping in
2634 * optimization. We can do it once here, safely. This also has the
2635 * opportunity to split interpolated values, or maybe even uniforms,
2636 * which we don't have at the IR level.
2637 *
2638 * We want to split, because virtual GRFs are what we register
2639 * allocate and spill (due to contiguousness requirements for some
2640 * instructions), and they're what we naturally generate in the
2641 * codegen process, but most virtual GRFs don't actually need to be
2642 * contiguous sets of GRFs. If we split, we'll end up with reduced
2643 * live intervals and better dead code elimination and coalescing.
2644 */
2645 void
2646 fs_visitor::split_virtual_grfs()
2647 {
2648 int num_vars = this->virtual_grf_next;
2649 bool split_grf[num_vars];
2650 int new_virtual_grf[num_vars];
2651
2652 /* Try to split anything > 0 sized. */
2653 for (int i = 0; i < num_vars; i++) {
2654 if (this->virtual_grf_sizes[i] != 1)
2655 split_grf[i] = true;
2656 else
2657 split_grf[i] = false;
2658 }
2659
2660 if (brw->has_pln) {
2661 /* PLN opcodes rely on the delta_xy being contiguous. */
2662 split_grf[this->delta_x.reg] = false;
2663 }
2664
2665 foreach_iter(exec_list_iterator, iter, this->instructions) {
2666 fs_inst *inst = (fs_inst *)iter.get();
2667
2668 /* Texturing produces 4 contiguous registers, so no splitting. */
2669 if (inst->is_tex()) {
2670 split_grf[inst->dst.reg] = false;
2671 }
2672 }
2673
2674 /* Allocate new space for split regs. Note that the virtual
2675 * numbers will be contiguous.
2676 */
2677 for (int i = 0; i < num_vars; i++) {
2678 if (split_grf[i]) {
2679 new_virtual_grf[i] = virtual_grf_alloc(1);
2680 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
2681 int reg = virtual_grf_alloc(1);
2682 assert(reg == new_virtual_grf[i] + j - 1);
2683 (void) reg;
2684 }
2685 this->virtual_grf_sizes[i] = 1;
2686 }
2687 }
2688
2689 foreach_iter(exec_list_iterator, iter, this->instructions) {
2690 fs_inst *inst = (fs_inst *)iter.get();
2691
2692 if (inst->dst.file == GRF &&
2693 split_grf[inst->dst.reg] &&
2694 inst->dst.reg_offset != 0) {
2695 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
2696 inst->dst.reg_offset - 1);
2697 inst->dst.reg_offset = 0;
2698 }
2699 for (int i = 0; i < 3; i++) {
2700 if (inst->src[i].file == GRF &&
2701 split_grf[inst->src[i].reg] &&
2702 inst->src[i].reg_offset != 0) {
2703 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
2704 inst->src[i].reg_offset - 1);
2705 inst->src[i].reg_offset = 0;
2706 }
2707 }
2708 }
2709 this->live_intervals_valid = false;
2710 }
2711
2712 /**
2713 * Choose accesses from the UNIFORM file to demote to using the pull
2714 * constant buffer.
2715 *
2716 * We allow a fragment shader to have more than the specified minimum
2717 * maximum number of fragment shader uniform components (64). If
2718 * there are too many of these, they'd fill up all of register space.
2719 * So, this will push some of them out to the pull constant buffer and
2720 * update the program to load them.
2721 */
2722 void
2723 fs_visitor::setup_pull_constants()
2724 {
2725 /* Only allow 16 registers (128 uniform components) as push constants. */
2726 unsigned int max_uniform_components = 16 * 8;
2727 if (c->prog_data.nr_params <= max_uniform_components)
2728 return;
2729
2730 /* Just demote the end of the list. We could probably do better
2731 * here, demoting things that are rarely used in the program first.
2732 */
2733 int pull_uniform_base = max_uniform_components;
2734 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
2735
2736 foreach_iter(exec_list_iterator, iter, this->instructions) {
2737 fs_inst *inst = (fs_inst *)iter.get();
2738
2739 for (int i = 0; i < 3; i++) {
2740 if (inst->src[i].file != UNIFORM)
2741 continue;
2742
2743 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
2744 if (uniform_nr < pull_uniform_base)
2745 continue;
2746
2747 fs_reg dst = fs_reg(this, glsl_type::float_type);
2748 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
2749 dst);
2750 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
2751 pull->ir = inst->ir;
2752 pull->annotation = inst->annotation;
2753 pull->base_mrf = 14;
2754 pull->mlen = 1;
2755
2756 inst->insert_before(pull);
2757
2758 inst->src[i].file = GRF;
2759 inst->src[i].reg = dst.reg;
2760 inst->src[i].reg_offset = 0;
2761 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
2762 }
2763 }
2764
2765 for (int i = 0; i < pull_uniform_count; i++) {
2766 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
2767 c->prog_data.pull_param_convert[i] =
2768 c->prog_data.param_convert[pull_uniform_base + i];
2769 }
2770 c->prog_data.nr_params -= pull_uniform_count;
2771 c->prog_data.nr_pull_params = pull_uniform_count;
2772 }
2773
2774 void
2775 fs_visitor::calculate_live_intervals()
2776 {
2777 int num_vars = this->virtual_grf_next;
2778 int *def = ralloc_array(mem_ctx, int, num_vars);
2779 int *use = ralloc_array(mem_ctx, int, num_vars);
2780 int loop_depth = 0;
2781 int loop_start = 0;
2782 int bb_header_ip = 0;
2783
2784 if (this->live_intervals_valid)
2785 return;
2786
2787 for (int i = 0; i < num_vars; i++) {
2788 def[i] = MAX_INSTRUCTION;
2789 use[i] = -1;
2790 }
2791
2792 int ip = 0;
2793 foreach_iter(exec_list_iterator, iter, this->instructions) {
2794 fs_inst *inst = (fs_inst *)iter.get();
2795
2796 if (inst->opcode == BRW_OPCODE_DO) {
2797 if (loop_depth++ == 0)
2798 loop_start = ip;
2799 } else if (inst->opcode == BRW_OPCODE_WHILE) {
2800 loop_depth--;
2801
2802 if (loop_depth == 0) {
2803 /* Patches up the use of vars marked for being live across
2804 * the whole loop.
2805 */
2806 for (int i = 0; i < num_vars; i++) {
2807 if (use[i] == loop_start) {
2808 use[i] = ip;
2809 }
2810 }
2811 }
2812 } else {
2813 for (unsigned int i = 0; i < 3; i++) {
2814 if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
2815 int reg = inst->src[i].reg;
2816
2817 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2818 def[reg] >= bb_header_ip)) {
2819 use[reg] = ip;
2820 } else {
2821 def[reg] = MIN2(loop_start, def[reg]);
2822 use[reg] = loop_start;
2823
2824 /* Nobody else is going to go smash our start to
2825 * later in the loop now, because def[reg] now
2826 * points before the bb header.
2827 */
2828 }
2829 }
2830 }
2831 if (inst->dst.file == GRF && inst->dst.reg != 0) {
2832 int reg = inst->dst.reg;
2833
2834 if (!loop_depth || (this->virtual_grf_sizes[reg] == 1 &&
2835 !inst->predicated)) {
2836 def[reg] = MIN2(def[reg], ip);
2837 } else {
2838 def[reg] = MIN2(def[reg], loop_start);
2839 }
2840 }
2841 }
2842
2843 ip++;
2844
2845 /* Set the basic block header IP. This is used for determining
2846 * if a complete def of single-register virtual GRF in a loop
2847 * dominates a use in the same basic block. It's a quick way to
2848 * reduce the live interval range of most register used in a
2849 * loop.
2850 */
2851 if (inst->opcode == BRW_OPCODE_IF ||
2852 inst->opcode == BRW_OPCODE_ELSE ||
2853 inst->opcode == BRW_OPCODE_ENDIF ||
2854 inst->opcode == BRW_OPCODE_DO ||
2855 inst->opcode == BRW_OPCODE_WHILE ||
2856 inst->opcode == BRW_OPCODE_BREAK ||
2857 inst->opcode == BRW_OPCODE_CONTINUE) {
2858 bb_header_ip = ip;
2859 }
2860 }
2861
2862 ralloc_free(this->virtual_grf_def);
2863 ralloc_free(this->virtual_grf_use);
2864 this->virtual_grf_def = def;
2865 this->virtual_grf_use = use;
2866
2867 this->live_intervals_valid = true;
2868 }
2869
2870 /**
2871 * Attempts to move immediate constants into the immediate
2872 * constant slot of following instructions.
2873 *
2874 * Immediate constants are a bit tricky -- they have to be in the last
2875 * operand slot, you can't do abs/negate on them,
2876 */
2877
2878 bool
2879 fs_visitor::propagate_constants()
2880 {
2881 bool progress = false;
2882
2883 calculate_live_intervals();
2884
2885 foreach_iter(exec_list_iterator, iter, this->instructions) {
2886 fs_inst *inst = (fs_inst *)iter.get();
2887
2888 if (inst->opcode != BRW_OPCODE_MOV ||
2889 inst->predicated ||
2890 inst->dst.file != GRF || inst->src[0].file != IMM ||
2891 inst->dst.type != inst->src[0].type)
2892 continue;
2893
2894 /* Don't bother with cases where we should have had the
2895 * operation on the constant folded in GLSL already.
2896 */
2897 if (inst->saturate)
2898 continue;
2899
2900 /* Found a move of a constant to a GRF. Find anything else using the GRF
2901 * before it's written, and replace it with the constant if we can.
2902 */
2903 exec_list_iterator scan_iter = iter;
2904 scan_iter.next();
2905 for (; scan_iter.has_next(); scan_iter.next()) {
2906 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
2907
2908 if (scan_inst->opcode == BRW_OPCODE_DO ||
2909 scan_inst->opcode == BRW_OPCODE_WHILE ||
2910 scan_inst->opcode == BRW_OPCODE_ELSE ||
2911 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2912 break;
2913 }
2914
2915 for (int i = 2; i >= 0; i--) {
2916 if (scan_inst->src[i].file != GRF ||
2917 scan_inst->src[i].reg != inst->dst.reg ||
2918 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
2919 continue;
2920
2921 /* Don't bother with cases where we should have had the
2922 * operation on the constant folded in GLSL already.
2923 */
2924 if (scan_inst->src[i].negate || scan_inst->src[i].abs)
2925 continue;
2926
2927 switch (scan_inst->opcode) {
2928 case BRW_OPCODE_MOV:
2929 scan_inst->src[i] = inst->src[0];
2930 progress = true;
2931 break;
2932
2933 case BRW_OPCODE_MUL:
2934 case BRW_OPCODE_ADD:
2935 if (i == 1) {
2936 scan_inst->src[i] = inst->src[0];
2937 progress = true;
2938 } else if (i == 0 && scan_inst->src[1].file != IMM) {
2939 /* Fit this constant in by commuting the operands */
2940 scan_inst->src[0] = scan_inst->src[1];
2941 scan_inst->src[1] = inst->src[0];
2942 progress = true;
2943 }
2944 break;
2945 case BRW_OPCODE_CMP:
2946 case BRW_OPCODE_SEL:
2947 if (i == 1) {
2948 scan_inst->src[i] = inst->src[0];
2949 progress = true;
2950 }
2951 }
2952 }
2953
2954 if (scan_inst->dst.file == GRF &&
2955 scan_inst->dst.reg == inst->dst.reg &&
2956 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
2957 scan_inst->is_tex())) {
2958 break;
2959 }
2960 }
2961 }
2962
2963 if (progress)
2964 this->live_intervals_valid = false;
2965
2966 return progress;
2967 }
2968 /**
2969 * Must be called after calculate_live_intervales() to remove unused
2970 * writes to registers -- register allocation will fail otherwise
2971 * because something deffed but not used won't be considered to
2972 * interfere with other regs.
2973 */
2974 bool
2975 fs_visitor::dead_code_eliminate()
2976 {
2977 bool progress = false;
2978 int pc = 0;
2979
2980 calculate_live_intervals();
2981
2982 foreach_iter(exec_list_iterator, iter, this->instructions) {
2983 fs_inst *inst = (fs_inst *)iter.get();
2984
2985 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
2986 inst->remove();
2987 progress = true;
2988 }
2989
2990 pc++;
2991 }
2992
2993 if (progress)
2994 live_intervals_valid = false;
2995
2996 return progress;
2997 }
2998
2999 bool
3000 fs_visitor::register_coalesce()
3001 {
3002 bool progress = false;
3003 int if_depth = 0;
3004 int loop_depth = 0;
3005
3006 foreach_iter(exec_list_iterator, iter, this->instructions) {
3007 fs_inst *inst = (fs_inst *)iter.get();
3008
3009 /* Make sure that we dominate the instructions we're going to
3010 * scan for interfering with our coalescing, or we won't have
3011 * scanned enough to see if anything interferes with our
3012 * coalescing. We don't dominate the following instructions if
3013 * we're in a loop or an if block.
3014 */
3015 switch (inst->opcode) {
3016 case BRW_OPCODE_DO:
3017 loop_depth++;
3018 break;
3019 case BRW_OPCODE_WHILE:
3020 loop_depth--;
3021 break;
3022 case BRW_OPCODE_IF:
3023 if_depth++;
3024 break;
3025 case BRW_OPCODE_ENDIF:
3026 if_depth--;
3027 break;
3028 }
3029 if (loop_depth || if_depth)
3030 continue;
3031
3032 if (inst->opcode != BRW_OPCODE_MOV ||
3033 inst->predicated ||
3034 inst->saturate ||
3035 inst->dst.file != GRF || inst->src[0].file != GRF ||
3036 inst->dst.type != inst->src[0].type)
3037 continue;
3038
3039 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
3040
3041 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
3042 * them: check for no writes to either one until the exit of the
3043 * program.
3044 */
3045 bool interfered = false;
3046 exec_list_iterator scan_iter = iter;
3047 scan_iter.next();
3048 for (; scan_iter.has_next(); scan_iter.next()) {
3049 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3050
3051 if (scan_inst->dst.file == GRF) {
3052 if (scan_inst->dst.reg == inst->dst.reg &&
3053 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
3054 scan_inst->is_tex())) {
3055 interfered = true;
3056 break;
3057 }
3058 if (scan_inst->dst.reg == inst->src[0].reg &&
3059 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
3060 scan_inst->is_tex())) {
3061 interfered = true;
3062 break;
3063 }
3064 }
3065
3066 /* The gen6 MATH instruction can't handle source modifiers, so avoid
3067 * coalescing those for now. We should do something more specific.
3068 */
3069 if (intel->gen == 6 && scan_inst->is_math() && has_source_modifiers) {
3070 interfered = true;
3071 break;
3072 }
3073 }
3074 if (interfered) {
3075 continue;
3076 }
3077
3078 /* Rewrite the later usage to point at the source of the move to
3079 * be removed.
3080 */
3081 for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
3082 scan_iter.next()) {
3083 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
3084
3085 for (int i = 0; i < 3; i++) {
3086 if (scan_inst->src[i].file == GRF &&
3087 scan_inst->src[i].reg == inst->dst.reg &&
3088 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
3089 scan_inst->src[i].reg = inst->src[0].reg;
3090 scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
3091 scan_inst->src[i].abs |= inst->src[0].abs;
3092 scan_inst->src[i].negate ^= inst->src[0].negate;
3093 scan_inst->src[i].smear = inst->src[0].smear;
3094 }
3095 }
3096 }
3097
3098 inst->remove();
3099 progress = true;
3100 }
3101
3102 if (progress)
3103 live_intervals_valid = false;
3104
3105 return progress;
3106 }
3107
3108
3109 bool
3110 fs_visitor::compute_to_mrf()
3111 {
3112 bool progress = false;
3113 int next_ip = 0;
3114
3115 calculate_live_intervals();
3116
3117 foreach_iter(exec_list_iterator, iter, this->instructions) {
3118 fs_inst *inst = (fs_inst *)iter.get();
3119
3120 int ip = next_ip;
3121 next_ip++;
3122
3123 if (inst->opcode != BRW_OPCODE_MOV ||
3124 inst->predicated ||
3125 inst->dst.file != MRF || inst->src[0].file != GRF ||
3126 inst->dst.type != inst->src[0].type ||
3127 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
3128 continue;
3129
3130 /* Can't compute-to-MRF this GRF if someone else was going to
3131 * read it later.
3132 */
3133 if (this->virtual_grf_use[inst->src[0].reg] > ip)
3134 continue;
3135
3136 /* Found a move of a GRF to a MRF. Let's see if we can go
3137 * rewrite the thing that made this GRF to write into the MRF.
3138 */
3139 fs_inst *scan_inst;
3140 for (scan_inst = (fs_inst *)inst->prev;
3141 scan_inst->prev != NULL;
3142 scan_inst = (fs_inst *)scan_inst->prev) {
3143 if (scan_inst->dst.file == GRF &&
3144 scan_inst->dst.reg == inst->src[0].reg) {
3145 /* Found the last thing to write our reg we want to turn
3146 * into a compute-to-MRF.
3147 */
3148
3149 if (scan_inst->is_tex()) {
3150 /* texturing writes several continuous regs, so we can't
3151 * compute-to-mrf that.
3152 */
3153 break;
3154 }
3155
3156 /* If it's predicated, it (probably) didn't populate all
3157 * the channels.
3158 */
3159 if (scan_inst->predicated)
3160 break;
3161
3162 /* SEND instructions can't have MRF as a destination. */
3163 if (scan_inst->mlen)
3164 break;
3165
3166 if (intel->gen >= 6) {
3167 /* gen6 math instructions must have the destination be
3168 * GRF, so no compute-to-MRF for them.
3169 */
3170 if (scan_inst->is_math()) {
3171 break;
3172 }
3173 }
3174
3175 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
3176 /* Found the creator of our MRF's source value. */
3177 scan_inst->dst.file = MRF;
3178 scan_inst->dst.hw_reg = inst->dst.hw_reg;
3179 scan_inst->saturate |= inst->saturate;
3180 inst->remove();
3181 progress = true;
3182 }
3183 break;
3184 }
3185
3186 /* We don't handle flow control here. Most computation of
3187 * values that end up in MRFs are shortly before the MRF
3188 * write anyway.
3189 */
3190 if (scan_inst->opcode == BRW_OPCODE_DO ||
3191 scan_inst->opcode == BRW_OPCODE_WHILE ||
3192 scan_inst->opcode == BRW_OPCODE_ELSE ||
3193 scan_inst->opcode == BRW_OPCODE_ENDIF) {
3194 break;
3195 }
3196
3197 /* You can't read from an MRF, so if someone else reads our
3198 * MRF's source GRF that we wanted to rewrite, that stops us.
3199 */
3200 bool interfered = false;
3201 for (int i = 0; i < 3; i++) {
3202 if (scan_inst->src[i].file == GRF &&
3203 scan_inst->src[i].reg == inst->src[0].reg &&
3204 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
3205 interfered = true;
3206 }
3207 }
3208 if (interfered)
3209 break;
3210
3211 if (scan_inst->dst.file == MRF &&
3212 scan_inst->dst.hw_reg == inst->dst.hw_reg) {
3213 /* Somebody else wrote our MRF here, so we can't can't
3214 * compute-to-MRF before that.
3215 */
3216 break;
3217 }
3218
3219 if (scan_inst->mlen > 0) {
3220 /* Found a SEND instruction, which means that there are
3221 * live values in MRFs from base_mrf to base_mrf +
3222 * scan_inst->mlen - 1. Don't go pushing our MRF write up
3223 * above it.
3224 */
3225 if (inst->dst.hw_reg >= scan_inst->base_mrf &&
3226 inst->dst.hw_reg < scan_inst->base_mrf + scan_inst->mlen) {
3227 break;
3228 }
3229 }
3230 }
3231 }
3232
3233 return progress;
3234 }
3235
3236 /**
3237 * Walks through basic blocks, locking for repeated MRF writes and
3238 * removing the later ones.
3239 */
3240 bool
3241 fs_visitor::remove_duplicate_mrf_writes()
3242 {
3243 fs_inst *last_mrf_move[16];
3244 bool progress = false;
3245
3246 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3247
3248 foreach_iter(exec_list_iterator, iter, this->instructions) {
3249 fs_inst *inst = (fs_inst *)iter.get();
3250
3251 switch (inst->opcode) {
3252 case BRW_OPCODE_DO:
3253 case BRW_OPCODE_WHILE:
3254 case BRW_OPCODE_IF:
3255 case BRW_OPCODE_ELSE:
3256 case BRW_OPCODE_ENDIF:
3257 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3258 continue;
3259 default:
3260 break;
3261 }
3262
3263 if (inst->opcode == BRW_OPCODE_MOV &&
3264 inst->dst.file == MRF) {
3265 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
3266 if (prev_inst && inst->equals(prev_inst)) {
3267 inst->remove();
3268 progress = true;
3269 continue;
3270 }
3271 }
3272
3273 /* Clear out the last-write records for MRFs that were overwritten. */
3274 if (inst->dst.file == MRF) {
3275 last_mrf_move[inst->dst.hw_reg] = NULL;
3276 }
3277
3278 if (inst->mlen > 0) {
3279 /* Found a SEND instruction, which will include two or fewer
3280 * implied MRF writes. We could do better here.
3281 */
3282 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3283 last_mrf_move[inst->base_mrf + i] = NULL;
3284 }
3285 }
3286
3287 /* Clear out any MRF move records whose sources got overwritten. */
3288 if (inst->dst.file == GRF) {
3289 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
3290 if (last_mrf_move[i] &&
3291 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3292 last_mrf_move[i] = NULL;
3293 }
3294 }
3295 }
3296
3297 if (inst->opcode == BRW_OPCODE_MOV &&
3298 inst->dst.file == MRF &&
3299 inst->src[0].file == GRF &&
3300 !inst->predicated) {
3301 last_mrf_move[inst->dst.hw_reg] = inst;
3302 }
3303 }
3304
3305 return progress;
3306 }
3307
3308 bool
3309 fs_visitor::virtual_grf_interferes(int a, int b)
3310 {
3311 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
3312 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
3313
3314 /* We can't handle dead register writes here, without iterating
3315 * over the whole instruction stream to find every single dead
3316 * write to that register to compare to the live interval of the
3317 * other register. Just assert that dead_code_eliminate() has been
3318 * called.
3319 */
3320 assert((this->virtual_grf_use[a] != -1 ||
3321 this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
3322 (this->virtual_grf_use[b] != -1 ||
3323 this->virtual_grf_def[b] == MAX_INSTRUCTION));
3324
3325 return start < end;
3326 }
3327
3328 static struct brw_reg brw_reg_from_fs_reg(fs_reg *reg)
3329 {
3330 struct brw_reg brw_reg;
3331
3332 switch (reg->file) {
3333 case GRF:
3334 case ARF:
3335 case MRF:
3336 if (reg->smear == -1) {
3337 brw_reg = brw_vec8_reg(reg->file,
3338 reg->hw_reg, 0);
3339 } else {
3340 brw_reg = brw_vec1_reg(reg->file,
3341 reg->hw_reg, reg->smear);
3342 }
3343 brw_reg = retype(brw_reg, reg->type);
3344 break;
3345 case IMM:
3346 switch (reg->type) {
3347 case BRW_REGISTER_TYPE_F:
3348 brw_reg = brw_imm_f(reg->imm.f);
3349 break;
3350 case BRW_REGISTER_TYPE_D:
3351 brw_reg = brw_imm_d(reg->imm.i);
3352 break;
3353 case BRW_REGISTER_TYPE_UD:
3354 brw_reg = brw_imm_ud(reg->imm.u);
3355 break;
3356 default:
3357 assert(!"not reached");
3358 brw_reg = brw_null_reg();
3359 break;
3360 }
3361 break;
3362 case FIXED_HW_REG:
3363 brw_reg = reg->fixed_hw_reg;
3364 break;
3365 case BAD_FILE:
3366 /* Probably unused. */
3367 brw_reg = brw_null_reg();
3368 break;
3369 case UNIFORM:
3370 assert(!"not reached");
3371 brw_reg = brw_null_reg();
3372 break;
3373 default:
3374 assert(!"not reached");
3375 brw_reg = brw_null_reg();
3376 break;
3377 }
3378 if (reg->abs)
3379 brw_reg = brw_abs(brw_reg);
3380 if (reg->negate)
3381 brw_reg = negate(brw_reg);
3382
3383 return brw_reg;
3384 }
3385
3386 void
3387 fs_visitor::generate_code()
3388 {
3389 int last_native_inst = 0;
3390 const char *last_annotation_string = NULL;
3391 ir_instruction *last_annotation_ir = NULL;
3392
3393 int if_stack_array_size = 16;
3394 int loop_stack_array_size = 16;
3395 int if_stack_depth = 0, loop_stack_depth = 0;
3396 brw_instruction **if_stack =
3397 rzalloc_array(this->mem_ctx, brw_instruction *, if_stack_array_size);
3398 brw_instruction **loop_stack =
3399 rzalloc_array(this->mem_ctx, brw_instruction *, loop_stack_array_size);
3400 int *if_depth_in_loop =
3401 rzalloc_array(this->mem_ctx, int, loop_stack_array_size);
3402
3403
3404 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3405 printf("Native code for fragment shader %d:\n",
3406 ctx->Shader.CurrentFragmentProgram->Name);
3407 }
3408
3409 foreach_iter(exec_list_iterator, iter, this->instructions) {
3410 fs_inst *inst = (fs_inst *)iter.get();
3411 struct brw_reg src[3], dst;
3412
3413 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3414 if (last_annotation_ir != inst->ir) {
3415 last_annotation_ir = inst->ir;
3416 if (last_annotation_ir) {
3417 printf(" ");
3418 last_annotation_ir->print();
3419 printf("\n");
3420 }
3421 }
3422 if (last_annotation_string != inst->annotation) {
3423 last_annotation_string = inst->annotation;
3424 if (last_annotation_string)
3425 printf(" %s\n", last_annotation_string);
3426 }
3427 }
3428
3429 for (unsigned int i = 0; i < 3; i++) {
3430 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
3431 }
3432 dst = brw_reg_from_fs_reg(&inst->dst);
3433
3434 brw_set_conditionalmod(p, inst->conditional_mod);
3435 brw_set_predicate_control(p, inst->predicated);
3436 brw_set_saturate(p, inst->saturate);
3437
3438 switch (inst->opcode) {
3439 case BRW_OPCODE_MOV:
3440 brw_MOV(p, dst, src[0]);
3441 break;
3442 case BRW_OPCODE_ADD:
3443 brw_ADD(p, dst, src[0], src[1]);
3444 break;
3445 case BRW_OPCODE_MUL:
3446 brw_MUL(p, dst, src[0], src[1]);
3447 break;
3448
3449 case BRW_OPCODE_FRC:
3450 brw_FRC(p, dst, src[0]);
3451 break;
3452 case BRW_OPCODE_RNDD:
3453 brw_RNDD(p, dst, src[0]);
3454 break;
3455 case BRW_OPCODE_RNDE:
3456 brw_RNDE(p, dst, src[0]);
3457 break;
3458 case BRW_OPCODE_RNDZ:
3459 brw_RNDZ(p, dst, src[0]);
3460 break;
3461
3462 case BRW_OPCODE_AND:
3463 brw_AND(p, dst, src[0], src[1]);
3464 break;
3465 case BRW_OPCODE_OR:
3466 brw_OR(p, dst, src[0], src[1]);
3467 break;
3468 case BRW_OPCODE_XOR:
3469 brw_XOR(p, dst, src[0], src[1]);
3470 break;
3471 case BRW_OPCODE_NOT:
3472 brw_NOT(p, dst, src[0]);
3473 break;
3474 case BRW_OPCODE_ASR:
3475 brw_ASR(p, dst, src[0], src[1]);
3476 break;
3477 case BRW_OPCODE_SHR:
3478 brw_SHR(p, dst, src[0], src[1]);
3479 break;
3480 case BRW_OPCODE_SHL:
3481 brw_SHL(p, dst, src[0], src[1]);
3482 break;
3483
3484 case BRW_OPCODE_CMP:
3485 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
3486 break;
3487 case BRW_OPCODE_SEL:
3488 brw_SEL(p, dst, src[0], src[1]);
3489 break;
3490
3491 case BRW_OPCODE_IF:
3492 if (inst->src[0].file != BAD_FILE) {
3493 assert(intel->gen >= 6);
3494 if_stack[if_stack_depth] = gen6_IF(p, inst->conditional_mod, src[0], src[1]);
3495 } else {
3496 if_stack[if_stack_depth] = brw_IF(p, BRW_EXECUTE_8);
3497 }
3498 if_depth_in_loop[loop_stack_depth]++;
3499 if_stack_depth++;
3500 if (if_stack_array_size <= if_stack_depth) {
3501 if_stack_array_size *= 2;
3502 if_stack = reralloc(this->mem_ctx, if_stack, brw_instruction *,
3503 if_stack_array_size);
3504 }
3505 break;
3506
3507 case BRW_OPCODE_ELSE:
3508 if_stack[if_stack_depth - 1] =
3509 brw_ELSE(p, if_stack[if_stack_depth - 1]);
3510 break;
3511 case BRW_OPCODE_ENDIF:
3512 if_stack_depth--;
3513 brw_ENDIF(p , if_stack[if_stack_depth]);
3514 if_depth_in_loop[loop_stack_depth]--;
3515 break;
3516
3517 case BRW_OPCODE_DO:
3518 loop_stack[loop_stack_depth++] = brw_DO(p, BRW_EXECUTE_8);
3519 if (loop_stack_array_size <= loop_stack_depth) {
3520 loop_stack_array_size *= 2;
3521 loop_stack = reralloc(this->mem_ctx, loop_stack, brw_instruction *,
3522 loop_stack_array_size);
3523 if_depth_in_loop = reralloc(this->mem_ctx, if_depth_in_loop, int,
3524 loop_stack_array_size);
3525 }
3526 if_depth_in_loop[loop_stack_depth] = 0;
3527 break;
3528
3529 case BRW_OPCODE_BREAK:
3530 brw_BREAK(p, if_depth_in_loop[loop_stack_depth]);
3531 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3532 break;
3533 case BRW_OPCODE_CONTINUE:
3534 /* FINISHME: We need to write the loop instruction support still. */
3535 if (intel->gen >= 6)
3536 gen6_CONT(p, loop_stack[loop_stack_depth - 1]);
3537 else
3538 brw_CONT(p, if_depth_in_loop[loop_stack_depth]);
3539 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
3540 break;
3541
3542 case BRW_OPCODE_WHILE: {
3543 struct brw_instruction *inst0, *inst1;
3544 GLuint br = 1;
3545
3546 if (intel->gen >= 5)
3547 br = 2;
3548
3549 assert(loop_stack_depth > 0);
3550 loop_stack_depth--;
3551 inst0 = inst1 = brw_WHILE(p, loop_stack[loop_stack_depth]);
3552 if (intel->gen < 6) {
3553 /* patch all the BREAK/CONT instructions from last BGNLOOP */
3554 while (inst0 > loop_stack[loop_stack_depth]) {
3555 inst0--;
3556 if (inst0->header.opcode == BRW_OPCODE_BREAK &&
3557 inst0->bits3.if_else.jump_count == 0) {
3558 inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
3559 }
3560 else if (inst0->header.opcode == BRW_OPCODE_CONTINUE &&
3561 inst0->bits3.if_else.jump_count == 0) {
3562 inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
3563 }
3564 }
3565 }
3566 }
3567 break;
3568
3569 case FS_OPCODE_RCP:
3570 case FS_OPCODE_RSQ:
3571 case FS_OPCODE_SQRT:
3572 case FS_OPCODE_EXP2:
3573 case FS_OPCODE_LOG2:
3574 case FS_OPCODE_POW:
3575 case FS_OPCODE_SIN:
3576 case FS_OPCODE_COS:
3577 generate_math(inst, dst, src);
3578 break;
3579 case FS_OPCODE_CINTERP:
3580 brw_MOV(p, dst, src[0]);
3581 break;
3582 case FS_OPCODE_LINTERP:
3583 generate_linterp(inst, dst, src);
3584 break;
3585 case FS_OPCODE_TEX:
3586 case FS_OPCODE_TXB:
3587 case FS_OPCODE_TXD:
3588 case FS_OPCODE_TXL:
3589 generate_tex(inst, dst, src[0]);
3590 break;
3591 case FS_OPCODE_DISCARD_NOT:
3592 generate_discard_not(inst, dst);
3593 break;
3594 case FS_OPCODE_DISCARD_AND:
3595 generate_discard_and(inst, src[0]);
3596 break;
3597 case FS_OPCODE_DDX:
3598 generate_ddx(inst, dst, src[0]);
3599 break;
3600 case FS_OPCODE_DDY:
3601 generate_ddy(inst, dst, src[0]);
3602 break;
3603
3604 case FS_OPCODE_SPILL:
3605 generate_spill(inst, src[0]);
3606 break;
3607
3608 case FS_OPCODE_UNSPILL:
3609 generate_unspill(inst, dst);
3610 break;
3611
3612 case FS_OPCODE_PULL_CONSTANT_LOAD:
3613 generate_pull_constant_load(inst, dst);
3614 break;
3615
3616 case FS_OPCODE_FB_WRITE:
3617 generate_fb_write(inst);
3618 break;
3619 default:
3620 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
3621 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
3622 brw_opcodes[inst->opcode].name);
3623 } else {
3624 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
3625 }
3626 fail("unsupported opcode in FS\n");
3627 }
3628
3629 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3630 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
3631 if (0) {
3632 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3633 ((uint32_t *)&p->store[i])[3],
3634 ((uint32_t *)&p->store[i])[2],
3635 ((uint32_t *)&p->store[i])[1],
3636 ((uint32_t *)&p->store[i])[0]);
3637 }
3638 brw_disasm(stdout, &p->store[i], intel->gen);
3639 }
3640 }
3641
3642 last_native_inst = p->nr_insn;
3643 }
3644
3645 ralloc_free(if_stack);
3646 ralloc_free(loop_stack);
3647 ralloc_free(if_depth_in_loop);
3648
3649 brw_set_uip_jip(p);
3650
3651 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
3652 * emit issues, it doesn't get the jump distances into the output,
3653 * which is often something we want to debug. So this is here in
3654 * case you're doing that.
3655 */
3656 if (0) {
3657 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3658 for (unsigned int i = 0; i < p->nr_insn; i++) {
3659 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
3660 ((uint32_t *)&p->store[i])[3],
3661 ((uint32_t *)&p->store[i])[2],
3662 ((uint32_t *)&p->store[i])[1],
3663 ((uint32_t *)&p->store[i])[0]);
3664 brw_disasm(stdout, &p->store[i], intel->gen);
3665 }
3666 }
3667 }
3668 }
3669
3670 GLboolean
3671 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
3672 {
3673 struct intel_context *intel = &brw->intel;
3674 struct gl_context *ctx = &intel->ctx;
3675 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
3676
3677 if (!prog)
3678 return GL_FALSE;
3679
3680 struct brw_shader *shader =
3681 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3682 if (!shader)
3683 return GL_FALSE;
3684
3685 /* We always use 8-wide mode, at least for now. For one, flow
3686 * control only works in 8-wide. Also, when we're fragment shader
3687 * bound, we're almost always under register pressure as well, so
3688 * 8-wide would save us from the performance cliff of spilling
3689 * regs.
3690 */
3691 c->dispatch_width = 8;
3692
3693 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3694 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3695 _mesa_print_ir(shader->ir, NULL);
3696 printf("\n");
3697 }
3698
3699 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3700 */
3701 fs_visitor v(c, shader);
3702
3703 if (0) {
3704 v.emit_dummy_fs();
3705 } else {
3706 v.calculate_urb_setup();
3707 if (intel->gen < 6)
3708 v.emit_interpolation_setup_gen4();
3709 else
3710 v.emit_interpolation_setup_gen6();
3711
3712 /* Generate FS IR for main(). (the visitor only descends into
3713 * functions called "main").
3714 */
3715 foreach_iter(exec_list_iterator, iter, *shader->ir) {
3716 ir_instruction *ir = (ir_instruction *)iter.get();
3717 v.base_ir = ir;
3718 ir->accept(&v);
3719 }
3720
3721 v.emit_fb_writes();
3722
3723 v.split_virtual_grfs();
3724
3725 v.setup_paramvalues_refs();
3726 v.setup_pull_constants();
3727
3728 bool progress;
3729 do {
3730 progress = false;
3731
3732 progress = v.remove_duplicate_mrf_writes() || progress;
3733
3734 progress = v.propagate_constants() || progress;
3735 progress = v.register_coalesce() || progress;
3736 progress = v.compute_to_mrf() || progress;
3737 progress = v.dead_code_eliminate() || progress;
3738 } while (progress);
3739
3740 v.schedule_instructions();
3741
3742 v.assign_curb_setup();
3743 v.assign_urb_setup();
3744
3745 if (0) {
3746 /* Debug of register spilling: Go spill everything. */
3747 int virtual_grf_count = v.virtual_grf_next;
3748 for (int i = 1; i < virtual_grf_count; i++) {
3749 v.spill_reg(i);
3750 }
3751 }
3752
3753 if (0)
3754 v.assign_regs_trivial();
3755 else {
3756 while (!v.assign_regs()) {
3757 if (v.failed)
3758 break;
3759 }
3760 }
3761 }
3762
3763 if (!v.failed)
3764 v.generate_code();
3765
3766 assert(!v.failed); /* FINISHME: Cleanly fail, tested at link time, etc. */
3767
3768 if (v.failed)
3769 return GL_FALSE;
3770
3771 c->prog_data.total_grf = v.grf_used;
3772
3773 return GL_TRUE;
3774 }