nouveau: Add support for ARB_sampler_objects
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_emit.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_emit.cpp
25 *
26 * This file supports emitting code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 extern "C" {
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 } /* extern "C" */
35
36 #include "brw_fs.h"
37 #include "brw_fs_cfg.h"
38 #include "glsl/ir_print_visitor.h"
39
40 void
41 fs_visitor::generate_fb_write(fs_inst *inst)
42 {
43 bool eot = inst->eot;
44 struct brw_reg implied_header;
45 uint32_t msg_control;
46
47 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
48 * move, here's g1.
49 */
50 brw_push_insn_state(p);
51 brw_set_mask_control(p, BRW_MASK_DISABLE);
52 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
53
54 if (inst->header_present) {
55 if (intel->gen >= 6) {
56 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
57 brw_MOV(p,
58 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
59 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
60 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
61
62 if (inst->target > 0) {
63 /* Set the render target index for choosing BLEND_STATE. */
64 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
65 inst->base_mrf, 2),
66 BRW_REGISTER_TYPE_UD),
67 brw_imm_ud(inst->target));
68 }
69
70 implied_header = brw_null_reg();
71 } else {
72 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
73
74 brw_MOV(p,
75 brw_message_reg(inst->base_mrf + 1),
76 brw_vec8_grf(1, 0));
77 }
78 } else {
79 implied_header = brw_null_reg();
80 }
81
82 if (this->dual_src_output.file != BAD_FILE)
83 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
84 else if (c->dispatch_width == 16)
85 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
86 else
87 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
88
89 brw_pop_insn_state(p);
90
91 brw_fb_WRITE(p,
92 c->dispatch_width,
93 inst->base_mrf,
94 implied_header,
95 msg_control,
96 inst->target,
97 inst->mlen,
98 0,
99 eot,
100 inst->header_present);
101 }
102
103 /* Computes the integer pixel x,y values from the origin.
104 *
105 * This is the basis of gl_FragCoord computation, but is also used
106 * pre-gen6 for computing the deltas from v0 for computing
107 * interpolation.
108 */
109 void
110 fs_visitor::generate_pixel_xy(struct brw_reg dst, bool is_x)
111 {
112 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
113 struct brw_reg src;
114 struct brw_reg deltas;
115
116 if (is_x) {
117 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
118 deltas = brw_imm_v(0x10101010);
119 } else {
120 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
121 deltas = brw_imm_v(0x11001100);
122 }
123
124 if (c->dispatch_width == 16) {
125 dst = vec16(dst);
126 }
127
128 /* We do this 8 or 16-wide, but since the destination is UW we
129 * don't do compression in the 16-wide case.
130 */
131 brw_push_insn_state(p);
132 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
133 brw_ADD(p, dst, src, deltas);
134 brw_pop_insn_state(p);
135 }
136
137 void
138 fs_visitor::generate_linterp(fs_inst *inst,
139 struct brw_reg dst, struct brw_reg *src)
140 {
141 struct brw_reg delta_x = src[0];
142 struct brw_reg delta_y = src[1];
143 struct brw_reg interp = src[2];
144
145 if (brw->has_pln &&
146 delta_y.nr == delta_x.nr + 1 &&
147 (intel->gen >= 6 || (delta_x.nr & 1) == 0)) {
148 brw_PLN(p, dst, interp, delta_x);
149 } else {
150 brw_LINE(p, brw_null_reg(), interp, delta_x);
151 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
152 }
153 }
154
155 void
156 fs_visitor::generate_math1_gen7(fs_inst *inst,
157 struct brw_reg dst,
158 struct brw_reg src0)
159 {
160 assert(inst->mlen == 0);
161 brw_math(p, dst,
162 brw_math_function(inst->opcode),
163 inst->saturate ? BRW_MATH_SATURATE_SATURATE
164 : BRW_MATH_SATURATE_NONE,
165 0, src0,
166 BRW_MATH_DATA_VECTOR,
167 BRW_MATH_PRECISION_FULL);
168 }
169
170 void
171 fs_visitor::generate_math2_gen7(fs_inst *inst,
172 struct brw_reg dst,
173 struct brw_reg src0,
174 struct brw_reg src1)
175 {
176 assert(inst->mlen == 0);
177 brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
178 }
179
180 void
181 fs_visitor::generate_math1_gen6(fs_inst *inst,
182 struct brw_reg dst,
183 struct brw_reg src0)
184 {
185 int op = brw_math_function(inst->opcode);
186
187 assert(inst->mlen == 0);
188
189 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
190 brw_math(p, dst,
191 op,
192 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
193 BRW_MATH_SATURATE_NONE,
194 0, src0,
195 BRW_MATH_DATA_VECTOR,
196 BRW_MATH_PRECISION_FULL);
197
198 if (c->dispatch_width == 16) {
199 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
200 brw_math(p, sechalf(dst),
201 op,
202 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
203 BRW_MATH_SATURATE_NONE,
204 0, sechalf(src0),
205 BRW_MATH_DATA_VECTOR,
206 BRW_MATH_PRECISION_FULL);
207 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
208 }
209 }
210
211 void
212 fs_visitor::generate_math2_gen6(fs_inst *inst,
213 struct brw_reg dst,
214 struct brw_reg src0,
215 struct brw_reg src1)
216 {
217 int op = brw_math_function(inst->opcode);
218
219 assert(inst->mlen == 0);
220
221 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
222 brw_math2(p, dst, op, src0, src1);
223
224 if (c->dispatch_width == 16) {
225 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
226 brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
227 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
228 }
229 }
230
231 void
232 fs_visitor::generate_math_gen4(fs_inst *inst,
233 struct brw_reg dst,
234 struct brw_reg src)
235 {
236 int op = brw_math_function(inst->opcode);
237
238 assert(inst->mlen >= 1);
239
240 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
241 brw_math(p, dst,
242 op,
243 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
244 BRW_MATH_SATURATE_NONE,
245 inst->base_mrf, src,
246 BRW_MATH_DATA_VECTOR,
247 BRW_MATH_PRECISION_FULL);
248
249 if (c->dispatch_width == 16) {
250 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
251 brw_math(p, sechalf(dst),
252 op,
253 inst->saturate ? BRW_MATH_SATURATE_SATURATE :
254 BRW_MATH_SATURATE_NONE,
255 inst->base_mrf + 1, sechalf(src),
256 BRW_MATH_DATA_VECTOR,
257 BRW_MATH_PRECISION_FULL);
258
259 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
260 }
261 }
262
263 void
264 fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
265 {
266 int msg_type = -1;
267 int rlen = 4;
268 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
269 uint32_t return_format;
270
271 switch (dst.type) {
272 case BRW_REGISTER_TYPE_D:
273 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
274 break;
275 case BRW_REGISTER_TYPE_UD:
276 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
277 break;
278 default:
279 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
280 break;
281 }
282
283 if (c->dispatch_width == 16)
284 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
285
286 if (intel->gen >= 5) {
287 switch (inst->opcode) {
288 case SHADER_OPCODE_TEX:
289 if (inst->shadow_compare) {
290 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
291 } else {
292 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
293 }
294 break;
295 case FS_OPCODE_TXB:
296 if (inst->shadow_compare) {
297 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
298 } else {
299 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
300 }
301 break;
302 case SHADER_OPCODE_TXL:
303 if (inst->shadow_compare) {
304 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
305 } else {
306 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
307 }
308 break;
309 case SHADER_OPCODE_TXS:
310 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
311 break;
312 case SHADER_OPCODE_TXD:
313 /* There is no sample_d_c message; comparisons are done manually */
314 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
315 break;
316 case SHADER_OPCODE_TXF:
317 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
318 break;
319 default:
320 assert(!"not reached");
321 break;
322 }
323 } else {
324 switch (inst->opcode) {
325 case SHADER_OPCODE_TEX:
326 /* Note that G45 and older determines shadow compare and dispatch width
327 * from message length for most messages.
328 */
329 assert(c->dispatch_width == 8);
330 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
331 if (inst->shadow_compare) {
332 assert(inst->mlen == 6);
333 } else {
334 assert(inst->mlen <= 4);
335 }
336 break;
337 case FS_OPCODE_TXB:
338 if (inst->shadow_compare) {
339 assert(inst->mlen == 6);
340 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
341 } else {
342 assert(inst->mlen == 9);
343 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
344 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
345 }
346 break;
347 case SHADER_OPCODE_TXL:
348 if (inst->shadow_compare) {
349 assert(inst->mlen == 6);
350 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
351 } else {
352 assert(inst->mlen == 9);
353 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
354 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
355 }
356 break;
357 case SHADER_OPCODE_TXD:
358 /* There is no sample_d_c message; comparisons are done manually */
359 assert(inst->mlen == 7 || inst->mlen == 10);
360 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
361 break;
362 case SHADER_OPCODE_TXF:
363 assert(inst->mlen == 9);
364 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
365 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
366 break;
367 case SHADER_OPCODE_TXS:
368 assert(inst->mlen == 3);
369 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
370 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
371 break;
372 default:
373 assert(!"not reached");
374 break;
375 }
376 }
377 assert(msg_type != -1);
378
379 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
380 rlen = 8;
381 dst = vec16(dst);
382 }
383
384 brw_SAMPLE(p,
385 retype(dst, BRW_REGISTER_TYPE_UW),
386 inst->base_mrf,
387 src,
388 SURF_INDEX_TEXTURE(inst->sampler),
389 inst->sampler,
390 WRITEMASK_XYZW,
391 msg_type,
392 rlen,
393 inst->mlen,
394 inst->header_present,
395 simd_mode,
396 return_format);
397 }
398
399
400 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
401 * looking like:
402 *
403 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
404 *
405 * and we're trying to produce:
406 *
407 * DDX DDY
408 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
409 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
410 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
411 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
412 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
413 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
414 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
415 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
416 *
417 * and add another set of two more subspans if in 16-pixel dispatch mode.
418 *
419 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
420 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
421 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
422 * between each other. We could probably do it like ddx and swizzle the right
423 * order later, but bail for now and just produce
424 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
425 */
426 void
427 fs_visitor::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
428 {
429 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
430 BRW_REGISTER_TYPE_F,
431 BRW_VERTICAL_STRIDE_2,
432 BRW_WIDTH_2,
433 BRW_HORIZONTAL_STRIDE_0,
434 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
435 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
436 BRW_REGISTER_TYPE_F,
437 BRW_VERTICAL_STRIDE_2,
438 BRW_WIDTH_2,
439 BRW_HORIZONTAL_STRIDE_0,
440 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
441 brw_ADD(p, dst, src0, negate(src1));
442 }
443
444 /* The negate_value boolean is used to negate the derivative computation for
445 * FBOs, since they place the origin at the upper left instead of the lower
446 * left.
447 */
448 void
449 fs_visitor::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
450 bool negate_value)
451 {
452 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
453 BRW_REGISTER_TYPE_F,
454 BRW_VERTICAL_STRIDE_4,
455 BRW_WIDTH_4,
456 BRW_HORIZONTAL_STRIDE_0,
457 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
458 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
459 BRW_REGISTER_TYPE_F,
460 BRW_VERTICAL_STRIDE_4,
461 BRW_WIDTH_4,
462 BRW_HORIZONTAL_STRIDE_0,
463 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
464 if (negate_value)
465 brw_ADD(p, dst, src1, negate(src0));
466 else
467 brw_ADD(p, dst, src0, negate(src1));
468 }
469
470 void
471 fs_visitor::generate_discard(fs_inst *inst)
472 {
473 struct brw_reg f0 = brw_flag_reg();
474
475 if (intel->gen >= 6) {
476 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
477 struct brw_reg some_register;
478
479 /* As of gen6, we no longer have the mask register to look at,
480 * so life gets a bit more complicated.
481 */
482
483 /* Load the flag register with all ones. */
484 brw_push_insn_state(p);
485 brw_set_mask_control(p, BRW_MASK_DISABLE);
486 brw_MOV(p, f0, brw_imm_uw(0xffff));
487 brw_pop_insn_state(p);
488
489 /* Do a comparison that should always fail, to produce 0s in the flag
490 * reg where we have active channels.
491 */
492 some_register = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
493 brw_CMP(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UD),
494 BRW_CONDITIONAL_NZ, some_register, some_register);
495
496 /* Undo CMP's whacking of predication*/
497 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
498
499 brw_push_insn_state(p);
500 brw_set_mask_control(p, BRW_MASK_DISABLE);
501 brw_AND(p, g1, f0, g1);
502 brw_pop_insn_state(p);
503 } else {
504 struct brw_reg g0 = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
505
506 brw_push_insn_state(p);
507 brw_set_mask_control(p, BRW_MASK_DISABLE);
508 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
509
510 /* Unlike the 965, we have the mask reg, so we just need
511 * somewhere to invert that (containing channels to be disabled)
512 * so it can be ANDed with the mask of pixels still to be
513 * written. Use the flag reg for consistency with gen6+.
514 */
515 brw_NOT(p, f0, brw_mask_reg(1)); /* IMASK */
516 brw_AND(p, g0, f0, g0);
517
518 brw_pop_insn_state(p);
519 }
520 }
521
522 void
523 fs_visitor::generate_spill(fs_inst *inst, struct brw_reg src)
524 {
525 assert(inst->mlen != 0);
526
527 brw_MOV(p,
528 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
529 retype(src, BRW_REGISTER_TYPE_UD));
530 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
531 inst->offset);
532 }
533
534 void
535 fs_visitor::generate_unspill(fs_inst *inst, struct brw_reg dst)
536 {
537 assert(inst->mlen != 0);
538
539 /* Clear any post destination dependencies that would be ignored by
540 * the block read. See the B-Spec for pre-gen5 send instruction.
541 *
542 * This could use a better solution, since texture sampling and
543 * math reads could potentially run into it as well -- anywhere
544 * that we have a SEND with a destination that is a register that
545 * was written but not read within the last N instructions (what's
546 * N? unsure). This is rare because of dead code elimination, but
547 * not impossible.
548 */
549 if (intel->gen == 4 && !intel->is_g4x)
550 brw_MOV(p, brw_null_reg(), dst);
551
552 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
553 inst->offset);
554
555 if (intel->gen == 4 && !intel->is_g4x) {
556 /* gen4 errata: destination from a send can't be used as a
557 * destination until it's been read. Just read it so we don't
558 * have to worry.
559 */
560 brw_MOV(p, brw_null_reg(), dst);
561 }
562 }
563
564 void
565 fs_visitor::generate_pull_constant_load(fs_inst *inst, struct brw_reg dst)
566 {
567 assert(inst->mlen != 0);
568
569 /* Clear any post destination dependencies that would be ignored by
570 * the block read. See the B-Spec for pre-gen5 send instruction.
571 *
572 * This could use a better solution, since texture sampling and
573 * math reads could potentially run into it as well -- anywhere
574 * that we have a SEND with a destination that is a register that
575 * was written but not read within the last N instructions (what's
576 * N? unsure). This is rare because of dead code elimination, but
577 * not impossible.
578 */
579 if (intel->gen == 4 && !intel->is_g4x)
580 brw_MOV(p, brw_null_reg(), dst);
581
582 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
583 inst->offset, SURF_INDEX_FRAG_CONST_BUFFER);
584
585 if (intel->gen == 4 && !intel->is_g4x) {
586 /* gen4 errata: destination from a send can't be used as a
587 * destination until it's been read. Just read it so we don't
588 * have to worry.
589 */
590 brw_MOV(p, brw_null_reg(), dst);
591 }
592 }
593
594
595 /**
596 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
597 * into the flags register (f0.0).
598 *
599 * Used only on Gen6 and above.
600 */
601 void
602 fs_visitor::generate_mov_dispatch_to_flags()
603 {
604 struct brw_reg f0 = brw_flag_reg();
605 struct brw_reg g1 = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
606
607 assert (intel->gen >= 6);
608 brw_push_insn_state(p);
609 brw_set_mask_control(p, BRW_MASK_DISABLE);
610 brw_MOV(p, f0, g1);
611 brw_pop_insn_state(p);
612 }
613
614
615 static uint32_t brw_file_from_reg(fs_reg *reg)
616 {
617 switch (reg->file) {
618 case ARF:
619 return BRW_ARCHITECTURE_REGISTER_FILE;
620 case GRF:
621 return BRW_GENERAL_REGISTER_FILE;
622 case MRF:
623 return BRW_MESSAGE_REGISTER_FILE;
624 case IMM:
625 return BRW_IMMEDIATE_VALUE;
626 default:
627 assert(!"not reached");
628 return BRW_GENERAL_REGISTER_FILE;
629 }
630 }
631
632 static struct brw_reg
633 brw_reg_from_fs_reg(fs_reg *reg)
634 {
635 struct brw_reg brw_reg;
636
637 switch (reg->file) {
638 case GRF:
639 case ARF:
640 case MRF:
641 if (reg->smear == -1) {
642 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
643 } else {
644 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
645 }
646 brw_reg = retype(brw_reg, reg->type);
647 if (reg->sechalf)
648 brw_reg = sechalf(brw_reg);
649 break;
650 case IMM:
651 switch (reg->type) {
652 case BRW_REGISTER_TYPE_F:
653 brw_reg = brw_imm_f(reg->imm.f);
654 break;
655 case BRW_REGISTER_TYPE_D:
656 brw_reg = brw_imm_d(reg->imm.i);
657 break;
658 case BRW_REGISTER_TYPE_UD:
659 brw_reg = brw_imm_ud(reg->imm.u);
660 break;
661 default:
662 assert(!"not reached");
663 brw_reg = brw_null_reg();
664 break;
665 }
666 break;
667 case FIXED_HW_REG:
668 brw_reg = reg->fixed_hw_reg;
669 break;
670 case BAD_FILE:
671 /* Probably unused. */
672 brw_reg = brw_null_reg();
673 break;
674 case UNIFORM:
675 assert(!"not reached");
676 brw_reg = brw_null_reg();
677 break;
678 default:
679 assert(!"not reached");
680 brw_reg = brw_null_reg();
681 break;
682 }
683 if (reg->abs)
684 brw_reg = brw_abs(brw_reg);
685 if (reg->negate)
686 brw_reg = negate(brw_reg);
687
688 return brw_reg;
689 }
690
691 void
692 fs_visitor::generate_code()
693 {
694 int last_native_inst = p->nr_insn;
695 const char *last_annotation_string = NULL;
696 ir_instruction *last_annotation_ir = NULL;
697
698 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
699 printf("Native code for fragment shader %d (%d-wide dispatch):\n",
700 prog->Name, c->dispatch_width);
701 }
702
703 fs_cfg *cfg = NULL;
704 if (unlikely(INTEL_DEBUG & DEBUG_WM))
705 cfg = new(mem_ctx) fs_cfg(this);
706
707 foreach_list(node, &this->instructions) {
708 fs_inst *inst = (fs_inst *)node;
709 struct brw_reg src[3], dst;
710
711 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
712 foreach_list(node, &cfg->block_list) {
713 fs_bblock_link *link = (fs_bblock_link *)node;
714 fs_bblock *block = link->block;
715
716 if (block->start == inst) {
717 printf(" START B%d", block->block_num);
718 foreach_list(predecessor_node, &block->parents) {
719 fs_bblock_link *predecessor_link =
720 (fs_bblock_link *)predecessor_node;
721 fs_bblock *predecessor_block = predecessor_link->block;
722 printf(" <-B%d", predecessor_block->block_num);
723 }
724 printf("\n");
725 }
726 }
727
728 if (last_annotation_ir != inst->ir) {
729 last_annotation_ir = inst->ir;
730 if (last_annotation_ir) {
731 printf(" ");
732 last_annotation_ir->print();
733 printf("\n");
734 }
735 }
736 if (last_annotation_string != inst->annotation) {
737 last_annotation_string = inst->annotation;
738 if (last_annotation_string)
739 printf(" %s\n", last_annotation_string);
740 }
741 }
742
743 for (unsigned int i = 0; i < 3; i++) {
744 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
745
746 /* The accumulator result appears to get used for the
747 * conditional modifier generation. When negating a UD
748 * value, there is a 33rd bit generated for the sign in the
749 * accumulator value, so now you can't check, for example,
750 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
751 */
752 assert(!inst->conditional_mod ||
753 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
754 !inst->src[i].negate);
755 }
756 dst = brw_reg_from_fs_reg(&inst->dst);
757
758 brw_set_conditionalmod(p, inst->conditional_mod);
759 brw_set_predicate_control(p, inst->predicated);
760 brw_set_predicate_inverse(p, inst->predicate_inverse);
761 brw_set_saturate(p, inst->saturate);
762
763 if (inst->force_uncompressed || c->dispatch_width == 8) {
764 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
765 } else if (inst->force_sechalf) {
766 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
767 } else {
768 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
769 }
770
771 switch (inst->opcode) {
772 case BRW_OPCODE_MOV:
773 brw_MOV(p, dst, src[0]);
774 break;
775 case BRW_OPCODE_ADD:
776 brw_ADD(p, dst, src[0], src[1]);
777 break;
778 case BRW_OPCODE_MUL:
779 brw_MUL(p, dst, src[0], src[1]);
780 break;
781 case BRW_OPCODE_MACH:
782 brw_set_acc_write_control(p, 1);
783 brw_MACH(p, dst, src[0], src[1]);
784 brw_set_acc_write_control(p, 0);
785 break;
786
787 case BRW_OPCODE_MAD:
788 brw_set_access_mode(p, BRW_ALIGN_16);
789 if (c->dispatch_width == 16) {
790 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
791 brw_MAD(p, dst, src[0], src[1], src[2]);
792 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
793 brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
794 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
795 } else {
796 brw_MAD(p, dst, src[0], src[1], src[2]);
797 }
798 brw_set_access_mode(p, BRW_ALIGN_1);
799 break;
800
801 case BRW_OPCODE_FRC:
802 brw_FRC(p, dst, src[0]);
803 break;
804 case BRW_OPCODE_RNDD:
805 brw_RNDD(p, dst, src[0]);
806 break;
807 case BRW_OPCODE_RNDE:
808 brw_RNDE(p, dst, src[0]);
809 break;
810 case BRW_OPCODE_RNDZ:
811 brw_RNDZ(p, dst, src[0]);
812 break;
813
814 case BRW_OPCODE_AND:
815 brw_AND(p, dst, src[0], src[1]);
816 break;
817 case BRW_OPCODE_OR:
818 brw_OR(p, dst, src[0], src[1]);
819 break;
820 case BRW_OPCODE_XOR:
821 brw_XOR(p, dst, src[0], src[1]);
822 break;
823 case BRW_OPCODE_NOT:
824 brw_NOT(p, dst, src[0]);
825 break;
826 case BRW_OPCODE_ASR:
827 brw_ASR(p, dst, src[0], src[1]);
828 break;
829 case BRW_OPCODE_SHR:
830 brw_SHR(p, dst, src[0], src[1]);
831 break;
832 case BRW_OPCODE_SHL:
833 brw_SHL(p, dst, src[0], src[1]);
834 break;
835
836 case BRW_OPCODE_CMP:
837 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
838 break;
839 case BRW_OPCODE_SEL:
840 brw_SEL(p, dst, src[0], src[1]);
841 break;
842
843 case BRW_OPCODE_IF:
844 if (inst->src[0].file != BAD_FILE) {
845 /* The instruction has an embedded compare (only allowed on gen6) */
846 assert(intel->gen == 6);
847 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
848 } else {
849 brw_IF(p, c->dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
850 }
851 break;
852
853 case BRW_OPCODE_ELSE:
854 brw_ELSE(p);
855 break;
856 case BRW_OPCODE_ENDIF:
857 brw_ENDIF(p);
858 break;
859
860 case BRW_OPCODE_DO:
861 brw_DO(p, BRW_EXECUTE_8);
862 break;
863
864 case BRW_OPCODE_BREAK:
865 brw_BREAK(p);
866 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
867 break;
868 case BRW_OPCODE_CONTINUE:
869 /* FINISHME: We need to write the loop instruction support still. */
870 if (intel->gen >= 6)
871 gen6_CONT(p);
872 else
873 brw_CONT(p);
874 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
875 break;
876
877 case BRW_OPCODE_WHILE:
878 brw_WHILE(p);
879 break;
880
881 case SHADER_OPCODE_RCP:
882 case SHADER_OPCODE_RSQ:
883 case SHADER_OPCODE_SQRT:
884 case SHADER_OPCODE_EXP2:
885 case SHADER_OPCODE_LOG2:
886 case SHADER_OPCODE_SIN:
887 case SHADER_OPCODE_COS:
888 if (intel->gen >= 7) {
889 generate_math1_gen7(inst, dst, src[0]);
890 } else if (intel->gen == 6) {
891 generate_math1_gen6(inst, dst, src[0]);
892 } else {
893 generate_math_gen4(inst, dst, src[0]);
894 }
895 break;
896 case SHADER_OPCODE_INT_QUOTIENT:
897 case SHADER_OPCODE_INT_REMAINDER:
898 case SHADER_OPCODE_POW:
899 if (intel->gen >= 7) {
900 generate_math2_gen7(inst, dst, src[0], src[1]);
901 } else if (intel->gen == 6) {
902 generate_math2_gen6(inst, dst, src[0], src[1]);
903 } else {
904 generate_math_gen4(inst, dst, src[0]);
905 }
906 break;
907 case FS_OPCODE_PIXEL_X:
908 generate_pixel_xy(dst, true);
909 break;
910 case FS_OPCODE_PIXEL_Y:
911 generate_pixel_xy(dst, false);
912 break;
913 case FS_OPCODE_CINTERP:
914 brw_MOV(p, dst, src[0]);
915 break;
916 case FS_OPCODE_LINTERP:
917 generate_linterp(inst, dst, src);
918 break;
919 case SHADER_OPCODE_TEX:
920 case FS_OPCODE_TXB:
921 case SHADER_OPCODE_TXD:
922 case SHADER_OPCODE_TXF:
923 case SHADER_OPCODE_TXL:
924 case SHADER_OPCODE_TXS:
925 generate_tex(inst, dst, src[0]);
926 break;
927 case FS_OPCODE_DISCARD:
928 generate_discard(inst);
929 break;
930 case FS_OPCODE_DDX:
931 generate_ddx(inst, dst, src[0]);
932 break;
933 case FS_OPCODE_DDY:
934 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
935 * guarantee that c->key.render_to_fbo is set).
936 */
937 assert(fp->UsesDFdy);
938 generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
939 break;
940
941 case FS_OPCODE_SPILL:
942 generate_spill(inst, src[0]);
943 break;
944
945 case FS_OPCODE_UNSPILL:
946 generate_unspill(inst, dst);
947 break;
948
949 case FS_OPCODE_PULL_CONSTANT_LOAD:
950 generate_pull_constant_load(inst, dst);
951 break;
952
953 case FS_OPCODE_FB_WRITE:
954 generate_fb_write(inst);
955 break;
956
957 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
958 generate_mov_dispatch_to_flags();
959 break;
960
961 default:
962 if (inst->opcode < (int)ARRAY_SIZE(brw_opcodes)) {
963 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
964 brw_opcodes[inst->opcode].name);
965 } else {
966 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
967 }
968 fail("unsupported opcode in FS\n");
969 }
970
971 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
972 for (unsigned int i = last_native_inst; i < p->nr_insn; i++) {
973 if (0) {
974 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
975 ((uint32_t *)&p->store[i])[3],
976 ((uint32_t *)&p->store[i])[2],
977 ((uint32_t *)&p->store[i])[1],
978 ((uint32_t *)&p->store[i])[0]);
979 }
980 brw_disasm(stdout, &p->store[i], intel->gen);
981 }
982
983 foreach_list(node, &cfg->block_list) {
984 fs_bblock_link *link = (fs_bblock_link *)node;
985 fs_bblock *block = link->block;
986
987 if (block->end == inst) {
988 printf(" END B%d", block->block_num);
989 foreach_list(successor_node, &block->children) {
990 fs_bblock_link *successor_link =
991 (fs_bblock_link *)successor_node;
992 fs_bblock *successor_block = successor_link->block;
993 printf(" ->B%d", successor_block->block_num);
994 }
995 printf("\n");
996 }
997 }
998 }
999
1000 last_native_inst = p->nr_insn;
1001 }
1002
1003 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1004 printf("\n");
1005 }
1006
1007 brw_set_uip_jip(p);
1008
1009 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1010 * emit issues, it doesn't get the jump distances into the output,
1011 * which is often something we want to debug. So this is here in
1012 * case you're doing that.
1013 */
1014 if (0) {
1015 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1016 for (unsigned int i = 0; i < p->nr_insn; i++) {
1017 printf("0x%08x 0x%08x 0x%08x 0x%08x ",
1018 ((uint32_t *)&p->store[i])[3],
1019 ((uint32_t *)&p->store[i])[2],
1020 ((uint32_t *)&p->store[i])[1],
1021 ((uint32_t *)&p->store[i])[0]);
1022 brw_disasm(stdout, &p->store[i], intel->gen);
1023 }
1024 }
1025 }
1026 }