i965: rename tex_ms to tex_cms
[mesa.git] / src / mesa / drivers / dri / i965 / gen8_fs_generator.cpp
1 /*
2 * Copyright © 2010, 2011, 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file gen8_fs_generate.cpp
25 *
26 * Code generation for Gen8+ hardware.
27 */
28
29 extern "C" {
30 #include "main/macros.h"
31 #include "brw_context.h"
32 } /* extern "C" */
33
34 #include "brw_fs.h"
35 #include "brw_cfg.h"
36 #include "glsl/ir_print_visitor.h"
37
38 gen8_fs_generator::gen8_fs_generator(struct brw_context *brw,
39 struct brw_wm_compile *c,
40 struct gl_shader_program *shader_prog,
41 struct gl_fragment_program *fp,
42 bool dual_source_output)
43 : gen8_generator(brw, shader_prog, fp ? &fp->Base : NULL, c), c(c), fp(fp),
44 dual_source_output(dual_source_output)
45 {
46 shader =
47 shader_prog ? shader_prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL;
48 }
49
50 gen8_fs_generator::~gen8_fs_generator()
51 {
52 }
53
54 void
55 gen8_fs_generator::mark_surface_used(unsigned surf_index)
56 {
57 assert(surf_index < BRW_MAX_SURFACES);
58
59 c->prog_data.base.binding_table.size_bytes =
60 MAX2(c->prog_data.base.binding_table.size_bytes, (surf_index + 1) * 4);
61 }
62
63 void
64 gen8_fs_generator::generate_fb_write(fs_inst *ir)
65 {
66 if (fp && fp->UsesKill) {
67 gen8_instruction *mov =
68 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW),
69 brw_flag_reg(0, 1));
70 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
71 }
72
73 if (ir->header_present) {
74 gen8_instruction *mov =
75 MOV_RAW(brw_message_reg(ir->base_mrf), brw_vec8_grf(0, 0));
76 gen8_set_exec_size(mov, BRW_EXECUTE_16);
77
78 if (ir->target > 0 && c->key.replicate_alpha) {
79 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
80 OR(vec1(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD)),
81 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
82 brw_imm_ud(1 << 11));
83 }
84
85 if (ir->target > 0) {
86 /* Set the render target index for choosing BLEND_STATE. */
87 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
88 BRW_REGISTER_TYPE_UD),
89 brw_imm_ud(ir->target));
90 }
91 }
92
93 gen8_instruction *inst = next_inst(BRW_OPCODE_SENDC);
94 gen8_set_dst(brw, inst, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW));
95 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
96
97 /* Set up the "Message Specific Control" bits for the Data Port Message
98 * Descriptor. These are documented in the "Render Target Write" message's
99 * "Message Descriptor" documentation (vol5c.2).
100 */
101 uint32_t msg_type;
102 /* Set the Message Type */
103 if (this->dual_source_output)
104 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
105 else if (dispatch_width == 16)
106 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
107 else
108 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
109
110 uint32_t msg_control = msg_type;
111
112 /* "Last Render Target Select" must be set on all writes to the last of
113 * the render targets (if using MRT), or always for a single RT scenario.
114 */
115 if ((ir->target == c->key.nr_color_regions - 1) || !c->key.nr_color_regions)
116 msg_control |= (1 << 4); /* Last Render Target Select */
117
118 uint32_t surf_index =
119 c->prog_data.binding_table.render_target_start + ir->target;
120
121 gen8_set_dp_message(brw, inst,
122 GEN6_SFID_DATAPORT_RENDER_CACHE,
123 surf_index,
124 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
125 msg_control,
126 ir->mlen,
127 0,
128 ir->header_present,
129 ir->eot);
130
131 mark_surface_used(surf_index);
132 }
133
134 void
135 gen8_fs_generator::generate_linterp(fs_inst *inst,
136 struct brw_reg dst,
137 struct brw_reg *src)
138 {
139 struct brw_reg delta_x = src[0];
140 struct brw_reg delta_y = src[1];
141 struct brw_reg interp = src[2];
142
143 (void) delta_y;
144 assert(delta_y.nr == delta_x.nr + 1);
145 PLN(dst, interp, delta_x);
146 }
147
148 void
149 gen8_fs_generator::generate_tex(fs_inst *ir,
150 struct brw_reg dst,
151 struct brw_reg src)
152 {
153 int msg_type = -1;
154 int rlen = 4;
155 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
156
157 assert(src.file == BRW_GENERAL_REGISTER_FILE);
158
159 if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
160 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
161
162 switch (ir->opcode) {
163 case SHADER_OPCODE_TEX:
164 if (ir->shadow_compare) {
165 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
166 } else {
167 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
168 }
169 break;
170 case FS_OPCODE_TXB:
171 if (ir->shadow_compare) {
172 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
173 } else {
174 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
175 }
176 break;
177 case SHADER_OPCODE_TXL:
178 if (ir->shadow_compare) {
179 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
180 } else {
181 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
182 }
183 break;
184 case SHADER_OPCODE_TXS:
185 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
186 break;
187 case SHADER_OPCODE_TXD:
188 if (ir->shadow_compare) {
189 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
190 } else {
191 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
192 }
193 break;
194 case SHADER_OPCODE_TXF:
195 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
196 break;
197 case SHADER_OPCODE_TXF_CMS:
198 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
199 break;
200 case SHADER_OPCODE_TXF_MCS:
201 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
202 break;
203 case SHADER_OPCODE_LOD:
204 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
205 break;
206 case SHADER_OPCODE_TG4:
207 if (ir->shadow_compare) {
208 assert(brw->gen >= 7);
209 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
210 } else {
211 assert(brw->gen >= 6);
212 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
213 }
214 break;
215 case SHADER_OPCODE_TG4_OFFSET:
216 assert(brw->gen >= 7);
217 if (ir->shadow_compare) {
218 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
219 } else {
220 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
221 }
222 break;
223 default:
224 assert(!"not reached");
225 break;
226 }
227 assert(msg_type != -1);
228
229 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
230 rlen = 8;
231 dst = vec16(dst);
232 }
233
234 if (ir->header_present) {
235 /* The send-from-GRF for SIMD16 texturing with a header has an extra
236 * hardware register allocated to it, which we need to skip over (since
237 * our coordinates in the payload are in the even-numbered registers,
238 * and the header comes right before the first one.
239 */
240 if (dispatch_width == 16)
241 src.nr++;
242
243 MOV_RAW(src, brw_vec8_grf(0, 0));
244
245 if (ir->texture_offset) {
246 /* Set the texel offset bits. */
247 MOV_RAW(retype(brw_vec1_grf(src.nr, 2), BRW_REGISTER_TYPE_UD),
248 brw_imm_ud(ir->texture_offset));
249 }
250 }
251
252 uint32_t surf_index =
253 c->prog_data.base.binding_table.texture_start + ir->sampler;
254
255 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
256 gen8_set_dst(brw, inst, dst);
257 gen8_set_src0(brw, inst, src);
258 gen8_set_sampler_message(brw, inst,
259 surf_index,
260 ir->sampler,
261 msg_type,
262 rlen,
263 ir->mlen,
264 ir->header_present,
265 simd_mode);
266
267 mark_surface_used(surf_index);
268 }
269
270
271 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
272 * looking like:
273 *
274 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
275 *
276 * and we're trying to produce:
277 *
278 * DDX DDY
279 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
280 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
281 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
282 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
283 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
284 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
285 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
286 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
287 *
288 * and add another set of two more subspans if in 16-pixel dispatch mode.
289 *
290 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
291 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
292 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
293 * between each other. We could probably do it like ddx and swizzle the right
294 * order later, but bail for now and just produce
295 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
296 */
297 void
298 gen8_fs_generator::generate_ddx(fs_inst *inst,
299 struct brw_reg dst,
300 struct brw_reg src)
301 {
302 unsigned vstride, width;
303
304 if (c->key.high_quality_derivatives) {
305 /* Produce accurate derivatives. */
306 vstride = BRW_VERTICAL_STRIDE_2;
307 width = BRW_WIDTH_2;
308 } else {
309 /* Replicate the derivative at the top-left pixel to other pixels. */
310 vstride = BRW_VERTICAL_STRIDE_4;
311 width = BRW_WIDTH_4;
312 }
313
314 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
315 BRW_REGISTER_TYPE_F,
316 vstride,
317 width,
318 BRW_HORIZONTAL_STRIDE_0,
319 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
320 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
321 BRW_REGISTER_TYPE_F,
322 vstride,
323 width,
324 BRW_HORIZONTAL_STRIDE_0,
325 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
326 ADD(dst, src0, negate(src1));
327 }
328
329 /* The negate_value boolean is used to negate the derivative computation for
330 * FBOs, since they place the origin at the upper left instead of the lower
331 * left.
332 */
333 void
334 gen8_fs_generator::generate_ddy(fs_inst *inst,
335 struct brw_reg dst,
336 struct brw_reg src,
337 bool negate_value)
338 {
339 unsigned hstride;
340 unsigned src0_swizzle;
341 unsigned src1_swizzle;
342 unsigned src1_subnr;
343
344 if (c->key.high_quality_derivatives) {
345 /* Produce accurate derivatives. */
346 hstride = BRW_HORIZONTAL_STRIDE_1;
347 src0_swizzle = BRW_SWIZZLE_XYXY;
348 src1_swizzle = BRW_SWIZZLE_ZWZW;
349 src1_subnr = 0;
350
351 default_state.access_mode = BRW_ALIGN_16;
352 } else {
353 /* Replicate the derivative at the top-left pixel to other pixels. */
354 hstride = BRW_HORIZONTAL_STRIDE_0;
355 src0_swizzle = BRW_SWIZZLE_XYZW;
356 src1_swizzle = BRW_SWIZZLE_XYZW;
357 src1_subnr = 2;
358 }
359
360 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
361 BRW_REGISTER_TYPE_F,
362 BRW_VERTICAL_STRIDE_4,
363 BRW_WIDTH_4,
364 hstride,
365 src0_swizzle, WRITEMASK_XYZW);
366 struct brw_reg src1 = brw_reg(src.file, src.nr, src1_subnr,
367 BRW_REGISTER_TYPE_F,
368 BRW_VERTICAL_STRIDE_4,
369 BRW_WIDTH_4,
370 hstride,
371 src1_swizzle, WRITEMASK_XYZW);
372
373 if (negate_value)
374 ADD(dst, src1, negate(src0));
375 else
376 ADD(dst, src0, negate(src1));
377
378 default_state.access_mode = BRW_ALIGN_1;
379 }
380
381 void
382 gen8_fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg dst)
383 {
384 assert(inst->mlen != 0);
385 assert(!"TODO: Implement generate_scratch_write.");
386 }
387
388 void
389 gen8_fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
390 {
391 assert(inst->mlen != 0);
392 assert(!"TODO: Implement generate_scratch_read.");
393 }
394
395 void
396 gen8_fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
397 {
398 assert(inst->mlen != 0);
399 assert(!"TODO: Implement generate_scratch_read_gen7.");
400 }
401
402 void
403 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
404 struct brw_reg dst,
405 struct brw_reg index,
406 struct brw_reg offset)
407 {
408 assert(inst->mlen == 0);
409
410 assert(index.file == BRW_IMMEDIATE_VALUE &&
411 index.type == BRW_REGISTER_TYPE_UD);
412 uint32_t surf_index = index.dw1.ud;
413
414 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
415 /* Reference only the dword we need lest we anger validate_reg() with
416 * reg.width > reg.execszie.
417 */
418 offset = brw_vec1_grf(offset.nr, 0);
419
420 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
421 gen8_set_mask_control(send, BRW_MASK_DISABLE);
422
423 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
424 * the destination loaded consecutively from the same offset (which appears
425 * in the first component, and the rest are ignored).
426 */
427 dst.width = BRW_WIDTH_4;
428 gen8_set_dst(brw, send, dst);
429 gen8_set_src0(brw, send, offset);
430 gen8_set_sampler_message(brw, send,
431 surf_index,
432 0, /* The LD message ignores the sampler unit. */
433 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
434 1, /* rlen */
435 1, /* mlen */
436 false, /* no header */
437 BRW_SAMPLER_SIMD_MODE_SIMD4X2);
438
439 mark_surface_used(surf_index);
440 }
441
442 void
443 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst *ir,
444 struct brw_reg dst,
445 struct brw_reg index,
446 struct brw_reg offset)
447 {
448 /* Varying-offset pull constant loads are treated as a normal expression on
449 * gen7, so the fact that it's a send message is hidden at the IR level.
450 */
451 assert(!ir->header_present);
452 assert(!ir->mlen);
453
454 assert(index.file == BRW_IMMEDIATE_VALUE &&
455 index.type == BRW_REGISTER_TYPE_UD);
456 uint32_t surf_index = index.dw1.ud;
457
458 uint32_t simd_mode, rlen, mlen;
459 if (dispatch_width == 16) {
460 mlen = 2;
461 rlen = 8;
462 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
463 } else {
464 mlen = 1;
465 rlen = 4;
466 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
467 }
468
469 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
470 gen8_set_dst(brw, send, dst);
471 gen8_set_src0(brw, send, offset);
472 gen8_set_sampler_message(brw, send,
473 surf_index,
474 0, /* The LD message ignore the sampler unit. */
475 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
476 rlen, /* rlen */
477 mlen, /* mlen */
478 false, /* no header */
479 simd_mode);
480
481 mark_surface_used(surf_index);
482 }
483
484 /**
485 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
486 * into the flags register (f0.0).
487 */
488 void
489 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst *ir)
490 {
491 struct brw_reg flags = brw_flag_reg(0, ir->flag_subreg);
492 struct brw_reg dispatch_mask =
493 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
494
495 gen8_instruction *mov = MOV(flags, dispatch_mask);
496 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
497 }
498
499 void
500 gen8_fs_generator::generate_discard_jump(fs_inst *ir)
501 {
502 /* This HALT will be patched up at FB write time to point UIP at the end of
503 * the program, and at brw_uip_jip() JIP will be set to the end of the
504 * current block (or the program).
505 */
506 discard_halt_patches.push_tail(new(mem_ctx) ip_record(nr_inst));
507
508 HALT();
509 }
510
511 void
512 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
513 {
514 if (discard_halt_patches.is_empty())
515 return;
516
517 /* There is a somewhat strange undocumented requirement of using
518 * HALT, according to the simulator. If some channel has HALTed to
519 * a particular UIP, then by the end of the program, every channel
520 * must have HALTed to that UIP. Furthermore, the tracking is a
521 * stack, so you can't do the final halt of a UIP after starting
522 * halting to a new UIP.
523 *
524 * Symptoms of not emitting this instruction on actual hardware
525 * included GPU hangs and sparkly rendering on the piglit discard
526 * tests.
527 */
528 gen8_instruction *last_halt = HALT();
529 gen8_set_uip(last_halt, 16);
530 gen8_set_jip(last_halt, 16);
531
532 int ip = nr_inst;
533
534 foreach_list(node, &discard_halt_patches) {
535 ip_record *patch_ip = (ip_record *) node;
536 gen8_instruction *patch = &store[patch_ip->ip];
537 assert(gen8_opcode(patch) == BRW_OPCODE_HALT);
538
539 /* HALT takes an instruction distance from the pre-incremented IP. */
540 gen8_set_uip(patch, (ip - patch_ip->ip) * 16);
541 }
542
543 this->discard_halt_patches.make_empty();
544 }
545
546 /**
547 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
548 * sampler LD messages.
549 *
550 * We don't want to bake it into the send message's code generation because
551 * that means we don't get a chance to schedule the instruction.
552 */
553 void
554 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst *ir,
555 struct brw_reg dst,
556 struct brw_reg value)
557 {
558 assert(value.file == BRW_IMMEDIATE_VALUE);
559 MOV_RAW(retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
560 }
561
562 void
563 gen8_fs_generator::generate_code(exec_list *instructions)
564 {
565 int last_native_inst_offset = next_inst_offset;
566 const char *last_annotation_string = NULL;
567 const void *last_annotation_ir = NULL;
568
569 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
570 if (shader) {
571 printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
572 shader_prog->Name, dispatch_width);
573 } else if (fp) {
574 printf("Native code for fragment program %d (SIMD%d dispatch):\n",
575 prog->Id, dispatch_width);
576 } else {
577 printf("Native code for blorp program (SIMD%d dispatch):\n",
578 dispatch_width);
579 }
580 }
581
582 cfg_t *cfg = NULL;
583 if (unlikely(INTEL_DEBUG & DEBUG_WM))
584 cfg = new(mem_ctx) cfg_t(instructions);
585
586 foreach_list(node, instructions) {
587 fs_inst *ir = (fs_inst *) node;
588 struct brw_reg src[3], dst;
589
590 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
591 foreach_list(node, &cfg->block_list) {
592 bblock_link *link = (bblock_link *)node;
593 bblock_t *block = link->block;
594
595 if (block->start == ir) {
596 printf(" START B%d", block->block_num);
597 foreach_list(predecessor_node, &block->parents) {
598 bblock_link *predecessor_link =
599 (bblock_link *)predecessor_node;
600 bblock_t *predecessor_block = predecessor_link->block;
601 printf(" <-B%d", predecessor_block->block_num);
602 }
603 printf("\n");
604 }
605 }
606
607 if (last_annotation_ir != ir->ir) {
608 last_annotation_ir = ir->ir;
609 if (last_annotation_ir) {
610 printf(" ");
611 if (shader) {
612 ((ir_instruction *) ir->ir)->print();
613 } else if (prog) {
614 const prog_instruction *fpi;
615 fpi = (const prog_instruction *) ir->ir;
616 printf("%d: ", (int)(fpi - prog->Instructions));
617 _mesa_fprint_instruction_opt(stdout,
618 fpi,
619 0, PROG_PRINT_DEBUG, NULL);
620 }
621 printf("\n");
622 }
623 }
624 if (last_annotation_string != ir->annotation) {
625 last_annotation_string = ir->annotation;
626 if (last_annotation_string)
627 printf(" %s\n", last_annotation_string);
628 }
629 }
630
631 for (unsigned int i = 0; i < 3; i++) {
632 src[i] = brw_reg_from_fs_reg(&ir->src[i]);
633
634 /* The accumulator result appears to get used for the
635 * conditional modifier generation. When negating a UD
636 * value, there is a 33rd bit generated for the sign in the
637 * accumulator value, so now you can't check, for example,
638 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
639 */
640 assert(!ir->conditional_mod ||
641 ir->src[i].type != BRW_REGISTER_TYPE_UD ||
642 !ir->src[i].negate);
643 }
644 dst = brw_reg_from_fs_reg(&ir->dst);
645
646 default_state.conditional_mod = ir->conditional_mod;
647 default_state.predicate = ir->predicate;
648 default_state.predicate_inverse = ir->predicate_inverse;
649 default_state.saturate = ir->saturate;
650 default_state.flag_subreg_nr = ir->flag_subreg;
651
652 if (dispatch_width == 16 && !ir->force_uncompressed)
653 default_state.exec_size = BRW_EXECUTE_16;
654 else
655 default_state.exec_size = BRW_EXECUTE_8;
656
657 /* fs_inst::force_sechalf is only used for original Gen4 code, so we
658 * don't handle it. Add qtr_control to default_state if that changes.
659 */
660 assert(!ir->force_sechalf);
661
662 switch (ir->opcode) {
663 case BRW_OPCODE_MOV:
664 MOV(dst, src[0]);
665 break;
666 case BRW_OPCODE_ADD:
667 ADD(dst, src[0], src[1]);
668 break;
669 case BRW_OPCODE_MUL:
670 MUL(dst, src[0], src[1]);
671 break;
672 case BRW_OPCODE_MACH:
673 MACH(dst, src[0], src[1]);
674 break;
675
676 case BRW_OPCODE_MAD:
677 default_state.access_mode = BRW_ALIGN_16;
678 MAD(dst, src[0], src[1], src[2]);
679 default_state.access_mode = BRW_ALIGN_1;
680 break;
681
682 case BRW_OPCODE_LRP:
683 default_state.access_mode = BRW_ALIGN_16;
684 LRP(dst, src[0], src[1], src[2]);
685 default_state.access_mode = BRW_ALIGN_1;
686 break;
687
688
689 case BRW_OPCODE_FRC:
690 FRC(dst, src[0]);
691 break;
692 case BRW_OPCODE_RNDD:
693 RNDD(dst, src[0]);
694 break;
695 case BRW_OPCODE_RNDE:
696 RNDE(dst, src[0]);
697 break;
698 case BRW_OPCODE_RNDZ:
699 RNDZ(dst, src[0]);
700 break;
701
702 case BRW_OPCODE_AND:
703 AND(dst, src[0], src[1]);
704 break;
705 case BRW_OPCODE_OR:
706 OR(dst, src[0], src[1]);
707 break;
708 case BRW_OPCODE_XOR:
709 XOR(dst, src[0], src[1]);
710 break;
711 case BRW_OPCODE_NOT:
712 NOT(dst, src[0]);
713 break;
714 case BRW_OPCODE_ASR:
715 ASR(dst, src[0], src[1]);
716 break;
717 case BRW_OPCODE_SHR:
718 SHR(dst, src[0], src[1]);
719 break;
720 case BRW_OPCODE_SHL:
721 SHL(dst, src[0], src[1]);
722 break;
723
724 case BRW_OPCODE_F32TO16:
725 F32TO16(dst, src[0]);
726 break;
727 case BRW_OPCODE_F16TO32:
728 F16TO32(dst, src[0]);
729 break;
730
731 case BRW_OPCODE_CMP:
732 CMP(dst, ir->conditional_mod, src[0], src[1]);
733 break;
734 case BRW_OPCODE_SEL:
735 SEL(dst, src[0], src[1]);
736 break;
737
738 case BRW_OPCODE_BFREV:
739 /* BFREV only supports UD type for src and dst. */
740 BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
741 retype(src[0], BRW_REGISTER_TYPE_UD));
742 break;
743
744 case BRW_OPCODE_FBH:
745 /* FBH only supports UD type for dst. */
746 FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
747 break;
748
749 case BRW_OPCODE_FBL:
750 /* FBL only supports UD type for dst. */
751 FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
752 break;
753
754 case BRW_OPCODE_CBIT:
755 /* CBIT only supports UD type for dst. */
756 CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
757 break;
758
759 case BRW_OPCODE_ADDC:
760 ADDC(dst, src[0], src[1]);
761 break;
762
763 case BRW_OPCODE_SUBB:
764 SUBB(dst, src[0], src[1]);
765 break;
766
767 case BRW_OPCODE_BFE:
768 default_state.access_mode = BRW_ALIGN_16;
769 BFE(dst, src[0], src[1], src[2]);
770 default_state.access_mode = BRW_ALIGN_1;
771 break;
772
773 case BRW_OPCODE_BFI1:
774 BFI1(dst, src[0], src[1]);
775 break;
776
777 case BRW_OPCODE_BFI2:
778 default_state.access_mode = BRW_ALIGN_16;
779 BFI2(dst, src[0], src[1], src[2]);
780 default_state.access_mode = BRW_ALIGN_1;
781 break;
782
783 case BRW_OPCODE_IF:
784 IF(BRW_PREDICATE_NORMAL);
785 break;
786
787 case BRW_OPCODE_ELSE:
788 ELSE();
789 break;
790
791 case BRW_OPCODE_ENDIF:
792 ENDIF();
793 break;
794
795 case BRW_OPCODE_DO:
796 DO();
797 break;
798
799 case BRW_OPCODE_BREAK:
800 BREAK();
801 break;
802
803 case BRW_OPCODE_CONTINUE:
804 CONTINUE();
805 break;
806
807 case BRW_OPCODE_WHILE:
808 WHILE();
809 break;
810
811 case SHADER_OPCODE_RCP:
812 MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
813 break;
814
815 case SHADER_OPCODE_RSQ:
816 MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
817 break;
818
819 case SHADER_OPCODE_SQRT:
820 MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
821 break;
822
823 case SHADER_OPCODE_EXP2:
824 MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
825 break;
826
827 case SHADER_OPCODE_LOG2:
828 MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
829 break;
830
831 case SHADER_OPCODE_SIN:
832 MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
833 break;
834
835 case SHADER_OPCODE_COS:
836 MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
837 break;
838
839 case SHADER_OPCODE_INT_QUOTIENT:
840 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
841 break;
842
843 case SHADER_OPCODE_INT_REMAINDER:
844 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
845 break;
846
847 case SHADER_OPCODE_POW:
848 MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
849 break;
850
851 case FS_OPCODE_PIXEL_X:
852 case FS_OPCODE_PIXEL_Y:
853 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
854 break;
855
856 case FS_OPCODE_CINTERP:
857 MOV(dst, src[0]);
858 break;
859 case FS_OPCODE_LINTERP:
860 generate_linterp(ir, dst, src);
861 break;
862 case SHADER_OPCODE_TEX:
863 case FS_OPCODE_TXB:
864 case SHADER_OPCODE_TXD:
865 case SHADER_OPCODE_TXF:
866 case SHADER_OPCODE_TXF_CMS:
867 case SHADER_OPCODE_TXF_MCS:
868 case SHADER_OPCODE_TXL:
869 case SHADER_OPCODE_TXS:
870 case SHADER_OPCODE_LOD:
871 case SHADER_OPCODE_TG4:
872 case SHADER_OPCODE_TG4_OFFSET:
873 generate_tex(ir, dst, src[0]);
874 break;
875
876 case FS_OPCODE_DDX:
877 generate_ddx(ir, dst, src[0]);
878 break;
879 case FS_OPCODE_DDY:
880 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
881 * guarantee that c->key.render_to_fbo is set).
882 */
883 assert(fp->UsesDFdy);
884 generate_ddy(ir, dst, src[0], c->key.render_to_fbo);
885 break;
886
887 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
888 generate_scratch_write(ir, src[0]);
889 break;
890
891 case SHADER_OPCODE_GEN4_SCRATCH_READ:
892 generate_scratch_read(ir, dst);
893 break;
894
895 case SHADER_OPCODE_GEN7_SCRATCH_READ:
896 generate_scratch_read_gen7(ir, dst);
897 break;
898
899 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
900 generate_uniform_pull_constant_load(ir, dst, src[0], src[1]);
901 break;
902
903 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
904 generate_varying_pull_constant_load(ir, dst, src[0], src[1]);
905 break;
906
907 case FS_OPCODE_FB_WRITE:
908 generate_fb_write(ir);
909 break;
910
911 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
912 generate_mov_dispatch_to_flags(ir);
913 break;
914
915 case FS_OPCODE_DISCARD_JUMP:
916 generate_discard_jump(ir);
917 break;
918
919 case SHADER_OPCODE_SHADER_TIME_ADD:
920 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
921 break;
922
923 case SHADER_OPCODE_UNTYPED_ATOMIC:
924 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
925 break;
926
927 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
928 assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
929 break;
930
931 case FS_OPCODE_SET_SIMD4X2_OFFSET:
932 generate_set_simd4x2_offset(ir, dst, src[0]);
933 break;
934
935 case FS_OPCODE_SET_OMASK:
936 assert(!"XXX: Missing Gen8 scalar support for SET_OMASK");
937 break;
938
939 case FS_OPCODE_SET_SAMPLE_ID:
940 assert(!"XXX: Missing Gen8 scalar support for SET_SAMPLE_ID");
941 break;
942
943 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
944 assert(!"XXX: Missing Gen8 scalar support for PACK_HALF_2x16_SPLIT");
945 break;
946
947 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
948 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
949 assert(!"XXX: Missing Gen8 scalar support for UNPACK_HALF_2x16_SPLIT");
950 break;
951
952 case FS_OPCODE_PLACEHOLDER_HALT:
953 /* This is the place where the final HALT needs to be inserted if
954 * we've emitted any discards. If not, this will emit no code.
955 */
956 patch_discard_jumps_to_fb_writes();
957 break;
958
959 default:
960 if (ir->opcode < int(ARRAY_SIZE(opcode_descs))) {
961 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
962 opcode_descs[ir->opcode].name);
963 } else {
964 _mesa_problem(ctx, "Unsupported opcode %d in FS", ir->opcode);
965 }
966 abort();
967 }
968
969 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
970 disassemble(stdout, last_native_inst_offset, next_inst_offset);
971
972 foreach_list(node, &cfg->block_list) {
973 bblock_link *link = (bblock_link *)node;
974 bblock_t *block = link->block;
975
976 if (block->end == ir) {
977 printf(" END B%d", block->block_num);
978 foreach_list(successor_node, &block->children) {
979 bblock_link *successor_link =
980 (bblock_link *)successor_node;
981 bblock_t *successor_block = successor_link->block;
982 printf(" ->B%d", successor_block->block_num);
983 }
984 printf("\n");
985 }
986 }
987 }
988
989 last_native_inst_offset = next_inst_offset;
990 }
991
992 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
993 printf("\n");
994 }
995
996 patch_jump_targets();
997 }
998
999 const unsigned *
1000 gen8_fs_generator::generate_assembly(exec_list *simd8_instructions,
1001 exec_list *simd16_instructions,
1002 unsigned *assembly_size)
1003 {
1004 assert(simd8_instructions || simd16_instructions);
1005
1006 if (simd8_instructions) {
1007 dispatch_width = 8;
1008 generate_code(simd8_instructions);
1009 }
1010
1011 if (simd16_instructions) {
1012 /* Align to a 64-byte boundary. */
1013 while ((nr_inst * sizeof(gen8_instruction)) % 64)
1014 NOP();
1015
1016 /* Save off the start of this SIMD16 program */
1017 c->prog_data.prog_offset_16 = nr_inst * sizeof(gen8_instruction);
1018
1019 dispatch_width = 16;
1020 generate_code(simd16_instructions);
1021 }
1022
1023 *assembly_size = next_inst_offset;
1024 return (const unsigned *) store;
1025 }