e0b52126705cdcad30bcad9acc6533329fe56d02
[mesa.git] / src / mesa / drivers / dri / i965 / gen8_fs_generator.cpp
1 /*
2 * Copyright © 2010, 2011, 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file gen8_fs_generate.cpp
25 *
26 * Code generation for Gen8+ hardware.
27 */
28
29 extern "C" {
30 #include "main/macros.h"
31 #include "brw_context.h"
32 } /* extern "C" */
33
34 #include "brw_fs.h"
35 #include "brw_cfg.h"
36 #include "glsl/ir_print_visitor.h"
37
38 gen8_fs_generator::gen8_fs_generator(struct brw_context *brw,
39 struct brw_wm_compile *c,
40 struct gl_shader_program *shader_prog,
41 struct gl_fragment_program *fp,
42 bool dual_source_output)
43 : gen8_generator(brw, shader_prog, fp ? &fp->Base : NULL, c), c(c), fp(fp),
44 dual_source_output(dual_source_output)
45 {
46 }
47
48 gen8_fs_generator::~gen8_fs_generator()
49 {
50 }
51
52 void
53 gen8_fs_generator::mark_surface_used(unsigned surf_index)
54 {
55 assert(surf_index < BRW_MAX_SURFACES);
56
57 c->prog_data.base.binding_table.size_bytes =
58 MAX2(c->prog_data.base.binding_table.size_bytes, (surf_index + 1) * 4);
59 }
60
61 void
62 gen8_fs_generator::generate_fb_write(fs_inst *ir)
63 {
64 if (fp && fp->UsesKill) {
65 gen8_instruction *mov =
66 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW),
67 brw_flag_reg(0, 1));
68 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
69 }
70
71 if (ir->header_present) {
72 gen8_instruction *mov =
73 MOV_RAW(brw_message_reg(ir->base_mrf), brw_vec8_grf(0, 0));
74 gen8_set_exec_size(mov, BRW_EXECUTE_16);
75
76 if (ir->target > 0 && c->key.replicate_alpha) {
77 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
78 OR(vec1(retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD)),
79 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
80 brw_imm_ud(1 << 11));
81 }
82
83 if (ir->target > 0) {
84 /* Set the render target index for choosing BLEND_STATE. */
85 MOV(retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
86 BRW_REGISTER_TYPE_UD),
87 brw_imm_ud(ir->target));
88 }
89 }
90
91 gen8_instruction *inst = next_inst(BRW_OPCODE_SENDC);
92 gen8_set_dst(brw, inst, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW));
93 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
94
95 /* Set up the "Message Specific Control" bits for the Data Port Message
96 * Descriptor. These are documented in the "Render Target Write" message's
97 * "Message Descriptor" documentation (vol5c.2).
98 */
99 uint32_t msg_type;
100 /* Set the Message Type */
101 if (this->dual_source_output)
102 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
103 else if (dispatch_width == 16)
104 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
105 else
106 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
107
108 uint32_t msg_control = msg_type;
109
110 /* "Last Render Target Select" must be set on all writes to the last of
111 * the render targets (if using MRT), or always for a single RT scenario.
112 */
113 if ((ir->target == c->key.nr_color_regions - 1) || !c->key.nr_color_regions)
114 msg_control |= (1 << 4); /* Last Render Target Select */
115
116 uint32_t surf_index =
117 c->prog_data.binding_table.render_target_start + ir->target;
118
119 gen8_set_dp_message(brw, inst,
120 GEN6_SFID_DATAPORT_RENDER_CACHE,
121 surf_index,
122 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
123 msg_control,
124 ir->mlen,
125 0,
126 ir->header_present,
127 ir->eot);
128
129 mark_surface_used(surf_index);
130 }
131
132 void
133 gen8_fs_generator::generate_linterp(fs_inst *inst,
134 struct brw_reg dst,
135 struct brw_reg *src)
136 {
137 struct brw_reg delta_x = src[0];
138 struct brw_reg delta_y = src[1];
139 struct brw_reg interp = src[2];
140
141 (void) delta_y;
142 assert(delta_y.nr == delta_x.nr + 1);
143 PLN(dst, interp, delta_x);
144 }
145
146 void
147 gen8_fs_generator::generate_tex(fs_inst *ir,
148 struct brw_reg dst,
149 struct brw_reg src)
150 {
151 int msg_type = -1;
152 int rlen = 4;
153 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
154
155 assert(src.file == BRW_GENERAL_REGISTER_FILE);
156
157 if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
158 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
159
160 switch (ir->opcode) {
161 case SHADER_OPCODE_TEX:
162 if (ir->shadow_compare) {
163 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
164 } else {
165 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
166 }
167 break;
168 case FS_OPCODE_TXB:
169 if (ir->shadow_compare) {
170 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
171 } else {
172 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
173 }
174 break;
175 case SHADER_OPCODE_TXL:
176 if (ir->shadow_compare) {
177 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
178 } else {
179 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
180 }
181 break;
182 case SHADER_OPCODE_TXS:
183 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
184 break;
185 case SHADER_OPCODE_TXD:
186 if (ir->shadow_compare) {
187 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
188 } else {
189 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
190 }
191 break;
192 case SHADER_OPCODE_TXF:
193 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
194 break;
195 case SHADER_OPCODE_TXF_CMS:
196 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
197 break;
198 case SHADER_OPCODE_TXF_UMS:
199 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
200 break;
201 case SHADER_OPCODE_TXF_MCS:
202 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
203 break;
204 case SHADER_OPCODE_LOD:
205 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
206 break;
207 case SHADER_OPCODE_TG4:
208 if (ir->shadow_compare) {
209 assert(brw->gen >= 7);
210 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
211 } else {
212 assert(brw->gen >= 6);
213 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
214 }
215 break;
216 case SHADER_OPCODE_TG4_OFFSET:
217 assert(brw->gen >= 7);
218 if (ir->shadow_compare) {
219 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
220 } else {
221 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
222 }
223 break;
224 default:
225 assert(!"not reached");
226 break;
227 }
228 assert(msg_type != -1);
229
230 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
231 rlen = 8;
232 dst = vec16(dst);
233 }
234
235 if (ir->header_present) {
236 /* The send-from-GRF for SIMD16 texturing with a header has an extra
237 * hardware register allocated to it, which we need to skip over (since
238 * our coordinates in the payload are in the even-numbered registers,
239 * and the header comes right before the first one.
240 */
241 if (dispatch_width == 16)
242 src.nr++;
243
244 MOV_RAW(src, brw_vec8_grf(0, 0));
245
246 if (ir->texture_offset) {
247 /* Set the texel offset bits. */
248 MOV_RAW(retype(brw_vec1_grf(src.nr, 2), BRW_REGISTER_TYPE_UD),
249 brw_imm_ud(ir->texture_offset));
250 }
251 }
252
253 uint32_t surf_index =
254 c->prog_data.base.binding_table.texture_start + ir->sampler;
255
256 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
257 gen8_set_dst(brw, inst, dst);
258 gen8_set_src0(brw, inst, src);
259 gen8_set_sampler_message(brw, inst,
260 surf_index,
261 ir->sampler,
262 msg_type,
263 rlen,
264 ir->mlen,
265 ir->header_present,
266 simd_mode);
267
268 mark_surface_used(surf_index);
269 }
270
271
272 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
273 * looking like:
274 *
275 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
276 *
277 * and we're trying to produce:
278 *
279 * DDX DDY
280 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
281 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
282 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
283 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
284 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
285 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
286 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
287 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
288 *
289 * and add another set of two more subspans if in 16-pixel dispatch mode.
290 *
291 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
292 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
293 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
294 * between each other. We could probably do it like ddx and swizzle the right
295 * order later, but bail for now and just produce
296 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
297 */
298 void
299 gen8_fs_generator::generate_ddx(fs_inst *inst,
300 struct brw_reg dst,
301 struct brw_reg src)
302 {
303 unsigned vstride, width;
304
305 if (c->key.high_quality_derivatives) {
306 /* Produce accurate derivatives. */
307 vstride = BRW_VERTICAL_STRIDE_2;
308 width = BRW_WIDTH_2;
309 } else {
310 /* Replicate the derivative at the top-left pixel to other pixels. */
311 vstride = BRW_VERTICAL_STRIDE_4;
312 width = BRW_WIDTH_4;
313 }
314
315 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
316 BRW_REGISTER_TYPE_F,
317 vstride,
318 width,
319 BRW_HORIZONTAL_STRIDE_0,
320 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
321 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
322 BRW_REGISTER_TYPE_F,
323 vstride,
324 width,
325 BRW_HORIZONTAL_STRIDE_0,
326 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
327 ADD(dst, src0, negate(src1));
328 }
329
330 /* The negate_value boolean is used to negate the derivative computation for
331 * FBOs, since they place the origin at the upper left instead of the lower
332 * left.
333 */
334 void
335 gen8_fs_generator::generate_ddy(fs_inst *inst,
336 struct brw_reg dst,
337 struct brw_reg src,
338 bool negate_value)
339 {
340 unsigned hstride;
341 unsigned src0_swizzle;
342 unsigned src1_swizzle;
343 unsigned src1_subnr;
344
345 if (c->key.high_quality_derivatives) {
346 /* Produce accurate derivatives. */
347 hstride = BRW_HORIZONTAL_STRIDE_1;
348 src0_swizzle = BRW_SWIZZLE_XYXY;
349 src1_swizzle = BRW_SWIZZLE_ZWZW;
350 src1_subnr = 0;
351
352 default_state.access_mode = BRW_ALIGN_16;
353 } else {
354 /* Replicate the derivative at the top-left pixel to other pixels. */
355 hstride = BRW_HORIZONTAL_STRIDE_0;
356 src0_swizzle = BRW_SWIZZLE_XYZW;
357 src1_swizzle = BRW_SWIZZLE_XYZW;
358 src1_subnr = 2;
359 }
360
361 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
362 BRW_REGISTER_TYPE_F,
363 BRW_VERTICAL_STRIDE_4,
364 BRW_WIDTH_4,
365 hstride,
366 src0_swizzle, WRITEMASK_XYZW);
367 struct brw_reg src1 = brw_reg(src.file, src.nr, src1_subnr,
368 BRW_REGISTER_TYPE_F,
369 BRW_VERTICAL_STRIDE_4,
370 BRW_WIDTH_4,
371 hstride,
372 src1_swizzle, WRITEMASK_XYZW);
373
374 if (negate_value)
375 ADD(dst, src1, negate(src0));
376 else
377 ADD(dst, src0, negate(src1));
378
379 default_state.access_mode = BRW_ALIGN_1;
380 }
381
382 void
383 gen8_fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg dst)
384 {
385 assert(inst->mlen != 0);
386 assert(!"TODO: Implement generate_scratch_write.");
387 }
388
389 void
390 gen8_fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
391 {
392 assert(inst->mlen != 0);
393 assert(!"TODO: Implement generate_scratch_read.");
394 }
395
396 void
397 gen8_fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
398 {
399 assert(inst->mlen != 0);
400 assert(!"TODO: Implement generate_scratch_read_gen7.");
401 }
402
403 void
404 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
405 struct brw_reg dst,
406 struct brw_reg index,
407 struct brw_reg offset)
408 {
409 assert(inst->mlen == 0);
410
411 assert(index.file == BRW_IMMEDIATE_VALUE &&
412 index.type == BRW_REGISTER_TYPE_UD);
413 uint32_t surf_index = index.dw1.ud;
414
415 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
416 /* Reference only the dword we need lest we anger validate_reg() with
417 * reg.width > reg.execszie.
418 */
419 offset = brw_vec1_grf(offset.nr, 0);
420
421 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
422 gen8_set_mask_control(send, BRW_MASK_DISABLE);
423
424 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
425 * the destination loaded consecutively from the same offset (which appears
426 * in the first component, and the rest are ignored).
427 */
428 dst.width = BRW_WIDTH_4;
429 gen8_set_dst(brw, send, dst);
430 gen8_set_src0(brw, send, offset);
431 gen8_set_sampler_message(brw, send,
432 surf_index,
433 0, /* The LD message ignores the sampler unit. */
434 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
435 1, /* rlen */
436 1, /* mlen */
437 false, /* no header */
438 BRW_SAMPLER_SIMD_MODE_SIMD4X2);
439
440 mark_surface_used(surf_index);
441 }
442
443 void
444 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst *ir,
445 struct brw_reg dst,
446 struct brw_reg index,
447 struct brw_reg offset)
448 {
449 /* Varying-offset pull constant loads are treated as a normal expression on
450 * gen7, so the fact that it's a send message is hidden at the IR level.
451 */
452 assert(!ir->header_present);
453 assert(!ir->mlen);
454
455 assert(index.file == BRW_IMMEDIATE_VALUE &&
456 index.type == BRW_REGISTER_TYPE_UD);
457 uint32_t surf_index = index.dw1.ud;
458
459 uint32_t simd_mode, rlen, mlen;
460 if (dispatch_width == 16) {
461 mlen = 2;
462 rlen = 8;
463 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
464 } else {
465 mlen = 1;
466 rlen = 4;
467 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
468 }
469
470 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
471 gen8_set_dst(brw, send, dst);
472 gen8_set_src0(brw, send, offset);
473 gen8_set_sampler_message(brw, send,
474 surf_index,
475 0, /* The LD message ignore the sampler unit. */
476 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
477 rlen, /* rlen */
478 mlen, /* mlen */
479 false, /* no header */
480 simd_mode);
481
482 mark_surface_used(surf_index);
483 }
484
485 /**
486 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
487 * into the flags register (f0.0).
488 */
489 void
490 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst *ir)
491 {
492 struct brw_reg flags = brw_flag_reg(0, ir->flag_subreg);
493 struct brw_reg dispatch_mask =
494 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
495
496 gen8_instruction *mov = MOV(flags, dispatch_mask);
497 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
498 }
499
500 void
501 gen8_fs_generator::generate_discard_jump(fs_inst *ir)
502 {
503 /* This HALT will be patched up at FB write time to point UIP at the end of
504 * the program, and at brw_uip_jip() JIP will be set to the end of the
505 * current block (or the program).
506 */
507 discard_halt_patches.push_tail(new(mem_ctx) ip_record(nr_inst));
508
509 HALT();
510 }
511
512 void
513 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
514 {
515 if (discard_halt_patches.is_empty())
516 return;
517
518 /* There is a somewhat strange undocumented requirement of using
519 * HALT, according to the simulator. If some channel has HALTed to
520 * a particular UIP, then by the end of the program, every channel
521 * must have HALTed to that UIP. Furthermore, the tracking is a
522 * stack, so you can't do the final halt of a UIP after starting
523 * halting to a new UIP.
524 *
525 * Symptoms of not emitting this instruction on actual hardware
526 * included GPU hangs and sparkly rendering on the piglit discard
527 * tests.
528 */
529 gen8_instruction *last_halt = HALT();
530 gen8_set_uip(last_halt, 16);
531 gen8_set_jip(last_halt, 16);
532
533 int ip = nr_inst;
534
535 foreach_list(node, &discard_halt_patches) {
536 ip_record *patch_ip = (ip_record *) node;
537 gen8_instruction *patch = &store[patch_ip->ip];
538 assert(gen8_opcode(patch) == BRW_OPCODE_HALT);
539
540 /* HALT takes an instruction distance from the pre-incremented IP. */
541 gen8_set_uip(patch, (ip - patch_ip->ip) * 16);
542 }
543
544 this->discard_halt_patches.make_empty();
545 }
546
547 /**
548 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
549 * sampler LD messages.
550 *
551 * We don't want to bake it into the send message's code generation because
552 * that means we don't get a chance to schedule the instruction.
553 */
554 void
555 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst *ir,
556 struct brw_reg dst,
557 struct brw_reg value)
558 {
559 assert(value.file == BRW_IMMEDIATE_VALUE);
560 MOV_RAW(retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
561 }
562
563 void
564 gen8_fs_generator::generate_code(exec_list *instructions)
565 {
566 int last_native_inst_offset = next_inst_offset;
567 const char *last_annotation_string = NULL;
568 const void *last_annotation_ir = NULL;
569
570 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
571 if (prog) {
572 printf("Native code for fragment shader %d (SIMD%d dispatch):\n",
573 shader_prog->Name, dispatch_width);
574 } else if (fp) {
575 printf("Native code for fragment program %d (SIMD%d dispatch):\n",
576 prog->Id, dispatch_width);
577 } else {
578 printf("Native code for blorp program (SIMD%d dispatch):\n",
579 dispatch_width);
580 }
581 }
582
583 cfg_t *cfg = NULL;
584 if (unlikely(INTEL_DEBUG & DEBUG_WM))
585 cfg = new(mem_ctx) cfg_t(instructions);
586
587 foreach_list(node, instructions) {
588 fs_inst *ir = (fs_inst *) node;
589 struct brw_reg src[3], dst;
590
591 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
592 foreach_list(node, &cfg->block_list) {
593 bblock_link *link = (bblock_link *)node;
594 bblock_t *block = link->block;
595
596 if (block->start == ir) {
597 printf(" START B%d", block->block_num);
598 foreach_list(predecessor_node, &block->parents) {
599 bblock_link *predecessor_link =
600 (bblock_link *)predecessor_node;
601 bblock_t *predecessor_block = predecessor_link->block;
602 printf(" <-B%d", predecessor_block->block_num);
603 }
604 printf("\n");
605 }
606 }
607
608 if (last_annotation_ir != ir->ir) {
609 last_annotation_ir = ir->ir;
610 if (last_annotation_ir) {
611 printf(" ");
612 if (prog) {
613 ((ir_instruction *) ir->ir)->print();
614 } else if (prog) {
615 const prog_instruction *fpi;
616 fpi = (const prog_instruction *) ir->ir;
617 printf("%d: ", (int)(fpi - prog->Instructions));
618 _mesa_fprint_instruction_opt(stdout,
619 fpi,
620 0, PROG_PRINT_DEBUG, NULL);
621 }
622 printf("\n");
623 }
624 }
625 if (last_annotation_string != ir->annotation) {
626 last_annotation_string = ir->annotation;
627 if (last_annotation_string)
628 printf(" %s\n", last_annotation_string);
629 }
630 }
631
632 for (unsigned int i = 0; i < 3; i++) {
633 src[i] = brw_reg_from_fs_reg(&ir->src[i]);
634
635 /* The accumulator result appears to get used for the
636 * conditional modifier generation. When negating a UD
637 * value, there is a 33rd bit generated for the sign in the
638 * accumulator value, so now you can't check, for example,
639 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
640 */
641 assert(!ir->conditional_mod ||
642 ir->src[i].type != BRW_REGISTER_TYPE_UD ||
643 !ir->src[i].negate);
644 }
645 dst = brw_reg_from_fs_reg(&ir->dst);
646
647 default_state.conditional_mod = ir->conditional_mod;
648 default_state.predicate = ir->predicate;
649 default_state.predicate_inverse = ir->predicate_inverse;
650 default_state.saturate = ir->saturate;
651 default_state.flag_subreg_nr = ir->flag_subreg;
652
653 if (dispatch_width == 16 && !ir->force_uncompressed)
654 default_state.exec_size = BRW_EXECUTE_16;
655 else
656 default_state.exec_size = BRW_EXECUTE_8;
657
658 /* fs_inst::force_sechalf is only used for original Gen4 code, so we
659 * don't handle it. Add qtr_control to default_state if that changes.
660 */
661 assert(!ir->force_sechalf);
662
663 switch (ir->opcode) {
664 case BRW_OPCODE_MOV:
665 MOV(dst, src[0]);
666 break;
667 case BRW_OPCODE_ADD:
668 ADD(dst, src[0], src[1]);
669 break;
670 case BRW_OPCODE_MUL:
671 MUL(dst, src[0], src[1]);
672 break;
673 case BRW_OPCODE_MACH:
674 MACH(dst, src[0], src[1]);
675 break;
676
677 case BRW_OPCODE_MAD:
678 default_state.access_mode = BRW_ALIGN_16;
679 MAD(dst, src[0], src[1], src[2]);
680 default_state.access_mode = BRW_ALIGN_1;
681 break;
682
683 case BRW_OPCODE_LRP:
684 default_state.access_mode = BRW_ALIGN_16;
685 LRP(dst, src[0], src[1], src[2]);
686 default_state.access_mode = BRW_ALIGN_1;
687 break;
688
689
690 case BRW_OPCODE_FRC:
691 FRC(dst, src[0]);
692 break;
693 case BRW_OPCODE_RNDD:
694 RNDD(dst, src[0]);
695 break;
696 case BRW_OPCODE_RNDE:
697 RNDE(dst, src[0]);
698 break;
699 case BRW_OPCODE_RNDZ:
700 RNDZ(dst, src[0]);
701 break;
702
703 case BRW_OPCODE_AND:
704 AND(dst, src[0], src[1]);
705 break;
706 case BRW_OPCODE_OR:
707 OR(dst, src[0], src[1]);
708 break;
709 case BRW_OPCODE_XOR:
710 XOR(dst, src[0], src[1]);
711 break;
712 case BRW_OPCODE_NOT:
713 NOT(dst, src[0]);
714 break;
715 case BRW_OPCODE_ASR:
716 ASR(dst, src[0], src[1]);
717 break;
718 case BRW_OPCODE_SHR:
719 SHR(dst, src[0], src[1]);
720 break;
721 case BRW_OPCODE_SHL:
722 SHL(dst, src[0], src[1]);
723 break;
724
725 case BRW_OPCODE_F32TO16:
726 F32TO16(dst, src[0]);
727 break;
728 case BRW_OPCODE_F16TO32:
729 F16TO32(dst, src[0]);
730 break;
731
732 case BRW_OPCODE_CMP:
733 CMP(dst, ir->conditional_mod, src[0], src[1]);
734 break;
735 case BRW_OPCODE_SEL:
736 SEL(dst, src[0], src[1]);
737 break;
738
739 case BRW_OPCODE_BFREV:
740 /* BFREV only supports UD type for src and dst. */
741 BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
742 retype(src[0], BRW_REGISTER_TYPE_UD));
743 break;
744
745 case BRW_OPCODE_FBH:
746 /* FBH only supports UD type for dst. */
747 FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
748 break;
749
750 case BRW_OPCODE_FBL:
751 /* FBL only supports UD type for dst. */
752 FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
753 break;
754
755 case BRW_OPCODE_CBIT:
756 /* CBIT only supports UD type for dst. */
757 CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
758 break;
759
760 case BRW_OPCODE_ADDC:
761 ADDC(dst, src[0], src[1]);
762 break;
763
764 case BRW_OPCODE_SUBB:
765 SUBB(dst, src[0], src[1]);
766 break;
767
768 case BRW_OPCODE_BFE:
769 default_state.access_mode = BRW_ALIGN_16;
770 BFE(dst, src[0], src[1], src[2]);
771 default_state.access_mode = BRW_ALIGN_1;
772 break;
773
774 case BRW_OPCODE_BFI1:
775 BFI1(dst, src[0], src[1]);
776 break;
777
778 case BRW_OPCODE_BFI2:
779 default_state.access_mode = BRW_ALIGN_16;
780 BFI2(dst, src[0], src[1], src[2]);
781 default_state.access_mode = BRW_ALIGN_1;
782 break;
783
784 case BRW_OPCODE_IF:
785 IF(BRW_PREDICATE_NORMAL);
786 break;
787
788 case BRW_OPCODE_ELSE:
789 ELSE();
790 break;
791
792 case BRW_OPCODE_ENDIF:
793 ENDIF();
794 break;
795
796 case BRW_OPCODE_DO:
797 DO();
798 break;
799
800 case BRW_OPCODE_BREAK:
801 BREAK();
802 break;
803
804 case BRW_OPCODE_CONTINUE:
805 CONTINUE();
806 break;
807
808 case BRW_OPCODE_WHILE:
809 WHILE();
810 break;
811
812 case SHADER_OPCODE_RCP:
813 MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
814 break;
815
816 case SHADER_OPCODE_RSQ:
817 MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
818 break;
819
820 case SHADER_OPCODE_SQRT:
821 MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
822 break;
823
824 case SHADER_OPCODE_EXP2:
825 MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
826 break;
827
828 case SHADER_OPCODE_LOG2:
829 MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
830 break;
831
832 case SHADER_OPCODE_SIN:
833 MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
834 break;
835
836 case SHADER_OPCODE_COS:
837 MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
838 break;
839
840 case SHADER_OPCODE_INT_QUOTIENT:
841 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
842 break;
843
844 case SHADER_OPCODE_INT_REMAINDER:
845 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
846 break;
847
848 case SHADER_OPCODE_POW:
849 MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
850 break;
851
852 case FS_OPCODE_PIXEL_X:
853 case FS_OPCODE_PIXEL_Y:
854 assert(!"FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
855 break;
856
857 case FS_OPCODE_CINTERP:
858 MOV(dst, src[0]);
859 break;
860 case FS_OPCODE_LINTERP:
861 generate_linterp(ir, dst, src);
862 break;
863 case SHADER_OPCODE_TEX:
864 case FS_OPCODE_TXB:
865 case SHADER_OPCODE_TXD:
866 case SHADER_OPCODE_TXF:
867 case SHADER_OPCODE_TXF_CMS:
868 case SHADER_OPCODE_TXF_UMS:
869 case SHADER_OPCODE_TXF_MCS:
870 case SHADER_OPCODE_TXL:
871 case SHADER_OPCODE_TXS:
872 case SHADER_OPCODE_LOD:
873 case SHADER_OPCODE_TG4:
874 case SHADER_OPCODE_TG4_OFFSET:
875 generate_tex(ir, dst, src[0]);
876 break;
877
878 case FS_OPCODE_DDX:
879 generate_ddx(ir, dst, src[0]);
880 break;
881 case FS_OPCODE_DDY:
882 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
883 * guarantee that c->key.render_to_fbo is set).
884 */
885 assert(fp->UsesDFdy);
886 generate_ddy(ir, dst, src[0], c->key.render_to_fbo);
887 break;
888
889 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
890 generate_scratch_write(ir, src[0]);
891 break;
892
893 case SHADER_OPCODE_GEN4_SCRATCH_READ:
894 generate_scratch_read(ir, dst);
895 break;
896
897 case SHADER_OPCODE_GEN7_SCRATCH_READ:
898 generate_scratch_read_gen7(ir, dst);
899 break;
900
901 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
902 generate_uniform_pull_constant_load(ir, dst, src[0], src[1]);
903 break;
904
905 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
906 generate_varying_pull_constant_load(ir, dst, src[0], src[1]);
907 break;
908
909 case FS_OPCODE_FB_WRITE:
910 generate_fb_write(ir);
911 break;
912
913 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
914 generate_mov_dispatch_to_flags(ir);
915 break;
916
917 case FS_OPCODE_DISCARD_JUMP:
918 generate_discard_jump(ir);
919 break;
920
921 case SHADER_OPCODE_SHADER_TIME_ADD:
922 assert(!"XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
923 break;
924
925 case SHADER_OPCODE_UNTYPED_ATOMIC:
926 assert(!"XXX: Missing Gen8 scalar support for untyped atomics");
927 break;
928
929 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
930 assert(!"XXX: Missing Gen8 scalar support for untyped surface reads");
931 break;
932
933 case FS_OPCODE_SET_SIMD4X2_OFFSET:
934 generate_set_simd4x2_offset(ir, dst, src[0]);
935 break;
936
937 case FS_OPCODE_SET_OMASK:
938 assert(!"XXX: Missing Gen8 scalar support for SET_OMASK");
939 break;
940
941 case FS_OPCODE_SET_SAMPLE_ID:
942 assert(!"XXX: Missing Gen8 scalar support for SET_SAMPLE_ID");
943 break;
944
945 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
946 assert(!"XXX: Missing Gen8 scalar support for PACK_HALF_2x16_SPLIT");
947 break;
948
949 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
950 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
951 assert(!"XXX: Missing Gen8 scalar support for UNPACK_HALF_2x16_SPLIT");
952 break;
953
954 case FS_OPCODE_PLACEHOLDER_HALT:
955 /* This is the place where the final HALT needs to be inserted if
956 * we've emitted any discards. If not, this will emit no code.
957 */
958 patch_discard_jumps_to_fb_writes();
959 break;
960
961 default:
962 if (ir->opcode < int(ARRAY_SIZE(opcode_descs))) {
963 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
964 opcode_descs[ir->opcode].name);
965 } else {
966 _mesa_problem(ctx, "Unsupported opcode %d in FS", ir->opcode);
967 }
968 abort();
969 }
970
971 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
972 disassemble(stdout, last_native_inst_offset, next_inst_offset);
973
974 foreach_list(node, &cfg->block_list) {
975 bblock_link *link = (bblock_link *)node;
976 bblock_t *block = link->block;
977
978 if (block->end == ir) {
979 printf(" END B%d", block->block_num);
980 foreach_list(successor_node, &block->children) {
981 bblock_link *successor_link =
982 (bblock_link *)successor_node;
983 bblock_t *successor_block = successor_link->block;
984 printf(" ->B%d", successor_block->block_num);
985 }
986 printf("\n");
987 }
988 }
989 }
990
991 last_native_inst_offset = next_inst_offset;
992 }
993
994 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
995 printf("\n");
996 }
997
998 patch_jump_targets();
999 }
1000
1001 const unsigned *
1002 gen8_fs_generator::generate_assembly(exec_list *simd8_instructions,
1003 exec_list *simd16_instructions,
1004 unsigned *assembly_size)
1005 {
1006 assert(simd8_instructions || simd16_instructions);
1007
1008 if (simd8_instructions) {
1009 dispatch_width = 8;
1010 generate_code(simd8_instructions);
1011 }
1012
1013 if (simd16_instructions) {
1014 /* Align to a 64-byte boundary. */
1015 while ((nr_inst * sizeof(gen8_instruction)) % 64)
1016 NOP();
1017
1018 /* Save off the start of this SIMD16 program */
1019 c->prog_data.prog_offset_16 = nr_inst * sizeof(gen8_instruction);
1020
1021 dispatch_width = 16;
1022 generate_code(simd16_instructions);
1023 }
1024
1025 *assembly_size = next_inst_offset;
1026 return (const unsigned *) store;
1027 }