i965: Return NONE from brw_swap_cmod on unknown input.
[mesa.git] / src / mesa / drivers / dri / i965 / gen8_fs_generator.cpp
1 /*
2 * Copyright © 2010, 2011, 2012 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file gen8_fs_generate.cpp
25 *
26 * Code generation for Gen8+ hardware.
27 */
28
29 extern "C" {
30 #include "main/macros.h"
31 #include "brw_context.h"
32 } /* extern "C" */
33
34 #include "brw_fs.h"
35 #include "brw_cfg.h"
36 #include "glsl/ir_print_visitor.h"
37
38 gen8_fs_generator::gen8_fs_generator(struct brw_context *brw,
39 void *mem_ctx,
40 const struct brw_wm_prog_key *key,
41 struct brw_wm_prog_data *prog_data,
42 struct gl_shader_program *shader_prog,
43 struct gl_fragment_program *fp,
44 bool dual_source_output)
45 : gen8_generator(brw, shader_prog, fp ? &fp->Base : NULL, mem_ctx),
46 key(key), prog_data(prog_data),
47 fp(fp), dual_source_output(dual_source_output)
48 {
49 }
50
51 gen8_fs_generator::~gen8_fs_generator()
52 {
53 }
54
55 void
56 gen8_fs_generator::generate_fb_write(fs_inst *ir)
57 {
58 /* Disable the discard condition while setting up the header. */
59 default_state.predicate = BRW_PREDICATE_NONE;
60 default_state.predicate_inverse = false;
61 default_state.flag_subreg_nr = 0;
62
63 if (ir->header_present) {
64 /* The GPU will use the predicate on SENDC, unless the header is present.
65 */
66 if (fp && fp->UsesKill) {
67 gen8_instruction *mov =
68 MOV(retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW),
69 brw_flag_reg(0, 1));
70 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
71 }
72
73 gen8_instruction *mov =
74 MOV_RAW(brw_message_reg(ir->base_mrf), brw_vec8_grf(0, 0));
75 gen8_set_exec_size(mov, BRW_EXECUTE_16);
76
77 if (ir->target > 0 && key->replicate_alpha) {
78 /* Set "Source0 Alpha Present to RenderTarget" bit in the header. */
79 gen8_instruction *inst =
80 OR(get_element_ud(brw_message_reg(ir->base_mrf), 0),
81 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
82 brw_imm_ud(1 << 11));
83 gen8_set_mask_control(inst, BRW_MASK_DISABLE);
84 }
85
86 if (ir->target > 0) {
87 /* Set the render target index for choosing BLEND_STATE. */
88 MOV_RAW(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, ir->base_mrf, 2),
89 brw_imm_ud(ir->target));
90 }
91 }
92
93 /* Set the predicate back to get the conditional write if necessary for
94 * discards.
95 */
96 default_state.predicate = ir->predicate;
97 default_state.predicate_inverse = ir->predicate_inverse;
98 default_state.flag_subreg_nr = ir->flag_subreg;
99
100 gen8_instruction *inst = next_inst(BRW_OPCODE_SENDC);
101 gen8_set_dst(brw, inst, retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW));
102 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
103
104 /* Set up the "Message Specific Control" bits for the Data Port Message
105 * Descriptor. These are documented in the "Render Target Write" message's
106 * "Message Descriptor" documentation (vol5c.2).
107 */
108 uint32_t msg_type;
109 /* Set the Message Type */
110 if (this->dual_source_output)
111 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
112 else if (dispatch_width == 16)
113 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
114 else
115 msg_type = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
116
117 uint32_t msg_control = msg_type;
118
119 /* Set "Last Render Target Select" on the final FB write. */
120 if (ir->eot)
121 msg_control |= (1 << 4); /* Last Render Target Select */
122
123 uint32_t surf_index =
124 prog_data->binding_table.render_target_start + ir->target;
125
126 gen8_set_dp_message(brw, inst,
127 GEN6_SFID_DATAPORT_RENDER_CACHE,
128 surf_index,
129 GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE,
130 msg_control,
131 ir->mlen,
132 0,
133 ir->header_present,
134 ir->eot);
135
136 brw_mark_surface_used(&prog_data->base, surf_index);
137 }
138
139 void
140 gen8_fs_generator::generate_linterp(fs_inst *inst,
141 struct brw_reg dst,
142 struct brw_reg *src)
143 {
144 struct brw_reg delta_x = src[0];
145 struct brw_reg delta_y = src[1];
146 struct brw_reg interp = src[2];
147
148 (void) delta_y;
149 assert(delta_y.nr == delta_x.nr + 1);
150 PLN(dst, interp, delta_x);
151 }
152
153 void
154 gen8_fs_generator::generate_tex(fs_inst *ir,
155 struct brw_reg dst,
156 struct brw_reg src,
157 struct brw_reg sampler_index)
158 {
159 int msg_type = -1;
160 int rlen = 4;
161 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
162
163 assert(src.file == BRW_GENERAL_REGISTER_FILE);
164
165 if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
166 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
167
168 switch (ir->opcode) {
169 case SHADER_OPCODE_TEX:
170 if (ir->shadow_compare) {
171 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
172 } else {
173 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
174 }
175 break;
176 case FS_OPCODE_TXB:
177 if (ir->shadow_compare) {
178 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
179 } else {
180 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
181 }
182 break;
183 case SHADER_OPCODE_TXL:
184 if (ir->shadow_compare) {
185 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
186 } else {
187 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
188 }
189 break;
190 case SHADER_OPCODE_TXS:
191 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
192 break;
193 case SHADER_OPCODE_TXD:
194 if (ir->shadow_compare) {
195 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
196 } else {
197 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
198 }
199 break;
200 case SHADER_OPCODE_TXF:
201 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
202 break;
203 case SHADER_OPCODE_TXF_CMS:
204 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
205 break;
206 case SHADER_OPCODE_TXF_UMS:
207 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
208 break;
209 case SHADER_OPCODE_TXF_MCS:
210 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
211 break;
212 case SHADER_OPCODE_LOD:
213 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
214 break;
215 case SHADER_OPCODE_TG4:
216 if (ir->shadow_compare) {
217 assert(brw->gen >= 7);
218 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
219 } else {
220 assert(brw->gen >= 6);
221 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
222 }
223 break;
224 case SHADER_OPCODE_TG4_OFFSET:
225 assert(brw->gen >= 7);
226 if (ir->shadow_compare) {
227 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
228 } else {
229 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
230 }
231 break;
232 default:
233 unreachable("not reached");
234 }
235 assert(msg_type != -1);
236
237 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
238 rlen = 8;
239 dst = vec16(dst);
240 }
241
242 assert(sampler_index.file == BRW_IMMEDIATE_VALUE);
243 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
244
245 uint32_t sampler = sampler_index.dw1.ud;
246
247 if (ir->header_present) {
248 /* The send-from-GRF for SIMD16 texturing with a header has an extra
249 * hardware register allocated to it, which we need to skip over (since
250 * our coordinates in the payload are in the even-numbered registers,
251 * and the header comes right before the first one.
252 */
253 if (dispatch_width == 16)
254 src.nr++;
255
256 unsigned save_exec_size = default_state.exec_size;
257 default_state.exec_size = BRW_EXECUTE_8;
258
259 MOV_RAW(src, brw_vec8_grf(0, 0));
260
261 if (ir->texture_offset) {
262 /* Set the texel offset bits. */
263 MOV_RAW(retype(brw_vec1_grf(src.nr, 2), BRW_REGISTER_TYPE_UD),
264 brw_imm_ud(ir->texture_offset));
265 }
266
267 if (sampler >= 16) {
268 /* The "Sampler Index" field can only store values between 0 and 15.
269 * However, we can add an offset to the "Sampler State Pointer"
270 * field, effectively selecting a different set of 16 samplers.
271 *
272 * The "Sampler State Pointer" needs to be aligned to a 32-byte
273 * offset, and each sampler state is only 16-bytes, so we can't
274 * exclusively use the offset - we have to use both.
275 */
276 const int sampler_state_size = 16; /* 16 bytes */
277 gen8_instruction *add =
278 ADD(get_element_ud(src, 3),
279 get_element_ud(brw_vec8_grf(0, 0), 3),
280 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
281 gen8_set_mask_control(add, BRW_MASK_DISABLE);
282 }
283
284 default_state.exec_size = save_exec_size;
285 }
286
287 uint32_t surf_index =
288 prog_data->base.binding_table.texture_start + sampler;
289
290 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
291 gen8_set_dst(brw, inst, dst);
292 gen8_set_src0(brw, inst, src);
293 gen8_set_sampler_message(brw, inst,
294 surf_index,
295 sampler % 16,
296 msg_type,
297 rlen,
298 ir->mlen,
299 ir->header_present,
300 simd_mode);
301
302 brw_mark_surface_used(&prog_data->base, surf_index);
303 }
304
305
306 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
307 * looking like:
308 *
309 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
310 *
311 * and we're trying to produce:
312 *
313 * DDX DDY
314 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
315 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
316 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
317 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
318 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
319 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
320 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
321 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
322 *
323 * and add another set of two more subspans if in 16-pixel dispatch mode.
324 *
325 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
326 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
327 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
328 * between each other. We could probably do it like ddx and swizzle the right
329 * order later, but bail for now and just produce
330 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
331 */
332 void
333 gen8_fs_generator::generate_ddx(fs_inst *inst,
334 struct brw_reg dst,
335 struct brw_reg src)
336 {
337 unsigned vstride, width;
338
339 if (key->high_quality_derivatives) {
340 /* Produce accurate derivatives. */
341 vstride = BRW_VERTICAL_STRIDE_2;
342 width = BRW_WIDTH_2;
343 } else {
344 /* Replicate the derivative at the top-left pixel to other pixels. */
345 vstride = BRW_VERTICAL_STRIDE_4;
346 width = BRW_WIDTH_4;
347 }
348
349 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
350 BRW_REGISTER_TYPE_F,
351 vstride,
352 width,
353 BRW_HORIZONTAL_STRIDE_0,
354 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
355 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
356 BRW_REGISTER_TYPE_F,
357 vstride,
358 width,
359 BRW_HORIZONTAL_STRIDE_0,
360 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
361 ADD(dst, src0, negate(src1));
362 }
363
364 /* The negate_value boolean is used to negate the derivative computation for
365 * FBOs, since they place the origin at the upper left instead of the lower
366 * left.
367 */
368 void
369 gen8_fs_generator::generate_ddy(fs_inst *inst,
370 struct brw_reg dst,
371 struct brw_reg src,
372 bool negate_value)
373 {
374 unsigned hstride;
375 unsigned src0_swizzle;
376 unsigned src1_swizzle;
377 unsigned src1_subnr;
378
379 if (key->high_quality_derivatives) {
380 /* Produce accurate derivatives. */
381 hstride = BRW_HORIZONTAL_STRIDE_1;
382 src0_swizzle = BRW_SWIZZLE_XYXY;
383 src1_swizzle = BRW_SWIZZLE_ZWZW;
384 src1_subnr = 0;
385
386 default_state.access_mode = BRW_ALIGN_16;
387 } else {
388 /* Replicate the derivative at the top-left pixel to other pixels. */
389 hstride = BRW_HORIZONTAL_STRIDE_0;
390 src0_swizzle = BRW_SWIZZLE_XYZW;
391 src1_swizzle = BRW_SWIZZLE_XYZW;
392 src1_subnr = 2;
393 }
394
395 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
396 BRW_REGISTER_TYPE_F,
397 BRW_VERTICAL_STRIDE_4,
398 BRW_WIDTH_4,
399 hstride,
400 src0_swizzle, WRITEMASK_XYZW);
401 struct brw_reg src1 = brw_reg(src.file, src.nr, src1_subnr,
402 BRW_REGISTER_TYPE_F,
403 BRW_VERTICAL_STRIDE_4,
404 BRW_WIDTH_4,
405 hstride,
406 src1_swizzle, WRITEMASK_XYZW);
407
408 if (negate_value)
409 ADD(dst, src1, negate(src0));
410 else
411 ADD(dst, src0, negate(src1));
412
413 default_state.access_mode = BRW_ALIGN_1;
414 }
415
416 void
417 gen8_fs_generator::generate_scratch_write(fs_inst *ir, struct brw_reg src)
418 {
419 MOV(retype(brw_message_reg(ir->base_mrf + 1), BRW_REGISTER_TYPE_UD),
420 retype(src, BRW_REGISTER_TYPE_UD));
421
422 struct brw_reg mrf =
423 retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD);
424
425 const int num_regs = dispatch_width / 8;
426
427 uint32_t msg_control;
428 if (num_regs == 1)
429 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
430 else
431 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
432
433 /* Set up the message header. This is g0, with g0.2 filled with
434 * the offset. We don't want to leave our offset around in g0 or
435 * it'll screw up texture samples, so set it up inside the message
436 * reg.
437 */
438 unsigned save_exec_size = default_state.exec_size;
439 default_state.exec_size = BRW_EXECUTE_8;
440
441 MOV_RAW(mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
442 /* set message header global offset field (reg 0, element 2) */
443 MOV_RAW(get_element_ud(mrf, 2), brw_imm_ud(ir->offset / 16));
444
445 struct brw_reg dst;
446 if (dispatch_width == 16)
447 dst = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
448 else
449 dst = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
450
451 default_state.exec_size = BRW_EXECUTE_16;
452
453 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
454 gen8_set_dst(brw, send, dst);
455 gen8_set_src0(brw, send, mrf);
456 gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
457 255, /* binding table index: stateless access */
458 GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE,
459 msg_control,
460 1 + num_regs, /* mlen */
461 0, /* rlen */
462 true, /* header present */
463 false); /* EOT */
464
465 default_state.exec_size = save_exec_size;
466 }
467
468 void
469 gen8_fs_generator::generate_scratch_read(fs_inst *ir, struct brw_reg dst)
470 {
471 struct brw_reg mrf =
472 retype(brw_message_reg(ir->base_mrf), BRW_REGISTER_TYPE_UD);
473
474 const int num_regs = dispatch_width / 8;
475
476 uint32_t msg_control;
477 if (num_regs == 1)
478 msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
479 else
480 msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
481
482 unsigned save_exec_size = default_state.exec_size;
483 default_state.exec_size = BRW_EXECUTE_8;
484
485 MOV_RAW(mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
486 /* set message header global offset field (reg 0, element 2) */
487 MOV_RAW(get_element_ud(mrf, 2), brw_imm_ud(ir->offset / 16));
488
489 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
490 gen8_set_dst(brw, send, retype(dst, BRW_REGISTER_TYPE_UW));
491 gen8_set_src0(brw, send, mrf);
492 gen8_set_dp_message(brw, send, GEN7_SFID_DATAPORT_DATA_CACHE,
493 255, /* binding table index: stateless access */
494 BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
495 msg_control,
496 1, /* mlen */
497 num_regs, /* rlen */
498 true, /* header present */
499 false); /* EOT */
500
501 default_state.exec_size = save_exec_size;
502 }
503
504 void
505 gen8_fs_generator::generate_scratch_read_gen7(fs_inst *ir, struct brw_reg dst)
506 {
507 unsigned save_exec_size = default_state.exec_size;
508 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
509
510 int num_regs = dispatch_width / 8;
511
512 /* According to the docs, offset is "A 12-bit HWord offset into the memory
513 * Immediate Memory buffer as specified by binding table 0xFF." An HWORD
514 * is 32 bytes, which happens to be the size of a register.
515 */
516 int offset = ir->offset / REG_SIZE;
517
518 /* The HW requires that the header is present; this is to get the g0.5
519 * scratch offset.
520 */
521 gen8_set_src0(brw, send, brw_vec8_grf(0, 0));
522 gen8_set_dst(brw, send, retype(dst, BRW_REGISTER_TYPE_UW));
523 gen8_set_dp_scratch_message(brw, send,
524 false, /* scratch read */
525 false, /* OWords */
526 false, /* invalidate after read */
527 num_regs,
528 offset,
529 1, /* mlen - just g0 */
530 num_regs, /* rlen */
531 true, /* header present */
532 false); /* EOT */
533
534 default_state.exec_size = save_exec_size;
535 }
536
537 void
538 gen8_fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
539 struct brw_reg dst,
540 struct brw_reg index,
541 struct brw_reg offset)
542 {
543 assert(inst->mlen == 0);
544
545 assert(index.file == BRW_IMMEDIATE_VALUE &&
546 index.type == BRW_REGISTER_TYPE_UD);
547 uint32_t surf_index = index.dw1.ud;
548
549 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
550 /* Reference only the dword we need lest we anger validate_reg() with
551 * reg.width > reg.execszie.
552 */
553 offset = brw_vec1_grf(offset.nr, 0);
554
555 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
556 gen8_set_mask_control(send, BRW_MASK_DISABLE);
557
558 /* We use the SIMD4x2 mode because we want to end up with 4 constants in
559 * the destination loaded consecutively from the same offset (which appears
560 * in the first component, and the rest are ignored).
561 */
562 dst.width = BRW_WIDTH_4;
563 gen8_set_dst(brw, send, dst);
564 gen8_set_src0(brw, send, offset);
565 gen8_set_sampler_message(brw, send,
566 surf_index,
567 0, /* The LD message ignores the sampler unit. */
568 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
569 1, /* rlen */
570 1, /* mlen */
571 false, /* no header */
572 BRW_SAMPLER_SIMD_MODE_SIMD4X2);
573
574 brw_mark_surface_used(&prog_data->base, surf_index);
575 }
576
577 void
578 gen8_fs_generator::generate_varying_pull_constant_load(fs_inst *ir,
579 struct brw_reg dst,
580 struct brw_reg index,
581 struct brw_reg offset)
582 {
583 /* Varying-offset pull constant loads are treated as a normal expression on
584 * gen7, so the fact that it's a send message is hidden at the IR level.
585 */
586 assert(!ir->header_present);
587 assert(!ir->mlen);
588
589 assert(index.file == BRW_IMMEDIATE_VALUE &&
590 index.type == BRW_REGISTER_TYPE_UD);
591 uint32_t surf_index = index.dw1.ud;
592
593 uint32_t simd_mode, rlen, mlen;
594 if (dispatch_width == 16) {
595 mlen = 2;
596 rlen = 8;
597 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
598 } else {
599 mlen = 1;
600 rlen = 4;
601 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
602 }
603
604 gen8_instruction *send = next_inst(BRW_OPCODE_SEND);
605 gen8_set_dst(brw, send, dst);
606 gen8_set_src0(brw, send, offset);
607 gen8_set_sampler_message(brw, send,
608 surf_index,
609 0, /* The LD message ignore the sampler unit. */
610 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
611 rlen, /* rlen */
612 mlen, /* mlen */
613 false, /* no header */
614 simd_mode);
615
616 brw_mark_surface_used(&prog_data->base, surf_index);
617 }
618
619 /**
620 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
621 * into the flags register (f0.0).
622 */
623 void
624 gen8_fs_generator::generate_mov_dispatch_to_flags(fs_inst *ir)
625 {
626 struct brw_reg flags = brw_flag_reg(0, ir->flag_subreg);
627 struct brw_reg dispatch_mask =
628 retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
629
630 gen8_instruction *mov = MOV(flags, dispatch_mask);
631 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
632 }
633
634 void
635 gen8_fs_generator::generate_discard_jump(fs_inst *ir)
636 {
637 /* This HALT will be patched up at FB write time to point UIP at the end of
638 * the program, and at brw_uip_jip() JIP will be set to the end of the
639 * current block (or the program).
640 */
641 discard_halt_patches.push_tail(new(mem_ctx) ip_record(nr_inst));
642
643 HALT();
644 }
645
646 bool
647 gen8_fs_generator::patch_discard_jumps_to_fb_writes()
648 {
649 if (discard_halt_patches.is_empty())
650 return false;
651
652 /* There is a somewhat strange undocumented requirement of using
653 * HALT, according to the simulator. If some channel has HALTed to
654 * a particular UIP, then by the end of the program, every channel
655 * must have HALTed to that UIP. Furthermore, the tracking is a
656 * stack, so you can't do the final halt of a UIP after starting
657 * halting to a new UIP.
658 *
659 * Symptoms of not emitting this instruction on actual hardware
660 * included GPU hangs and sparkly rendering on the piglit discard
661 * tests.
662 */
663 gen8_instruction *last_halt = HALT();
664 gen8_set_uip(last_halt, 16);
665 gen8_set_jip(last_halt, 16);
666
667 int ip = nr_inst;
668
669 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
670 gen8_instruction *patch = &store[patch_ip->ip];
671 assert(gen8_opcode(patch) == BRW_OPCODE_HALT);
672
673 /* HALT takes an instruction distance from the pre-incremented IP. */
674 gen8_set_uip(patch, (ip - patch_ip->ip) * 16);
675 }
676
677 this->discard_halt_patches.make_empty();
678 return true;
679 }
680
681 /**
682 * Sets the first dword of a vgrf for simd4x2 uniform pull constant
683 * sampler LD messages.
684 *
685 * We don't want to bake it into the send message's code generation because
686 * that means we don't get a chance to schedule the instruction.
687 */
688 void
689 gen8_fs_generator::generate_set_simd4x2_offset(fs_inst *ir,
690 struct brw_reg dst,
691 struct brw_reg value)
692 {
693 assert(value.file == BRW_IMMEDIATE_VALUE);
694 MOV_RAW(retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
695 }
696
697 /**
698 * Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
699 * (when mask is passed as a uniform) of register mask before moving it
700 * to register dst.
701 */
702 void
703 gen8_fs_generator::generate_set_omask(fs_inst *inst,
704 struct brw_reg dst,
705 struct brw_reg mask)
706 {
707 assert(dst.type == BRW_REGISTER_TYPE_UW);
708
709 if (dispatch_width == 16)
710 dst = vec16(dst);
711
712 if (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
713 mask.width == BRW_WIDTH_8 &&
714 mask.hstride == BRW_HORIZONTAL_STRIDE_1) {
715 mask = stride(mask, 16, 8, 2);
716 } else {
717 assert(mask.vstride == BRW_VERTICAL_STRIDE_0 &&
718 mask.width == BRW_WIDTH_1 &&
719 mask.hstride == BRW_HORIZONTAL_STRIDE_0);
720 }
721
722 gen8_instruction *mov = MOV(dst, retype(mask, dst.type));
723 gen8_set_mask_control(mov, BRW_MASK_DISABLE);
724 }
725
726 /**
727 * Do a special ADD with vstride=1, width=4, hstride=0 for src1.
728 */
729 void
730 gen8_fs_generator::generate_set_sample_id(fs_inst *ir,
731 struct brw_reg dst,
732 struct brw_reg src0,
733 struct brw_reg src1)
734 {
735 assert(dst.type == BRW_REGISTER_TYPE_D || dst.type == BRW_REGISTER_TYPE_UD);
736 assert(src0.type == BRW_REGISTER_TYPE_D || src0.type == BRW_REGISTER_TYPE_UD);
737
738 struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
739
740 unsigned save_exec_size = default_state.exec_size;
741 default_state.exec_size = BRW_EXECUTE_8;
742
743 gen8_instruction *add = ADD(dst, src0, reg);
744 gen8_set_mask_control(add, BRW_MASK_DISABLE);
745 if (dispatch_width == 16) {
746 add = ADD(offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
747 gen8_set_mask_control(add, BRW_MASK_DISABLE);
748 }
749
750 default_state.exec_size = save_exec_size;
751 }
752
753 /**
754 * Change the register's data type from UD to HF, doubling the strides in order
755 * to compensate for halving the data type width.
756 */
757 static struct brw_reg
758 ud_reg_to_hf(struct brw_reg r)
759 {
760 assert(r.type == BRW_REGISTER_TYPE_UD);
761 r.type = BRW_REGISTER_TYPE_HF;
762
763 /* The BRW_*_STRIDE enums are defined so that incrementing the field
764 * doubles the real stride.
765 */
766 if (r.hstride != 0)
767 ++r.hstride;
768 if (r.vstride != 0)
769 ++r.vstride;
770
771 return r;
772 }
773
774 void
775 gen8_fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
776 struct brw_reg dst,
777 struct brw_reg x,
778 struct brw_reg y)
779 {
780 assert(dst.type == BRW_REGISTER_TYPE_UD);
781 assert(x.type == BRW_REGISTER_TYPE_F);
782 assert(y.type == BRW_REGISTER_TYPE_F);
783
784 struct brw_reg dst_hf = ud_reg_to_hf(dst);
785
786 /* Give each 32-bit channel of dst the form below , where "." means
787 * unchanged.
788 * 0x....hhhh
789 */
790 MOV(dst_hf, y);
791
792 /* Now the form:
793 * 0xhhhh0000
794 */
795 SHL(dst, dst, brw_imm_ud(16u));
796
797 /* And, finally the form of packHalf2x16's output:
798 * 0xhhhhllll
799 */
800 MOV(dst_hf, x);
801 }
802
803 void
804 gen8_fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
805 struct brw_reg dst,
806 struct brw_reg src)
807 {
808 assert(dst.type == BRW_REGISTER_TYPE_F);
809 assert(src.type == BRW_REGISTER_TYPE_UD);
810
811 struct brw_reg src_hf = ud_reg_to_hf(src);
812
813 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
814 * For the Y case, we wish to access only the upper word; therefore
815 * a 16-bit subregister offset is needed.
816 */
817 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
818 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
819 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
820 src_hf.subnr += 2;
821
822 MOV(dst, src_hf);
823 }
824
825 void
826 gen8_fs_generator::generate_untyped_atomic(fs_inst *ir,
827 struct brw_reg dst,
828 struct brw_reg atomic_op,
829 struct brw_reg surf_index)
830 {
831 assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
832 atomic_op.type == BRW_REGISTER_TYPE_UD &&
833 surf_index.file == BRW_IMMEDIATE_VALUE &&
834 surf_index.type == BRW_REGISTER_TYPE_UD);
835 assert((atomic_op.dw1.ud & ~0xf) == 0);
836
837 unsigned msg_control =
838 atomic_op.dw1.ud | /* Atomic Operation Type: BRW_AOP_* */
839 ((dispatch_width == 16 ? 0 : 1) << 4) | /* SIMD Mode */
840 (1 << 5); /* Return data expected */
841
842 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
843 gen8_set_dst(brw, inst, retype(dst, BRW_REGISTER_TYPE_UD));
844 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
845 gen8_set_dp_message(brw, inst, HSW_SFID_DATAPORT_DATA_CACHE_1,
846 surf_index.dw1.ud,
847 HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP,
848 msg_control,
849 ir->mlen,
850 dispatch_width / 8,
851 ir->header_present,
852 false);
853
854 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
855 }
856
857 void
858 gen8_fs_generator::generate_untyped_surface_read(fs_inst *ir,
859 struct brw_reg dst,
860 struct brw_reg surf_index)
861 {
862 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
863 surf_index.type == BRW_REGISTER_TYPE_UD);
864
865 unsigned msg_control = 0xe | /* Enable only the R channel */
866 ((dispatch_width == 16 ? 1 : 2) << 4); /* SIMD Mode */
867
868 gen8_instruction *inst = next_inst(BRW_OPCODE_SEND);
869 gen8_set_dst(brw, inst, retype(dst, BRW_REGISTER_TYPE_UD));
870 gen8_set_src0(brw, inst, brw_message_reg(ir->base_mrf));
871 gen8_set_dp_message(brw, inst, HSW_SFID_DATAPORT_DATA_CACHE_1,
872 surf_index.dw1.ud,
873 HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ,
874 msg_control,
875 ir->mlen,
876 dispatch_width / 8,
877 ir->header_present,
878 false);
879
880 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
881 }
882
883 void
884 gen8_fs_generator::generate_code(exec_list *instructions)
885 {
886 int start_offset = next_inst_offset;
887
888 struct annotation_info annotation;
889 memset(&annotation, 0, sizeof(annotation));
890
891 cfg_t *cfg = NULL;
892 if (unlikely(INTEL_DEBUG & DEBUG_WM))
893 cfg = new(mem_ctx) cfg_t(instructions);
894
895 foreach_in_list(fs_inst, ir, instructions) {
896 struct brw_reg src[3], dst;
897
898 if (unlikely(INTEL_DEBUG & DEBUG_WM))
899 annotate(brw, &annotation, cfg, ir, next_inst_offset);
900
901 for (unsigned int i = 0; i < 3; i++) {
902 src[i] = brw_reg_from_fs_reg(&ir->src[i]);
903
904 /* The accumulator result appears to get used for the
905 * conditional modifier generation. When negating a UD
906 * value, there is a 33rd bit generated for the sign in the
907 * accumulator value, so now you can't check, for example,
908 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
909 */
910 assert(!ir->conditional_mod ||
911 ir->src[i].type != BRW_REGISTER_TYPE_UD ||
912 !ir->src[i].negate);
913 }
914 dst = brw_reg_from_fs_reg(&ir->dst);
915
916 default_state.conditional_mod = ir->conditional_mod;
917 default_state.predicate = ir->predicate;
918 default_state.predicate_inverse = ir->predicate_inverse;
919 default_state.saturate = ir->saturate;
920 default_state.mask_control = ir->force_writemask_all;
921 default_state.flag_subreg_nr = ir->flag_subreg;
922
923 if (dispatch_width == 16 && !ir->force_uncompressed && !ir->force_sechalf)
924 default_state.exec_size = BRW_EXECUTE_16;
925 else
926 default_state.exec_size = BRW_EXECUTE_8;
927
928 if (ir->force_uncompressed || dispatch_width == 8)
929 default_state.qtr_control = GEN6_COMPRESSION_1Q;
930 else if (ir->force_sechalf)
931 default_state.qtr_control = GEN6_COMPRESSION_2Q;
932 else
933 default_state.qtr_control = GEN6_COMPRESSION_1H;
934
935 switch (ir->opcode) {
936 case BRW_OPCODE_MOV:
937 MOV(dst, src[0]);
938 break;
939 case BRW_OPCODE_ADD:
940 ADD(dst, src[0], src[1]);
941 break;
942 case BRW_OPCODE_MUL:
943 MUL(dst, src[0], src[1]);
944 break;
945 case BRW_OPCODE_MACH:
946 MACH(dst, src[0], src[1]);
947 break;
948
949 case BRW_OPCODE_MAD:
950 default_state.access_mode = BRW_ALIGN_16;
951 MAD(dst, src[0], src[1], src[2]);
952 default_state.access_mode = BRW_ALIGN_1;
953 break;
954
955 case BRW_OPCODE_LRP:
956 default_state.access_mode = BRW_ALIGN_16;
957 LRP(dst, src[0], src[1], src[2]);
958 default_state.access_mode = BRW_ALIGN_1;
959 break;
960
961
962 case BRW_OPCODE_FRC:
963 FRC(dst, src[0]);
964 break;
965 case BRW_OPCODE_RNDD:
966 RNDD(dst, src[0]);
967 break;
968 case BRW_OPCODE_RNDE:
969 RNDE(dst, src[0]);
970 break;
971 case BRW_OPCODE_RNDZ:
972 RNDZ(dst, src[0]);
973 break;
974
975 case BRW_OPCODE_AND:
976 AND(dst, src[0], src[1]);
977 break;
978 case BRW_OPCODE_OR:
979 OR(dst, src[0], src[1]);
980 break;
981 case BRW_OPCODE_XOR:
982 XOR(dst, src[0], src[1]);
983 break;
984 case BRW_OPCODE_NOT:
985 NOT(dst, src[0]);
986 break;
987 case BRW_OPCODE_ASR:
988 ASR(dst, src[0], src[1]);
989 break;
990 case BRW_OPCODE_SHR:
991 SHR(dst, src[0], src[1]);
992 break;
993 case BRW_OPCODE_SHL:
994 SHL(dst, src[0], src[1]);
995 break;
996
997 case BRW_OPCODE_F32TO16:
998 MOV(retype(dst, BRW_REGISTER_TYPE_HF), src[0]);
999 break;
1000 case BRW_OPCODE_F16TO32:
1001 MOV(dst, retype(src[0], BRW_REGISTER_TYPE_HF));
1002 break;
1003
1004 case BRW_OPCODE_CMP:
1005 CMP(dst, ir->conditional_mod, src[0], src[1]);
1006 break;
1007 case BRW_OPCODE_SEL:
1008 SEL(dst, src[0], src[1]);
1009 break;
1010
1011 case BRW_OPCODE_BFREV:
1012 /* BFREV only supports UD type for src and dst. */
1013 BFREV(retype(dst, BRW_REGISTER_TYPE_UD),
1014 retype(src[0], BRW_REGISTER_TYPE_UD));
1015 break;
1016
1017 case BRW_OPCODE_FBH:
1018 /* FBH only supports UD type for dst. */
1019 FBH(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1020 break;
1021
1022 case BRW_OPCODE_FBL:
1023 /* FBL only supports UD type for dst. */
1024 FBL(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1025 break;
1026
1027 case BRW_OPCODE_CBIT:
1028 /* CBIT only supports UD type for dst. */
1029 CBIT(retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1030 break;
1031
1032 case BRW_OPCODE_ADDC:
1033 ADDC(dst, src[0], src[1]);
1034 break;
1035
1036 case BRW_OPCODE_SUBB:
1037 SUBB(dst, src[0], src[1]);
1038 break;
1039
1040 case BRW_OPCODE_BFE:
1041 default_state.access_mode = BRW_ALIGN_16;
1042 BFE(dst, src[0], src[1], src[2]);
1043 default_state.access_mode = BRW_ALIGN_1;
1044 break;
1045
1046 case BRW_OPCODE_BFI1:
1047 BFI1(dst, src[0], src[1]);
1048 break;
1049
1050 case BRW_OPCODE_BFI2:
1051 default_state.access_mode = BRW_ALIGN_16;
1052 BFI2(dst, src[0], src[1], src[2]);
1053 default_state.access_mode = BRW_ALIGN_1;
1054 break;
1055
1056 case BRW_OPCODE_IF:
1057 IF(BRW_PREDICATE_NORMAL);
1058 break;
1059
1060 case BRW_OPCODE_ELSE:
1061 ELSE();
1062 break;
1063
1064 case BRW_OPCODE_ENDIF:
1065 ENDIF();
1066 break;
1067
1068 case BRW_OPCODE_DO:
1069 DO();
1070 break;
1071
1072 case BRW_OPCODE_BREAK:
1073 BREAK();
1074 break;
1075
1076 case BRW_OPCODE_CONTINUE:
1077 CONTINUE();
1078 break;
1079
1080 case BRW_OPCODE_WHILE:
1081 WHILE();
1082 break;
1083
1084 case SHADER_OPCODE_RCP:
1085 MATH(BRW_MATH_FUNCTION_INV, dst, src[0]);
1086 break;
1087
1088 case SHADER_OPCODE_RSQ:
1089 MATH(BRW_MATH_FUNCTION_RSQ, dst, src[0]);
1090 break;
1091
1092 case SHADER_OPCODE_SQRT:
1093 MATH(BRW_MATH_FUNCTION_SQRT, dst, src[0]);
1094 break;
1095
1096 case SHADER_OPCODE_EXP2:
1097 MATH(BRW_MATH_FUNCTION_EXP, dst, src[0]);
1098 break;
1099
1100 case SHADER_OPCODE_LOG2:
1101 MATH(BRW_MATH_FUNCTION_LOG, dst, src[0]);
1102 break;
1103
1104 case SHADER_OPCODE_SIN:
1105 MATH(BRW_MATH_FUNCTION_SIN, dst, src[0]);
1106 break;
1107
1108 case SHADER_OPCODE_COS:
1109 MATH(BRW_MATH_FUNCTION_COS, dst, src[0]);
1110 break;
1111
1112 case SHADER_OPCODE_INT_QUOTIENT:
1113 MATH(BRW_MATH_FUNCTION_INT_DIV_QUOTIENT, dst, src[0], src[1]);
1114 break;
1115
1116 case SHADER_OPCODE_INT_REMAINDER:
1117 MATH(BRW_MATH_FUNCTION_INT_DIV_REMAINDER, dst, src[0], src[1]);
1118 break;
1119
1120 case SHADER_OPCODE_POW:
1121 MATH(BRW_MATH_FUNCTION_POW, dst, src[0], src[1]);
1122 break;
1123
1124 case FS_OPCODE_PIXEL_X:
1125 case FS_OPCODE_PIXEL_Y:
1126 unreachable("FS_OPCODE_PIXEL_X and FS_OPCODE_PIXEL_Y are only for Gen4-5.");
1127
1128 case FS_OPCODE_CINTERP:
1129 MOV(dst, src[0]);
1130 break;
1131 case FS_OPCODE_LINTERP:
1132 generate_linterp(ir, dst, src);
1133 break;
1134 case SHADER_OPCODE_TEX:
1135 case FS_OPCODE_TXB:
1136 case SHADER_OPCODE_TXD:
1137 case SHADER_OPCODE_TXF:
1138 case SHADER_OPCODE_TXF_CMS:
1139 case SHADER_OPCODE_TXF_UMS:
1140 case SHADER_OPCODE_TXF_MCS:
1141 case SHADER_OPCODE_TXL:
1142 case SHADER_OPCODE_TXS:
1143 case SHADER_OPCODE_LOD:
1144 case SHADER_OPCODE_TG4:
1145 case SHADER_OPCODE_TG4_OFFSET:
1146 generate_tex(ir, dst, src[0], src[1]);
1147 break;
1148
1149 case FS_OPCODE_DDX:
1150 generate_ddx(ir, dst, src[0]);
1151 break;
1152 case FS_OPCODE_DDY:
1153 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1154 * guarantee that key->render_to_fbo is set).
1155 */
1156 assert(fp->UsesDFdy);
1157 generate_ddy(ir, dst, src[0], key->render_to_fbo);
1158 break;
1159
1160 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1161 generate_scratch_write(ir, src[0]);
1162 break;
1163
1164 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1165 generate_scratch_read(ir, dst);
1166 break;
1167
1168 case SHADER_OPCODE_GEN7_SCRATCH_READ:
1169 generate_scratch_read_gen7(ir, dst);
1170 break;
1171
1172 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1173 generate_uniform_pull_constant_load(ir, dst, src[0], src[1]);
1174 break;
1175
1176 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1177 generate_varying_pull_constant_load(ir, dst, src[0], src[1]);
1178 break;
1179
1180 case FS_OPCODE_FB_WRITE:
1181 generate_fb_write(ir);
1182 break;
1183
1184 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1185 generate_mov_dispatch_to_flags(ir);
1186 break;
1187
1188 case FS_OPCODE_DISCARD_JUMP:
1189 generate_discard_jump(ir);
1190 break;
1191
1192 case SHADER_OPCODE_SHADER_TIME_ADD:
1193 unreachable("XXX: Missing Gen8 scalar support for INTEL_DEBUG=shader_time");
1194
1195 case SHADER_OPCODE_UNTYPED_ATOMIC:
1196 generate_untyped_atomic(ir, dst, src[0], src[1]);
1197 break;
1198
1199 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1200 generate_untyped_surface_read(ir, dst, src[0]);
1201 break;
1202
1203 case FS_OPCODE_SET_SIMD4X2_OFFSET:
1204 generate_set_simd4x2_offset(ir, dst, src[0]);
1205 break;
1206
1207 case FS_OPCODE_SET_OMASK:
1208 generate_set_omask(ir, dst, src[0]);
1209 break;
1210
1211 case FS_OPCODE_SET_SAMPLE_ID:
1212 generate_set_sample_id(ir, dst, src[0], src[1]);
1213 break;
1214
1215 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1216 generate_pack_half_2x16_split(ir, dst, src[0], src[1]);
1217 break;
1218
1219 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1220 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1221 generate_unpack_half_2x16_split(ir, dst, src[0]);
1222 break;
1223
1224 case FS_OPCODE_PLACEHOLDER_HALT:
1225 /* This is the place where the final HALT needs to be inserted if
1226 * we've emitted any discards. If not, this will emit no code.
1227 */
1228 if (!patch_discard_jumps_to_fb_writes()) {
1229 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1230 annotation.ann_count--;
1231 }
1232 }
1233 break;
1234
1235 default:
1236 if (ir->opcode < int(ARRAY_SIZE(opcode_descs))) {
1237 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1238 opcode_descs[ir->opcode].name);
1239 } else {
1240 _mesa_problem(ctx, "Unsupported opcode %d in FS", ir->opcode);
1241 }
1242 abort();
1243 }
1244 }
1245
1246 patch_jump_targets();
1247 annotation_finalize(&annotation, next_inst_offset);
1248
1249 int before_size = next_inst_offset - start_offset;
1250
1251 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1252 if (shader_prog) {
1253 fprintf(stderr,
1254 "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
1255 shader_prog->Label ? shader_prog->Label : "unnamed",
1256 shader_prog->Name, dispatch_width);
1257 } else if (fp) {
1258 fprintf(stderr,
1259 "Native code for fragment program %d (SIMD%d dispatch):\n",
1260 prog->Id, dispatch_width);
1261 } else {
1262 fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
1263 dispatch_width);
1264 }
1265 fprintf(stderr, "SIMD%d shader: %d instructions.\n",
1266 dispatch_width, before_size / 16);
1267
1268 dump_assembly(store, annotation.ann_count, annotation.ann, brw, prog);
1269 ralloc_free(annotation.ann);
1270 }
1271 }
1272
1273 const unsigned *
1274 gen8_fs_generator::generate_assembly(exec_list *simd8_instructions,
1275 exec_list *simd16_instructions,
1276 unsigned *assembly_size)
1277 {
1278 assert(simd8_instructions || simd16_instructions);
1279
1280 if (simd8_instructions) {
1281 dispatch_width = 8;
1282 generate_code(simd8_instructions);
1283 }
1284
1285 if (simd16_instructions) {
1286 /* Align to a 64-byte boundary. */
1287 while (next_inst_offset % 64)
1288 NOP();
1289
1290 /* Save off the start of this SIMD16 program */
1291 prog_data->prog_offset_16 = next_inst_offset;
1292
1293 dispatch_width = 16;
1294 generate_code(simd16_instructions);
1295 }
1296
1297 *assembly_size = next_inst_offset;
1298 return (const unsigned *) store;
1299 }