i965/vec4: Only zero out unused message components when there are any.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_emit.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_emit.cpp
25 *
26 * This file supports emitting code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 extern "C" {
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 } /* extern "C" */
35
36 #include "brw_fs.h"
37 #include "brw_cfg.h"
38
39 fs_generator::fs_generator(struct brw_context *brw,
40 struct brw_wm_compile *c,
41 struct gl_shader_program *prog,
42 struct gl_fragment_program *fp,
43 bool dual_source_output)
44
45 : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output)
46 {
47 ctx = &brw->ctx;
48
49 shader = prog ? prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL;
50
51 mem_ctx = c;
52
53 p = rzalloc(mem_ctx, struct brw_compile);
54 brw_init_compile(brw, p, mem_ctx);
55 }
56
57 fs_generator::~fs_generator()
58 {
59 }
60
61 void
62 fs_generator::mark_surface_used(unsigned surf_index)
63 {
64 assert(surf_index < BRW_MAX_WM_SURFACES);
65
66 c->prog_data.binding_table_size =
67 MAX2(c->prog_data.binding_table_size, surf_index + 1);
68 }
69
70 void
71 fs_generator::patch_discard_jumps_to_fb_writes()
72 {
73 if (brw->gen < 6 || this->discard_halt_patches.is_empty())
74 return;
75
76 /* There is a somewhat strange undocumented requirement of using
77 * HALT, according to the simulator. If some channel has HALTed to
78 * a particular UIP, then by the end of the program, every channel
79 * must have HALTed to that UIP. Furthermore, the tracking is a
80 * stack, so you can't do the final halt of a UIP after starting
81 * halting to a new UIP.
82 *
83 * Symptoms of not emitting this instruction on actual hardware
84 * included GPU hangs and sparkly rendering on the piglit discard
85 * tests.
86 */
87 struct brw_instruction *last_halt = gen6_HALT(p);
88 last_halt->bits3.break_cont.uip = 2;
89 last_halt->bits3.break_cont.jip = 2;
90
91 int ip = p->nr_insn;
92
93 foreach_list(node, &this->discard_halt_patches) {
94 ip_record *patch_ip = (ip_record *)node;
95 struct brw_instruction *patch = &p->store[patch_ip->ip];
96
97 assert(patch->header.opcode == BRW_OPCODE_HALT);
98 /* HALT takes a half-instruction distance from the pre-incremented IP. */
99 patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
100 }
101
102 this->discard_halt_patches.make_empty();
103 }
104
105 void
106 fs_generator::generate_fb_write(fs_inst *inst)
107 {
108 bool eot = inst->eot;
109 struct brw_reg implied_header;
110 uint32_t msg_control;
111
112 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
113 * move, here's g1.
114 */
115 brw_push_insn_state(p);
116 brw_set_mask_control(p, BRW_MASK_DISABLE);
117 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
118
119 if (fp->UsesKill) {
120 struct brw_reg pixel_mask;
121
122 if (brw->gen >= 6)
123 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
124 else
125 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
126
127 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
128 }
129
130 if (inst->header_present) {
131 if (brw->gen >= 6) {
132 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
133 brw_MOV(p,
134 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
135 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
136 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
137
138 if (inst->target > 0 && c->key.replicate_alpha) {
139 /* Set "Source0 Alpha Present to RenderTarget" bit in message
140 * header.
141 */
142 brw_OR(p,
143 vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
144 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
145 brw_imm_ud(0x1 << 11));
146 }
147
148 if (inst->target > 0) {
149 /* Set the render target index for choosing BLEND_STATE. */
150 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
151 inst->base_mrf, 2),
152 BRW_REGISTER_TYPE_UD),
153 brw_imm_ud(inst->target));
154 }
155
156 implied_header = brw_null_reg();
157 } else {
158 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
159
160 brw_MOV(p,
161 brw_message_reg(inst->base_mrf + 1),
162 brw_vec8_grf(1, 0));
163 }
164 } else {
165 implied_header = brw_null_reg();
166 }
167
168 if (this->dual_source_output)
169 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
170 else if (dispatch_width == 16)
171 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
172 else
173 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
174
175 brw_pop_insn_state(p);
176
177 brw_fb_WRITE(p,
178 dispatch_width,
179 inst->base_mrf,
180 implied_header,
181 msg_control,
182 SURF_INDEX_DRAW(inst->target),
183 inst->mlen,
184 0,
185 eot,
186 inst->header_present);
187
188 mark_surface_used(SURF_INDEX_DRAW(inst->target));
189 }
190
191 /* Computes the integer pixel x,y values from the origin.
192 *
193 * This is the basis of gl_FragCoord computation, but is also used
194 * pre-gen6 for computing the deltas from v0 for computing
195 * interpolation.
196 */
197 void
198 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
199 {
200 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
201 struct brw_reg src;
202 struct brw_reg deltas;
203
204 if (is_x) {
205 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
206 deltas = brw_imm_v(0x10101010);
207 } else {
208 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
209 deltas = brw_imm_v(0x11001100);
210 }
211
212 if (dispatch_width == 16) {
213 dst = vec16(dst);
214 }
215
216 /* We do this 8 or 16-wide, but since the destination is UW we
217 * don't do compression in the 16-wide case.
218 */
219 brw_push_insn_state(p);
220 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
221 brw_ADD(p, dst, src, deltas);
222 brw_pop_insn_state(p);
223 }
224
225 void
226 fs_generator::generate_linterp(fs_inst *inst,
227 struct brw_reg dst, struct brw_reg *src)
228 {
229 struct brw_reg delta_x = src[0];
230 struct brw_reg delta_y = src[1];
231 struct brw_reg interp = src[2];
232
233 if (brw->has_pln &&
234 delta_y.nr == delta_x.nr + 1 &&
235 (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
236 brw_PLN(p, dst, interp, delta_x);
237 } else {
238 brw_LINE(p, brw_null_reg(), interp, delta_x);
239 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
240 }
241 }
242
243 void
244 fs_generator::generate_math1_gen7(fs_inst *inst,
245 struct brw_reg dst,
246 struct brw_reg src0)
247 {
248 assert(inst->mlen == 0);
249 brw_math(p, dst,
250 brw_math_function(inst->opcode),
251 0, src0,
252 BRW_MATH_DATA_VECTOR,
253 BRW_MATH_PRECISION_FULL);
254 }
255
256 void
257 fs_generator::generate_math2_gen7(fs_inst *inst,
258 struct brw_reg dst,
259 struct brw_reg src0,
260 struct brw_reg src1)
261 {
262 assert(inst->mlen == 0);
263 brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
264 }
265
266 void
267 fs_generator::generate_math1_gen6(fs_inst *inst,
268 struct brw_reg dst,
269 struct brw_reg src0)
270 {
271 int op = brw_math_function(inst->opcode);
272
273 assert(inst->mlen == 0);
274
275 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
276 brw_math(p, dst,
277 op,
278 0, src0,
279 BRW_MATH_DATA_VECTOR,
280 BRW_MATH_PRECISION_FULL);
281
282 if (dispatch_width == 16) {
283 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
284 brw_math(p, sechalf(dst),
285 op,
286 0, sechalf(src0),
287 BRW_MATH_DATA_VECTOR,
288 BRW_MATH_PRECISION_FULL);
289 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
290 }
291 }
292
293 void
294 fs_generator::generate_math2_gen6(fs_inst *inst,
295 struct brw_reg dst,
296 struct brw_reg src0,
297 struct brw_reg src1)
298 {
299 int op = brw_math_function(inst->opcode);
300
301 assert(inst->mlen == 0);
302
303 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
304 brw_math2(p, dst, op, src0, src1);
305
306 if (dispatch_width == 16) {
307 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
308 brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
309 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
310 }
311 }
312
313 void
314 fs_generator::generate_math_gen4(fs_inst *inst,
315 struct brw_reg dst,
316 struct brw_reg src)
317 {
318 int op = brw_math_function(inst->opcode);
319
320 assert(inst->mlen >= 1);
321
322 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
323 brw_math(p, dst,
324 op,
325 inst->base_mrf, src,
326 BRW_MATH_DATA_VECTOR,
327 BRW_MATH_PRECISION_FULL);
328
329 if (dispatch_width == 16) {
330 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
331 brw_math(p, sechalf(dst),
332 op,
333 inst->base_mrf + 1, sechalf(src),
334 BRW_MATH_DATA_VECTOR,
335 BRW_MATH_PRECISION_FULL);
336
337 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
338 }
339 }
340
341 void
342 fs_generator::generate_math_g45(fs_inst *inst,
343 struct brw_reg dst,
344 struct brw_reg src)
345 {
346 if (inst->opcode == SHADER_OPCODE_POW ||
347 inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
348 inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
349 generate_math_gen4(inst, dst, src);
350 return;
351 }
352
353 int op = brw_math_function(inst->opcode);
354
355 assert(inst->mlen >= 1);
356
357 brw_math(p, dst,
358 op,
359 inst->base_mrf, src,
360 BRW_MATH_DATA_VECTOR,
361 BRW_MATH_PRECISION_FULL);
362 }
363
364 void
365 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
366 {
367 int msg_type = -1;
368 int rlen = 4;
369 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
370 uint32_t return_format;
371
372 switch (dst.type) {
373 case BRW_REGISTER_TYPE_D:
374 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
375 break;
376 case BRW_REGISTER_TYPE_UD:
377 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
378 break;
379 default:
380 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
381 break;
382 }
383
384 if (dispatch_width == 16)
385 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
386
387 if (brw->gen >= 5) {
388 switch (inst->opcode) {
389 case SHADER_OPCODE_TEX:
390 if (inst->shadow_compare) {
391 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
392 } else {
393 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
394 }
395 break;
396 case FS_OPCODE_TXB:
397 if (inst->shadow_compare) {
398 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
399 } else {
400 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
401 }
402 break;
403 case SHADER_OPCODE_TXL:
404 if (inst->shadow_compare) {
405 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
406 } else {
407 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
408 }
409 break;
410 case SHADER_OPCODE_TXS:
411 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
412 break;
413 case SHADER_OPCODE_TXD:
414 if (inst->shadow_compare) {
415 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
416 assert(brw->is_haswell);
417 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
418 } else {
419 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
420 }
421 break;
422 case SHADER_OPCODE_TXF:
423 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
424 break;
425 case SHADER_OPCODE_TXF_MS:
426 if (brw->gen >= 7)
427 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
428 else
429 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
430 break;
431 case SHADER_OPCODE_LOD:
432 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
433 break;
434 default:
435 assert(!"not reached");
436 break;
437 }
438 } else {
439 switch (inst->opcode) {
440 case SHADER_OPCODE_TEX:
441 /* Note that G45 and older determines shadow compare and dispatch width
442 * from message length for most messages.
443 */
444 assert(dispatch_width == 8);
445 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
446 if (inst->shadow_compare) {
447 assert(inst->mlen == 6);
448 } else {
449 assert(inst->mlen <= 4);
450 }
451 break;
452 case FS_OPCODE_TXB:
453 if (inst->shadow_compare) {
454 assert(inst->mlen == 6);
455 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
456 } else {
457 assert(inst->mlen == 9);
458 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
459 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
460 }
461 break;
462 case SHADER_OPCODE_TXL:
463 if (inst->shadow_compare) {
464 assert(inst->mlen == 6);
465 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
466 } else {
467 assert(inst->mlen == 9);
468 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
469 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
470 }
471 break;
472 case SHADER_OPCODE_TXD:
473 /* There is no sample_d_c message; comparisons are done manually */
474 assert(inst->mlen == 7 || inst->mlen == 10);
475 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
476 break;
477 case SHADER_OPCODE_TXF:
478 assert(inst->mlen == 9);
479 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
480 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
481 break;
482 case SHADER_OPCODE_TXS:
483 assert(inst->mlen == 3);
484 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
485 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
486 break;
487 default:
488 assert(!"not reached");
489 break;
490 }
491 }
492 assert(msg_type != -1);
493
494 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
495 rlen = 8;
496 dst = vec16(dst);
497 }
498
499 /* Load the message header if present. If there's a texture offset,
500 * we need to set it up explicitly and load the offset bitfield.
501 * Otherwise, we can use an implied move from g0 to the first message reg.
502 */
503 if (inst->texture_offset) {
504 brw_push_insn_state(p);
505 brw_set_mask_control(p, BRW_MASK_DISABLE);
506 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
507 /* Explicitly set up the message header by copying g0 to the MRF. */
508 brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
509 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
510
511 /* Then set the offset bits in DWord 2. */
512 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
513 inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
514 brw_imm_ud(inst->texture_offset));
515 brw_pop_insn_state(p);
516 } else if (inst->header_present) {
517 /* Set up an implied move from g0 to the MRF. */
518 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
519 }
520
521 brw_SAMPLE(p,
522 retype(dst, BRW_REGISTER_TYPE_UW),
523 inst->base_mrf,
524 src,
525 SURF_INDEX_TEXTURE(inst->sampler),
526 inst->sampler,
527 msg_type,
528 rlen,
529 inst->mlen,
530 inst->header_present,
531 simd_mode,
532 return_format);
533
534 mark_surface_used(SURF_INDEX_TEXTURE(inst->sampler));
535 }
536
537
538 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
539 * looking like:
540 *
541 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
542 *
543 * and we're trying to produce:
544 *
545 * DDX DDY
546 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
547 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
548 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
549 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
550 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
551 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
552 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
553 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
554 *
555 * and add another set of two more subspans if in 16-pixel dispatch mode.
556 *
557 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
558 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
559 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
560 * between each other. We could probably do it like ddx and swizzle the right
561 * order later, but bail for now and just produce
562 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
563 */
564 void
565 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
566 {
567 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
568 BRW_REGISTER_TYPE_F,
569 BRW_VERTICAL_STRIDE_2,
570 BRW_WIDTH_2,
571 BRW_HORIZONTAL_STRIDE_0,
572 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
573 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
574 BRW_REGISTER_TYPE_F,
575 BRW_VERTICAL_STRIDE_2,
576 BRW_WIDTH_2,
577 BRW_HORIZONTAL_STRIDE_0,
578 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
579 brw_ADD(p, dst, src0, negate(src1));
580 }
581
582 /* The negate_value boolean is used to negate the derivative computation for
583 * FBOs, since they place the origin at the upper left instead of the lower
584 * left.
585 */
586 void
587 fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
588 bool negate_value)
589 {
590 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
591 BRW_REGISTER_TYPE_F,
592 BRW_VERTICAL_STRIDE_4,
593 BRW_WIDTH_4,
594 BRW_HORIZONTAL_STRIDE_0,
595 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
596 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
597 BRW_REGISTER_TYPE_F,
598 BRW_VERTICAL_STRIDE_4,
599 BRW_WIDTH_4,
600 BRW_HORIZONTAL_STRIDE_0,
601 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
602 if (negate_value)
603 brw_ADD(p, dst, src1, negate(src0));
604 else
605 brw_ADD(p, dst, src0, negate(src1));
606 }
607
608 void
609 fs_generator::generate_discard_jump(fs_inst *inst)
610 {
611 assert(brw->gen >= 6);
612
613 /* This HALT will be patched up at FB write time to point UIP at the end of
614 * the program, and at brw_uip_jip() JIP will be set to the end of the
615 * current block (or the program).
616 */
617 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
618
619 brw_push_insn_state(p);
620 brw_set_mask_control(p, BRW_MASK_DISABLE);
621 gen6_HALT(p);
622 brw_pop_insn_state(p);
623 }
624
625 void
626 fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
627 {
628 assert(inst->mlen != 0);
629
630 brw_MOV(p,
631 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
632 retype(src, BRW_REGISTER_TYPE_UD));
633 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
634 inst->offset);
635 }
636
637 void
638 fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
639 {
640 assert(inst->mlen != 0);
641
642 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
643 inst->offset);
644 }
645
646 void
647 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
648 struct brw_reg dst,
649 struct brw_reg index,
650 struct brw_reg offset)
651 {
652 assert(inst->mlen != 0);
653
654 assert(index.file == BRW_IMMEDIATE_VALUE &&
655 index.type == BRW_REGISTER_TYPE_UD);
656 uint32_t surf_index = index.dw1.ud;
657
658 assert(offset.file == BRW_IMMEDIATE_VALUE &&
659 offset.type == BRW_REGISTER_TYPE_UD);
660 uint32_t read_offset = offset.dw1.ud;
661
662 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
663 read_offset, surf_index);
664
665 mark_surface_used(surf_index);
666 }
667
668 void
669 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
670 struct brw_reg dst,
671 struct brw_reg index,
672 struct brw_reg offset)
673 {
674 assert(inst->mlen == 0);
675
676 assert(index.file == BRW_IMMEDIATE_VALUE &&
677 index.type == BRW_REGISTER_TYPE_UD);
678 uint32_t surf_index = index.dw1.ud;
679
680 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
681 /* Reference just the dword we need, to avoid angering validate_reg(). */
682 offset = brw_vec1_grf(offset.nr, 0);
683
684 brw_push_insn_state(p);
685 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
686 brw_set_mask_control(p, BRW_MASK_DISABLE);
687 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
688 brw_pop_insn_state(p);
689
690 /* We use the SIMD4x2 mode because we want to end up with 4 components in
691 * the destination loaded consecutively from the same offset (which appears
692 * in the first component, and the rest are ignored).
693 */
694 dst.width = BRW_WIDTH_4;
695 brw_set_dest(p, send, dst);
696 brw_set_src0(p, send, offset);
697 brw_set_sampler_message(p, send,
698 surf_index,
699 0, /* LD message ignores sampler unit */
700 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
701 1, /* rlen */
702 1, /* mlen */
703 false, /* no header */
704 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
705 0);
706
707 mark_surface_used(surf_index);
708 }
709
710 void
711 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
712 struct brw_reg dst,
713 struct brw_reg index,
714 struct brw_reg offset)
715 {
716 assert(brw->gen < 7); /* Should use the gen7 variant. */
717 assert(inst->header_present);
718 assert(inst->mlen);
719
720 assert(index.file == BRW_IMMEDIATE_VALUE &&
721 index.type == BRW_REGISTER_TYPE_UD);
722 uint32_t surf_index = index.dw1.ud;
723
724 uint32_t simd_mode, rlen, msg_type;
725 if (dispatch_width == 16) {
726 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
727 rlen = 8;
728 } else {
729 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
730 rlen = 4;
731 }
732
733 if (brw->gen >= 5)
734 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
735 else {
736 /* We always use the SIMD16 message so that we only have to load U, and
737 * not V or R.
738 */
739 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
740 assert(inst->mlen == 3);
741 assert(inst->regs_written == 8);
742 rlen = 8;
743 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
744 }
745
746 struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
747 BRW_REGISTER_TYPE_D);
748 brw_MOV(p, offset_mrf, offset);
749
750 struct brw_reg header = brw_vec8_grf(0, 0);
751 gen6_resolve_implied_move(p, &header, inst->base_mrf);
752
753 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
754 send->header.compression_control = BRW_COMPRESSION_NONE;
755 brw_set_dest(p, send, dst);
756 brw_set_src0(p, send, header);
757 if (brw->gen < 6)
758 send->header.destreg__conditionalmod = inst->base_mrf;
759
760 /* Our surface is set up as floats, regardless of what actual data is
761 * stored in it.
762 */
763 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
764 brw_set_sampler_message(p, send,
765 surf_index,
766 0, /* sampler (unused) */
767 msg_type,
768 rlen,
769 inst->mlen,
770 inst->header_present,
771 simd_mode,
772 return_format);
773
774 mark_surface_used(surf_index);
775 }
776
777 void
778 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
779 struct brw_reg dst,
780 struct brw_reg index,
781 struct brw_reg offset)
782 {
783 assert(brw->gen >= 7);
784 /* Varying-offset pull constant loads are treated as a normal expression on
785 * gen7, so the fact that it's a send message is hidden at the IR level.
786 */
787 assert(!inst->header_present);
788 assert(!inst->mlen);
789
790 assert(index.file == BRW_IMMEDIATE_VALUE &&
791 index.type == BRW_REGISTER_TYPE_UD);
792 uint32_t surf_index = index.dw1.ud;
793
794 uint32_t simd_mode, rlen, mlen;
795 if (dispatch_width == 16) {
796 mlen = 2;
797 rlen = 8;
798 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
799 } else {
800 mlen = 1;
801 rlen = 4;
802 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
803 }
804
805 struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
806 brw_set_dest(p, send, dst);
807 brw_set_src0(p, send, offset);
808 brw_set_sampler_message(p, send,
809 surf_index,
810 0, /* LD message ignores sampler unit */
811 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
812 rlen,
813 mlen,
814 false, /* no header */
815 simd_mode,
816 0);
817
818 mark_surface_used(surf_index);
819 }
820
821 /**
822 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
823 * into the flags register (f0.0).
824 *
825 * Used only on Gen6 and above.
826 */
827 void
828 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
829 {
830 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
831 struct brw_reg dispatch_mask;
832
833 if (brw->gen >= 6)
834 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
835 else
836 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
837
838 brw_push_insn_state(p);
839 brw_set_mask_control(p, BRW_MASK_DISABLE);
840 brw_MOV(p, flags, dispatch_mask);
841 brw_pop_insn_state(p);
842 }
843
844
845 static uint32_t brw_file_from_reg(fs_reg *reg)
846 {
847 switch (reg->file) {
848 case ARF:
849 return BRW_ARCHITECTURE_REGISTER_FILE;
850 case GRF:
851 return BRW_GENERAL_REGISTER_FILE;
852 case MRF:
853 return BRW_MESSAGE_REGISTER_FILE;
854 case IMM:
855 return BRW_IMMEDIATE_VALUE;
856 default:
857 assert(!"not reached");
858 return BRW_GENERAL_REGISTER_FILE;
859 }
860 }
861
862 static struct brw_reg
863 brw_reg_from_fs_reg(fs_reg *reg)
864 {
865 struct brw_reg brw_reg;
866
867 switch (reg->file) {
868 case GRF:
869 case ARF:
870 case MRF:
871 if (reg->smear == -1) {
872 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
873 } else {
874 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
875 }
876 brw_reg = retype(brw_reg, reg->type);
877 if (reg->sechalf)
878 brw_reg = sechalf(brw_reg);
879 break;
880 case IMM:
881 switch (reg->type) {
882 case BRW_REGISTER_TYPE_F:
883 brw_reg = brw_imm_f(reg->imm.f);
884 break;
885 case BRW_REGISTER_TYPE_D:
886 brw_reg = brw_imm_d(reg->imm.i);
887 break;
888 case BRW_REGISTER_TYPE_UD:
889 brw_reg = brw_imm_ud(reg->imm.u);
890 break;
891 default:
892 assert(!"not reached");
893 brw_reg = brw_null_reg();
894 break;
895 }
896 break;
897 case HW_REG:
898 brw_reg = reg->fixed_hw_reg;
899 break;
900 case BAD_FILE:
901 /* Probably unused. */
902 brw_reg = brw_null_reg();
903 break;
904 case UNIFORM:
905 assert(!"not reached");
906 brw_reg = brw_null_reg();
907 break;
908 default:
909 assert(!"not reached");
910 brw_reg = brw_null_reg();
911 break;
912 }
913 if (reg->abs)
914 brw_reg = brw_abs(brw_reg);
915 if (reg->negate)
916 brw_reg = negate(brw_reg);
917
918 return brw_reg;
919 }
920
921 /**
922 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
923 * sampler LD messages.
924 *
925 * We don't want to bake it into the send message's code generation because
926 * that means we don't get a chance to schedule the instructions.
927 */
928 void
929 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
930 struct brw_reg dst,
931 struct brw_reg value)
932 {
933 assert(value.file == BRW_IMMEDIATE_VALUE);
934
935 brw_push_insn_state(p);
936 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
937 brw_set_mask_control(p, BRW_MASK_DISABLE);
938 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
939 brw_pop_insn_state(p);
940 }
941
942 /**
943 * Change the register's data type from UD to W, doubling the strides in order
944 * to compensate for halving the data type width.
945 */
946 static struct brw_reg
947 ud_reg_to_w(struct brw_reg r)
948 {
949 assert(r.type == BRW_REGISTER_TYPE_UD);
950 r.type = BRW_REGISTER_TYPE_W;
951
952 /* The BRW_*_STRIDE enums are defined so that incrementing the field
953 * doubles the real stride.
954 */
955 if (r.hstride != 0)
956 ++r.hstride;
957 if (r.vstride != 0)
958 ++r.vstride;
959
960 return r;
961 }
962
963 void
964 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
965 struct brw_reg dst,
966 struct brw_reg x,
967 struct brw_reg y)
968 {
969 assert(brw->gen >= 7);
970 assert(dst.type == BRW_REGISTER_TYPE_UD);
971 assert(x.type == BRW_REGISTER_TYPE_F);
972 assert(y.type == BRW_REGISTER_TYPE_F);
973
974 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
975 *
976 * Because this instruction does not have a 16-bit floating-point type,
977 * the destination data type must be Word (W).
978 *
979 * The destination must be DWord-aligned and specify a horizontal stride
980 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
981 * each destination channel and the upper word is not modified.
982 */
983 struct brw_reg dst_w = ud_reg_to_w(dst);
984
985 /* Give each 32-bit channel of dst the form below , where "." means
986 * unchanged.
987 * 0x....hhhh
988 */
989 brw_F32TO16(p, dst_w, y);
990
991 /* Now the form:
992 * 0xhhhh0000
993 */
994 brw_SHL(p, dst, dst, brw_imm_ud(16u));
995
996 /* And, finally the form of packHalf2x16's output:
997 * 0xhhhhllll
998 */
999 brw_F32TO16(p, dst_w, x);
1000 }
1001
1002 void
1003 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1004 struct brw_reg dst,
1005 struct brw_reg src)
1006 {
1007 assert(brw->gen >= 7);
1008 assert(dst.type == BRW_REGISTER_TYPE_F);
1009 assert(src.type == BRW_REGISTER_TYPE_UD);
1010
1011 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1012 *
1013 * Because this instruction does not have a 16-bit floating-point type,
1014 * the source data type must be Word (W). The destination type must be
1015 * F (Float).
1016 */
1017 struct brw_reg src_w = ud_reg_to_w(src);
1018
1019 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1020 * For the Y case, we wish to access only the upper word; therefore
1021 * a 16-bit subregister offset is needed.
1022 */
1023 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1024 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1025 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1026 src_w.subnr += 2;
1027
1028 brw_F16TO32(p, dst, src_w);
1029 }
1030
1031 void
1032 fs_generator::generate_shader_time_add(fs_inst *inst,
1033 struct brw_reg payload,
1034 struct brw_reg offset,
1035 struct brw_reg value)
1036 {
1037 assert(brw->gen >= 7);
1038 brw_push_insn_state(p);
1039 brw_set_mask_control(p, true);
1040
1041 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1042 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1043 offset.type);
1044 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1045 value.type);
1046
1047 assert(offset.file == BRW_IMMEDIATE_VALUE);
1048 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1049 value.width = BRW_WIDTH_1;
1050 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1051 value.vstride = BRW_VERTICAL_STRIDE_0;
1052 } else {
1053 assert(value.file == BRW_IMMEDIATE_VALUE);
1054 }
1055
1056 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1057 * case, and we don't really care about squeezing every bit of performance
1058 * out of this path, so we just emit the MOVs from here.
1059 */
1060 brw_MOV(p, payload_offset, offset);
1061 brw_MOV(p, payload_value, value);
1062 brw_shader_time_add(p, payload, SURF_INDEX_WM_SHADER_TIME);
1063 brw_pop_insn_state(p);
1064
1065 mark_surface_used(SURF_INDEX_WM_SHADER_TIME);
1066 }
1067
1068 void
1069 fs_generator::generate_code(exec_list *instructions)
1070 {
1071 int last_native_insn_offset = p->next_insn_offset;
1072 const char *last_annotation_string = NULL;
1073 const void *last_annotation_ir = NULL;
1074
1075 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1076 if (shader) {
1077 printf("Native code for fragment shader %d (%d-wide dispatch):\n",
1078 prog->Name, dispatch_width);
1079 } else {
1080 printf("Native code for fragment program %d (%d-wide dispatch):\n",
1081 fp->Base.Id, dispatch_width);
1082 }
1083 }
1084
1085 cfg_t *cfg = NULL;
1086 if (unlikely(INTEL_DEBUG & DEBUG_WM))
1087 cfg = new(mem_ctx) cfg_t(mem_ctx, instructions);
1088
1089 foreach_list(node, instructions) {
1090 fs_inst *inst = (fs_inst *)node;
1091 struct brw_reg src[3], dst;
1092
1093 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1094 foreach_list(node, &cfg->block_list) {
1095 bblock_link *link = (bblock_link *)node;
1096 bblock_t *block = link->block;
1097
1098 if (block->start == inst) {
1099 printf(" START B%d", block->block_num);
1100 foreach_list(predecessor_node, &block->parents) {
1101 bblock_link *predecessor_link =
1102 (bblock_link *)predecessor_node;
1103 bblock_t *predecessor_block = predecessor_link->block;
1104 printf(" <-B%d", predecessor_block->block_num);
1105 }
1106 printf("\n");
1107 }
1108 }
1109
1110 if (last_annotation_ir != inst->ir) {
1111 last_annotation_ir = inst->ir;
1112 if (last_annotation_ir) {
1113 printf(" ");
1114 if (shader)
1115 ((ir_instruction *)inst->ir)->print();
1116 else {
1117 const prog_instruction *fpi;
1118 fpi = (const prog_instruction *)inst->ir;
1119 printf("%d: ", (int)(fpi - fp->Base.Instructions));
1120 _mesa_fprint_instruction_opt(stdout,
1121 fpi,
1122 0, PROG_PRINT_DEBUG, NULL);
1123 }
1124 printf("\n");
1125 }
1126 }
1127 if (last_annotation_string != inst->annotation) {
1128 last_annotation_string = inst->annotation;
1129 if (last_annotation_string)
1130 printf(" %s\n", last_annotation_string);
1131 }
1132 }
1133
1134 for (unsigned int i = 0; i < 3; i++) {
1135 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1136
1137 /* The accumulator result appears to get used for the
1138 * conditional modifier generation. When negating a UD
1139 * value, there is a 33rd bit generated for the sign in the
1140 * accumulator value, so now you can't check, for example,
1141 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1142 */
1143 assert(!inst->conditional_mod ||
1144 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1145 !inst->src[i].negate);
1146 }
1147 dst = brw_reg_from_fs_reg(&inst->dst);
1148
1149 brw_set_conditionalmod(p, inst->conditional_mod);
1150 brw_set_predicate_control(p, inst->predicate);
1151 brw_set_predicate_inverse(p, inst->predicate_inverse);
1152 brw_set_flag_reg(p, 0, inst->flag_subreg);
1153 brw_set_saturate(p, inst->saturate);
1154 brw_set_mask_control(p, inst->force_writemask_all);
1155
1156 if (inst->force_uncompressed || dispatch_width == 8) {
1157 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1158 } else if (inst->force_sechalf) {
1159 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1160 } else {
1161 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1162 }
1163
1164 switch (inst->opcode) {
1165 case BRW_OPCODE_MOV:
1166 brw_MOV(p, dst, src[0]);
1167 break;
1168 case BRW_OPCODE_ADD:
1169 brw_ADD(p, dst, src[0], src[1]);
1170 break;
1171 case BRW_OPCODE_MUL:
1172 brw_MUL(p, dst, src[0], src[1]);
1173 break;
1174 case BRW_OPCODE_MACH:
1175 brw_set_acc_write_control(p, 1);
1176 brw_MACH(p, dst, src[0], src[1]);
1177 brw_set_acc_write_control(p, 0);
1178 break;
1179
1180 case BRW_OPCODE_MAD:
1181 brw_set_access_mode(p, BRW_ALIGN_16);
1182 if (dispatch_width == 16) {
1183 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1184 brw_MAD(p, dst, src[0], src[1], src[2]);
1185 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1186 brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1187 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1188 } else {
1189 brw_MAD(p, dst, src[0], src[1], src[2]);
1190 }
1191 brw_set_access_mode(p, BRW_ALIGN_1);
1192 break;
1193
1194 case BRW_OPCODE_LRP:
1195 brw_set_access_mode(p, BRW_ALIGN_16);
1196 if (dispatch_width == 16) {
1197 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1198 brw_LRP(p, dst, src[0], src[1], src[2]);
1199 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1200 brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1201 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1202 } else {
1203 brw_LRP(p, dst, src[0], src[1], src[2]);
1204 }
1205 brw_set_access_mode(p, BRW_ALIGN_1);
1206 break;
1207
1208 case BRW_OPCODE_FRC:
1209 brw_FRC(p, dst, src[0]);
1210 break;
1211 case BRW_OPCODE_RNDD:
1212 brw_RNDD(p, dst, src[0]);
1213 break;
1214 case BRW_OPCODE_RNDE:
1215 brw_RNDE(p, dst, src[0]);
1216 break;
1217 case BRW_OPCODE_RNDZ:
1218 brw_RNDZ(p, dst, src[0]);
1219 break;
1220
1221 case BRW_OPCODE_AND:
1222 brw_AND(p, dst, src[0], src[1]);
1223 break;
1224 case BRW_OPCODE_OR:
1225 brw_OR(p, dst, src[0], src[1]);
1226 break;
1227 case BRW_OPCODE_XOR:
1228 brw_XOR(p, dst, src[0], src[1]);
1229 break;
1230 case BRW_OPCODE_NOT:
1231 brw_NOT(p, dst, src[0]);
1232 break;
1233 case BRW_OPCODE_ASR:
1234 brw_ASR(p, dst, src[0], src[1]);
1235 break;
1236 case BRW_OPCODE_SHR:
1237 brw_SHR(p, dst, src[0], src[1]);
1238 break;
1239 case BRW_OPCODE_SHL:
1240 brw_SHL(p, dst, src[0], src[1]);
1241 break;
1242 case BRW_OPCODE_F32TO16:
1243 brw_F32TO16(p, dst, src[0]);
1244 break;
1245 case BRW_OPCODE_F16TO32:
1246 brw_F16TO32(p, dst, src[0]);
1247 break;
1248 case BRW_OPCODE_CMP:
1249 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1250 break;
1251 case BRW_OPCODE_SEL:
1252 brw_SEL(p, dst, src[0], src[1]);
1253 break;
1254 case BRW_OPCODE_BFREV:
1255 /* BFREV only supports UD type for src and dst. */
1256 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1257 retype(src[0], BRW_REGISTER_TYPE_UD));
1258 break;
1259 case BRW_OPCODE_FBH:
1260 /* FBH only supports UD type for dst. */
1261 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1262 break;
1263 case BRW_OPCODE_FBL:
1264 /* FBL only supports UD type for dst. */
1265 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1266 break;
1267 case BRW_OPCODE_CBIT:
1268 /* CBIT only supports UD type for dst. */
1269 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1270 break;
1271
1272 case BRW_OPCODE_BFE:
1273 brw_set_access_mode(p, BRW_ALIGN_16);
1274 if (dispatch_width == 16) {
1275 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1276 brw_BFE(p, dst, src[0], src[1], src[2]);
1277 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1278 brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1279 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1280 } else {
1281 brw_BFE(p, dst, src[0], src[1], src[2]);
1282 }
1283 brw_set_access_mode(p, BRW_ALIGN_1);
1284 break;
1285
1286 case BRW_OPCODE_BFI1:
1287 brw_BFI1(p, dst, src[0], src[1]);
1288 break;
1289 case BRW_OPCODE_BFI2:
1290 brw_set_access_mode(p, BRW_ALIGN_16);
1291 if (dispatch_width == 16) {
1292 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1293 brw_BFI2(p, dst, src[0], src[1], src[2]);
1294 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1295 brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1296 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1297 } else {
1298 brw_BFI2(p, dst, src[0], src[1], src[2]);
1299 }
1300 brw_set_access_mode(p, BRW_ALIGN_1);
1301 break;
1302
1303 case BRW_OPCODE_IF:
1304 if (inst->src[0].file != BAD_FILE) {
1305 /* The instruction has an embedded compare (only allowed on gen6) */
1306 assert(brw->gen == 6);
1307 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1308 } else {
1309 brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1310 }
1311 break;
1312
1313 case BRW_OPCODE_ELSE:
1314 brw_ELSE(p);
1315 break;
1316 case BRW_OPCODE_ENDIF:
1317 brw_ENDIF(p);
1318 break;
1319
1320 case BRW_OPCODE_DO:
1321 brw_DO(p, BRW_EXECUTE_8);
1322 break;
1323
1324 case BRW_OPCODE_BREAK:
1325 brw_BREAK(p);
1326 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1327 break;
1328 case BRW_OPCODE_CONTINUE:
1329 /* FINISHME: We need to write the loop instruction support still. */
1330 if (brw->gen >= 6)
1331 gen6_CONT(p);
1332 else
1333 brw_CONT(p);
1334 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1335 break;
1336
1337 case BRW_OPCODE_WHILE:
1338 brw_WHILE(p);
1339 break;
1340
1341 case SHADER_OPCODE_RCP:
1342 case SHADER_OPCODE_RSQ:
1343 case SHADER_OPCODE_SQRT:
1344 case SHADER_OPCODE_EXP2:
1345 case SHADER_OPCODE_LOG2:
1346 case SHADER_OPCODE_SIN:
1347 case SHADER_OPCODE_COS:
1348 if (brw->gen >= 7) {
1349 generate_math1_gen7(inst, dst, src[0]);
1350 } else if (brw->gen == 6) {
1351 generate_math1_gen6(inst, dst, src[0]);
1352 } else if (brw->gen == 5 || brw->is_g4x) {
1353 generate_math_g45(inst, dst, src[0]);
1354 } else {
1355 generate_math_gen4(inst, dst, src[0]);
1356 }
1357 break;
1358 case SHADER_OPCODE_INT_QUOTIENT:
1359 case SHADER_OPCODE_INT_REMAINDER:
1360 case SHADER_OPCODE_POW:
1361 if (brw->gen >= 7) {
1362 generate_math2_gen7(inst, dst, src[0], src[1]);
1363 } else if (brw->gen == 6) {
1364 generate_math2_gen6(inst, dst, src[0], src[1]);
1365 } else {
1366 generate_math_gen4(inst, dst, src[0]);
1367 }
1368 break;
1369 case FS_OPCODE_PIXEL_X:
1370 generate_pixel_xy(dst, true);
1371 break;
1372 case FS_OPCODE_PIXEL_Y:
1373 generate_pixel_xy(dst, false);
1374 break;
1375 case FS_OPCODE_CINTERP:
1376 brw_MOV(p, dst, src[0]);
1377 break;
1378 case FS_OPCODE_LINTERP:
1379 generate_linterp(inst, dst, src);
1380 break;
1381 case SHADER_OPCODE_TEX:
1382 case FS_OPCODE_TXB:
1383 case SHADER_OPCODE_TXD:
1384 case SHADER_OPCODE_TXF:
1385 case SHADER_OPCODE_TXF_MS:
1386 case SHADER_OPCODE_TXL:
1387 case SHADER_OPCODE_TXS:
1388 case SHADER_OPCODE_LOD:
1389 generate_tex(inst, dst, src[0]);
1390 break;
1391 case FS_OPCODE_DDX:
1392 generate_ddx(inst, dst, src[0]);
1393 break;
1394 case FS_OPCODE_DDY:
1395 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1396 * guarantee that c->key.render_to_fbo is set).
1397 */
1398 assert(fp->UsesDFdy);
1399 generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
1400 break;
1401
1402 case FS_OPCODE_SPILL:
1403 generate_spill(inst, src[0]);
1404 break;
1405
1406 case FS_OPCODE_UNSPILL:
1407 generate_unspill(inst, dst);
1408 break;
1409
1410 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1411 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1412 break;
1413
1414 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1415 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1416 break;
1417
1418 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1419 generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
1420 break;
1421
1422 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1423 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1424 break;
1425
1426 case FS_OPCODE_FB_WRITE:
1427 generate_fb_write(inst);
1428 break;
1429
1430 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1431 generate_mov_dispatch_to_flags(inst);
1432 break;
1433
1434 case FS_OPCODE_DISCARD_JUMP:
1435 generate_discard_jump(inst);
1436 break;
1437
1438 case SHADER_OPCODE_SHADER_TIME_ADD:
1439 generate_shader_time_add(inst, src[0], src[1], src[2]);
1440 break;
1441
1442 case FS_OPCODE_SET_SIMD4X2_OFFSET:
1443 generate_set_simd4x2_offset(inst, dst, src[0]);
1444 break;
1445
1446 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1447 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
1448 break;
1449
1450 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1451 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1452 generate_unpack_half_2x16_split(inst, dst, src[0]);
1453 break;
1454
1455 case FS_OPCODE_PLACEHOLDER_HALT:
1456 /* This is the place where the final HALT needs to be inserted if
1457 * we've emitted any discards. If not, this will emit no code.
1458 */
1459 patch_discard_jumps_to_fb_writes();
1460 break;
1461
1462 default:
1463 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1464 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1465 opcode_descs[inst->opcode].name);
1466 } else {
1467 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1468 }
1469 abort();
1470 }
1471
1472 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1473 brw_dump_compile(p, stdout,
1474 last_native_insn_offset, p->next_insn_offset);
1475
1476 foreach_list(node, &cfg->block_list) {
1477 bblock_link *link = (bblock_link *)node;
1478 bblock_t *block = link->block;
1479
1480 if (block->end == inst) {
1481 printf(" END B%d", block->block_num);
1482 foreach_list(successor_node, &block->children) {
1483 bblock_link *successor_link =
1484 (bblock_link *)successor_node;
1485 bblock_t *successor_block = successor_link->block;
1486 printf(" ->B%d", successor_block->block_num);
1487 }
1488 printf("\n");
1489 }
1490 }
1491 }
1492
1493 last_native_insn_offset = p->next_insn_offset;
1494 }
1495
1496 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1497 printf("\n");
1498 }
1499
1500 brw_set_uip_jip(p);
1501
1502 /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1503 * emit issues, it doesn't get the jump distances into the output,
1504 * which is often something we want to debug. So this is here in
1505 * case you're doing that.
1506 */
1507 if (0) {
1508 brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1509 }
1510 }
1511
1512 const unsigned *
1513 fs_generator::generate_assembly(exec_list *simd8_instructions,
1514 exec_list *simd16_instructions,
1515 unsigned *assembly_size)
1516 {
1517 dispatch_width = 8;
1518 generate_code(simd8_instructions);
1519
1520 if (simd16_instructions) {
1521 /* We have to do a compaction pass now, or the one at the end of
1522 * execution will squash down where our prog_offset start needs
1523 * to be.
1524 */
1525 brw_compact_instructions(p);
1526
1527 /* align to 64 byte boundary. */
1528 while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) {
1529 brw_NOP(p);
1530 }
1531
1532 /* Save off the start of this 16-wide program */
1533 c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction);
1534
1535 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1536
1537 dispatch_width = 16;
1538 generate_code(simd16_instructions);
1539 }
1540
1541 return brw_get_program(p, assembly_size);
1542 }