i965/fs: Remove force_writemask_all assertion for execsize < 8.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 extern "C" {
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 } /* extern "C" */
35
36 #include "brw_fs.h"
37 #include "brw_cfg.h"
38
39 static uint32_t brw_file_from_reg(fs_reg *reg)
40 {
41 switch (reg->file) {
42 case GRF:
43 return BRW_GENERAL_REGISTER_FILE;
44 case MRF:
45 return BRW_MESSAGE_REGISTER_FILE;
46 case IMM:
47 return BRW_IMMEDIATE_VALUE;
48 default:
49 unreachable("not reached");
50 }
51 }
52
53 static struct brw_reg
54 brw_reg_from_fs_reg(fs_reg *reg)
55 {
56 struct brw_reg brw_reg;
57
58 switch (reg->file) {
59 case GRF:
60 case MRF:
61 if (reg->stride == 0) {
62 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
63 } else if (reg->width < 8) {
64 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
65 brw_reg = stride(brw_reg, reg->width * reg->stride,
66 reg->width, reg->stride);
67 } else {
68 /* From the Haswell PRM:
69 *
70 * VertStride must be used to cross GRF register boundaries. This
71 * rule implies that elements within a 'Width' cannot cross GRF
72 * boundaries.
73 *
74 * So, for registers with width > 8, we have to use a width of 8
75 * and trust the compression state to sort out the exec size.
76 */
77 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
78 brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
79 }
80
81 brw_reg = retype(brw_reg, reg->type);
82 brw_reg = byte_offset(brw_reg, reg->subreg_offset);
83 break;
84 case IMM:
85 switch (reg->type) {
86 case BRW_REGISTER_TYPE_F:
87 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
88 break;
89 case BRW_REGISTER_TYPE_D:
90 brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
91 break;
92 case BRW_REGISTER_TYPE_UD:
93 brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
94 break;
95 case BRW_REGISTER_TYPE_W:
96 brw_reg = brw_imm_w(reg->fixed_hw_reg.dw1.d);
97 break;
98 case BRW_REGISTER_TYPE_UW:
99 brw_reg = brw_imm_uw(reg->fixed_hw_reg.dw1.ud);
100 break;
101 case BRW_REGISTER_TYPE_VF:
102 brw_reg = brw_imm_vf(reg->fixed_hw_reg.dw1.ud);
103 break;
104 default:
105 unreachable("not reached");
106 }
107 break;
108 case HW_REG:
109 assert(reg->type == reg->fixed_hw_reg.type);
110 brw_reg = reg->fixed_hw_reg;
111 break;
112 case BAD_FILE:
113 /* Probably unused. */
114 brw_reg = brw_null_reg();
115 break;
116 default:
117 unreachable("not reached");
118 }
119 if (reg->abs)
120 brw_reg = brw_abs(brw_reg);
121 if (reg->negate)
122 brw_reg = negate(brw_reg);
123
124 return brw_reg;
125 }
126
127 fs_generator::fs_generator(struct brw_context *brw,
128 void *mem_ctx,
129 const void *key,
130 struct brw_stage_prog_data *prog_data,
131 struct gl_program *prog,
132 bool runtime_check_aads_emit,
133 const char *stage_abbrev)
134
135 : brw(brw), key(key),
136 prog_data(prog_data),
137 prog(prog), runtime_check_aads_emit(runtime_check_aads_emit),
138 debug_flag(false), stage_abbrev(stage_abbrev), mem_ctx(mem_ctx)
139 {
140 ctx = &brw->ctx;
141
142 p = rzalloc(mem_ctx, struct brw_compile);
143 brw_init_compile(brw, p, mem_ctx);
144 }
145
146 fs_generator::~fs_generator()
147 {
148 }
149
150 class ip_record : public exec_node {
151 public:
152 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
153
154 ip_record(int ip)
155 {
156 this->ip = ip;
157 }
158
159 int ip;
160 };
161
162 bool
163 fs_generator::patch_discard_jumps_to_fb_writes()
164 {
165 if (brw->gen < 6 || this->discard_halt_patches.is_empty())
166 return false;
167
168 int scale = brw_jump_scale(brw);
169
170 /* There is a somewhat strange undocumented requirement of using
171 * HALT, according to the simulator. If some channel has HALTed to
172 * a particular UIP, then by the end of the program, every channel
173 * must have HALTed to that UIP. Furthermore, the tracking is a
174 * stack, so you can't do the final halt of a UIP after starting
175 * halting to a new UIP.
176 *
177 * Symptoms of not emitting this instruction on actual hardware
178 * included GPU hangs and sparkly rendering on the piglit discard
179 * tests.
180 */
181 brw_inst *last_halt = gen6_HALT(p);
182 brw_inst_set_uip(brw, last_halt, 1 * scale);
183 brw_inst_set_jip(brw, last_halt, 1 * scale);
184
185 int ip = p->nr_insn;
186
187 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
188 brw_inst *patch = &p->store[patch_ip->ip];
189
190 assert(brw_inst_opcode(brw, patch) == BRW_OPCODE_HALT);
191 /* HALT takes a half-instruction distance from the pre-incremented IP. */
192 brw_inst_set_uip(brw, patch, (ip - patch_ip->ip) * scale);
193 }
194
195 this->discard_halt_patches.make_empty();
196 return true;
197 }
198
199 void
200 fs_generator::fire_fb_write(fs_inst *inst,
201 struct brw_reg payload,
202 struct brw_reg implied_header,
203 GLuint nr)
204 {
205 uint32_t msg_control;
206
207 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
208
209 if (brw->gen < 6) {
210 brw_push_insn_state(p);
211 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
212 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
213 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
214 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
215 brw_pop_insn_state(p);
216 }
217
218 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
219 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
220 else if (prog_data->dual_src_blend)
221 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
222 else if (dispatch_width == 16)
223 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
224 else
225 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
226
227 uint32_t surf_index =
228 prog_data->binding_table.render_target_start + inst->target;
229
230 brw_fb_WRITE(p,
231 dispatch_width,
232 payload,
233 implied_header,
234 msg_control,
235 surf_index,
236 nr,
237 0,
238 inst->eot,
239 inst->header_present);
240
241 brw_mark_surface_used(&prog_data->base, surf_index);
242 }
243
244 void
245 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
246 {
247 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
248 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
249 struct brw_reg implied_header;
250
251 if (brw->gen < 8 && !brw->is_haswell) {
252 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
253 }
254
255 if (inst->base_mrf >= 0)
256 payload = brw_message_reg(inst->base_mrf);
257
258 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
259 * move, here's g1.
260 */
261 if (inst->header_present) {
262 brw_push_insn_state(p);
263 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
264 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
265 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
266 brw_set_default_flag_reg(p, 0, 0);
267
268 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
269 * present.
270 */
271 if (prog_data->uses_kill) {
272 struct brw_reg pixel_mask;
273
274 if (brw->gen >= 6)
275 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
276 else
277 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
278
279 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
280 }
281
282 if (brw->gen >= 6) {
283 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
284 brw_MOV(p,
285 retype(payload, BRW_REGISTER_TYPE_UD),
286 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
287 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
288
289 if (inst->target > 0 && key->replicate_alpha) {
290 /* Set "Source0 Alpha Present to RenderTarget" bit in message
291 * header.
292 */
293 brw_OR(p,
294 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
295 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
296 brw_imm_ud(0x1 << 11));
297 }
298
299 if (inst->target > 0) {
300 /* Set the render target index for choosing BLEND_STATE. */
301 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
302 BRW_REGISTER_TYPE_UD),
303 brw_imm_ud(inst->target));
304 }
305
306 implied_header = brw_null_reg();
307 } else {
308 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
309 }
310
311 brw_pop_insn_state(p);
312 } else {
313 implied_header = brw_null_reg();
314 }
315
316 if (!runtime_check_aads_emit) {
317 fire_fb_write(inst, payload, implied_header, inst->mlen);
318 } else {
319 /* This can only happen in gen < 6 */
320 assert(brw->gen < 6);
321
322 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
323
324 /* Check runtime bit to detect if we have to send AA data or not */
325 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
326 brw_AND(p,
327 v1_null_ud,
328 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
329 brw_imm_ud(1<<26));
330 brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_NZ);
331
332 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
333 brw_inst_set_exec_size(brw, brw_last_inst, BRW_EXECUTE_1);
334 {
335 /* Don't send AA data */
336 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
337 }
338 brw_land_fwd_jump(p, jmp);
339 fire_fb_write(inst, payload, implied_header, inst->mlen);
340 }
341 }
342
343 void
344 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
345 {
346 brw_inst *insn;
347
348 insn = brw_next_insn(p, BRW_OPCODE_SEND);
349
350 brw_set_dest(p, insn, brw_null_reg());
351 brw_set_src0(p, insn, payload);
352 brw_set_src1(p, insn, brw_imm_d(0));
353
354 brw_inst_set_sfid(brw, insn, BRW_SFID_URB);
355 brw_inst_set_urb_opcode(brw, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
356
357 brw_inst_set_mlen(brw, insn, inst->mlen);
358 brw_inst_set_rlen(brw, insn, 0);
359 brw_inst_set_eot(brw, insn, inst->eot);
360 brw_inst_set_header_present(brw, insn, true);
361 brw_inst_set_urb_global_offset(brw, insn, inst->offset);
362 }
363
364 void
365 fs_generator::generate_blorp_fb_write(fs_inst *inst)
366 {
367 brw_fb_WRITE(p,
368 16 /* dispatch_width */,
369 brw_message_reg(inst->base_mrf),
370 brw_reg_from_fs_reg(&inst->src[0]),
371 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
372 inst->target,
373 inst->mlen,
374 0,
375 true,
376 inst->header_present);
377 }
378
379 /* Computes the integer pixel x,y values from the origin.
380 *
381 * This is the basis of gl_FragCoord computation, but is also used
382 * pre-gen6 for computing the deltas from v0 for computing
383 * interpolation.
384 */
385 void
386 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
387 {
388 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
389 struct brw_reg src;
390 struct brw_reg deltas;
391
392 if (is_x) {
393 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
394 deltas = brw_imm_v(0x10101010);
395 } else {
396 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
397 deltas = brw_imm_v(0x11001100);
398 }
399
400 if (dispatch_width == 16) {
401 dst = vec16(dst);
402 }
403
404 /* We do this SIMD8 or SIMD16, but since the destination is UW we
405 * don't do compression in the SIMD16 case.
406 */
407 brw_push_insn_state(p);
408 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
409 brw_ADD(p, dst, src, deltas);
410 brw_pop_insn_state(p);
411 }
412
413 void
414 fs_generator::generate_linterp(fs_inst *inst,
415 struct brw_reg dst, struct brw_reg *src)
416 {
417 struct brw_reg delta_x = src[0];
418 struct brw_reg delta_y = src[1];
419 struct brw_reg interp = src[2];
420
421 if (brw->has_pln &&
422 delta_y.nr == delta_x.nr + 1 &&
423 (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
424 brw_PLN(p, dst, interp, delta_x);
425 } else {
426 brw_LINE(p, brw_null_reg(), interp, delta_x);
427 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
428 }
429 }
430
431 void
432 fs_generator::generate_math_gen6(fs_inst *inst,
433 struct brw_reg dst,
434 struct brw_reg src0,
435 struct brw_reg src1)
436 {
437 int op = brw_math_function(inst->opcode);
438 bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
439
440 if (dispatch_width == 8) {
441 gen6_math(p, dst, op, src0, src1);
442 } else if (dispatch_width == 16) {
443 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
444 gen6_math(p, firsthalf(dst), op, firsthalf(src0), firsthalf(src1));
445 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
446 gen6_math(p, sechalf(dst), op, sechalf(src0),
447 binop ? sechalf(src1) : brw_null_reg());
448 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
449 }
450 }
451
452 void
453 fs_generator::generate_math_gen4(fs_inst *inst,
454 struct brw_reg dst,
455 struct brw_reg src)
456 {
457 int op = brw_math_function(inst->opcode);
458
459 assert(inst->mlen >= 1);
460
461 if (dispatch_width == 8) {
462 gen4_math(p, dst,
463 op,
464 inst->base_mrf, src,
465 BRW_MATH_PRECISION_FULL);
466 } else if (dispatch_width == 16) {
467 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
468 gen4_math(p, firsthalf(dst),
469 op,
470 inst->base_mrf, firsthalf(src),
471 BRW_MATH_PRECISION_FULL);
472 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
473 gen4_math(p, sechalf(dst),
474 op,
475 inst->base_mrf + 1, sechalf(src),
476 BRW_MATH_PRECISION_FULL);
477
478 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
479 }
480 }
481
482 void
483 fs_generator::generate_math_g45(fs_inst *inst,
484 struct brw_reg dst,
485 struct brw_reg src)
486 {
487 if (inst->opcode == SHADER_OPCODE_POW ||
488 inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
489 inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
490 generate_math_gen4(inst, dst, src);
491 return;
492 }
493
494 int op = brw_math_function(inst->opcode);
495
496 assert(inst->mlen >= 1);
497
498 gen4_math(p, dst,
499 op,
500 inst->base_mrf, src,
501 BRW_MATH_PRECISION_FULL);
502 }
503
504 void
505 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
506 struct brw_reg sampler_index)
507 {
508 int msg_type = -1;
509 int rlen = 4;
510 uint32_t simd_mode;
511 uint32_t return_format;
512
513 switch (dst.type) {
514 case BRW_REGISTER_TYPE_D:
515 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
516 break;
517 case BRW_REGISTER_TYPE_UD:
518 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
519 break;
520 default:
521 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
522 break;
523 }
524
525 switch (inst->exec_size) {
526 case 8:
527 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
528 break;
529 case 16:
530 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
531 break;
532 default:
533 unreachable("Invalid width for texture instruction");
534 }
535
536 if (brw->gen >= 5) {
537 switch (inst->opcode) {
538 case SHADER_OPCODE_TEX:
539 if (inst->shadow_compare) {
540 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
541 } else {
542 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
543 }
544 break;
545 case FS_OPCODE_TXB:
546 if (inst->shadow_compare) {
547 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
548 } else {
549 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
550 }
551 break;
552 case SHADER_OPCODE_TXL:
553 if (inst->shadow_compare) {
554 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
555 } else {
556 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
557 }
558 break;
559 case SHADER_OPCODE_TXS:
560 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
561 break;
562 case SHADER_OPCODE_TXD:
563 if (inst->shadow_compare) {
564 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
565 assert(brw->gen >= 8 || brw->is_haswell);
566 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
567 } else {
568 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
569 }
570 break;
571 case SHADER_OPCODE_TXF:
572 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
573 break;
574 case SHADER_OPCODE_TXF_CMS:
575 if (brw->gen >= 7)
576 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
577 else
578 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
579 break;
580 case SHADER_OPCODE_TXF_UMS:
581 assert(brw->gen >= 7);
582 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
583 break;
584 case SHADER_OPCODE_TXF_MCS:
585 assert(brw->gen >= 7);
586 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
587 break;
588 case SHADER_OPCODE_LOD:
589 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
590 break;
591 case SHADER_OPCODE_TG4:
592 if (inst->shadow_compare) {
593 assert(brw->gen >= 7);
594 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
595 } else {
596 assert(brw->gen >= 6);
597 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
598 }
599 break;
600 case SHADER_OPCODE_TG4_OFFSET:
601 assert(brw->gen >= 7);
602 if (inst->shadow_compare) {
603 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
604 } else {
605 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
606 }
607 break;
608 default:
609 unreachable("not reached");
610 }
611 } else {
612 switch (inst->opcode) {
613 case SHADER_OPCODE_TEX:
614 /* Note that G45 and older determines shadow compare and dispatch width
615 * from message length for most messages.
616 */
617 assert(dispatch_width == 8);
618 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
619 if (inst->shadow_compare) {
620 assert(inst->mlen == 6);
621 } else {
622 assert(inst->mlen <= 4);
623 }
624 break;
625 case FS_OPCODE_TXB:
626 if (inst->shadow_compare) {
627 assert(inst->mlen == 6);
628 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
629 } else {
630 assert(inst->mlen == 9);
631 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
632 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
633 }
634 break;
635 case SHADER_OPCODE_TXL:
636 if (inst->shadow_compare) {
637 assert(inst->mlen == 6);
638 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
639 } else {
640 assert(inst->mlen == 9);
641 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
642 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
643 }
644 break;
645 case SHADER_OPCODE_TXD:
646 /* There is no sample_d_c message; comparisons are done manually */
647 assert(inst->mlen == 7 || inst->mlen == 10);
648 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
649 break;
650 case SHADER_OPCODE_TXF:
651 assert(inst->mlen == 9);
652 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
653 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
654 break;
655 case SHADER_OPCODE_TXS:
656 assert(inst->mlen == 3);
657 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
658 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
659 break;
660 default:
661 unreachable("not reached");
662 }
663 }
664 assert(msg_type != -1);
665
666 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
667 rlen = 8;
668 dst = vec16(dst);
669 }
670
671 assert(brw->gen < 7 || !inst->header_present ||
672 src.file == BRW_GENERAL_REGISTER_FILE);
673
674 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
675
676 /* Load the message header if present. If there's a texture offset,
677 * we need to set it up explicitly and load the offset bitfield.
678 * Otherwise, we can use an implied move from g0 to the first message reg.
679 */
680 if (inst->header_present) {
681 if (brw->gen < 6 && !inst->offset) {
682 /* Set up an implied move from g0 to the MRF. */
683 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
684 } else {
685 struct brw_reg header_reg;
686
687 if (brw->gen >= 7) {
688 header_reg = src;
689 } else {
690 assert(inst->base_mrf != -1);
691 header_reg = brw_message_reg(inst->base_mrf);
692 }
693
694 brw_push_insn_state(p);
695 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
696 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
697 /* Explicitly set up the message header by copying g0 to the MRF. */
698 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
699
700 if (inst->offset) {
701 /* Set the offset bits in DWord 2. */
702 brw_MOV(p, get_element_ud(header_reg, 2),
703 brw_imm_ud(inst->offset));
704 }
705
706 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
707 brw_pop_insn_state(p);
708 }
709 }
710
711 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
712 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
713 ? prog_data->binding_table.gather_texture_start
714 : prog_data->binding_table.texture_start;
715
716 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
717 uint32_t sampler = sampler_index.dw1.ud;
718
719 brw_SAMPLE(p,
720 retype(dst, BRW_REGISTER_TYPE_UW),
721 inst->base_mrf,
722 src,
723 sampler + base_binding_table_index,
724 sampler % 16,
725 msg_type,
726 rlen,
727 inst->mlen,
728 inst->header_present,
729 simd_mode,
730 return_format);
731
732 brw_mark_surface_used(prog_data, sampler + base_binding_table_index);
733 } else {
734 /* Non-const sampler index */
735 /* Note: this clobbers `dst` as a temporary before emitting the send */
736
737 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
738 struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
739
740 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
741
742 brw_push_insn_state(p);
743 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
744 brw_set_default_access_mode(p, BRW_ALIGN_1);
745
746 /* Some care required: `sampler` and `temp` may alias:
747 * addr = sampler & 0xff
748 * temp = (sampler << 8) & 0xf00
749 * addr = addr | temp
750 */
751 brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
752 brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
753 brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
754 brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
755 brw_OR(p, addr, addr, temp);
756
757 /* a0.0 |= <descriptor> */
758 brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
759 brw_set_sampler_message(p, insn_or,
760 0 /* surface */,
761 0 /* sampler */,
762 msg_type,
763 rlen,
764 inst->mlen /* mlen */,
765 inst->header_present /* header */,
766 simd_mode,
767 return_format);
768 brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
769 brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
770 brw_set_src0(p, insn_or, addr);
771 brw_set_dest(p, insn_or, addr);
772
773
774 /* dst = send(offset, a0.0) */
775 brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
776 brw_set_dest(p, insn_send, dst);
777 brw_set_src0(p, insn_send, src);
778 brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
779
780 brw_pop_insn_state(p);
781
782 /* visitor knows more than we do about the surface limit required,
783 * so has already done marking.
784 */
785 }
786 }
787
788
789 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
790 * looking like:
791 *
792 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
793 *
794 * Ideally, we want to produce:
795 *
796 * DDX DDY
797 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
798 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
799 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
800 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
801 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
802 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
803 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
804 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
805 *
806 * and add another set of two more subspans if in 16-pixel dispatch mode.
807 *
808 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
809 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
810 * pair. But the ideal approximation may impose a huge performance cost on
811 * sample_d. On at least Haswell, sample_d instruction does some
812 * optimizations if the same LOD is used for all pixels in the subspan.
813 *
814 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
815 * appropriate swizzling.
816 */
817 void
818 fs_generator::generate_ddx(enum opcode opcode,
819 struct brw_reg dst, struct brw_reg src)
820 {
821 unsigned vstride, width;
822
823 if (opcode == FS_OPCODE_DDX_FINE) {
824 /* produce accurate derivatives */
825 vstride = BRW_VERTICAL_STRIDE_2;
826 width = BRW_WIDTH_2;
827 } else {
828 /* replicate the derivative at the top-left pixel to other pixels */
829 vstride = BRW_VERTICAL_STRIDE_4;
830 width = BRW_WIDTH_4;
831 }
832
833 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
834 src.negate, src.abs,
835 BRW_REGISTER_TYPE_F,
836 vstride,
837 width,
838 BRW_HORIZONTAL_STRIDE_0,
839 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
840 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
841 src.negate, src.abs,
842 BRW_REGISTER_TYPE_F,
843 vstride,
844 width,
845 BRW_HORIZONTAL_STRIDE_0,
846 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
847 brw_ADD(p, dst, src0, negate(src1));
848 }
849
850 /* The negate_value boolean is used to negate the derivative computation for
851 * FBOs, since they place the origin at the upper left instead of the lower
852 * left.
853 */
854 void
855 fs_generator::generate_ddy(enum opcode opcode,
856 struct brw_reg dst, struct brw_reg src,
857 bool negate_value)
858 {
859 if (opcode == FS_OPCODE_DDY_FINE) {
860 /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
861 * Region Restrictions):
862 *
863 * In Align16 access mode, SIMD16 is not allowed for DW operations
864 * and SIMD8 is not allowed for DF operations.
865 *
866 * In this context, "DW operations" means "operations acting on 32-bit
867 * values", so it includes operations on floats.
868 *
869 * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
870 * (Instruction Compression -> Rules and Restrictions):
871 *
872 * A compressed instruction must be in Align1 access mode. Align16
873 * mode instructions cannot be compressed.
874 *
875 * Similar text exists in the g45 PRM.
876 *
877 * On these platforms, if we're building a SIMD16 shader, we need to
878 * manually unroll to a pair of SIMD8 instructions.
879 */
880 bool unroll_to_simd8 =
881 (dispatch_width == 16 &&
882 (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
883
884 /* produce accurate derivatives */
885 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
886 src.negate, src.abs,
887 BRW_REGISTER_TYPE_F,
888 BRW_VERTICAL_STRIDE_4,
889 BRW_WIDTH_4,
890 BRW_HORIZONTAL_STRIDE_1,
891 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
892 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
893 src.negate, src.abs,
894 BRW_REGISTER_TYPE_F,
895 BRW_VERTICAL_STRIDE_4,
896 BRW_WIDTH_4,
897 BRW_HORIZONTAL_STRIDE_1,
898 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
899 brw_push_insn_state(p);
900 brw_set_default_access_mode(p, BRW_ALIGN_16);
901 if (unroll_to_simd8) {
902 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
903 if (negate_value) {
904 brw_ADD(p, firsthalf(dst), firsthalf(src1), negate(firsthalf(src0)));
905 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
906 brw_ADD(p, sechalf(dst), sechalf(src1), negate(sechalf(src0)));
907 } else {
908 brw_ADD(p, firsthalf(dst), firsthalf(src0), negate(firsthalf(src1)));
909 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
910 brw_ADD(p, sechalf(dst), sechalf(src0), negate(sechalf(src1)));
911 }
912 } else {
913 if (negate_value)
914 brw_ADD(p, dst, src1, negate(src0));
915 else
916 brw_ADD(p, dst, src0, negate(src1));
917 }
918 brw_pop_insn_state(p);
919 } else {
920 /* replicate the derivative at the top-left pixel to other pixels */
921 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
922 src.negate, src.abs,
923 BRW_REGISTER_TYPE_F,
924 BRW_VERTICAL_STRIDE_4,
925 BRW_WIDTH_4,
926 BRW_HORIZONTAL_STRIDE_0,
927 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
928 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
929 src.negate, src.abs,
930 BRW_REGISTER_TYPE_F,
931 BRW_VERTICAL_STRIDE_4,
932 BRW_WIDTH_4,
933 BRW_HORIZONTAL_STRIDE_0,
934 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
935 if (negate_value)
936 brw_ADD(p, dst, src1, negate(src0));
937 else
938 brw_ADD(p, dst, src0, negate(src1));
939 }
940 }
941
942 void
943 fs_generator::generate_discard_jump(fs_inst *inst)
944 {
945 assert(brw->gen >= 6);
946
947 /* This HALT will be patched up at FB write time to point UIP at the end of
948 * the program, and at brw_uip_jip() JIP will be set to the end of the
949 * current block (or the program).
950 */
951 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
952
953 brw_push_insn_state(p);
954 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
955 gen6_HALT(p);
956 brw_pop_insn_state(p);
957 }
958
959 void
960 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
961 {
962 assert(inst->mlen != 0);
963
964 brw_MOV(p,
965 brw_uvec_mrf(inst->exec_size, (inst->base_mrf + 1), 0),
966 retype(src, BRW_REGISTER_TYPE_UD));
967 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
968 inst->exec_size / 8, inst->offset);
969 }
970
971 void
972 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
973 {
974 assert(inst->mlen != 0);
975
976 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
977 inst->exec_size / 8, inst->offset);
978 }
979
980 void
981 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
982 {
983 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
984 }
985
986 void
987 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
988 struct brw_reg dst,
989 struct brw_reg index,
990 struct brw_reg offset)
991 {
992 assert(inst->mlen != 0);
993
994 assert(index.file == BRW_IMMEDIATE_VALUE &&
995 index.type == BRW_REGISTER_TYPE_UD);
996 uint32_t surf_index = index.dw1.ud;
997
998 assert(offset.file == BRW_IMMEDIATE_VALUE &&
999 offset.type == BRW_REGISTER_TYPE_UD);
1000 uint32_t read_offset = offset.dw1.ud;
1001
1002 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1003 read_offset, surf_index);
1004
1005 brw_mark_surface_used(prog_data, surf_index);
1006 }
1007
1008 void
1009 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1010 struct brw_reg dst,
1011 struct brw_reg index,
1012 struct brw_reg offset)
1013 {
1014 assert(inst->mlen == 0);
1015 assert(index.type == BRW_REGISTER_TYPE_UD);
1016
1017 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
1018 /* Reference just the dword we need, to avoid angering validate_reg(). */
1019 offset = brw_vec1_grf(offset.nr, 0);
1020
1021 /* We use the SIMD4x2 mode because we want to end up with 4 components in
1022 * the destination loaded consecutively from the same offset (which appears
1023 * in the first component, and the rest are ignored).
1024 */
1025 dst.width = BRW_WIDTH_4;
1026
1027 struct brw_reg src = offset;
1028 bool header_present = false;
1029 int mlen = 1;
1030
1031 if (brw->gen >= 9) {
1032 /* Skylake requires a message header in order to use SIMD4x2 mode. */
1033 src = retype(brw_vec4_grf(offset.nr - 1, 0), BRW_REGISTER_TYPE_UD);
1034 mlen = 2;
1035 header_present = true;
1036
1037 brw_push_insn_state(p);
1038 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1039 brw_MOV(p, src, retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
1040 brw_set_default_access_mode(p, BRW_ALIGN_1);
1041
1042 brw_MOV(p, get_element_ud(src, 2),
1043 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1044 brw_pop_insn_state(p);
1045 }
1046
1047 if (index.file == BRW_IMMEDIATE_VALUE) {
1048
1049 uint32_t surf_index = index.dw1.ud;
1050
1051 brw_push_insn_state(p);
1052 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1053 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1054 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1055 brw_pop_insn_state(p);
1056
1057 brw_set_dest(p, send, dst);
1058 brw_set_src0(p, send, src);
1059 brw_set_sampler_message(p, send,
1060 surf_index,
1061 0, /* LD message ignores sampler unit */
1062 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1063 1, /* rlen */
1064 mlen,
1065 header_present,
1066 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1067 0);
1068
1069 brw_mark_surface_used(prog_data, surf_index);
1070
1071 } else {
1072
1073 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1074
1075 brw_push_insn_state(p);
1076 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1077 brw_set_default_access_mode(p, BRW_ALIGN_1);
1078
1079 /* a0.0 = surf_index & 0xff */
1080 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1081 brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
1082 brw_set_dest(p, insn_and, addr);
1083 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1084 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1085
1086
1087 /* a0.0 |= <descriptor> */
1088 brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
1089 brw_set_sampler_message(p, insn_or,
1090 0 /* surface */,
1091 0 /* sampler */,
1092 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1093 1 /* rlen */,
1094 mlen,
1095 header_present,
1096 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1097 0);
1098 brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
1099 brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
1100 brw_set_src0(p, insn_or, addr);
1101 brw_set_dest(p, insn_or, addr);
1102
1103
1104 /* dst = send(offset, a0.0) */
1105 brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
1106 brw_set_dest(p, insn_send, dst);
1107 brw_set_src0(p, insn_send, src);
1108 brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
1109
1110 brw_pop_insn_state(p);
1111
1112 /* visitor knows more than we do about the surface limit required,
1113 * so has already done marking.
1114 */
1115
1116 }
1117 }
1118
1119 void
1120 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
1121 struct brw_reg dst,
1122 struct brw_reg index,
1123 struct brw_reg offset)
1124 {
1125 assert(brw->gen < 7); /* Should use the gen7 variant. */
1126 assert(inst->header_present);
1127 assert(inst->mlen);
1128
1129 assert(index.file == BRW_IMMEDIATE_VALUE &&
1130 index.type == BRW_REGISTER_TYPE_UD);
1131 uint32_t surf_index = index.dw1.ud;
1132
1133 uint32_t simd_mode, rlen, msg_type;
1134 if (dispatch_width == 16) {
1135 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1136 rlen = 8;
1137 } else {
1138 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1139 rlen = 4;
1140 }
1141
1142 if (brw->gen >= 5)
1143 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1144 else {
1145 /* We always use the SIMD16 message so that we only have to load U, and
1146 * not V or R.
1147 */
1148 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1149 assert(inst->mlen == 3);
1150 assert(inst->regs_written == 8);
1151 rlen = 8;
1152 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1153 }
1154
1155 struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
1156 BRW_REGISTER_TYPE_D);
1157 brw_MOV(p, offset_mrf, offset);
1158
1159 struct brw_reg header = brw_vec8_grf(0, 0);
1160 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1161
1162 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1163 brw_inst_set_qtr_control(brw, send, BRW_COMPRESSION_NONE);
1164 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1165 brw_set_src0(p, send, header);
1166 if (brw->gen < 6)
1167 brw_inst_set_base_mrf(brw, send, inst->base_mrf);
1168
1169 /* Our surface is set up as floats, regardless of what actual data is
1170 * stored in it.
1171 */
1172 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1173 brw_set_sampler_message(p, send,
1174 surf_index,
1175 0, /* sampler (unused) */
1176 msg_type,
1177 rlen,
1178 inst->mlen,
1179 inst->header_present,
1180 simd_mode,
1181 return_format);
1182
1183 brw_mark_surface_used(prog_data, surf_index);
1184 }
1185
1186 void
1187 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1188 struct brw_reg dst,
1189 struct brw_reg index,
1190 struct brw_reg offset)
1191 {
1192 assert(brw->gen >= 7);
1193 /* Varying-offset pull constant loads are treated as a normal expression on
1194 * gen7, so the fact that it's a send message is hidden at the IR level.
1195 */
1196 assert(!inst->header_present);
1197 assert(!inst->mlen);
1198 assert(index.type == BRW_REGISTER_TYPE_UD);
1199
1200 uint32_t simd_mode, rlen, mlen;
1201 if (dispatch_width == 16) {
1202 mlen = 2;
1203 rlen = 8;
1204 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1205 } else {
1206 mlen = 1;
1207 rlen = 4;
1208 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1209 }
1210
1211 if (index.file == BRW_IMMEDIATE_VALUE) {
1212
1213 uint32_t surf_index = index.dw1.ud;
1214
1215 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1216 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1217 brw_set_src0(p, send, offset);
1218 brw_set_sampler_message(p, send,
1219 surf_index,
1220 0, /* LD message ignores sampler unit */
1221 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1222 rlen,
1223 mlen,
1224 false, /* no header */
1225 simd_mode,
1226 0);
1227
1228 brw_mark_surface_used(prog_data, surf_index);
1229
1230 } else {
1231
1232 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1233
1234 brw_push_insn_state(p);
1235 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1236 brw_set_default_access_mode(p, BRW_ALIGN_1);
1237
1238 /* a0.0 = surf_index & 0xff */
1239 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1240 brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
1241 brw_set_dest(p, insn_and, addr);
1242 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1243 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1244
1245
1246 /* a0.0 |= <descriptor> */
1247 brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
1248 brw_set_sampler_message(p, insn_or,
1249 0 /* surface */,
1250 0 /* sampler */,
1251 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1252 rlen /* rlen */,
1253 mlen /* mlen */,
1254 false /* header */,
1255 simd_mode,
1256 0);
1257 brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
1258 brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
1259 brw_set_src0(p, insn_or, addr);
1260 brw_set_dest(p, insn_or, addr);
1261
1262
1263 /* dst = send(offset, a0.0) */
1264 brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
1265 brw_set_dest(p, insn_send, retype(dst, BRW_REGISTER_TYPE_UW));
1266 brw_set_src0(p, insn_send, offset);
1267 brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
1268
1269 brw_pop_insn_state(p);
1270
1271 /* visitor knows more than we do about the surface limit required,
1272 * so has already done marking.
1273 */
1274 }
1275 }
1276
1277 /**
1278 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1279 * into the flags register (f0.0).
1280 *
1281 * Used only on Gen6 and above.
1282 */
1283 void
1284 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1285 {
1286 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1287 struct brw_reg dispatch_mask;
1288
1289 if (brw->gen >= 6)
1290 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1291 else
1292 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1293
1294 brw_push_insn_state(p);
1295 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1296 brw_MOV(p, flags, dispatch_mask);
1297 brw_pop_insn_state(p);
1298 }
1299
1300 void
1301 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1302 struct brw_reg dst,
1303 struct brw_reg src,
1304 struct brw_reg msg_data,
1305 unsigned msg_type)
1306 {
1307 assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
1308 msg_data.type == BRW_REGISTER_TYPE_UD);
1309
1310 brw_pixel_interpolator_query(p,
1311 retype(dst, BRW_REGISTER_TYPE_UW),
1312 src,
1313 inst->pi_noperspective,
1314 msg_type,
1315 msg_data.dw1.ud,
1316 inst->mlen,
1317 inst->regs_written);
1318 }
1319
1320
1321 /**
1322 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1323 * sampler LD messages.
1324 *
1325 * We don't want to bake it into the send message's code generation because
1326 * that means we don't get a chance to schedule the instructions.
1327 */
1328 void
1329 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1330 struct brw_reg dst,
1331 struct brw_reg value)
1332 {
1333 assert(value.file == BRW_IMMEDIATE_VALUE);
1334
1335 brw_push_insn_state(p);
1336 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1337 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1338 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1339 brw_pop_insn_state(p);
1340 }
1341
1342 /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
1343 * (when mask is passed as a uniform) of register mask before moving it
1344 * to register dst.
1345 */
1346 void
1347 fs_generator::generate_set_omask(fs_inst *inst,
1348 struct brw_reg dst,
1349 struct brw_reg mask)
1350 {
1351 bool stride_8_8_1 =
1352 (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
1353 mask.width == BRW_WIDTH_8 &&
1354 mask.hstride == BRW_HORIZONTAL_STRIDE_1);
1355
1356 bool stride_0_1_0 = has_scalar_region(mask);
1357
1358 assert(stride_8_8_1 || stride_0_1_0);
1359 assert(dst.type == BRW_REGISTER_TYPE_UW);
1360
1361 if (dispatch_width == 16)
1362 dst = vec16(dst);
1363 brw_push_insn_state(p);
1364 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1365 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1366
1367 if (stride_8_8_1) {
1368 brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
1369 } else if (stride_0_1_0) {
1370 brw_MOV(p, dst, retype(mask, dst.type));
1371 }
1372 brw_pop_insn_state(p);
1373 }
1374
1375 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1376 * the ADD instruction.
1377 */
1378 void
1379 fs_generator::generate_set_sample_id(fs_inst *inst,
1380 struct brw_reg dst,
1381 struct brw_reg src0,
1382 struct brw_reg src1)
1383 {
1384 assert(dst.type == BRW_REGISTER_TYPE_D ||
1385 dst.type == BRW_REGISTER_TYPE_UD);
1386 assert(src0.type == BRW_REGISTER_TYPE_D ||
1387 src0.type == BRW_REGISTER_TYPE_UD);
1388
1389 brw_push_insn_state(p);
1390 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1391 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1392 struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
1393 if (dispatch_width == 8) {
1394 brw_ADD(p, dst, src0, reg);
1395 } else if (dispatch_width == 16) {
1396 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1397 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1398 }
1399 brw_pop_insn_state(p);
1400 }
1401
1402 /**
1403 * Change the register's data type from UD to W, doubling the strides in order
1404 * to compensate for halving the data type width.
1405 */
1406 static struct brw_reg
1407 ud_reg_to_w(struct brw_reg r)
1408 {
1409 assert(r.type == BRW_REGISTER_TYPE_UD);
1410 r.type = BRW_REGISTER_TYPE_W;
1411
1412 /* The BRW_*_STRIDE enums are defined so that incrementing the field
1413 * doubles the real stride.
1414 */
1415 if (r.hstride != 0)
1416 ++r.hstride;
1417 if (r.vstride != 0)
1418 ++r.vstride;
1419
1420 return r;
1421 }
1422
1423 void
1424 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1425 struct brw_reg dst,
1426 struct brw_reg x,
1427 struct brw_reg y)
1428 {
1429 assert(brw->gen >= 7);
1430 assert(dst.type == BRW_REGISTER_TYPE_UD);
1431 assert(x.type == BRW_REGISTER_TYPE_F);
1432 assert(y.type == BRW_REGISTER_TYPE_F);
1433
1434 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1435 *
1436 * Because this instruction does not have a 16-bit floating-point type,
1437 * the destination data type must be Word (W).
1438 *
1439 * The destination must be DWord-aligned and specify a horizontal stride
1440 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1441 * each destination channel and the upper word is not modified.
1442 */
1443 struct brw_reg dst_w = ud_reg_to_w(dst);
1444
1445 /* Give each 32-bit channel of dst the form below , where "." means
1446 * unchanged.
1447 * 0x....hhhh
1448 */
1449 brw_F32TO16(p, dst_w, y);
1450
1451 /* Now the form:
1452 * 0xhhhh0000
1453 */
1454 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1455
1456 /* And, finally the form of packHalf2x16's output:
1457 * 0xhhhhllll
1458 */
1459 brw_F32TO16(p, dst_w, x);
1460 }
1461
1462 void
1463 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1464 struct brw_reg dst,
1465 struct brw_reg src)
1466 {
1467 assert(brw->gen >= 7);
1468 assert(dst.type == BRW_REGISTER_TYPE_F);
1469 assert(src.type == BRW_REGISTER_TYPE_UD);
1470
1471 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1472 *
1473 * Because this instruction does not have a 16-bit floating-point type,
1474 * the source data type must be Word (W). The destination type must be
1475 * F (Float).
1476 */
1477 struct brw_reg src_w = ud_reg_to_w(src);
1478
1479 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1480 * For the Y case, we wish to access only the upper word; therefore
1481 * a 16-bit subregister offset is needed.
1482 */
1483 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1484 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1485 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1486 src_w.subnr += 2;
1487
1488 brw_F16TO32(p, dst, src_w);
1489 }
1490
1491 void
1492 fs_generator::generate_shader_time_add(fs_inst *inst,
1493 struct brw_reg payload,
1494 struct brw_reg offset,
1495 struct brw_reg value)
1496 {
1497 assert(brw->gen >= 7);
1498 brw_push_insn_state(p);
1499 brw_set_default_mask_control(p, true);
1500
1501 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1502 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1503 offset.type);
1504 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1505 value.type);
1506
1507 assert(offset.file == BRW_IMMEDIATE_VALUE);
1508 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1509 value.width = BRW_WIDTH_1;
1510 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1511 value.vstride = BRW_VERTICAL_STRIDE_0;
1512 } else {
1513 assert(value.file == BRW_IMMEDIATE_VALUE);
1514 }
1515
1516 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1517 * case, and we don't really care about squeezing every bit of performance
1518 * out of this path, so we just emit the MOVs from here.
1519 */
1520 brw_MOV(p, payload_offset, offset);
1521 brw_MOV(p, payload_value, value);
1522 brw_shader_time_add(p, payload,
1523 prog_data->binding_table.shader_time_start);
1524 brw_pop_insn_state(p);
1525
1526 brw_mark_surface_used(prog_data,
1527 prog_data->binding_table.shader_time_start);
1528 }
1529
1530 void
1531 fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
1532 struct brw_reg payload,
1533 struct brw_reg atomic_op,
1534 struct brw_reg surf_index)
1535 {
1536 assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
1537 atomic_op.type == BRW_REGISTER_TYPE_UD &&
1538 surf_index.file == BRW_IMMEDIATE_VALUE &&
1539 surf_index.type == BRW_REGISTER_TYPE_UD);
1540
1541 brw_untyped_atomic(p, dst, payload, atomic_op.dw1.ud, surf_index.dw1.ud,
1542 inst->mlen, inst->exec_size / 8);
1543
1544 brw_mark_surface_used(prog_data, surf_index.dw1.ud);
1545 }
1546
1547 void
1548 fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
1549 struct brw_reg payload,
1550 struct brw_reg surf_index)
1551 {
1552 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
1553 surf_index.type == BRW_REGISTER_TYPE_UD);
1554
1555 brw_untyped_surface_read(p, dst, payload,
1556 surf_index.dw1.ud,
1557 inst->mlen, inst->exec_size / 8);
1558
1559 brw_mark_surface_used(prog_data, surf_index.dw1.ud);
1560 }
1561
1562 void
1563 fs_generator::enable_debug(const char *shader_name)
1564 {
1565 debug_flag = true;
1566 this->shader_name = shader_name;
1567 }
1568
1569 int
1570 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1571 {
1572 /* align to 64 byte boundary. */
1573 while (p->next_insn_offset % 64)
1574 brw_NOP(p);
1575
1576 this->dispatch_width = dispatch_width;
1577 if (dispatch_width == 16)
1578 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1579
1580 int start_offset = p->next_insn_offset;
1581 int loop_count = 0;
1582
1583 struct annotation_info annotation;
1584 memset(&annotation, 0, sizeof(annotation));
1585
1586 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1587 struct brw_reg src[3], dst;
1588 unsigned int last_insn_offset = p->next_insn_offset;
1589 bool multiple_instructions_emitted = false;
1590
1591 if (unlikely(debug_flag))
1592 annotate(brw, &annotation, cfg, inst, p->next_insn_offset);
1593
1594 for (unsigned int i = 0; i < inst->sources; i++) {
1595 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1596
1597 /* The accumulator result appears to get used for the
1598 * conditional modifier generation. When negating a UD
1599 * value, there is a 33rd bit generated for the sign in the
1600 * accumulator value, so now you can't check, for example,
1601 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1602 */
1603 assert(!inst->conditional_mod ||
1604 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1605 !inst->src[i].negate);
1606 }
1607 dst = brw_reg_from_fs_reg(&inst->dst);
1608
1609 brw_set_default_predicate_control(p, inst->predicate);
1610 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1611 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1612 brw_set_default_saturate(p, inst->saturate);
1613 brw_set_default_mask_control(p, inst->force_writemask_all);
1614 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1615
1616 switch (inst->exec_size) {
1617 case 1:
1618 case 2:
1619 case 4:
1620 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1621 break;
1622 case 8:
1623 if (inst->force_sechalf) {
1624 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1625 } else {
1626 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1627 }
1628 break;
1629 case 16:
1630 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1631 break;
1632 default:
1633 unreachable(!"Invalid instruction width");
1634 }
1635
1636 switch (inst->opcode) {
1637 case BRW_OPCODE_MOV:
1638 brw_MOV(p, dst, src[0]);
1639 break;
1640 case BRW_OPCODE_ADD:
1641 brw_ADD(p, dst, src[0], src[1]);
1642 break;
1643 case BRW_OPCODE_MUL:
1644 brw_MUL(p, dst, src[0], src[1]);
1645 break;
1646 case BRW_OPCODE_AVG:
1647 brw_AVG(p, dst, src[0], src[1]);
1648 break;
1649 case BRW_OPCODE_MACH:
1650 brw_MACH(p, dst, src[0], src[1]);
1651 break;
1652
1653 case BRW_OPCODE_LINE:
1654 brw_LINE(p, dst, src[0], src[1]);
1655 break;
1656
1657 case BRW_OPCODE_MAD:
1658 assert(brw->gen >= 6);
1659 brw_set_default_access_mode(p, BRW_ALIGN_16);
1660 if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1661 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1662 brw_inst *f = brw_MAD(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1663 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1664 brw_inst *s = brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1665 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1666
1667 if (inst->conditional_mod) {
1668 brw_inst_set_cond_modifier(brw, f, inst->conditional_mod);
1669 brw_inst_set_cond_modifier(brw, s, inst->conditional_mod);
1670 multiple_instructions_emitted = true;
1671 }
1672 } else {
1673 brw_MAD(p, dst, src[0], src[1], src[2]);
1674 }
1675 brw_set_default_access_mode(p, BRW_ALIGN_1);
1676 break;
1677
1678 case BRW_OPCODE_LRP:
1679 assert(brw->gen >= 6);
1680 brw_set_default_access_mode(p, BRW_ALIGN_16);
1681 if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1682 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1683 brw_inst *f = brw_LRP(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1684 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1685 brw_inst *s = brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1686 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1687
1688 if (inst->conditional_mod) {
1689 brw_inst_set_cond_modifier(brw, f, inst->conditional_mod);
1690 brw_inst_set_cond_modifier(brw, s, inst->conditional_mod);
1691 multiple_instructions_emitted = true;
1692 }
1693 } else {
1694 brw_LRP(p, dst, src[0], src[1], src[2]);
1695 }
1696 brw_set_default_access_mode(p, BRW_ALIGN_1);
1697 break;
1698
1699 case BRW_OPCODE_FRC:
1700 brw_FRC(p, dst, src[0]);
1701 break;
1702 case BRW_OPCODE_RNDD:
1703 brw_RNDD(p, dst, src[0]);
1704 break;
1705 case BRW_OPCODE_RNDE:
1706 brw_RNDE(p, dst, src[0]);
1707 break;
1708 case BRW_OPCODE_RNDZ:
1709 brw_RNDZ(p, dst, src[0]);
1710 break;
1711
1712 case BRW_OPCODE_AND:
1713 brw_AND(p, dst, src[0], src[1]);
1714 break;
1715 case BRW_OPCODE_OR:
1716 brw_OR(p, dst, src[0], src[1]);
1717 break;
1718 case BRW_OPCODE_XOR:
1719 brw_XOR(p, dst, src[0], src[1]);
1720 break;
1721 case BRW_OPCODE_NOT:
1722 brw_NOT(p, dst, src[0]);
1723 break;
1724 case BRW_OPCODE_ASR:
1725 brw_ASR(p, dst, src[0], src[1]);
1726 break;
1727 case BRW_OPCODE_SHR:
1728 brw_SHR(p, dst, src[0], src[1]);
1729 break;
1730 case BRW_OPCODE_SHL:
1731 brw_SHL(p, dst, src[0], src[1]);
1732 break;
1733 case BRW_OPCODE_F32TO16:
1734 assert(brw->gen >= 7);
1735 brw_F32TO16(p, dst, src[0]);
1736 break;
1737 case BRW_OPCODE_F16TO32:
1738 assert(brw->gen >= 7);
1739 brw_F16TO32(p, dst, src[0]);
1740 break;
1741 case BRW_OPCODE_CMP:
1742 /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says
1743 * that when the destination is a GRF that the dependency-clear bit on
1744 * the flag register is cleared early.
1745 *
1746 * Suggested workarounds are to disable coissuing CMP instructions
1747 * or to split CMP(16) instructions into two CMP(8) instructions.
1748 *
1749 * We choose to split into CMP(8) instructions since disabling
1750 * coissuing would affect CMP instructions not otherwise affected by
1751 * the errata.
1752 */
1753 if (dispatch_width == 16 && brw->gen == 7 && !brw->is_haswell) {
1754 if (dst.file == BRW_GENERAL_REGISTER_FILE) {
1755 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1756 brw_CMP(p, firsthalf(dst), inst->conditional_mod,
1757 firsthalf(src[0]), firsthalf(src[1]));
1758 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1759 brw_CMP(p, sechalf(dst), inst->conditional_mod,
1760 sechalf(src[0]), sechalf(src[1]));
1761 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1762
1763 multiple_instructions_emitted = true;
1764 } else if (dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1765 /* For unknown reasons, the aforementioned workaround is not
1766 * sufficient. Overriding the type when the destination is the
1767 * null register is necessary but not sufficient by itself.
1768 */
1769 assert(dst.nr == BRW_ARF_NULL);
1770 dst.type = BRW_REGISTER_TYPE_D;
1771 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1772 } else {
1773 unreachable("not reached");
1774 }
1775 } else {
1776 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1777 }
1778 break;
1779 case BRW_OPCODE_SEL:
1780 brw_SEL(p, dst, src[0], src[1]);
1781 break;
1782 case BRW_OPCODE_BFREV:
1783 assert(brw->gen >= 7);
1784 /* BFREV only supports UD type for src and dst. */
1785 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1786 retype(src[0], BRW_REGISTER_TYPE_UD));
1787 break;
1788 case BRW_OPCODE_FBH:
1789 assert(brw->gen >= 7);
1790 /* FBH only supports UD type for dst. */
1791 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1792 break;
1793 case BRW_OPCODE_FBL:
1794 assert(brw->gen >= 7);
1795 /* FBL only supports UD type for dst. */
1796 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1797 break;
1798 case BRW_OPCODE_CBIT:
1799 assert(brw->gen >= 7);
1800 /* CBIT only supports UD type for dst. */
1801 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1802 break;
1803 case BRW_OPCODE_ADDC:
1804 assert(brw->gen >= 7);
1805 brw_ADDC(p, dst, src[0], src[1]);
1806 break;
1807 case BRW_OPCODE_SUBB:
1808 assert(brw->gen >= 7);
1809 brw_SUBB(p, dst, src[0], src[1]);
1810 break;
1811 case BRW_OPCODE_MAC:
1812 brw_MAC(p, dst, src[0], src[1]);
1813 break;
1814
1815 case BRW_OPCODE_BFE:
1816 assert(brw->gen >= 7);
1817 brw_set_default_access_mode(p, BRW_ALIGN_16);
1818 if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1819 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1820 brw_BFE(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1821 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1822 brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1823 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1824 } else {
1825 brw_BFE(p, dst, src[0], src[1], src[2]);
1826 }
1827 brw_set_default_access_mode(p, BRW_ALIGN_1);
1828 break;
1829
1830 case BRW_OPCODE_BFI1:
1831 assert(brw->gen >= 7);
1832 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1833 * should
1834 *
1835 * "Force BFI instructions to be executed always in SIMD8."
1836 */
1837 if (dispatch_width == 16 && brw->is_haswell) {
1838 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1839 brw_BFI1(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]));
1840 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1841 brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
1842 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1843 } else {
1844 brw_BFI1(p, dst, src[0], src[1]);
1845 }
1846 break;
1847 case BRW_OPCODE_BFI2:
1848 assert(brw->gen >= 7);
1849 brw_set_default_access_mode(p, BRW_ALIGN_16);
1850 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1851 * should
1852 *
1853 * "Force BFI instructions to be executed always in SIMD8."
1854 *
1855 * Otherwise we would be able to emit compressed instructions like we
1856 * do for the other three-source instructions.
1857 */
1858 if (dispatch_width == 16 && brw->gen < 8) {
1859 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1860 brw_BFI2(p, firsthalf(dst), firsthalf(src[0]), firsthalf(src[1]), firsthalf(src[2]));
1861 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1862 brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1863 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1864 } else {
1865 brw_BFI2(p, dst, src[0], src[1], src[2]);
1866 }
1867 brw_set_default_access_mode(p, BRW_ALIGN_1);
1868 break;
1869
1870 case BRW_OPCODE_IF:
1871 if (inst->src[0].file != BAD_FILE) {
1872 /* The instruction has an embedded compare (only allowed on gen6) */
1873 assert(brw->gen == 6);
1874 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1875 } else {
1876 brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1877 }
1878 break;
1879
1880 case BRW_OPCODE_ELSE:
1881 brw_ELSE(p);
1882 break;
1883 case BRW_OPCODE_ENDIF:
1884 brw_ENDIF(p);
1885 break;
1886
1887 case BRW_OPCODE_DO:
1888 brw_DO(p, BRW_EXECUTE_8);
1889 break;
1890
1891 case BRW_OPCODE_BREAK:
1892 brw_BREAK(p);
1893 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1894 break;
1895 case BRW_OPCODE_CONTINUE:
1896 brw_CONT(p);
1897 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1898 break;
1899
1900 case BRW_OPCODE_WHILE:
1901 brw_WHILE(p);
1902 loop_count++;
1903 break;
1904
1905 case SHADER_OPCODE_RCP:
1906 case SHADER_OPCODE_RSQ:
1907 case SHADER_OPCODE_SQRT:
1908 case SHADER_OPCODE_EXP2:
1909 case SHADER_OPCODE_LOG2:
1910 case SHADER_OPCODE_SIN:
1911 case SHADER_OPCODE_COS:
1912 assert(brw->gen < 6 || inst->mlen == 0);
1913 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1914 if (brw->gen >= 7) {
1915 gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1916 brw_null_reg());
1917 } else if (brw->gen == 6) {
1918 generate_math_gen6(inst, dst, src[0], brw_null_reg());
1919 } else if (brw->gen == 5 || brw->is_g4x) {
1920 generate_math_g45(inst, dst, src[0]);
1921 } else {
1922 generate_math_gen4(inst, dst, src[0]);
1923 }
1924 break;
1925 case SHADER_OPCODE_INT_QUOTIENT:
1926 case SHADER_OPCODE_INT_REMAINDER:
1927 case SHADER_OPCODE_POW:
1928 assert(brw->gen < 6 || inst->mlen == 0);
1929 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1930 if (brw->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
1931 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1932 } else if (brw->gen >= 6) {
1933 generate_math_gen6(inst, dst, src[0], src[1]);
1934 } else {
1935 generate_math_gen4(inst, dst, src[0]);
1936 }
1937 break;
1938 case FS_OPCODE_PIXEL_X:
1939 generate_pixel_xy(dst, true);
1940 break;
1941 case FS_OPCODE_PIXEL_Y:
1942 generate_pixel_xy(dst, false);
1943 break;
1944 case FS_OPCODE_CINTERP:
1945 brw_MOV(p, dst, src[0]);
1946 break;
1947 case FS_OPCODE_LINTERP:
1948 generate_linterp(inst, dst, src);
1949 break;
1950 case SHADER_OPCODE_TEX:
1951 case FS_OPCODE_TXB:
1952 case SHADER_OPCODE_TXD:
1953 case SHADER_OPCODE_TXF:
1954 case SHADER_OPCODE_TXF_CMS:
1955 case SHADER_OPCODE_TXF_UMS:
1956 case SHADER_OPCODE_TXF_MCS:
1957 case SHADER_OPCODE_TXL:
1958 case SHADER_OPCODE_TXS:
1959 case SHADER_OPCODE_LOD:
1960 case SHADER_OPCODE_TG4:
1961 case SHADER_OPCODE_TG4_OFFSET:
1962 generate_tex(inst, dst, src[0], src[1]);
1963 break;
1964 case FS_OPCODE_DDX_COARSE:
1965 case FS_OPCODE_DDX_FINE:
1966 generate_ddx(inst->opcode, dst, src[0]);
1967 break;
1968 case FS_OPCODE_DDY_COARSE:
1969 case FS_OPCODE_DDY_FINE:
1970 assert(src[1].file == BRW_IMMEDIATE_VALUE);
1971 generate_ddy(inst->opcode, dst, src[0], src[1].dw1.ud);
1972 break;
1973
1974 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1975 generate_scratch_write(inst, src[0]);
1976 break;
1977
1978 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1979 generate_scratch_read(inst, dst);
1980 break;
1981
1982 case SHADER_OPCODE_GEN7_SCRATCH_READ:
1983 generate_scratch_read_gen7(inst, dst);
1984 break;
1985
1986 case SHADER_OPCODE_URB_WRITE_SIMD8:
1987 generate_urb_write(inst, src[0]);
1988 break;
1989
1990 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1991 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1992 break;
1993
1994 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1995 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1996 break;
1997
1998 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1999 generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
2000 break;
2001
2002 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
2003 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2004 break;
2005
2006 case FS_OPCODE_REP_FB_WRITE:
2007 case FS_OPCODE_FB_WRITE:
2008 generate_fb_write(inst, src[0]);
2009 break;
2010
2011 case FS_OPCODE_BLORP_FB_WRITE:
2012 generate_blorp_fb_write(inst);
2013 break;
2014
2015 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
2016 generate_mov_dispatch_to_flags(inst);
2017 break;
2018
2019 case FS_OPCODE_DISCARD_JUMP:
2020 generate_discard_jump(inst);
2021 break;
2022
2023 case SHADER_OPCODE_SHADER_TIME_ADD:
2024 generate_shader_time_add(inst, src[0], src[1], src[2]);
2025 break;
2026
2027 case SHADER_OPCODE_UNTYPED_ATOMIC:
2028 generate_untyped_atomic(inst, dst, src[0], src[1], src[2]);
2029 break;
2030
2031 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2032 generate_untyped_surface_read(inst, dst, src[0], src[1]);
2033 break;
2034
2035 case FS_OPCODE_SET_SIMD4X2_OFFSET:
2036 generate_set_simd4x2_offset(inst, dst, src[0]);
2037 break;
2038
2039 case FS_OPCODE_SET_OMASK:
2040 generate_set_omask(inst, dst, src[0]);
2041 break;
2042
2043 case FS_OPCODE_SET_SAMPLE_ID:
2044 generate_set_sample_id(inst, dst, src[0], src[1]);
2045 break;
2046
2047 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2048 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2049 break;
2050
2051 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2052 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2053 generate_unpack_half_2x16_split(inst, dst, src[0]);
2054 break;
2055
2056 case FS_OPCODE_PLACEHOLDER_HALT:
2057 /* This is the place where the final HALT needs to be inserted if
2058 * we've emitted any discards. If not, this will emit no code.
2059 */
2060 if (!patch_discard_jumps_to_fb_writes()) {
2061 if (unlikely(debug_flag)) {
2062 annotation.ann_count--;
2063 }
2064 }
2065 break;
2066
2067 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
2068 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2069 GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
2070 break;
2071
2072 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2073 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2074 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2075 break;
2076
2077 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2078 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2079 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2080 break;
2081
2082 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2083 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2084 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2085 break;
2086
2087 default:
2088 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
2089 _mesa_problem(ctx, "Unsupported opcode `%s' in %s",
2090 opcode_descs[inst->opcode].name, stage_abbrev);
2091 } else {
2092 _mesa_problem(ctx, "Unsupported opcode %d in %s", inst->opcode,
2093 stage_abbrev);
2094 }
2095 abort();
2096
2097 case SHADER_OPCODE_LOAD_PAYLOAD:
2098 unreachable("Should be lowered by lower_load_payload()");
2099 }
2100
2101 if (multiple_instructions_emitted)
2102 continue;
2103
2104 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2105 assert(p->next_insn_offset == last_insn_offset + 16 ||
2106 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2107 "emitting more than 1 instruction");
2108
2109 brw_inst *last = &p->store[last_insn_offset / 16];
2110
2111 if (inst->conditional_mod)
2112 brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
2113 brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear);
2114 brw_inst_set_no_dd_check(brw, last, inst->no_dd_check);
2115 }
2116 }
2117
2118 brw_set_uip_jip(p);
2119 annotation_finalize(&annotation, p->next_insn_offset);
2120
2121 int before_size = p->next_insn_offset - start_offset;
2122 brw_compact_instructions(p, start_offset, annotation.ann_count,
2123 annotation.ann);
2124 int after_size = p->next_insn_offset - start_offset;
2125
2126 if (unlikely(debug_flag)) {
2127 fprintf(stderr, "Native code for %s\n"
2128 "SIMD%d shader: %d instructions. %d loops. Compacted %d to %d"
2129 " bytes (%.0f%%)\n",
2130 shader_name,
2131 dispatch_width, before_size / 16, loop_count, before_size, after_size,
2132 100.0f * (before_size - after_size) / before_size);
2133
2134 dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog);
2135 ralloc_free(annotation.ann);
2136 }
2137
2138 static GLuint msg_id = 0;
2139 _mesa_gl_debug(&brw->ctx, &msg_id,
2140 MESA_DEBUG_SOURCE_SHADER_COMPILER,
2141 MESA_DEBUG_TYPE_OTHER,
2142 MESA_DEBUG_SEVERITY_NOTIFICATION,
2143 "%s SIMD%d shader: %d inst, %d loops, "
2144 "compacted %d to %d bytes.\n",
2145 stage_abbrev, dispatch_width, before_size / 16, loop_count,
2146 before_size, after_size);
2147
2148 return start_offset;
2149 }
2150
2151 const unsigned *
2152 fs_generator::get_assembly(unsigned int *assembly_size)
2153 {
2154 return brw_get_program(p, assembly_size);
2155 }