i965/fs: Don't store gl_fragment_program* in fs_generator
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 extern "C" {
31 #include "main/macros.h"
32 #include "brw_context.h"
33 #include "brw_eu.h"
34 } /* extern "C" */
35
36 #include "brw_fs.h"
37 #include "brw_cfg.h"
38
39 fs_generator::fs_generator(struct brw_context *brw,
40 void *mem_ctx,
41 const struct brw_wm_prog_key *key,
42 struct brw_wm_prog_data *prog_data,
43 struct gl_shader_program *shader_prog,
44 struct gl_fragment_program *fp,
45 bool runtime_check_aads_emit,
46 bool debug_flag)
47
48 : brw(brw), stage(MESA_SHADER_FRAGMENT), key(key),
49 prog_data(prog_data), shader_prog(shader_prog), prog(&fp->Base),
50 runtime_check_aads_emit(runtime_check_aads_emit),
51 debug_flag(debug_flag), mem_ctx(mem_ctx)
52 {
53 ctx = &brw->ctx;
54
55 p = rzalloc(mem_ctx, struct brw_compile);
56 brw_init_compile(brw, p, mem_ctx);
57 }
58
59 fs_generator::~fs_generator()
60 {
61 }
62
63 bool
64 fs_generator::patch_discard_jumps_to_fb_writes()
65 {
66 if (brw->gen < 6 || this->discard_halt_patches.is_empty())
67 return false;
68
69 int scale = brw_jump_scale(brw);
70
71 /* There is a somewhat strange undocumented requirement of using
72 * HALT, according to the simulator. If some channel has HALTed to
73 * a particular UIP, then by the end of the program, every channel
74 * must have HALTed to that UIP. Furthermore, the tracking is a
75 * stack, so you can't do the final halt of a UIP after starting
76 * halting to a new UIP.
77 *
78 * Symptoms of not emitting this instruction on actual hardware
79 * included GPU hangs and sparkly rendering on the piglit discard
80 * tests.
81 */
82 brw_inst *last_halt = gen6_HALT(p);
83 brw_inst_set_uip(brw, last_halt, 1 * scale);
84 brw_inst_set_jip(brw, last_halt, 1 * scale);
85
86 int ip = p->nr_insn;
87
88 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
89 brw_inst *patch = &p->store[patch_ip->ip];
90
91 assert(brw_inst_opcode(brw, patch) == BRW_OPCODE_HALT);
92 /* HALT takes a half-instruction distance from the pre-incremented IP. */
93 brw_inst_set_uip(brw, patch, (ip - patch_ip->ip) * scale);
94 }
95
96 this->discard_halt_patches.make_empty();
97 return true;
98 }
99
100 void
101 fs_generator::fire_fb_write(fs_inst *inst,
102 GLuint base_reg,
103 struct brw_reg implied_header,
104 GLuint nr)
105 {
106 uint32_t msg_control;
107
108 if (brw->gen < 6) {
109 brw_push_insn_state(p);
110 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
111 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
112 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
113 brw_MOV(p,
114 brw_message_reg(base_reg + 1),
115 brw_vec8_grf(1, 0));
116 brw_pop_insn_state(p);
117 }
118
119 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
120 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
121 else if (prog_data->dual_src_blend)
122 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
123 else if (dispatch_width == 16)
124 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
125 else
126 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
127
128 uint32_t surf_index =
129 prog_data->binding_table.render_target_start + inst->target;
130
131 brw_fb_WRITE(p,
132 dispatch_width,
133 base_reg,
134 implied_header,
135 msg_control,
136 surf_index,
137 nr,
138 0,
139 inst->eot,
140 inst->header_present);
141
142 brw_mark_surface_used(&prog_data->base, surf_index);
143 }
144
145 void
146 fs_generator::generate_fb_write(fs_inst *inst)
147 {
148 assert(stage == MESA_SHADER_FRAGMENT);
149 gl_fragment_program *fp = (gl_fragment_program *) prog;
150 struct brw_reg implied_header;
151
152 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
153 * move, here's g1.
154 */
155 if (inst->header_present) {
156 brw_push_insn_state(p);
157 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
158 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
159 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
160 brw_set_default_flag_reg(p, 0, 0);
161
162 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
163 * present.
164 */
165 if (prog_data->uses_kill || key->alpha_test_func) {
166 struct brw_reg pixel_mask;
167
168 if (brw->gen >= 6)
169 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
170 else
171 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
172
173 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
174 }
175
176 if (brw->gen >= 6) {
177 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
178 brw_MOV(p,
179 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
180 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
181 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
182
183 if (inst->target > 0 && key->replicate_alpha) {
184 /* Set "Source0 Alpha Present to RenderTarget" bit in message
185 * header.
186 */
187 brw_OR(p,
188 vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
189 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
190 brw_imm_ud(0x1 << 11));
191 }
192
193 if (inst->target > 0) {
194 /* Set the render target index for choosing BLEND_STATE. */
195 brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
196 inst->base_mrf, 2),
197 BRW_REGISTER_TYPE_UD),
198 brw_imm_ud(inst->target));
199 }
200
201 implied_header = brw_null_reg();
202 } else {
203 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
204 }
205
206 brw_pop_insn_state(p);
207 } else {
208 implied_header = brw_null_reg();
209 }
210
211 if (!runtime_check_aads_emit) {
212 fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen);
213 } else {
214 /* This can only happen in gen < 6 */
215 assert(brw->gen < 6);
216
217 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
218
219 /* Check runtime bit to detect if we have to send AA data or not */
220 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
221 brw_AND(p,
222 v1_null_ud,
223 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
224 brw_imm_ud(1<<26));
225 brw_inst_set_cond_modifier(brw, brw_last_inst, BRW_CONDITIONAL_NZ);
226
227 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
228 brw_inst_set_exec_size(brw, brw_last_inst, BRW_EXECUTE_1);
229 {
230 /* Don't send AA data */
231 fire_fb_write(inst, inst->base_mrf+1, implied_header, inst->mlen-1);
232 }
233 brw_land_fwd_jump(p, jmp);
234 fire_fb_write(inst, inst->base_mrf, implied_header, inst->mlen);
235 }
236 }
237
238 void
239 fs_generator::generate_blorp_fb_write(fs_inst *inst)
240 {
241 brw_fb_WRITE(p,
242 16 /* dispatch_width */,
243 inst->base_mrf,
244 brw_reg_from_fs_reg(&inst->src[0]),
245 BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE,
246 inst->target,
247 inst->mlen,
248 0,
249 true,
250 inst->header_present);
251 }
252
253 /* Computes the integer pixel x,y values from the origin.
254 *
255 * This is the basis of gl_FragCoord computation, but is also used
256 * pre-gen6 for computing the deltas from v0 for computing
257 * interpolation.
258 */
259 void
260 fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
261 {
262 struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
263 struct brw_reg src;
264 struct brw_reg deltas;
265
266 if (is_x) {
267 src = stride(suboffset(g1_uw, 4), 2, 4, 0);
268 deltas = brw_imm_v(0x10101010);
269 } else {
270 src = stride(suboffset(g1_uw, 5), 2, 4, 0);
271 deltas = brw_imm_v(0x11001100);
272 }
273
274 if (dispatch_width == 16) {
275 dst = vec16(dst);
276 }
277
278 /* We do this SIMD8 or SIMD16, but since the destination is UW we
279 * don't do compression in the SIMD16 case.
280 */
281 brw_push_insn_state(p);
282 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
283 brw_ADD(p, dst, src, deltas);
284 brw_pop_insn_state(p);
285 }
286
287 void
288 fs_generator::generate_linterp(fs_inst *inst,
289 struct brw_reg dst, struct brw_reg *src)
290 {
291 struct brw_reg delta_x = src[0];
292 struct brw_reg delta_y = src[1];
293 struct brw_reg interp = src[2];
294
295 if (brw->has_pln &&
296 delta_y.nr == delta_x.nr + 1 &&
297 (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
298 brw_PLN(p, dst, interp, delta_x);
299 } else {
300 brw_LINE(p, brw_null_reg(), interp, delta_x);
301 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
302 }
303 }
304
305 void
306 fs_generator::generate_math_gen6(fs_inst *inst,
307 struct brw_reg dst,
308 struct brw_reg src0,
309 struct brw_reg src1)
310 {
311 int op = brw_math_function(inst->opcode);
312 bool binop = src1.file != BRW_ARCHITECTURE_REGISTER_FILE;
313
314 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
315 gen6_math(p, dst, op, src0, src1);
316
317 if (dispatch_width == 16) {
318 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
319 gen6_math(p, sechalf(dst), op, sechalf(src0),
320 binop ? sechalf(src1) : brw_null_reg());
321 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
322 }
323 }
324
325 void
326 fs_generator::generate_math_gen4(fs_inst *inst,
327 struct brw_reg dst,
328 struct brw_reg src)
329 {
330 int op = brw_math_function(inst->opcode);
331
332 assert(inst->mlen >= 1);
333
334 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
335 gen4_math(p, dst,
336 op,
337 inst->base_mrf, src,
338 BRW_MATH_DATA_VECTOR,
339 BRW_MATH_PRECISION_FULL);
340
341 if (dispatch_width == 16) {
342 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
343 gen4_math(p, sechalf(dst),
344 op,
345 inst->base_mrf + 1, sechalf(src),
346 BRW_MATH_DATA_VECTOR,
347 BRW_MATH_PRECISION_FULL);
348
349 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
350 }
351 }
352
353 void
354 fs_generator::generate_math_g45(fs_inst *inst,
355 struct brw_reg dst,
356 struct brw_reg src)
357 {
358 if (inst->opcode == SHADER_OPCODE_POW ||
359 inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
360 inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
361 generate_math_gen4(inst, dst, src);
362 return;
363 }
364
365 int op = brw_math_function(inst->opcode);
366
367 assert(inst->mlen >= 1);
368
369 gen4_math(p, dst,
370 op,
371 inst->base_mrf, src,
372 BRW_MATH_DATA_VECTOR,
373 BRW_MATH_PRECISION_FULL);
374 }
375
376 void
377 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
378 struct brw_reg sampler_index)
379 {
380 int msg_type = -1;
381 int rlen = 4;
382 uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
383 uint32_t return_format;
384
385 switch (dst.type) {
386 case BRW_REGISTER_TYPE_D:
387 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
388 break;
389 case BRW_REGISTER_TYPE_UD:
390 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
391 break;
392 default:
393 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
394 break;
395 }
396
397 if (dispatch_width == 16 &&
398 !inst->force_uncompressed && !inst->force_sechalf)
399 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
400
401 if (brw->gen >= 5) {
402 switch (inst->opcode) {
403 case SHADER_OPCODE_TEX:
404 if (inst->shadow_compare) {
405 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
406 } else {
407 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
408 }
409 break;
410 case FS_OPCODE_TXB:
411 if (inst->shadow_compare) {
412 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
413 } else {
414 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
415 }
416 break;
417 case SHADER_OPCODE_TXL:
418 if (inst->shadow_compare) {
419 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
420 } else {
421 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
422 }
423 break;
424 case SHADER_OPCODE_TXS:
425 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
426 break;
427 case SHADER_OPCODE_TXD:
428 if (inst->shadow_compare) {
429 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
430 assert(brw->gen >= 8 || brw->is_haswell);
431 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
432 } else {
433 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
434 }
435 break;
436 case SHADER_OPCODE_TXF:
437 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
438 break;
439 case SHADER_OPCODE_TXF_CMS:
440 if (brw->gen >= 7)
441 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
442 else
443 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
444 break;
445 case SHADER_OPCODE_TXF_UMS:
446 assert(brw->gen >= 7);
447 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
448 break;
449 case SHADER_OPCODE_TXF_MCS:
450 assert(brw->gen >= 7);
451 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
452 break;
453 case SHADER_OPCODE_LOD:
454 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
455 break;
456 case SHADER_OPCODE_TG4:
457 if (inst->shadow_compare) {
458 assert(brw->gen >= 7);
459 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
460 } else {
461 assert(brw->gen >= 6);
462 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
463 }
464 break;
465 case SHADER_OPCODE_TG4_OFFSET:
466 assert(brw->gen >= 7);
467 if (inst->shadow_compare) {
468 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
469 } else {
470 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
471 }
472 break;
473 default:
474 unreachable("not reached");
475 }
476 } else {
477 switch (inst->opcode) {
478 case SHADER_OPCODE_TEX:
479 /* Note that G45 and older determines shadow compare and dispatch width
480 * from message length for most messages.
481 */
482 assert(dispatch_width == 8);
483 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
484 if (inst->shadow_compare) {
485 assert(inst->mlen == 6);
486 } else {
487 assert(inst->mlen <= 4);
488 }
489 break;
490 case FS_OPCODE_TXB:
491 if (inst->shadow_compare) {
492 assert(inst->mlen == 6);
493 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
494 } else {
495 assert(inst->mlen == 9);
496 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
497 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
498 }
499 break;
500 case SHADER_OPCODE_TXL:
501 if (inst->shadow_compare) {
502 assert(inst->mlen == 6);
503 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
504 } else {
505 assert(inst->mlen == 9);
506 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
507 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
508 }
509 break;
510 case SHADER_OPCODE_TXD:
511 /* There is no sample_d_c message; comparisons are done manually */
512 assert(inst->mlen == 7 || inst->mlen == 10);
513 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
514 break;
515 case SHADER_OPCODE_TXF:
516 assert(inst->mlen == 9);
517 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
518 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
519 break;
520 case SHADER_OPCODE_TXS:
521 assert(inst->mlen == 3);
522 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
523 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
524 break;
525 default:
526 unreachable("not reached");
527 }
528 }
529 assert(msg_type != -1);
530
531 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
532 rlen = 8;
533 dst = vec16(dst);
534 }
535
536 if (brw->gen >= 7 && inst->header_present && dispatch_width == 16) {
537 /* The send-from-GRF for SIMD16 texturing with a header has an extra
538 * hardware register allocated to it, which we need to skip over (since
539 * our coordinates in the payload are in the even-numbered registers,
540 * and the header comes right before the first one).
541 */
542 assert(src.file == BRW_GENERAL_REGISTER_FILE);
543 src.nr++;
544 }
545
546 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
547
548 /* Load the message header if present. If there's a texture offset,
549 * we need to set it up explicitly and load the offset bitfield.
550 * Otherwise, we can use an implied move from g0 to the first message reg.
551 */
552 if (inst->header_present) {
553 if (brw->gen < 6 && !inst->texture_offset) {
554 /* Set up an implied move from g0 to the MRF. */
555 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
556 } else {
557 struct brw_reg header_reg;
558
559 if (brw->gen >= 7) {
560 header_reg = src;
561 } else {
562 assert(inst->base_mrf != -1);
563 header_reg = brw_message_reg(inst->base_mrf);
564 }
565
566 brw_push_insn_state(p);
567 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
568 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
569 /* Explicitly set up the message header by copying g0 to the MRF. */
570 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
571
572 if (inst->texture_offset) {
573 /* Set the offset bits in DWord 2. */
574 brw_MOV(p, get_element_ud(header_reg, 2),
575 brw_imm_ud(inst->texture_offset));
576 }
577
578 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index, dst);
579 brw_pop_insn_state(p);
580 }
581 }
582
583 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
584 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
585 ? prog_data->base.binding_table.gather_texture_start
586 : prog_data->base.binding_table.texture_start;
587
588 if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
589 uint32_t sampler = sampler_index.dw1.ud;
590
591 brw_SAMPLE(p,
592 retype(dst, BRW_REGISTER_TYPE_UW),
593 inst->base_mrf,
594 src,
595 sampler + base_binding_table_index,
596 sampler % 16,
597 msg_type,
598 rlen,
599 inst->mlen,
600 inst->header_present,
601 simd_mode,
602 return_format);
603
604 brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
605 } else {
606 /* Non-const sampler index */
607 /* Note: this clobbers `dst` as a temporary before emitting the send */
608
609 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
610 struct brw_reg temp = vec1(retype(dst, BRW_REGISTER_TYPE_UD));
611
612 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
613
614 brw_push_insn_state(p);
615 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
616 brw_set_default_access_mode(p, BRW_ALIGN_1);
617
618 /* Some care required: `sampler` and `temp` may alias:
619 * addr = sampler & 0xff
620 * temp = (sampler << 8) & 0xf00
621 * addr = addr | temp
622 */
623 brw_ADD(p, addr, sampler_reg, brw_imm_ud(base_binding_table_index));
624 brw_SHL(p, temp, sampler_reg, brw_imm_ud(8u));
625 brw_AND(p, temp, temp, brw_imm_ud(0x0f00));
626 brw_AND(p, addr, addr, brw_imm_ud(0x0ff));
627 brw_OR(p, addr, addr, temp);
628
629 /* a0.0 |= <descriptor> */
630 brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
631 brw_set_sampler_message(p, insn_or,
632 0 /* surface */,
633 0 /* sampler */,
634 msg_type,
635 rlen,
636 inst->mlen /* mlen */,
637 inst->header_present /* header */,
638 simd_mode,
639 return_format);
640 brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
641 brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
642 brw_set_src0(p, insn_or, addr);
643 brw_set_dest(p, insn_or, addr);
644
645
646 /* dst = send(offset, a0.0) */
647 brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
648 brw_set_dest(p, insn_send, dst);
649 brw_set_src0(p, insn_send, src);
650 brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
651
652 brw_pop_insn_state(p);
653
654 /* visitor knows more than we do about the surface limit required,
655 * so has already done marking.
656 */
657 }
658 }
659
660
661 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
662 * looking like:
663 *
664 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
665 *
666 * Ideally, we want to produce:
667 *
668 * DDX DDY
669 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
670 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
671 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
672 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
673 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
674 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
675 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
676 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
677 *
678 * and add another set of two more subspans if in 16-pixel dispatch mode.
679 *
680 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
681 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
682 * pair. But the ideal approximation may impose a huge performance cost on
683 * sample_d. On at least Haswell, sample_d instruction does some
684 * optimizations if the same LOD is used for all pixels in the subspan.
685 *
686 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
687 * appropriate swizzling.
688 */
689 void
690 fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
691 struct brw_reg quality)
692 {
693 unsigned vstride, width;
694 assert(quality.file == BRW_IMMEDIATE_VALUE);
695 assert(quality.type == BRW_REGISTER_TYPE_D);
696
697 int quality_value = quality.dw1.d;
698
699 if (quality_value == BRW_DERIVATIVE_FINE ||
700 (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) {
701 /* produce accurate derivatives */
702 vstride = BRW_VERTICAL_STRIDE_2;
703 width = BRW_WIDTH_2;
704 }
705 else {
706 /* replicate the derivative at the top-left pixel to other pixels */
707 vstride = BRW_VERTICAL_STRIDE_4;
708 width = BRW_WIDTH_4;
709 }
710
711 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
712 BRW_REGISTER_TYPE_F,
713 vstride,
714 width,
715 BRW_HORIZONTAL_STRIDE_0,
716 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
717 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
718 BRW_REGISTER_TYPE_F,
719 vstride,
720 width,
721 BRW_HORIZONTAL_STRIDE_0,
722 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
723 brw_ADD(p, dst, src0, negate(src1));
724 }
725
726 /* The negate_value boolean is used to negate the derivative computation for
727 * FBOs, since they place the origin at the upper left instead of the lower
728 * left.
729 */
730 void
731 fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
732 struct brw_reg quality, bool negate_value)
733 {
734 assert(quality.file == BRW_IMMEDIATE_VALUE);
735 assert(quality.type == BRW_REGISTER_TYPE_D);
736
737 int quality_value = quality.dw1.d;
738
739 if (quality_value == BRW_DERIVATIVE_FINE ||
740 (key->high_quality_derivatives && quality_value != BRW_DERIVATIVE_COARSE)) {
741 /* From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
742 * Region Restrictions):
743 *
744 * In Align16 access mode, SIMD16 is not allowed for DW operations
745 * and SIMD8 is not allowed for DF operations.
746 *
747 * In this context, "DW operations" means "operations acting on 32-bit
748 * values", so it includes operations on floats.
749 *
750 * Gen4 has a similar restriction. From the i965 PRM, section 11.5.3
751 * (Instruction Compression -> Rules and Restrictions):
752 *
753 * A compressed instruction must be in Align1 access mode. Align16
754 * mode instructions cannot be compressed.
755 *
756 * Similar text exists in the g45 PRM.
757 *
758 * On these platforms, if we're building a SIMD16 shader, we need to
759 * manually unroll to a pair of SIMD8 instructions.
760 */
761 bool unroll_to_simd8 =
762 (dispatch_width == 16 &&
763 (brw->gen == 4 || (brw->gen == 7 && !brw->is_haswell)));
764
765 /* produce accurate derivatives */
766 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
767 BRW_REGISTER_TYPE_F,
768 BRW_VERTICAL_STRIDE_4,
769 BRW_WIDTH_4,
770 BRW_HORIZONTAL_STRIDE_1,
771 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
772 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
773 BRW_REGISTER_TYPE_F,
774 BRW_VERTICAL_STRIDE_4,
775 BRW_WIDTH_4,
776 BRW_HORIZONTAL_STRIDE_1,
777 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
778 brw_push_insn_state(p);
779 brw_set_default_access_mode(p, BRW_ALIGN_16);
780 if (unroll_to_simd8)
781 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
782 if (negate_value)
783 brw_ADD(p, dst, src1, negate(src0));
784 else
785 brw_ADD(p, dst, src0, negate(src1));
786 if (unroll_to_simd8) {
787 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
788 src0 = sechalf(src0);
789 src1 = sechalf(src1);
790 dst = sechalf(dst);
791 if (negate_value)
792 brw_ADD(p, dst, src1, negate(src0));
793 else
794 brw_ADD(p, dst, src0, negate(src1));
795 }
796 brw_pop_insn_state(p);
797 } else {
798 /* replicate the derivative at the top-left pixel to other pixels */
799 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
800 BRW_REGISTER_TYPE_F,
801 BRW_VERTICAL_STRIDE_4,
802 BRW_WIDTH_4,
803 BRW_HORIZONTAL_STRIDE_0,
804 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
805 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
806 BRW_REGISTER_TYPE_F,
807 BRW_VERTICAL_STRIDE_4,
808 BRW_WIDTH_4,
809 BRW_HORIZONTAL_STRIDE_0,
810 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
811 if (negate_value)
812 brw_ADD(p, dst, src1, negate(src0));
813 else
814 brw_ADD(p, dst, src0, negate(src1));
815 }
816 }
817
818 void
819 fs_generator::generate_discard_jump(fs_inst *inst)
820 {
821 assert(brw->gen >= 6);
822
823 /* This HALT will be patched up at FB write time to point UIP at the end of
824 * the program, and at brw_uip_jip() JIP will be set to the end of the
825 * current block (or the program).
826 */
827 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
828
829 brw_push_insn_state(p);
830 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
831 gen6_HALT(p);
832 brw_pop_insn_state(p);
833 }
834
835 void
836 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
837 {
838 assert(inst->mlen != 0);
839
840 brw_MOV(p,
841 retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
842 retype(src, BRW_REGISTER_TYPE_UD));
843 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
844 dispatch_width / 8, inst->offset);
845 }
846
847 void
848 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
849 {
850 assert(inst->mlen != 0);
851
852 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
853 dispatch_width / 8, inst->offset);
854 }
855
856 void
857 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
858 {
859 gen7_block_read_scratch(p, dst, dispatch_width / 8, inst->offset);
860 }
861
862 void
863 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
864 struct brw_reg dst,
865 struct brw_reg index,
866 struct brw_reg offset)
867 {
868 assert(inst->mlen != 0);
869
870 assert(index.file == BRW_IMMEDIATE_VALUE &&
871 index.type == BRW_REGISTER_TYPE_UD);
872 uint32_t surf_index = index.dw1.ud;
873
874 assert(offset.file == BRW_IMMEDIATE_VALUE &&
875 offset.type == BRW_REGISTER_TYPE_UD);
876 uint32_t read_offset = offset.dw1.ud;
877
878 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
879 read_offset, surf_index);
880
881 brw_mark_surface_used(&prog_data->base, surf_index);
882 }
883
884 void
885 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
886 struct brw_reg dst,
887 struct brw_reg index,
888 struct brw_reg offset)
889 {
890 assert(inst->mlen == 0);
891 assert(index.type == BRW_REGISTER_TYPE_UD);
892
893 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
894 /* Reference just the dword we need, to avoid angering validate_reg(). */
895 offset = brw_vec1_grf(offset.nr, 0);
896
897 /* We use the SIMD4x2 mode because we want to end up with 4 components in
898 * the destination loaded consecutively from the same offset (which appears
899 * in the first component, and the rest are ignored).
900 */
901 dst.width = BRW_WIDTH_4;
902
903 if (index.file == BRW_IMMEDIATE_VALUE) {
904
905 uint32_t surf_index = index.dw1.ud;
906
907 brw_push_insn_state(p);
908 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
909 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
910 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
911 brw_pop_insn_state(p);
912
913 brw_set_dest(p, send, dst);
914 brw_set_src0(p, send, offset);
915 brw_set_sampler_message(p, send,
916 surf_index,
917 0, /* LD message ignores sampler unit */
918 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
919 1, /* rlen */
920 1, /* mlen */
921 false, /* no header */
922 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
923 0);
924
925 brw_mark_surface_used(&prog_data->base, surf_index);
926
927 } else {
928
929 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
930
931 brw_push_insn_state(p);
932 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
933 brw_set_default_access_mode(p, BRW_ALIGN_1);
934
935 /* a0.0 = surf_index & 0xff */
936 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
937 brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
938 brw_set_dest(p, insn_and, addr);
939 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
940 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
941
942
943 /* a0.0 |= <descriptor> */
944 brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
945 brw_set_sampler_message(p, insn_or,
946 0 /* surface */,
947 0 /* sampler */,
948 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
949 1 /* rlen */,
950 1 /* mlen */,
951 false /* header */,
952 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
953 0);
954 brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
955 brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
956 brw_set_src0(p, insn_or, addr);
957 brw_set_dest(p, insn_or, addr);
958
959
960 /* dst = send(offset, a0.0) */
961 brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
962 brw_set_dest(p, insn_send, dst);
963 brw_set_src0(p, insn_send, offset);
964 brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
965
966 brw_pop_insn_state(p);
967
968 /* visitor knows more than we do about the surface limit required,
969 * so has already done marking.
970 */
971
972 }
973 }
974
975 void
976 fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
977 struct brw_reg dst,
978 struct brw_reg index,
979 struct brw_reg offset)
980 {
981 assert(brw->gen < 7); /* Should use the gen7 variant. */
982 assert(inst->header_present);
983 assert(inst->mlen);
984
985 assert(index.file == BRW_IMMEDIATE_VALUE &&
986 index.type == BRW_REGISTER_TYPE_UD);
987 uint32_t surf_index = index.dw1.ud;
988
989 uint32_t simd_mode, rlen, msg_type;
990 if (dispatch_width == 16) {
991 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
992 rlen = 8;
993 } else {
994 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
995 rlen = 4;
996 }
997
998 if (brw->gen >= 5)
999 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1000 else {
1001 /* We always use the SIMD16 message so that we only have to load U, and
1002 * not V or R.
1003 */
1004 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1005 assert(inst->mlen == 3);
1006 assert(inst->regs_written == 8);
1007 rlen = 8;
1008 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1009 }
1010
1011 struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
1012 BRW_REGISTER_TYPE_D);
1013 brw_MOV(p, offset_mrf, offset);
1014
1015 struct brw_reg header = brw_vec8_grf(0, 0);
1016 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1017
1018 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1019 brw_inst_set_qtr_control(brw, send, BRW_COMPRESSION_NONE);
1020 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1021 brw_set_src0(p, send, header);
1022 if (brw->gen < 6)
1023 brw_inst_set_base_mrf(brw, send, inst->base_mrf);
1024
1025 /* Our surface is set up as floats, regardless of what actual data is
1026 * stored in it.
1027 */
1028 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1029 brw_set_sampler_message(p, send,
1030 surf_index,
1031 0, /* sampler (unused) */
1032 msg_type,
1033 rlen,
1034 inst->mlen,
1035 inst->header_present,
1036 simd_mode,
1037 return_format);
1038
1039 brw_mark_surface_used(&prog_data->base, surf_index);
1040 }
1041
1042 void
1043 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1044 struct brw_reg dst,
1045 struct brw_reg index,
1046 struct brw_reg offset)
1047 {
1048 assert(brw->gen >= 7);
1049 /* Varying-offset pull constant loads are treated as a normal expression on
1050 * gen7, so the fact that it's a send message is hidden at the IR level.
1051 */
1052 assert(!inst->header_present);
1053 assert(!inst->mlen);
1054 assert(index.type == BRW_REGISTER_TYPE_UD);
1055
1056 uint32_t simd_mode, rlen, mlen;
1057 if (dispatch_width == 16) {
1058 mlen = 2;
1059 rlen = 8;
1060 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1061 } else {
1062 mlen = 1;
1063 rlen = 4;
1064 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1065 }
1066
1067 if (index.file == BRW_IMMEDIATE_VALUE) {
1068
1069 uint32_t surf_index = index.dw1.ud;
1070
1071 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1072 brw_set_dest(p, send, dst);
1073 brw_set_src0(p, send, offset);
1074 brw_set_sampler_message(p, send,
1075 surf_index,
1076 0, /* LD message ignores sampler unit */
1077 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1078 rlen,
1079 mlen,
1080 false, /* no header */
1081 simd_mode,
1082 0);
1083
1084 brw_mark_surface_used(&prog_data->base, surf_index);
1085
1086 } else {
1087
1088 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1089
1090 brw_push_insn_state(p);
1091 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1092 brw_set_default_access_mode(p, BRW_ALIGN_1);
1093
1094 /* a0.0 = surf_index & 0xff */
1095 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1096 brw_inst_set_exec_size(p->brw, insn_and, BRW_EXECUTE_1);
1097 brw_set_dest(p, insn_and, addr);
1098 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1099 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1100
1101
1102 /* a0.0 |= <descriptor> */
1103 brw_inst *insn_or = brw_next_insn(p, BRW_OPCODE_OR);
1104 brw_set_sampler_message(p, insn_or,
1105 0 /* surface */,
1106 0 /* sampler */,
1107 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1108 rlen /* rlen */,
1109 mlen /* mlen */,
1110 false /* header */,
1111 simd_mode,
1112 0);
1113 brw_inst_set_exec_size(p->brw, insn_or, BRW_EXECUTE_1);
1114 brw_inst_set_src1_reg_type(p->brw, insn_or, BRW_REGISTER_TYPE_UD);
1115 brw_set_src0(p, insn_or, addr);
1116 brw_set_dest(p, insn_or, addr);
1117
1118
1119 /* dst = send(offset, a0.0) */
1120 brw_inst *insn_send = brw_next_insn(p, BRW_OPCODE_SEND);
1121 brw_set_dest(p, insn_send, dst);
1122 brw_set_src0(p, insn_send, offset);
1123 brw_set_indirect_send_descriptor(p, insn_send, BRW_SFID_SAMPLER, addr);
1124
1125 brw_pop_insn_state(p);
1126
1127 /* visitor knows more than we do about the surface limit required,
1128 * so has already done marking.
1129 */
1130 }
1131 }
1132
1133 /**
1134 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1135 * into the flags register (f0.0).
1136 *
1137 * Used only on Gen6 and above.
1138 */
1139 void
1140 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1141 {
1142 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1143 struct brw_reg dispatch_mask;
1144
1145 if (brw->gen >= 6)
1146 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1147 else
1148 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1149
1150 brw_push_insn_state(p);
1151 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1152 brw_MOV(p, flags, dispatch_mask);
1153 brw_pop_insn_state(p);
1154 }
1155
1156 void
1157 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1158 struct brw_reg dst,
1159 struct brw_reg src,
1160 struct brw_reg msg_data,
1161 unsigned msg_type)
1162 {
1163 assert(msg_data.file == BRW_IMMEDIATE_VALUE &&
1164 msg_data.type == BRW_REGISTER_TYPE_UD);
1165
1166 brw_pixel_interpolator_query(p,
1167 retype(dst, BRW_REGISTER_TYPE_UW),
1168 src,
1169 inst->pi_noperspective,
1170 msg_type,
1171 msg_data.dw1.ud,
1172 inst->mlen,
1173 inst->regs_written);
1174 }
1175
1176
1177 static uint32_t brw_file_from_reg(fs_reg *reg)
1178 {
1179 switch (reg->file) {
1180 case GRF:
1181 return BRW_GENERAL_REGISTER_FILE;
1182 case MRF:
1183 return BRW_MESSAGE_REGISTER_FILE;
1184 case IMM:
1185 return BRW_IMMEDIATE_VALUE;
1186 default:
1187 unreachable("not reached");
1188 }
1189 }
1190
1191 struct brw_reg
1192 brw_reg_from_fs_reg(fs_reg *reg)
1193 {
1194 struct brw_reg brw_reg;
1195
1196 switch (reg->file) {
1197 case GRF:
1198 case MRF:
1199 if (reg->stride == 0) {
1200 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, 0);
1201 } else {
1202 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
1203 brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);
1204 }
1205
1206 brw_reg = retype(brw_reg, reg->type);
1207 brw_reg = byte_offset(brw_reg, reg->subreg_offset);
1208 break;
1209 case IMM:
1210 switch (reg->type) {
1211 case BRW_REGISTER_TYPE_F:
1212 brw_reg = brw_imm_f(reg->fixed_hw_reg.dw1.f);
1213 break;
1214 case BRW_REGISTER_TYPE_D:
1215 brw_reg = brw_imm_d(reg->fixed_hw_reg.dw1.d);
1216 break;
1217 case BRW_REGISTER_TYPE_UD:
1218 brw_reg = brw_imm_ud(reg->fixed_hw_reg.dw1.ud);
1219 break;
1220 default:
1221 unreachable("not reached");
1222 }
1223 break;
1224 case HW_REG:
1225 assert(reg->type == reg->fixed_hw_reg.type);
1226 brw_reg = reg->fixed_hw_reg;
1227 break;
1228 case BAD_FILE:
1229 /* Probably unused. */
1230 brw_reg = brw_null_reg();
1231 break;
1232 case UNIFORM:
1233 unreachable("not reached");
1234 default:
1235 unreachable("not reached");
1236 }
1237 if (reg->abs)
1238 brw_reg = brw_abs(brw_reg);
1239 if (reg->negate)
1240 brw_reg = negate(brw_reg);
1241
1242 return brw_reg;
1243 }
1244
1245 /**
1246 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1247 * sampler LD messages.
1248 *
1249 * We don't want to bake it into the send message's code generation because
1250 * that means we don't get a chance to schedule the instructions.
1251 */
1252 void
1253 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1254 struct brw_reg dst,
1255 struct brw_reg value)
1256 {
1257 assert(value.file == BRW_IMMEDIATE_VALUE);
1258
1259 brw_push_insn_state(p);
1260 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1261 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1262 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1263 brw_pop_insn_state(p);
1264 }
1265
1266 /* Sets vstride=16, width=8, hstride=2 or vstride=0, width=1, hstride=0
1267 * (when mask is passed as a uniform) of register mask before moving it
1268 * to register dst.
1269 */
1270 void
1271 fs_generator::generate_set_omask(fs_inst *inst,
1272 struct brw_reg dst,
1273 struct brw_reg mask)
1274 {
1275 bool stride_8_8_1 =
1276 (mask.vstride == BRW_VERTICAL_STRIDE_8 &&
1277 mask.width == BRW_WIDTH_8 &&
1278 mask.hstride == BRW_HORIZONTAL_STRIDE_1);
1279
1280 bool stride_0_1_0 =
1281 (mask.vstride == BRW_VERTICAL_STRIDE_0 &&
1282 mask.width == BRW_WIDTH_1 &&
1283 mask.hstride == BRW_HORIZONTAL_STRIDE_0);
1284
1285 assert(stride_8_8_1 || stride_0_1_0);
1286 assert(dst.type == BRW_REGISTER_TYPE_UW);
1287
1288 if (dispatch_width == 16)
1289 dst = vec16(dst);
1290 brw_push_insn_state(p);
1291 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1292 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1293
1294 if (stride_8_8_1) {
1295 brw_MOV(p, dst, retype(stride(mask, 16, 8, 2), dst.type));
1296 } else if (stride_0_1_0) {
1297 brw_MOV(p, dst, retype(mask, dst.type));
1298 }
1299 brw_pop_insn_state(p);
1300 }
1301
1302 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1303 * the ADD instruction.
1304 */
1305 void
1306 fs_generator::generate_set_sample_id(fs_inst *inst,
1307 struct brw_reg dst,
1308 struct brw_reg src0,
1309 struct brw_reg src1)
1310 {
1311 assert(dst.type == BRW_REGISTER_TYPE_D ||
1312 dst.type == BRW_REGISTER_TYPE_UD);
1313 assert(src0.type == BRW_REGISTER_TYPE_D ||
1314 src0.type == BRW_REGISTER_TYPE_UD);
1315
1316 brw_push_insn_state(p);
1317 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1318 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1319 struct brw_reg reg = retype(stride(src1, 1, 4, 0), BRW_REGISTER_TYPE_UW);
1320 brw_ADD(p, dst, src0, reg);
1321 if (dispatch_width == 16)
1322 brw_ADD(p, offset(dst, 1), offset(src0, 1), suboffset(reg, 2));
1323 brw_pop_insn_state(p);
1324 }
1325
1326 /**
1327 * Change the register's data type from UD to W, doubling the strides in order
1328 * to compensate for halving the data type width.
1329 */
1330 static struct brw_reg
1331 ud_reg_to_w(struct brw_reg r)
1332 {
1333 assert(r.type == BRW_REGISTER_TYPE_UD);
1334 r.type = BRW_REGISTER_TYPE_W;
1335
1336 /* The BRW_*_STRIDE enums are defined so that incrementing the field
1337 * doubles the real stride.
1338 */
1339 if (r.hstride != 0)
1340 ++r.hstride;
1341 if (r.vstride != 0)
1342 ++r.vstride;
1343
1344 return r;
1345 }
1346
1347 void
1348 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1349 struct brw_reg dst,
1350 struct brw_reg x,
1351 struct brw_reg y)
1352 {
1353 assert(brw->gen >= 7);
1354 assert(dst.type == BRW_REGISTER_TYPE_UD);
1355 assert(x.type == BRW_REGISTER_TYPE_F);
1356 assert(y.type == BRW_REGISTER_TYPE_F);
1357
1358 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1359 *
1360 * Because this instruction does not have a 16-bit floating-point type,
1361 * the destination data type must be Word (W).
1362 *
1363 * The destination must be DWord-aligned and specify a horizontal stride
1364 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1365 * each destination channel and the upper word is not modified.
1366 */
1367 struct brw_reg dst_w = ud_reg_to_w(dst);
1368
1369 /* Give each 32-bit channel of dst the form below , where "." means
1370 * unchanged.
1371 * 0x....hhhh
1372 */
1373 brw_F32TO16(p, dst_w, y);
1374
1375 /* Now the form:
1376 * 0xhhhh0000
1377 */
1378 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1379
1380 /* And, finally the form of packHalf2x16's output:
1381 * 0xhhhhllll
1382 */
1383 brw_F32TO16(p, dst_w, x);
1384 }
1385
1386 void
1387 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1388 struct brw_reg dst,
1389 struct brw_reg src)
1390 {
1391 assert(brw->gen >= 7);
1392 assert(dst.type == BRW_REGISTER_TYPE_F);
1393 assert(src.type == BRW_REGISTER_TYPE_UD);
1394
1395 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1396 *
1397 * Because this instruction does not have a 16-bit floating-point type,
1398 * the source data type must be Word (W). The destination type must be
1399 * F (Float).
1400 */
1401 struct brw_reg src_w = ud_reg_to_w(src);
1402
1403 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1404 * For the Y case, we wish to access only the upper word; therefore
1405 * a 16-bit subregister offset is needed.
1406 */
1407 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1408 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1409 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1410 src_w.subnr += 2;
1411
1412 brw_F16TO32(p, dst, src_w);
1413 }
1414
1415 void
1416 fs_generator::generate_shader_time_add(fs_inst *inst,
1417 struct brw_reg payload,
1418 struct brw_reg offset,
1419 struct brw_reg value)
1420 {
1421 assert(brw->gen >= 7);
1422 brw_push_insn_state(p);
1423 brw_set_default_mask_control(p, true);
1424
1425 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1426 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1427 offset.type);
1428 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1429 value.type);
1430
1431 assert(offset.file == BRW_IMMEDIATE_VALUE);
1432 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1433 value.width = BRW_WIDTH_1;
1434 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1435 value.vstride = BRW_VERTICAL_STRIDE_0;
1436 } else {
1437 assert(value.file == BRW_IMMEDIATE_VALUE);
1438 }
1439
1440 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1441 * case, and we don't really care about squeezing every bit of performance
1442 * out of this path, so we just emit the MOVs from here.
1443 */
1444 brw_MOV(p, payload_offset, offset);
1445 brw_MOV(p, payload_value, value);
1446 brw_shader_time_add(p, payload,
1447 prog_data->base.binding_table.shader_time_start);
1448 brw_pop_insn_state(p);
1449
1450 brw_mark_surface_used(&prog_data->base,
1451 prog_data->base.binding_table.shader_time_start);
1452 }
1453
1454 void
1455 fs_generator::generate_untyped_atomic(fs_inst *inst, struct brw_reg dst,
1456 struct brw_reg atomic_op,
1457 struct brw_reg surf_index)
1458 {
1459 assert(atomic_op.file == BRW_IMMEDIATE_VALUE &&
1460 atomic_op.type == BRW_REGISTER_TYPE_UD &&
1461 surf_index.file == BRW_IMMEDIATE_VALUE &&
1462 surf_index.type == BRW_REGISTER_TYPE_UD);
1463
1464 brw_untyped_atomic(p, dst, brw_message_reg(inst->base_mrf),
1465 atomic_op.dw1.ud, surf_index.dw1.ud,
1466 inst->mlen, dispatch_width / 8);
1467
1468 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
1469 }
1470
1471 void
1472 fs_generator::generate_untyped_surface_read(fs_inst *inst, struct brw_reg dst,
1473 struct brw_reg surf_index)
1474 {
1475 assert(surf_index.file == BRW_IMMEDIATE_VALUE &&
1476 surf_index.type == BRW_REGISTER_TYPE_UD);
1477
1478 brw_untyped_surface_read(p, dst, brw_message_reg(inst->base_mrf),
1479 surf_index.dw1.ud,
1480 inst->mlen, dispatch_width / 8);
1481
1482 brw_mark_surface_used(&prog_data->base, surf_index.dw1.ud);
1483 }
1484
1485 void
1486 fs_generator::generate_code(const cfg_t *cfg)
1487 {
1488 int start_offset = p->next_insn_offset;
1489 int loop_count = 0;
1490
1491 struct annotation_info annotation;
1492 memset(&annotation, 0, sizeof(annotation));
1493
1494 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1495 struct brw_reg src[3], dst;
1496 unsigned int last_insn_offset = p->next_insn_offset;
1497
1498 if (unlikely(debug_flag))
1499 annotate(brw, &annotation, cfg, inst, p->next_insn_offset);
1500
1501 for (unsigned int i = 0; i < inst->sources; i++) {
1502 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1503
1504 /* The accumulator result appears to get used for the
1505 * conditional modifier generation. When negating a UD
1506 * value, there is a 33rd bit generated for the sign in the
1507 * accumulator value, so now you can't check, for example,
1508 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1509 */
1510 assert(!inst->conditional_mod ||
1511 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1512 !inst->src[i].negate);
1513 }
1514 dst = brw_reg_from_fs_reg(&inst->dst);
1515
1516 brw_set_default_predicate_control(p, inst->predicate);
1517 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1518 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1519 brw_set_default_saturate(p, inst->saturate);
1520 brw_set_default_mask_control(p, inst->force_writemask_all);
1521 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1522
1523 if (inst->force_uncompressed || dispatch_width == 8) {
1524 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1525 } else if (inst->force_sechalf) {
1526 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1527 } else {
1528 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1529 }
1530
1531 switch (inst->opcode) {
1532 case BRW_OPCODE_MOV:
1533 brw_MOV(p, dst, src[0]);
1534 break;
1535 case BRW_OPCODE_ADD:
1536 brw_ADD(p, dst, src[0], src[1]);
1537 break;
1538 case BRW_OPCODE_MUL:
1539 brw_MUL(p, dst, src[0], src[1]);
1540 break;
1541 case BRW_OPCODE_AVG:
1542 brw_AVG(p, dst, src[0], src[1]);
1543 break;
1544 case BRW_OPCODE_MACH:
1545 brw_MACH(p, dst, src[0], src[1]);
1546 break;
1547
1548 case BRW_OPCODE_MAD:
1549 assert(brw->gen >= 6);
1550 brw_set_default_access_mode(p, BRW_ALIGN_16);
1551 if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1552 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1553 brw_MAD(p, dst, src[0], src[1], src[2]);
1554 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1555 brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1556 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1557 } else {
1558 brw_MAD(p, dst, src[0], src[1], src[2]);
1559 }
1560 brw_set_default_access_mode(p, BRW_ALIGN_1);
1561 break;
1562
1563 case BRW_OPCODE_LRP:
1564 assert(brw->gen >= 6);
1565 brw_set_default_access_mode(p, BRW_ALIGN_16);
1566 if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1567 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1568 brw_LRP(p, dst, src[0], src[1], src[2]);
1569 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1570 brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1571 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1572 } else {
1573 brw_LRP(p, dst, src[0], src[1], src[2]);
1574 }
1575 brw_set_default_access_mode(p, BRW_ALIGN_1);
1576 break;
1577
1578 case BRW_OPCODE_FRC:
1579 brw_FRC(p, dst, src[0]);
1580 break;
1581 case BRW_OPCODE_RNDD:
1582 brw_RNDD(p, dst, src[0]);
1583 break;
1584 case BRW_OPCODE_RNDE:
1585 brw_RNDE(p, dst, src[0]);
1586 break;
1587 case BRW_OPCODE_RNDZ:
1588 brw_RNDZ(p, dst, src[0]);
1589 break;
1590
1591 case BRW_OPCODE_AND:
1592 brw_AND(p, dst, src[0], src[1]);
1593 break;
1594 case BRW_OPCODE_OR:
1595 brw_OR(p, dst, src[0], src[1]);
1596 break;
1597 case BRW_OPCODE_XOR:
1598 brw_XOR(p, dst, src[0], src[1]);
1599 break;
1600 case BRW_OPCODE_NOT:
1601 brw_NOT(p, dst, src[0]);
1602 break;
1603 case BRW_OPCODE_ASR:
1604 brw_ASR(p, dst, src[0], src[1]);
1605 break;
1606 case BRW_OPCODE_SHR:
1607 brw_SHR(p, dst, src[0], src[1]);
1608 break;
1609 case BRW_OPCODE_SHL:
1610 brw_SHL(p, dst, src[0], src[1]);
1611 break;
1612 case BRW_OPCODE_F32TO16:
1613 assert(brw->gen >= 7);
1614 brw_F32TO16(p, dst, src[0]);
1615 break;
1616 case BRW_OPCODE_F16TO32:
1617 assert(brw->gen >= 7);
1618 brw_F16TO32(p, dst, src[0]);
1619 break;
1620 case BRW_OPCODE_CMP:
1621 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1622 break;
1623 case BRW_OPCODE_SEL:
1624 brw_SEL(p, dst, src[0], src[1]);
1625 break;
1626 case BRW_OPCODE_BFREV:
1627 assert(brw->gen >= 7);
1628 /* BFREV only supports UD type for src and dst. */
1629 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1630 retype(src[0], BRW_REGISTER_TYPE_UD));
1631 break;
1632 case BRW_OPCODE_FBH:
1633 assert(brw->gen >= 7);
1634 /* FBH only supports UD type for dst. */
1635 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1636 break;
1637 case BRW_OPCODE_FBL:
1638 assert(brw->gen >= 7);
1639 /* FBL only supports UD type for dst. */
1640 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1641 break;
1642 case BRW_OPCODE_CBIT:
1643 assert(brw->gen >= 7);
1644 /* CBIT only supports UD type for dst. */
1645 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1646 break;
1647 case BRW_OPCODE_ADDC:
1648 assert(brw->gen >= 7);
1649 brw_ADDC(p, dst, src[0], src[1]);
1650 break;
1651 case BRW_OPCODE_SUBB:
1652 assert(brw->gen >= 7);
1653 brw_SUBB(p, dst, src[0], src[1]);
1654 break;
1655 case BRW_OPCODE_MAC:
1656 brw_MAC(p, dst, src[0], src[1]);
1657 break;
1658
1659 case BRW_OPCODE_BFE:
1660 assert(brw->gen >= 7);
1661 brw_set_default_access_mode(p, BRW_ALIGN_16);
1662 if (dispatch_width == 16 && brw->gen < 8 && !brw->is_haswell) {
1663 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1664 brw_BFE(p, dst, src[0], src[1], src[2]);
1665 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1666 brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1667 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1668 } else {
1669 brw_BFE(p, dst, src[0], src[1], src[2]);
1670 }
1671 brw_set_default_access_mode(p, BRW_ALIGN_1);
1672 break;
1673
1674 case BRW_OPCODE_BFI1:
1675 assert(brw->gen >= 7);
1676 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1677 * should
1678 *
1679 * "Force BFI instructions to be executed always in SIMD8."
1680 */
1681 if (dispatch_width == 16 && brw->is_haswell) {
1682 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1683 brw_BFI1(p, dst, src[0], src[1]);
1684 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1685 brw_BFI1(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]));
1686 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1687 } else {
1688 brw_BFI1(p, dst, src[0], src[1]);
1689 }
1690 break;
1691 case BRW_OPCODE_BFI2:
1692 assert(brw->gen >= 7);
1693 brw_set_default_access_mode(p, BRW_ALIGN_16);
1694 /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
1695 * should
1696 *
1697 * "Force BFI instructions to be executed always in SIMD8."
1698 *
1699 * Otherwise we would be able to emit compressed instructions like we
1700 * do for the other three-source instructions.
1701 */
1702 if (dispatch_width == 16) {
1703 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1704 brw_BFI2(p, dst, src[0], src[1], src[2]);
1705 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1706 brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1707 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1708 } else {
1709 brw_BFI2(p, dst, src[0], src[1], src[2]);
1710 }
1711 brw_set_default_access_mode(p, BRW_ALIGN_1);
1712 break;
1713
1714 case BRW_OPCODE_IF:
1715 if (inst->src[0].file != BAD_FILE) {
1716 /* The instruction has an embedded compare (only allowed on gen6) */
1717 assert(brw->gen == 6);
1718 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1719 } else {
1720 brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1721 }
1722 break;
1723
1724 case BRW_OPCODE_ELSE:
1725 brw_ELSE(p);
1726 break;
1727 case BRW_OPCODE_ENDIF:
1728 brw_ENDIF(p);
1729 break;
1730
1731 case BRW_OPCODE_DO:
1732 brw_DO(p, BRW_EXECUTE_8);
1733 break;
1734
1735 case BRW_OPCODE_BREAK:
1736 brw_BREAK(p);
1737 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1738 break;
1739 case BRW_OPCODE_CONTINUE:
1740 brw_CONT(p);
1741 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
1742 break;
1743
1744 case BRW_OPCODE_WHILE:
1745 brw_WHILE(p);
1746 loop_count++;
1747 break;
1748
1749 case SHADER_OPCODE_RCP:
1750 case SHADER_OPCODE_RSQ:
1751 case SHADER_OPCODE_SQRT:
1752 case SHADER_OPCODE_EXP2:
1753 case SHADER_OPCODE_LOG2:
1754 case SHADER_OPCODE_SIN:
1755 case SHADER_OPCODE_COS:
1756 assert(brw->gen < 6 || inst->mlen == 0);
1757 if (brw->gen >= 7) {
1758 gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
1759 brw_null_reg());
1760 } else if (brw->gen == 6) {
1761 generate_math_gen6(inst, dst, src[0], brw_null_reg());
1762 } else if (brw->gen == 5 || brw->is_g4x) {
1763 generate_math_g45(inst, dst, src[0]);
1764 } else {
1765 generate_math_gen4(inst, dst, src[0]);
1766 }
1767 break;
1768 case SHADER_OPCODE_INT_QUOTIENT:
1769 case SHADER_OPCODE_INT_REMAINDER:
1770 case SHADER_OPCODE_POW:
1771 assert(brw->gen < 6 || inst->mlen == 0);
1772 if (brw->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) {
1773 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1774 } else if (brw->gen >= 6) {
1775 generate_math_gen6(inst, dst, src[0], src[1]);
1776 } else {
1777 generate_math_gen4(inst, dst, src[0]);
1778 }
1779 break;
1780 case FS_OPCODE_PIXEL_X:
1781 generate_pixel_xy(dst, true);
1782 break;
1783 case FS_OPCODE_PIXEL_Y:
1784 generate_pixel_xy(dst, false);
1785 break;
1786 case FS_OPCODE_CINTERP:
1787 brw_MOV(p, dst, src[0]);
1788 break;
1789 case FS_OPCODE_LINTERP:
1790 generate_linterp(inst, dst, src);
1791 break;
1792 case SHADER_OPCODE_TEX:
1793 case FS_OPCODE_TXB:
1794 case SHADER_OPCODE_TXD:
1795 case SHADER_OPCODE_TXF:
1796 case SHADER_OPCODE_TXF_CMS:
1797 case SHADER_OPCODE_TXF_UMS:
1798 case SHADER_OPCODE_TXF_MCS:
1799 case SHADER_OPCODE_TXL:
1800 case SHADER_OPCODE_TXS:
1801 case SHADER_OPCODE_LOD:
1802 case SHADER_OPCODE_TG4:
1803 case SHADER_OPCODE_TG4_OFFSET:
1804 generate_tex(inst, dst, src[0], src[1]);
1805 break;
1806 case FS_OPCODE_DDX:
1807 generate_ddx(inst, dst, src[0], src[1]);
1808 break;
1809 case FS_OPCODE_DDY:
1810 /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1811 * guarantee that key->render_to_fbo is set).
1812 */
1813 assert(stage == MESA_SHADER_FRAGMENT &&
1814 ((gl_fragment_program *) prog)->UsesDFdy);
1815 generate_ddy(inst, dst, src[0], src[1], key->render_to_fbo);
1816 break;
1817
1818 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1819 generate_scratch_write(inst, src[0]);
1820 break;
1821
1822 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1823 generate_scratch_read(inst, dst);
1824 break;
1825
1826 case SHADER_OPCODE_GEN7_SCRATCH_READ:
1827 generate_scratch_read_gen7(inst, dst);
1828 break;
1829
1830 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1831 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1832 break;
1833
1834 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1835 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1836 break;
1837
1838 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1839 generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
1840 break;
1841
1842 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1843 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1844 break;
1845
1846 case FS_OPCODE_REP_FB_WRITE:
1847 case FS_OPCODE_FB_WRITE:
1848 generate_fb_write(inst);
1849 break;
1850
1851 case FS_OPCODE_BLORP_FB_WRITE:
1852 generate_blorp_fb_write(inst);
1853 break;
1854
1855 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1856 generate_mov_dispatch_to_flags(inst);
1857 break;
1858
1859 case FS_OPCODE_DISCARD_JUMP:
1860 generate_discard_jump(inst);
1861 break;
1862
1863 case SHADER_OPCODE_SHADER_TIME_ADD:
1864 generate_shader_time_add(inst, src[0], src[1], src[2]);
1865 break;
1866
1867 case SHADER_OPCODE_UNTYPED_ATOMIC:
1868 generate_untyped_atomic(inst, dst, src[0], src[1]);
1869 break;
1870
1871 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1872 generate_untyped_surface_read(inst, dst, src[0]);
1873 break;
1874
1875 case FS_OPCODE_SET_SIMD4X2_OFFSET:
1876 generate_set_simd4x2_offset(inst, dst, src[0]);
1877 break;
1878
1879 case FS_OPCODE_SET_OMASK:
1880 generate_set_omask(inst, dst, src[0]);
1881 break;
1882
1883 case FS_OPCODE_SET_SAMPLE_ID:
1884 generate_set_sample_id(inst, dst, src[0], src[1]);
1885 break;
1886
1887 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1888 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
1889 break;
1890
1891 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1892 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1893 generate_unpack_half_2x16_split(inst, dst, src[0]);
1894 break;
1895
1896 case FS_OPCODE_PLACEHOLDER_HALT:
1897 /* This is the place where the final HALT needs to be inserted if
1898 * we've emitted any discards. If not, this will emit no code.
1899 */
1900 if (!patch_discard_jumps_to_fb_writes()) {
1901 if (unlikely(debug_flag)) {
1902 annotation.ann_count--;
1903 }
1904 }
1905 break;
1906
1907 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1908 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
1909 GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID);
1910 break;
1911
1912 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1913 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
1914 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
1915 break;
1916
1917 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1918 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
1919 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
1920 break;
1921
1922 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1923 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
1924 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
1925 break;
1926
1927 default:
1928 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1929 _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1930 opcode_descs[inst->opcode].name);
1931 } else {
1932 _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1933 }
1934 abort();
1935
1936 case SHADER_OPCODE_LOAD_PAYLOAD:
1937 unreachable("Should be lowered by lower_load_payload()");
1938 }
1939
1940 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
1941 assert(p->next_insn_offset == last_insn_offset + 16 ||
1942 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
1943 "emitting more than 1 instruction");
1944
1945 brw_inst *last = &p->store[last_insn_offset / 16];
1946
1947 brw_inst_set_cond_modifier(brw, last, inst->conditional_mod);
1948 brw_inst_set_no_dd_clear(brw, last, inst->no_dd_clear);
1949 brw_inst_set_no_dd_check(brw, last, inst->no_dd_check);
1950 }
1951 }
1952
1953 brw_set_uip_jip(p);
1954 annotation_finalize(&annotation, p->next_insn_offset);
1955
1956 int before_size = p->next_insn_offset - start_offset;
1957 brw_compact_instructions(p, start_offset, annotation.ann_count,
1958 annotation.ann);
1959 int after_size = p->next_insn_offset - start_offset;
1960
1961 if (unlikely(debug_flag)) {
1962 if (shader_prog) {
1963 fprintf(stderr,
1964 "Native code for %s fragment shader %d (SIMD%d dispatch):\n",
1965 shader_prog->Label ? shader_prog->Label : "unnamed",
1966 shader_prog->Name, dispatch_width);
1967 } else if (prog) {
1968 fprintf(stderr,
1969 "Native code for fragment program %d (SIMD%d dispatch):\n",
1970 prog->Id, dispatch_width);
1971 } else {
1972 fprintf(stderr, "Native code for blorp program (SIMD%d dispatch):\n",
1973 dispatch_width);
1974 }
1975 fprintf(stderr, "SIMD%d shader: %d instructions. %d loops. Compacted %d to %d"
1976 " bytes (%.0f%%)\n",
1977 dispatch_width, before_size / 16, loop_count, before_size, after_size,
1978 100.0f * (before_size - after_size) / before_size);
1979
1980 dump_assembly(p->store, annotation.ann_count, annotation.ann, brw, prog);
1981 ralloc_free(annotation.ann);
1982 }
1983 }
1984
1985 const unsigned *
1986 fs_generator::generate_assembly(const cfg_t *simd8_cfg,
1987 const cfg_t *simd16_cfg,
1988 unsigned *assembly_size)
1989 {
1990 assert(simd8_cfg || simd16_cfg);
1991
1992 if (simd8_cfg) {
1993 dispatch_width = 8;
1994 generate_code(simd8_cfg);
1995 }
1996
1997 if (simd16_cfg) {
1998 /* align to 64 byte boundary. */
1999 while (p->next_insn_offset % 64) {
2000 brw_NOP(p);
2001 }
2002
2003 /* Save off the start of this SIMD16 program */
2004 prog_data->prog_offset_16 = p->next_insn_offset;
2005
2006 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2007
2008 dispatch_width = 16;
2009 generate_code(simd16_cfg);
2010 }
2011
2012 return brw_get_program(p, assembly_size);
2013 }