i965: Delete the FS_OPCODE_INTERPOLATE_AT_CENTROID virtual opcode.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33 #include "brw_program.h"
34
35 static enum brw_reg_file
36 brw_file_from_reg(fs_reg *reg)
37 {
38 switch (reg->file) {
39 case ARF:
40 return BRW_ARCHITECTURE_REGISTER_FILE;
41 case FIXED_GRF:
42 case VGRF:
43 return BRW_GENERAL_REGISTER_FILE;
44 case MRF:
45 return BRW_MESSAGE_REGISTER_FILE;
46 case IMM:
47 return BRW_IMMEDIATE_VALUE;
48 case BAD_FILE:
49 case ATTR:
50 case UNIFORM:
51 unreachable("not reached");
52 }
53 return BRW_ARCHITECTURE_REGISTER_FILE;
54 }
55
56 static struct brw_reg
57 brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
58 {
59 assert(reg->reg_offset == 0);
60 struct brw_reg brw_reg;
61
62 switch (reg->file) {
63 case MRF:
64 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
65 /* Fallthrough */
66 case VGRF:
67 if (reg->stride == 0) {
68 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
69 } else {
70 /* From the Haswell PRM:
71 *
72 * "VertStride must be used to cross GRF register boundaries. This
73 * rule implies that elements within a 'Width' cannot cross GRF
74 * boundaries."
75 *
76 * The maximum width value that could satisfy this restriction is:
77 */
78 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
79
80 /* Because the hardware can only split source regions at a whole
81 * multiple of width during decompression (i.e. vertically), clamp
82 * the value obtained above to the physical execution size of a
83 * single decompressed chunk of the instruction:
84 */
85 const unsigned phys_width = compressed ? inst->exec_size / 2 :
86 inst->exec_size;
87
88 /* XXX - The equation above is strictly speaking not correct on
89 * hardware that supports unbalanced GRF writes -- On Gen9+
90 * each decompressed chunk of the instruction may have a
91 * different execution size when the number of components
92 * written to each destination GRF is not the same.
93 */
94 const unsigned width = MIN2(reg_width, phys_width);
95 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
96 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
97 }
98
99 brw_reg = retype(brw_reg, reg->type);
100 brw_reg = byte_offset(brw_reg, reg->subreg_offset);
101 brw_reg.abs = reg->abs;
102 brw_reg.negate = reg->negate;
103 break;
104 case ARF:
105 case FIXED_GRF:
106 case IMM:
107 assert(reg->subreg_offset == 0);
108 brw_reg = reg->as_brw_reg();
109 break;
110 case BAD_FILE:
111 /* Probably unused. */
112 brw_reg = brw_null_reg();
113 break;
114 case ATTR:
115 case UNIFORM:
116 unreachable("not reached");
117 }
118
119 return brw_reg;
120 }
121
122 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
123 void *mem_ctx,
124 const void *key,
125 struct brw_stage_prog_data *prog_data,
126 unsigned promoted_constants,
127 bool runtime_check_aads_emit,
128 gl_shader_stage stage)
129
130 : compiler(compiler), log_data(log_data),
131 devinfo(compiler->devinfo), key(key),
132 prog_data(prog_data),
133 promoted_constants(promoted_constants),
134 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
135 stage(stage), mem_ctx(mem_ctx)
136 {
137 p = rzalloc(mem_ctx, struct brw_codegen);
138 brw_init_codegen(devinfo, p, mem_ctx);
139 }
140
141 fs_generator::~fs_generator()
142 {
143 }
144
145 class ip_record : public exec_node {
146 public:
147 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
148
149 ip_record(int ip)
150 {
151 this->ip = ip;
152 }
153
154 int ip;
155 };
156
157 bool
158 fs_generator::patch_discard_jumps_to_fb_writes()
159 {
160 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
161 return false;
162
163 int scale = brw_jump_scale(p->devinfo);
164
165 /* There is a somewhat strange undocumented requirement of using
166 * HALT, according to the simulator. If some channel has HALTed to
167 * a particular UIP, then by the end of the program, every channel
168 * must have HALTed to that UIP. Furthermore, the tracking is a
169 * stack, so you can't do the final halt of a UIP after starting
170 * halting to a new UIP.
171 *
172 * Symptoms of not emitting this instruction on actual hardware
173 * included GPU hangs and sparkly rendering on the piglit discard
174 * tests.
175 */
176 brw_inst *last_halt = gen6_HALT(p);
177 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
178 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
179
180 int ip = p->nr_insn;
181
182 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
183 brw_inst *patch = &p->store[patch_ip->ip];
184
185 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
186 /* HALT takes a half-instruction distance from the pre-incremented IP. */
187 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
188 }
189
190 this->discard_halt_patches.make_empty();
191 return true;
192 }
193
194 void
195 fs_generator::fire_fb_write(fs_inst *inst,
196 struct brw_reg payload,
197 struct brw_reg implied_header,
198 GLuint nr)
199 {
200 uint32_t msg_control;
201
202 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
203
204 if (devinfo->gen < 6) {
205 brw_push_insn_state(p);
206 brw_set_default_exec_size(p, BRW_EXECUTE_8);
207 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
208 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
209 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
210 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
211 brw_pop_insn_state(p);
212 }
213
214 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
215 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
216 else if (prog_data->dual_src_blend) {
217 if (!inst->group)
218 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
219 else
220 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
221 } else if (inst->exec_size == 16)
222 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
223 else
224 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
225
226 uint32_t surf_index =
227 prog_data->binding_table.render_target_start + inst->target;
228
229 bool last_render_target = inst->eot ||
230 (prog_data->dual_src_blend && dispatch_width == 16);
231
232
233 brw_fb_WRITE(p,
234 payload,
235 implied_header,
236 msg_control,
237 surf_index,
238 nr,
239 0,
240 inst->eot,
241 last_render_target,
242 inst->header_size != 0);
243
244 brw_mark_surface_used(&prog_data->base, surf_index);
245 }
246
247 void
248 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
249 {
250 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
251 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
252 struct brw_reg implied_header;
253
254 if (devinfo->gen < 8 && !devinfo->is_haswell) {
255 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
256 }
257
258 if (inst->base_mrf >= 0)
259 payload = brw_message_reg(inst->base_mrf);
260
261 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
262 * move, here's g1.
263 */
264 if (inst->header_size != 0) {
265 brw_push_insn_state(p);
266 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
267 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
268 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
269 brw_set_default_flag_reg(p, 0, 0);
270
271 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
272 * present.
273 */
274 if (prog_data->uses_kill) {
275 struct brw_reg pixel_mask;
276
277 if (devinfo->gen >= 6)
278 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
279 else
280 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
281
282 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
283 }
284
285 if (devinfo->gen >= 6) {
286 brw_push_insn_state(p);
287 brw_set_default_exec_size(p, BRW_EXECUTE_16);
288 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
289 brw_MOV(p,
290 retype(payload, BRW_REGISTER_TYPE_UD),
291 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
292 brw_pop_insn_state(p);
293
294 if (inst->target > 0 && key->replicate_alpha) {
295 /* Set "Source0 Alpha Present to RenderTarget" bit in message
296 * header.
297 */
298 brw_OR(p,
299 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
300 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
301 brw_imm_ud(0x1 << 11));
302 }
303
304 if (inst->target > 0) {
305 /* Set the render target index for choosing BLEND_STATE. */
306 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
307 BRW_REGISTER_TYPE_UD),
308 brw_imm_ud(inst->target));
309 }
310
311 /* Set computes stencil to render target */
312 if (prog_data->computed_stencil) {
313 brw_OR(p,
314 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
315 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
316 brw_imm_ud(0x1 << 14));
317 }
318
319 implied_header = brw_null_reg();
320 } else {
321 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
322 }
323
324 brw_pop_insn_state(p);
325 } else {
326 implied_header = brw_null_reg();
327 }
328
329 if (!runtime_check_aads_emit) {
330 fire_fb_write(inst, payload, implied_header, inst->mlen);
331 } else {
332 /* This can only happen in gen < 6 */
333 assert(devinfo->gen < 6);
334
335 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
336
337 /* Check runtime bit to detect if we have to send AA data or not */
338 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
339 brw_AND(p,
340 v1_null_ud,
341 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
342 brw_imm_ud(1<<26));
343 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
344
345 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
346 brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
347 {
348 /* Don't send AA data */
349 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
350 }
351 brw_land_fwd_jump(p, jmp);
352 fire_fb_write(inst, payload, implied_header, inst->mlen);
353 }
354 }
355
356 void
357 fs_generator::generate_mov_indirect(fs_inst *inst,
358 struct brw_reg dst,
359 struct brw_reg reg,
360 struct brw_reg indirect_byte_offset)
361 {
362 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
363 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
364
365 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
366
367 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
368 imm_byte_offset += indirect_byte_offset.ud;
369
370 reg.nr = imm_byte_offset / REG_SIZE;
371 reg.subnr = imm_byte_offset % REG_SIZE;
372 brw_MOV(p, dst, reg);
373 } else {
374 /* Prior to Broadwell, there are only 8 address registers. */
375 assert(inst->exec_size == 8 || devinfo->gen >= 8);
376
377 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
378 struct brw_reg addr = vec8(brw_address_reg(0));
379
380 /* The destination stride of an instruction (in bytes) must be greater
381 * than or equal to the size of the rest of the instruction. Since the
382 * address register is of type UW, we can't use a D-type instruction.
383 * In order to get around this, re retype to UW and use a stride.
384 */
385 indirect_byte_offset =
386 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
387
388 struct brw_reg ind_src;
389 if (devinfo->gen < 8) {
390 /* From the Haswell PRM section "Register Region Restrictions":
391 *
392 * "The lower bits of the AddressImmediate must not overflow to
393 * change the register address. The lower 5 bits of Address
394 * Immediate when added to lower 5 bits of address register gives
395 * the sub-register offset. The upper bits of Address Immediate
396 * when added to upper bits of address register gives the register
397 * address. Any overflow from sub-register offset is dropped."
398 *
399 * This restriction is only listed in the Haswell PRM but emperical
400 * testing indicates that it applies on all older generations and is
401 * lifted on Broadwell.
402 *
403 * Since the indirect may cause us to cross a register boundary, this
404 * makes the base offset almost useless. We could try and do
405 * something clever where we use a actual base offset if
406 * base_offset % 32 == 0 but that would mean we were generating
407 * different code depending on the base offset. Instead, for the
408 * sake of consistency, we'll just do the add ourselves.
409 */
410 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
411 ind_src = brw_VxH_indirect(0, 0);
412 } else {
413 brw_MOV(p, addr, indirect_byte_offset);
414 ind_src = brw_VxH_indirect(0, imm_byte_offset);
415 }
416
417 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, dst.type));
418
419 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
420 !inst->get_next()->is_tail_sentinel() &&
421 ((fs_inst *)inst->get_next())->mlen > 0) {
422 /* From the Sandybridge PRM:
423 *
424 * "[Errata: DevSNB(SNB)] If MRF register is updated by any
425 * instruction that “indexed/indirect” source AND is followed by a
426 * send, the instruction requires a “Switch”. This is to avoid
427 * race condition where send may dispatch before MRF is updated."
428 */
429 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
430 }
431 }
432 }
433
434 void
435 fs_generator::generate_urb_read(fs_inst *inst,
436 struct brw_reg dst,
437 struct brw_reg header)
438 {
439 assert(header.file == BRW_GENERAL_REGISTER_FILE);
440 assert(header.type == BRW_REGISTER_TYPE_UD);
441
442 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
443 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
444 brw_set_src0(p, send, header);
445 brw_set_src1(p, send, brw_imm_ud(0u));
446
447 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
448 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
449
450 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
451 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
452
453 brw_inst_set_mlen(p->devinfo, send, inst->mlen);
454 brw_inst_set_rlen(p->devinfo, send, inst->regs_written);
455 brw_inst_set_header_present(p->devinfo, send, true);
456 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
457 }
458
459 void
460 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
461 {
462 brw_inst *insn;
463
464 insn = brw_next_insn(p, BRW_OPCODE_SEND);
465
466 brw_set_dest(p, insn, brw_null_reg());
467 brw_set_src0(p, insn, payload);
468 brw_set_src1(p, insn, brw_imm_d(0));
469
470 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
471 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
472
473 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
474 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
475 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
476
477 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
478 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
479 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
480
481 brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
482 brw_inst_set_rlen(p->devinfo, insn, 0);
483 brw_inst_set_eot(p->devinfo, insn, inst->eot);
484 brw_inst_set_header_present(p->devinfo, insn, true);
485 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
486 }
487
488 void
489 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
490 {
491 struct brw_inst *insn;
492
493 insn = brw_next_insn(p, BRW_OPCODE_SEND);
494
495 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
496 brw_set_src0(p, insn, payload);
497 brw_set_src1(p, insn, brw_imm_d(0));
498
499 /* Terminate a compute shader by sending a message to the thread spawner.
500 */
501 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
502 brw_inst_set_mlen(devinfo, insn, 1);
503 brw_inst_set_rlen(devinfo, insn, 0);
504 brw_inst_set_eot(devinfo, insn, inst->eot);
505 brw_inst_set_header_present(devinfo, insn, false);
506
507 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
508 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
509
510 /* Note that even though the thread has a URB resource associated with it,
511 * we set the "do not dereference URB" bit, because the URB resource is
512 * managed by the fixed-function unit, so it will free it automatically.
513 */
514 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
515
516 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
517 }
518
519 void
520 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
521 {
522 brw_barrier(p, src);
523 brw_WAIT(p);
524 }
525
526 void
527 fs_generator::generate_linterp(fs_inst *inst,
528 struct brw_reg dst, struct brw_reg *src)
529 {
530 /* PLN reads:
531 * / in SIMD16 \
532 * -----------------------------------
533 * | src1+0 | src1+1 | src1+2 | src1+3 |
534 * |-----------------------------------|
535 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
536 * -----------------------------------
537 *
538 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
539 *
540 * -----------------------------------
541 * | src1+0 | src1+1 | src1+2 | src1+3 |
542 * |-----------------------------------|
543 * |(x0, x1)|(y0, y1)| | | in SIMD8
544 * |-----------------------------------|
545 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
546 * -----------------------------------
547 *
548 * See also: emit_interpolation_setup_gen4().
549 */
550 struct brw_reg delta_x = src[0];
551 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
552 struct brw_reg interp = src[1];
553
554 if (devinfo->has_pln &&
555 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
556 brw_PLN(p, dst, interp, delta_x);
557 } else {
558 brw_LINE(p, brw_null_reg(), interp, delta_x);
559 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
560 }
561 }
562
563 void
564 fs_generator::generate_get_buffer_size(fs_inst *inst,
565 struct brw_reg dst,
566 struct brw_reg src,
567 struct brw_reg surf_index)
568 {
569 assert(devinfo->gen >= 7);
570 assert(surf_index.file == BRW_IMMEDIATE_VALUE);
571
572 uint32_t simd_mode;
573 int rlen = 4;
574
575 switch (inst->exec_size) {
576 case 8:
577 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
578 break;
579 case 16:
580 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
581 break;
582 default:
583 unreachable("Invalid width for texture instruction");
584 }
585
586 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
587 rlen = 8;
588 dst = vec16(dst);
589 }
590
591 brw_SAMPLE(p,
592 retype(dst, BRW_REGISTER_TYPE_UW),
593 inst->base_mrf,
594 src,
595 surf_index.ud,
596 0,
597 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
598 rlen, /* response length */
599 inst->mlen,
600 inst->header_size > 0,
601 simd_mode,
602 BRW_SAMPLER_RETURN_FORMAT_SINT32);
603
604 brw_mark_surface_used(prog_data, surf_index.ud);
605 }
606
607 void
608 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
609 struct brw_reg surface_index,
610 struct brw_reg sampler_index)
611 {
612 int msg_type = -1;
613 uint32_t simd_mode;
614 uint32_t return_format;
615 bool is_combined_send = inst->eot;
616
617 switch (dst.type) {
618 case BRW_REGISTER_TYPE_D:
619 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
620 break;
621 case BRW_REGISTER_TYPE_UD:
622 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
623 break;
624 default:
625 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
626 break;
627 }
628
629 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
630 * is set as part of the message descriptor. On gen4, the PRM seems to
631 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
632 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
633 * gone from the message descriptor entirely and you just get UINT32 all
634 * the time regasrdless. Since we can really only do non-UINT32 on gen4,
635 * just stomp it to UINT32 all the time.
636 */
637 if (inst->opcode == SHADER_OPCODE_TXS)
638 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
639
640 switch (inst->exec_size) {
641 case 8:
642 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
643 break;
644 case 16:
645 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
646 break;
647 default:
648 unreachable("Invalid width for texture instruction");
649 }
650
651 if (devinfo->gen >= 5) {
652 switch (inst->opcode) {
653 case SHADER_OPCODE_TEX:
654 if (inst->shadow_compare) {
655 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
656 } else {
657 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
658 }
659 break;
660 case FS_OPCODE_TXB:
661 if (inst->shadow_compare) {
662 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
663 } else {
664 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
665 }
666 break;
667 case SHADER_OPCODE_TXL:
668 if (inst->shadow_compare) {
669 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
670 } else {
671 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
672 }
673 break;
674 case SHADER_OPCODE_TXL_LZ:
675 assert(devinfo->gen >= 9);
676 if (inst->shadow_compare) {
677 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
678 } else {
679 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
680 }
681 break;
682 case SHADER_OPCODE_TXS:
683 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
684 break;
685 case SHADER_OPCODE_TXD:
686 if (inst->shadow_compare) {
687 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
688 assert(devinfo->gen >= 8 || devinfo->is_haswell);
689 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
690 } else {
691 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
692 }
693 break;
694 case SHADER_OPCODE_TXF:
695 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
696 break;
697 case SHADER_OPCODE_TXF_LZ:
698 assert(devinfo->gen >= 9);
699 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
700 break;
701 case SHADER_OPCODE_TXF_CMS_W:
702 assert(devinfo->gen >= 9);
703 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
704 break;
705 case SHADER_OPCODE_TXF_CMS:
706 if (devinfo->gen >= 7)
707 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
708 else
709 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
710 break;
711 case SHADER_OPCODE_TXF_UMS:
712 assert(devinfo->gen >= 7);
713 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
714 break;
715 case SHADER_OPCODE_TXF_MCS:
716 assert(devinfo->gen >= 7);
717 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
718 break;
719 case SHADER_OPCODE_LOD:
720 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
721 break;
722 case SHADER_OPCODE_TG4:
723 if (inst->shadow_compare) {
724 assert(devinfo->gen >= 7);
725 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
726 } else {
727 assert(devinfo->gen >= 6);
728 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
729 }
730 break;
731 case SHADER_OPCODE_TG4_OFFSET:
732 assert(devinfo->gen >= 7);
733 if (inst->shadow_compare) {
734 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
735 } else {
736 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
737 }
738 break;
739 case SHADER_OPCODE_SAMPLEINFO:
740 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
741 break;
742 default:
743 unreachable("not reached");
744 }
745 } else {
746 switch (inst->opcode) {
747 case SHADER_OPCODE_TEX:
748 /* Note that G45 and older determines shadow compare and dispatch width
749 * from message length for most messages.
750 */
751 if (inst->exec_size == 8) {
752 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
753 if (inst->shadow_compare) {
754 assert(inst->mlen == 6);
755 } else {
756 assert(inst->mlen <= 4);
757 }
758 } else {
759 if (inst->shadow_compare) {
760 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
761 assert(inst->mlen == 9);
762 } else {
763 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
764 assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
765 }
766 }
767 break;
768 case FS_OPCODE_TXB:
769 if (inst->shadow_compare) {
770 assert(inst->exec_size == 8);
771 assert(inst->mlen == 6);
772 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
773 } else {
774 assert(inst->mlen == 9);
775 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
776 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
777 }
778 break;
779 case SHADER_OPCODE_TXL:
780 if (inst->shadow_compare) {
781 assert(inst->exec_size == 8);
782 assert(inst->mlen == 6);
783 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
784 } else {
785 assert(inst->mlen == 9);
786 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
787 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
788 }
789 break;
790 case SHADER_OPCODE_TXD:
791 /* There is no sample_d_c message; comparisons are done manually */
792 assert(inst->exec_size == 8);
793 assert(inst->mlen == 7 || inst->mlen == 10);
794 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
795 break;
796 case SHADER_OPCODE_TXF:
797 assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
798 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
799 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
800 break;
801 case SHADER_OPCODE_TXS:
802 assert(inst->mlen == 3);
803 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
804 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
805 break;
806 default:
807 unreachable("not reached");
808 }
809 }
810 assert(msg_type != -1);
811
812 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
813 dst = vec16(dst);
814 }
815
816 assert(devinfo->gen < 7 || inst->header_size == 0 ||
817 src.file == BRW_GENERAL_REGISTER_FILE);
818
819 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
820
821 /* Load the message header if present. If there's a texture offset,
822 * we need to set it up explicitly and load the offset bitfield.
823 * Otherwise, we can use an implied move from g0 to the first message reg.
824 */
825 if (inst->header_size != 0) {
826 if (devinfo->gen < 6 && !inst->offset) {
827 /* Set up an implied move from g0 to the MRF. */
828 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
829 } else {
830 struct brw_reg header_reg;
831
832 if (devinfo->gen >= 7) {
833 header_reg = src;
834 } else {
835 assert(inst->base_mrf != -1);
836 header_reg = brw_message_reg(inst->base_mrf);
837 }
838
839 brw_push_insn_state(p);
840 brw_set_default_exec_size(p, BRW_EXECUTE_8);
841 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
842 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
843 /* Explicitly set up the message header by copying g0 to the MRF. */
844 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
845
846 if (inst->offset) {
847 /* Set the offset bits in DWord 2. */
848 brw_MOV(p, get_element_ud(header_reg, 2),
849 brw_imm_ud(inst->offset));
850 } else if (stage != MESA_SHADER_VERTEX &&
851 stage != MESA_SHADER_FRAGMENT) {
852 /* The vertex and fragment stages have g0.2 set to 0, so
853 * header0.2 is 0 when g0 is copied. Other stages may not, so we
854 * must set it to 0 to avoid setting undesirable bits in the
855 * message.
856 */
857 brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
858 }
859
860 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
861 brw_pop_insn_state(p);
862 }
863 }
864
865 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
866 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
867 ? prog_data->binding_table.gather_texture_start
868 : prog_data->binding_table.texture_start;
869
870 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
871 sampler_index.file == BRW_IMMEDIATE_VALUE) {
872 uint32_t surface = surface_index.ud;
873 uint32_t sampler = sampler_index.ud;
874
875 brw_SAMPLE(p,
876 retype(dst, BRW_REGISTER_TYPE_UW),
877 inst->base_mrf,
878 src,
879 surface + base_binding_table_index,
880 sampler % 16,
881 msg_type,
882 inst->regs_written,
883 inst->mlen,
884 inst->header_size != 0,
885 simd_mode,
886 return_format);
887
888 brw_mark_surface_used(prog_data, surface + base_binding_table_index);
889 } else {
890 /* Non-const sampler index */
891
892 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
893 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
894 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
895
896 brw_push_insn_state(p);
897 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
898 brw_set_default_access_mode(p, BRW_ALIGN_1);
899
900 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
901 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
902 } else {
903 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
904 brw_OR(p, addr, addr, surface_reg);
905 }
906 if (base_binding_table_index)
907 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
908 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
909
910 brw_pop_insn_state(p);
911
912 /* dst = send(offset, a0.0 | <descriptor>) */
913 brw_inst *insn = brw_send_indirect_message(
914 p, BRW_SFID_SAMPLER, dst, src, addr);
915 brw_set_sampler_message(p, insn,
916 0 /* surface */,
917 0 /* sampler */,
918 msg_type,
919 inst->regs_written,
920 inst->mlen /* mlen */,
921 inst->header_size != 0 /* header */,
922 simd_mode,
923 return_format);
924
925 /* visitor knows more than we do about the surface limit required,
926 * so has already done marking.
927 */
928 }
929
930 if (is_combined_send) {
931 brw_inst_set_eot(p->devinfo, brw_last_inst, true);
932 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
933 }
934 }
935
936
937 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
938 * looking like:
939 *
940 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
941 *
942 * Ideally, we want to produce:
943 *
944 * DDX DDY
945 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
946 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
947 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
948 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
949 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
950 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
951 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
952 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
953 *
954 * and add another set of two more subspans if in 16-pixel dispatch mode.
955 *
956 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
957 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
958 * pair. But the ideal approximation may impose a huge performance cost on
959 * sample_d. On at least Haswell, sample_d instruction does some
960 * optimizations if the same LOD is used for all pixels in the subspan.
961 *
962 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
963 * appropriate swizzling.
964 */
965 void
966 fs_generator::generate_ddx(enum opcode opcode,
967 struct brw_reg dst, struct brw_reg src)
968 {
969 unsigned vstride, width;
970
971 if (opcode == FS_OPCODE_DDX_FINE) {
972 /* produce accurate derivatives */
973 vstride = BRW_VERTICAL_STRIDE_2;
974 width = BRW_WIDTH_2;
975 } else {
976 /* replicate the derivative at the top-left pixel to other pixels */
977 vstride = BRW_VERTICAL_STRIDE_4;
978 width = BRW_WIDTH_4;
979 }
980
981 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
982 src.negate, src.abs,
983 BRW_REGISTER_TYPE_F,
984 vstride,
985 width,
986 BRW_HORIZONTAL_STRIDE_0,
987 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
988 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
989 src.negate, src.abs,
990 BRW_REGISTER_TYPE_F,
991 vstride,
992 width,
993 BRW_HORIZONTAL_STRIDE_0,
994 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
995 brw_ADD(p, dst, src0, negate(src1));
996 }
997
998 /* The negate_value boolean is used to negate the derivative computation for
999 * FBOs, since they place the origin at the upper left instead of the lower
1000 * left.
1001 */
1002 void
1003 fs_generator::generate_ddy(enum opcode opcode,
1004 struct brw_reg dst, struct brw_reg src)
1005 {
1006 if (opcode == FS_OPCODE_DDY_FINE) {
1007 /* produce accurate derivatives */
1008 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1009 src.negate, src.abs,
1010 BRW_REGISTER_TYPE_F,
1011 BRW_VERTICAL_STRIDE_4,
1012 BRW_WIDTH_4,
1013 BRW_HORIZONTAL_STRIDE_1,
1014 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1015 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1016 src.negate, src.abs,
1017 BRW_REGISTER_TYPE_F,
1018 BRW_VERTICAL_STRIDE_4,
1019 BRW_WIDTH_4,
1020 BRW_HORIZONTAL_STRIDE_1,
1021 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1022 brw_push_insn_state(p);
1023 brw_set_default_access_mode(p, BRW_ALIGN_16);
1024 brw_ADD(p, dst, negate(src0), src1);
1025 brw_pop_insn_state(p);
1026 } else {
1027 /* replicate the derivative at the top-left pixel to other pixels */
1028 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1029 src.negate, src.abs,
1030 BRW_REGISTER_TYPE_F,
1031 BRW_VERTICAL_STRIDE_4,
1032 BRW_WIDTH_4,
1033 BRW_HORIZONTAL_STRIDE_0,
1034 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1035 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1036 src.negate, src.abs,
1037 BRW_REGISTER_TYPE_F,
1038 BRW_VERTICAL_STRIDE_4,
1039 BRW_WIDTH_4,
1040 BRW_HORIZONTAL_STRIDE_0,
1041 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1042 brw_ADD(p, dst, negate(src0), src1);
1043 }
1044 }
1045
1046 void
1047 fs_generator::generate_discard_jump(fs_inst *inst)
1048 {
1049 assert(devinfo->gen >= 6);
1050
1051 /* This HALT will be patched up at FB write time to point UIP at the end of
1052 * the program, and at brw_uip_jip() JIP will be set to the end of the
1053 * current block (or the program).
1054 */
1055 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1056
1057 brw_push_insn_state(p);
1058 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1059 gen6_HALT(p);
1060 brw_pop_insn_state(p);
1061 }
1062
1063 void
1064 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1065 {
1066 /* The 32-wide messages only respect the first 16-wide half of the channel
1067 * enable signals which are replicated identically for the second group of
1068 * 16 channels, so we cannot use them unless the write is marked
1069 * force_writemask_all.
1070 */
1071 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1072 MIN2(16, inst->exec_size);
1073 const unsigned block_size = 4 * lower_size / REG_SIZE;
1074 assert(inst->mlen != 0);
1075
1076 brw_push_insn_state(p);
1077 brw_set_default_exec_size(p, cvt(lower_size) - 1);
1078 brw_set_default_compression(p, lower_size > 8);
1079
1080 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1081 brw_set_default_group(p, inst->group + lower_size * i);
1082
1083 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1084 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1085
1086 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1087 block_size,
1088 inst->offset + block_size * REG_SIZE * i);
1089 }
1090
1091 brw_pop_insn_state(p);
1092 }
1093
1094 void
1095 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1096 {
1097 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1098 assert(inst->mlen != 0);
1099
1100 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1101 inst->exec_size / 8, inst->offset);
1102 }
1103
1104 void
1105 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1106 {
1107 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1108
1109 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1110 }
1111
1112 void
1113 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1114 struct brw_reg dst,
1115 struct brw_reg index,
1116 struct brw_reg offset)
1117 {
1118 assert(inst->mlen != 0);
1119
1120 assert(index.file == BRW_IMMEDIATE_VALUE &&
1121 index.type == BRW_REGISTER_TYPE_UD);
1122 uint32_t surf_index = index.ud;
1123
1124 assert(offset.file == BRW_IMMEDIATE_VALUE &&
1125 offset.type == BRW_REGISTER_TYPE_UD);
1126 uint32_t read_offset = offset.ud;
1127
1128 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1129 read_offset, surf_index);
1130 }
1131
1132 void
1133 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1134 struct brw_reg dst,
1135 struct brw_reg index,
1136 struct brw_reg offset)
1137 {
1138 assert(index.type == BRW_REGISTER_TYPE_UD);
1139
1140 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
1141 /* Reference just the dword we need, to avoid angering validate_reg(). */
1142 offset = brw_vec1_grf(offset.nr, 0);
1143
1144 /* We use the SIMD4x2 mode because we want to end up with 4 components in
1145 * the destination loaded consecutively from the same offset (which appears
1146 * in the first component, and the rest are ignored).
1147 */
1148 dst.width = BRW_WIDTH_4;
1149
1150 struct brw_reg src = offset;
1151 bool header_present = false;
1152
1153 if (devinfo->gen >= 9) {
1154 /* Skylake requires a message header in order to use SIMD4x2 mode. */
1155 src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
1156 header_present = true;
1157
1158 brw_push_insn_state(p);
1159 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1160 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1161 brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1162 brw_set_default_access_mode(p, BRW_ALIGN_1);
1163
1164 brw_MOV(p, get_element_ud(src, 2),
1165 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1166 brw_pop_insn_state(p);
1167 }
1168
1169 if (index.file == BRW_IMMEDIATE_VALUE) {
1170
1171 uint32_t surf_index = index.ud;
1172
1173 brw_push_insn_state(p);
1174 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1175 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1176 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1177 brw_inst_set_exec_size(devinfo, send, BRW_EXECUTE_4);
1178 brw_pop_insn_state(p);
1179
1180 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1181 brw_set_src0(p, send, src);
1182 brw_set_sampler_message(p, send,
1183 surf_index,
1184 0, /* LD message ignores sampler unit */
1185 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1186 1, /* rlen */
1187 inst->mlen,
1188 header_present,
1189 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1190 0);
1191 } else {
1192
1193 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1194
1195 brw_push_insn_state(p);
1196 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1197 brw_set_default_access_mode(p, BRW_ALIGN_1);
1198
1199 /* a0.0 = surf_index & 0xff */
1200 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1201 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1202 brw_set_dest(p, insn_and, addr);
1203 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1204 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1205
1206 /* dst = send(payload, a0.0 | <descriptor>) */
1207 brw_inst *insn = brw_send_indirect_message(
1208 p, BRW_SFID_SAMPLER, dst, src, addr);
1209 brw_set_sampler_message(p, insn,
1210 0,
1211 0, /* LD message ignores sampler unit */
1212 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1213 1, /* rlen */
1214 inst->mlen,
1215 header_present,
1216 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1217 0);
1218
1219 brw_pop_insn_state(p);
1220 }
1221 }
1222
1223 void
1224 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1225 struct brw_reg dst,
1226 struct brw_reg index)
1227 {
1228 assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1229 assert(inst->header_size != 0);
1230 assert(inst->mlen);
1231
1232 assert(index.file == BRW_IMMEDIATE_VALUE &&
1233 index.type == BRW_REGISTER_TYPE_UD);
1234 uint32_t surf_index = index.ud;
1235
1236 uint32_t simd_mode, rlen, msg_type;
1237 if (inst->exec_size == 16) {
1238 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1239 rlen = 8;
1240 } else {
1241 assert(inst->exec_size == 8);
1242 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1243 rlen = 4;
1244 }
1245
1246 if (devinfo->gen >= 5)
1247 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1248 else {
1249 /* We always use the SIMD16 message so that we only have to load U, and
1250 * not V or R.
1251 */
1252 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1253 assert(inst->mlen == 3);
1254 assert(inst->regs_written == 8);
1255 rlen = 8;
1256 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1257 }
1258
1259 struct brw_reg header = brw_vec8_grf(0, 0);
1260 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1261
1262 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1263 brw_inst_set_compression(devinfo, send, false);
1264 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1265 brw_set_src0(p, send, header);
1266 if (devinfo->gen < 6)
1267 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1268
1269 /* Our surface is set up as floats, regardless of what actual data is
1270 * stored in it.
1271 */
1272 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1273 brw_set_sampler_message(p, send,
1274 surf_index,
1275 0, /* sampler (unused) */
1276 msg_type,
1277 rlen,
1278 inst->mlen,
1279 inst->header_size != 0,
1280 simd_mode,
1281 return_format);
1282 }
1283
1284 void
1285 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1286 struct brw_reg dst,
1287 struct brw_reg index,
1288 struct brw_reg offset)
1289 {
1290 assert(devinfo->gen >= 7);
1291 /* Varying-offset pull constant loads are treated as a normal expression on
1292 * gen7, so the fact that it's a send message is hidden at the IR level.
1293 */
1294 assert(inst->header_size == 0);
1295 assert(!inst->mlen);
1296 assert(index.type == BRW_REGISTER_TYPE_UD);
1297
1298 uint32_t simd_mode, rlen, mlen;
1299 if (inst->exec_size == 16) {
1300 mlen = 2;
1301 rlen = 8;
1302 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1303 } else {
1304 assert(inst->exec_size == 8);
1305 mlen = 1;
1306 rlen = 4;
1307 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1308 }
1309
1310 if (index.file == BRW_IMMEDIATE_VALUE) {
1311
1312 uint32_t surf_index = index.ud;
1313
1314 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1315 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1316 brw_set_src0(p, send, offset);
1317 brw_set_sampler_message(p, send,
1318 surf_index,
1319 0, /* LD message ignores sampler unit */
1320 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1321 rlen,
1322 mlen,
1323 false, /* no header */
1324 simd_mode,
1325 0);
1326
1327 } else {
1328
1329 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1330
1331 brw_push_insn_state(p);
1332 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1333 brw_set_default_access_mode(p, BRW_ALIGN_1);
1334
1335 /* a0.0 = surf_index & 0xff */
1336 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1337 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1338 brw_set_dest(p, insn_and, addr);
1339 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1340 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1341
1342 brw_pop_insn_state(p);
1343
1344 /* dst = send(offset, a0.0 | <descriptor>) */
1345 brw_inst *insn = brw_send_indirect_message(
1346 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1347 offset, addr);
1348 brw_set_sampler_message(p, insn,
1349 0 /* surface */,
1350 0 /* sampler */,
1351 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1352 rlen /* rlen */,
1353 mlen /* mlen */,
1354 false /* header */,
1355 simd_mode,
1356 0);
1357 }
1358 }
1359
1360 /**
1361 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1362 * into the flags register (f0.0).
1363 *
1364 * Used only on Gen6 and above.
1365 */
1366 void
1367 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1368 {
1369 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1370 struct brw_reg dispatch_mask;
1371
1372 if (devinfo->gen >= 6)
1373 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1374 else
1375 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1376
1377 brw_push_insn_state(p);
1378 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1379 brw_MOV(p, flags, dispatch_mask);
1380 brw_pop_insn_state(p);
1381 }
1382
1383 void
1384 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1385 struct brw_reg dst,
1386 struct brw_reg src,
1387 struct brw_reg msg_data,
1388 unsigned msg_type)
1389 {
1390 assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1391
1392 brw_pixel_interpolator_query(p,
1393 retype(dst, BRW_REGISTER_TYPE_UW),
1394 src,
1395 inst->pi_noperspective,
1396 msg_type,
1397 msg_data,
1398 inst->mlen,
1399 inst->regs_written);
1400 }
1401
1402
1403 /**
1404 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1405 * sampler LD messages.
1406 *
1407 * We don't want to bake it into the send message's code generation because
1408 * that means we don't get a chance to schedule the instructions.
1409 */
1410 void
1411 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1412 struct brw_reg dst,
1413 struct brw_reg value)
1414 {
1415 assert(value.file == BRW_IMMEDIATE_VALUE);
1416
1417 brw_push_insn_state(p);
1418 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1419 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1420 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1421 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1422 brw_pop_insn_state(p);
1423 }
1424
1425 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1426 * the ADD instruction.
1427 */
1428 void
1429 fs_generator::generate_set_sample_id(fs_inst *inst,
1430 struct brw_reg dst,
1431 struct brw_reg src0,
1432 struct brw_reg src1)
1433 {
1434 assert(dst.type == BRW_REGISTER_TYPE_D ||
1435 dst.type == BRW_REGISTER_TYPE_UD);
1436 assert(src0.type == BRW_REGISTER_TYPE_D ||
1437 src0.type == BRW_REGISTER_TYPE_UD);
1438
1439 struct brw_reg reg = stride(src1, 1, 4, 0);
1440 if (devinfo->gen >= 8 || inst->exec_size == 8) {
1441 brw_ADD(p, dst, src0, reg);
1442 } else if (inst->exec_size == 16) {
1443 brw_push_insn_state(p);
1444 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1445 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1446 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1447 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1448 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1449 brw_pop_insn_state(p);
1450 }
1451 }
1452
1453 void
1454 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1455 struct brw_reg dst,
1456 struct brw_reg x,
1457 struct brw_reg y)
1458 {
1459 assert(devinfo->gen >= 7);
1460 assert(dst.type == BRW_REGISTER_TYPE_UD);
1461 assert(x.type == BRW_REGISTER_TYPE_F);
1462 assert(y.type == BRW_REGISTER_TYPE_F);
1463
1464 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1465 *
1466 * Because this instruction does not have a 16-bit floating-point type,
1467 * the destination data type must be Word (W).
1468 *
1469 * The destination must be DWord-aligned and specify a horizontal stride
1470 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1471 * each destination channel and the upper word is not modified.
1472 */
1473 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1474
1475 /* Give each 32-bit channel of dst the form below, where "." means
1476 * unchanged.
1477 * 0x....hhhh
1478 */
1479 brw_F32TO16(p, dst_w, y);
1480
1481 /* Now the form:
1482 * 0xhhhh0000
1483 */
1484 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1485
1486 /* And, finally the form of packHalf2x16's output:
1487 * 0xhhhhllll
1488 */
1489 brw_F32TO16(p, dst_w, x);
1490 }
1491
1492 void
1493 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1494 struct brw_reg dst,
1495 struct brw_reg src)
1496 {
1497 assert(devinfo->gen >= 7);
1498 assert(dst.type == BRW_REGISTER_TYPE_F);
1499 assert(src.type == BRW_REGISTER_TYPE_UD);
1500
1501 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1502 *
1503 * Because this instruction does not have a 16-bit floating-point type,
1504 * the source data type must be Word (W). The destination type must be
1505 * F (Float).
1506 */
1507 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1508
1509 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1510 * For the Y case, we wish to access only the upper word; therefore
1511 * a 16-bit subregister offset is needed.
1512 */
1513 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1514 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1515 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1516 src_w.subnr += 2;
1517
1518 brw_F16TO32(p, dst, src_w);
1519 }
1520
1521 void
1522 fs_generator::generate_shader_time_add(fs_inst *inst,
1523 struct brw_reg payload,
1524 struct brw_reg offset,
1525 struct brw_reg value)
1526 {
1527 assert(devinfo->gen >= 7);
1528 brw_push_insn_state(p);
1529 brw_set_default_mask_control(p, true);
1530
1531 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1532 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1533 offset.type);
1534 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1535 value.type);
1536
1537 assert(offset.file == BRW_IMMEDIATE_VALUE);
1538 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1539 value.width = BRW_WIDTH_1;
1540 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1541 value.vstride = BRW_VERTICAL_STRIDE_0;
1542 } else {
1543 assert(value.file == BRW_IMMEDIATE_VALUE);
1544 }
1545
1546 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1547 * case, and we don't really care about squeezing every bit of performance
1548 * out of this path, so we just emit the MOVs from here.
1549 */
1550 brw_MOV(p, payload_offset, offset);
1551 brw_MOV(p, payload_value, value);
1552 brw_shader_time_add(p, payload,
1553 prog_data->binding_table.shader_time_start);
1554 brw_pop_insn_state(p);
1555
1556 brw_mark_surface_used(prog_data,
1557 prog_data->binding_table.shader_time_start);
1558 }
1559
1560 void
1561 fs_generator::enable_debug(const char *shader_name)
1562 {
1563 debug_flag = true;
1564 this->shader_name = shader_name;
1565 }
1566
1567 int
1568 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1569 {
1570 /* align to 64 byte boundary. */
1571 while (p->next_insn_offset % 64)
1572 brw_NOP(p);
1573
1574 this->dispatch_width = dispatch_width;
1575
1576 int start_offset = p->next_insn_offset;
1577 int spill_count = 0, fill_count = 0;
1578 int loop_count = 0;
1579
1580 struct annotation_info annotation;
1581 memset(&annotation, 0, sizeof(annotation));
1582
1583 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1584 struct brw_reg src[3], dst;
1585 unsigned int last_insn_offset = p->next_insn_offset;
1586 bool multiple_instructions_emitted = false;
1587
1588 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1589 * "Register Region Restrictions" section: for BDW, SKL:
1590 *
1591 * "A POW/FDIV operation must not be followed by an instruction
1592 * that requires two destination registers."
1593 *
1594 * The documentation is often lacking annotations for Atom parts,
1595 * and empirically this affects CHV as well.
1596 */
1597 if (devinfo->gen >= 8 &&
1598 p->nr_insn > 1 &&
1599 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1600 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1601 inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1602 brw_NOP(p);
1603 last_insn_offset = p->next_insn_offset;
1604 }
1605
1606 if (unlikely(debug_flag))
1607 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1608
1609 /* If the instruction writes to more than one register, it needs to be
1610 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
1611 * hardware figures out by itself what the right compression mode is,
1612 * but we still need to know whether the instruction is compressed to
1613 * set up the source register regions appropriately.
1614 *
1615 * XXX - This is wrong for instructions that write a single register but
1616 * read more than one which should strictly speaking be treated as
1617 * compressed. For instructions that don't write any registers it
1618 * relies on the destination being a null register of the correct
1619 * type and regioning so the instruction is considered compressed
1620 * or not accordingly.
1621 */
1622 const bool compressed =
1623 inst->dst.component_size(inst->exec_size) > REG_SIZE;
1624 brw_set_default_compression(p, compressed);
1625 brw_set_default_group(p, inst->group);
1626
1627 for (unsigned int i = 0; i < inst->sources; i++) {
1628 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
1629 compressed);
1630
1631 /* The accumulator result appears to get used for the
1632 * conditional modifier generation. When negating a UD
1633 * value, there is a 33rd bit generated for the sign in the
1634 * accumulator value, so now you can't check, for example,
1635 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1636 */
1637 assert(!inst->conditional_mod ||
1638 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1639 !inst->src[i].negate);
1640 }
1641 dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
1642
1643 brw_set_default_access_mode(p, BRW_ALIGN_1);
1644 brw_set_default_predicate_control(p, inst->predicate);
1645 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1646 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1647 brw_set_default_saturate(p, inst->saturate);
1648 brw_set_default_mask_control(p, inst->force_writemask_all);
1649 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1650 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
1651
1652 assert(inst->force_writemask_all || inst->exec_size >= 4);
1653 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1654 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1655 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1656
1657 switch (inst->opcode) {
1658 case BRW_OPCODE_MOV:
1659 brw_MOV(p, dst, src[0]);
1660 break;
1661 case BRW_OPCODE_ADD:
1662 brw_ADD(p, dst, src[0], src[1]);
1663 break;
1664 case BRW_OPCODE_MUL:
1665 brw_MUL(p, dst, src[0], src[1]);
1666 break;
1667 case BRW_OPCODE_AVG:
1668 brw_AVG(p, dst, src[0], src[1]);
1669 break;
1670 case BRW_OPCODE_MACH:
1671 brw_MACH(p, dst, src[0], src[1]);
1672 break;
1673
1674 case BRW_OPCODE_LINE:
1675 brw_LINE(p, dst, src[0], src[1]);
1676 break;
1677
1678 case BRW_OPCODE_MAD:
1679 assert(devinfo->gen >= 6);
1680 brw_set_default_access_mode(p, BRW_ALIGN_16);
1681 brw_MAD(p, dst, src[0], src[1], src[2]);
1682 break;
1683
1684 case BRW_OPCODE_LRP:
1685 assert(devinfo->gen >= 6);
1686 brw_set_default_access_mode(p, BRW_ALIGN_16);
1687 brw_LRP(p, dst, src[0], src[1], src[2]);
1688 break;
1689
1690 case BRW_OPCODE_FRC:
1691 brw_FRC(p, dst, src[0]);
1692 break;
1693 case BRW_OPCODE_RNDD:
1694 brw_RNDD(p, dst, src[0]);
1695 break;
1696 case BRW_OPCODE_RNDE:
1697 brw_RNDE(p, dst, src[0]);
1698 break;
1699 case BRW_OPCODE_RNDZ:
1700 brw_RNDZ(p, dst, src[0]);
1701 break;
1702
1703 case BRW_OPCODE_AND:
1704 brw_AND(p, dst, src[0], src[1]);
1705 break;
1706 case BRW_OPCODE_OR:
1707 brw_OR(p, dst, src[0], src[1]);
1708 break;
1709 case BRW_OPCODE_XOR:
1710 brw_XOR(p, dst, src[0], src[1]);
1711 break;
1712 case BRW_OPCODE_NOT:
1713 brw_NOT(p, dst, src[0]);
1714 break;
1715 case BRW_OPCODE_ASR:
1716 brw_ASR(p, dst, src[0], src[1]);
1717 break;
1718 case BRW_OPCODE_SHR:
1719 brw_SHR(p, dst, src[0], src[1]);
1720 break;
1721 case BRW_OPCODE_SHL:
1722 brw_SHL(p, dst, src[0], src[1]);
1723 break;
1724 case BRW_OPCODE_F32TO16:
1725 assert(devinfo->gen >= 7);
1726 brw_F32TO16(p, dst, src[0]);
1727 break;
1728 case BRW_OPCODE_F16TO32:
1729 assert(devinfo->gen >= 7);
1730 brw_F16TO32(p, dst, src[0]);
1731 break;
1732 case BRW_OPCODE_CMP:
1733 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1734 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1735 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1736 * implemented in the compiler is not sufficient. Overriding the
1737 * type when the destination is the null register is necessary but
1738 * not sufficient by itself.
1739 */
1740 assert(dst.nr == BRW_ARF_NULL);
1741 dst.type = BRW_REGISTER_TYPE_D;
1742 }
1743 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1744 break;
1745 case BRW_OPCODE_SEL:
1746 brw_SEL(p, dst, src[0], src[1]);
1747 break;
1748 case BRW_OPCODE_BFREV:
1749 assert(devinfo->gen >= 7);
1750 /* BFREV only supports UD type for src and dst. */
1751 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1752 retype(src[0], BRW_REGISTER_TYPE_UD));
1753 break;
1754 case BRW_OPCODE_FBH:
1755 assert(devinfo->gen >= 7);
1756 /* FBH only supports UD type for dst. */
1757 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1758 break;
1759 case BRW_OPCODE_FBL:
1760 assert(devinfo->gen >= 7);
1761 /* FBL only supports UD type for dst. */
1762 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1763 break;
1764 case BRW_OPCODE_LZD:
1765 brw_LZD(p, dst, src[0]);
1766 break;
1767 case BRW_OPCODE_CBIT:
1768 assert(devinfo->gen >= 7);
1769 /* CBIT only supports UD type for dst. */
1770 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1771 break;
1772 case BRW_OPCODE_ADDC:
1773 assert(devinfo->gen >= 7);
1774 brw_ADDC(p, dst, src[0], src[1]);
1775 break;
1776 case BRW_OPCODE_SUBB:
1777 assert(devinfo->gen >= 7);
1778 brw_SUBB(p, dst, src[0], src[1]);
1779 break;
1780 case BRW_OPCODE_MAC:
1781 brw_MAC(p, dst, src[0], src[1]);
1782 break;
1783
1784 case BRW_OPCODE_BFE:
1785 assert(devinfo->gen >= 7);
1786 brw_set_default_access_mode(p, BRW_ALIGN_16);
1787 brw_BFE(p, dst, src[0], src[1], src[2]);
1788 break;
1789
1790 case BRW_OPCODE_BFI1:
1791 assert(devinfo->gen >= 7);
1792 brw_BFI1(p, dst, src[0], src[1]);
1793 break;
1794 case BRW_OPCODE_BFI2:
1795 assert(devinfo->gen >= 7);
1796 brw_set_default_access_mode(p, BRW_ALIGN_16);
1797 brw_BFI2(p, dst, src[0], src[1], src[2]);
1798 break;
1799
1800 case BRW_OPCODE_IF:
1801 if (inst->src[0].file != BAD_FILE) {
1802 /* The instruction has an embedded compare (only allowed on gen6) */
1803 assert(devinfo->gen == 6);
1804 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1805 } else {
1806 brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1807 }
1808 break;
1809
1810 case BRW_OPCODE_ELSE:
1811 brw_ELSE(p);
1812 break;
1813 case BRW_OPCODE_ENDIF:
1814 brw_ENDIF(p);
1815 break;
1816
1817 case BRW_OPCODE_DO:
1818 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1819 break;
1820
1821 case BRW_OPCODE_BREAK:
1822 brw_BREAK(p);
1823 break;
1824 case BRW_OPCODE_CONTINUE:
1825 brw_CONT(p);
1826 break;
1827
1828 case BRW_OPCODE_WHILE:
1829 brw_WHILE(p);
1830 loop_count++;
1831 break;
1832
1833 case SHADER_OPCODE_RCP:
1834 case SHADER_OPCODE_RSQ:
1835 case SHADER_OPCODE_SQRT:
1836 case SHADER_OPCODE_EXP2:
1837 case SHADER_OPCODE_LOG2:
1838 case SHADER_OPCODE_SIN:
1839 case SHADER_OPCODE_COS:
1840 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1841 if (devinfo->gen >= 6) {
1842 assert(inst->mlen == 0);
1843 assert(devinfo->gen >= 7 || inst->exec_size == 8);
1844 gen6_math(p, dst, brw_math_function(inst->opcode),
1845 src[0], brw_null_reg());
1846 } else {
1847 assert(inst->mlen >= 1);
1848 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1849 gen4_math(p, dst,
1850 brw_math_function(inst->opcode),
1851 inst->base_mrf, src[0],
1852 BRW_MATH_PRECISION_FULL);
1853 }
1854 break;
1855 case SHADER_OPCODE_INT_QUOTIENT:
1856 case SHADER_OPCODE_INT_REMAINDER:
1857 case SHADER_OPCODE_POW:
1858 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1859 if (devinfo->gen >= 6) {
1860 assert(inst->mlen == 0);
1861 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1862 inst->exec_size == 8);
1863 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1864 } else {
1865 assert(inst->mlen >= 1);
1866 assert(inst->exec_size == 8);
1867 gen4_math(p, dst, brw_math_function(inst->opcode),
1868 inst->base_mrf, src[0],
1869 BRW_MATH_PRECISION_FULL);
1870 }
1871 break;
1872 case FS_OPCODE_CINTERP:
1873 brw_MOV(p, dst, src[0]);
1874 break;
1875 case FS_OPCODE_LINTERP:
1876 generate_linterp(inst, dst, src);
1877 break;
1878 case FS_OPCODE_PIXEL_X:
1879 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1880 src[0].subnr = 0 * type_sz(src[0].type);
1881 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1882 break;
1883 case FS_OPCODE_PIXEL_Y:
1884 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1885 src[0].subnr = 4 * type_sz(src[0].type);
1886 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1887 break;
1888 case FS_OPCODE_GET_BUFFER_SIZE:
1889 generate_get_buffer_size(inst, dst, src[0], src[1]);
1890 break;
1891 case SHADER_OPCODE_TEX:
1892 case FS_OPCODE_TXB:
1893 case SHADER_OPCODE_TXD:
1894 case SHADER_OPCODE_TXF:
1895 case SHADER_OPCODE_TXF_LZ:
1896 case SHADER_OPCODE_TXF_CMS:
1897 case SHADER_OPCODE_TXF_CMS_W:
1898 case SHADER_OPCODE_TXF_UMS:
1899 case SHADER_OPCODE_TXF_MCS:
1900 case SHADER_OPCODE_TXL:
1901 case SHADER_OPCODE_TXL_LZ:
1902 case SHADER_OPCODE_TXS:
1903 case SHADER_OPCODE_LOD:
1904 case SHADER_OPCODE_TG4:
1905 case SHADER_OPCODE_TG4_OFFSET:
1906 case SHADER_OPCODE_SAMPLEINFO:
1907 generate_tex(inst, dst, src[0], src[1], src[2]);
1908 break;
1909 case FS_OPCODE_DDX_COARSE:
1910 case FS_OPCODE_DDX_FINE:
1911 generate_ddx(inst->opcode, dst, src[0]);
1912 break;
1913 case FS_OPCODE_DDY_COARSE:
1914 case FS_OPCODE_DDY_FINE:
1915 generate_ddy(inst->opcode, dst, src[0]);
1916 break;
1917
1918 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1919 generate_scratch_write(inst, src[0]);
1920 spill_count++;
1921 break;
1922
1923 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1924 generate_scratch_read(inst, dst);
1925 fill_count++;
1926 break;
1927
1928 case SHADER_OPCODE_GEN7_SCRATCH_READ:
1929 generate_scratch_read_gen7(inst, dst);
1930 fill_count++;
1931 break;
1932
1933 case SHADER_OPCODE_MOV_INDIRECT:
1934 generate_mov_indirect(inst, dst, src[0], src[1]);
1935 break;
1936
1937 case SHADER_OPCODE_URB_READ_SIMD8:
1938 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
1939 generate_urb_read(inst, dst, src[0]);
1940 break;
1941
1942 case SHADER_OPCODE_URB_WRITE_SIMD8:
1943 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
1944 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
1945 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
1946 generate_urb_write(inst, src[0]);
1947 break;
1948
1949 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1950 assert(inst->force_writemask_all);
1951 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1952 break;
1953
1954 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1955 assert(inst->force_writemask_all);
1956 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1957 break;
1958
1959 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
1960 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
1961 break;
1962
1963 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1964 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1965 break;
1966
1967 case FS_OPCODE_REP_FB_WRITE:
1968 case FS_OPCODE_FB_WRITE:
1969 generate_fb_write(inst, src[0]);
1970 break;
1971
1972 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1973 generate_mov_dispatch_to_flags(inst);
1974 break;
1975
1976 case FS_OPCODE_DISCARD_JUMP:
1977 generate_discard_jump(inst);
1978 break;
1979
1980 case SHADER_OPCODE_SHADER_TIME_ADD:
1981 generate_shader_time_add(inst, src[0], src[1], src[2]);
1982 break;
1983
1984 case SHADER_OPCODE_UNTYPED_ATOMIC:
1985 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1986 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
1987 inst->mlen, !inst->dst.is_null());
1988 break;
1989
1990 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1991 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1992 brw_untyped_surface_read(p, dst, src[0], src[1],
1993 inst->mlen, src[2].ud);
1994 break;
1995
1996 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1997 assert(src[2].file == BRW_IMMEDIATE_VALUE);
1998 brw_untyped_surface_write(p, src[0], src[1],
1999 inst->mlen, src[2].ud);
2000 break;
2001
2002 case SHADER_OPCODE_TYPED_ATOMIC:
2003 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2004 brw_typed_atomic(p, dst, src[0], src[1],
2005 src[2].ud, inst->mlen, !inst->dst.is_null());
2006 break;
2007
2008 case SHADER_OPCODE_TYPED_SURFACE_READ:
2009 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2010 brw_typed_surface_read(p, dst, src[0], src[1],
2011 inst->mlen, src[2].ud);
2012 break;
2013
2014 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2015 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2016 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2017 break;
2018
2019 case SHADER_OPCODE_MEMORY_FENCE:
2020 brw_memory_fence(p, dst);
2021 break;
2022
2023 case FS_OPCODE_SET_SIMD4X2_OFFSET:
2024 generate_set_simd4x2_offset(inst, dst, src[0]);
2025 break;
2026
2027 case SHADER_OPCODE_FIND_LIVE_CHANNEL:
2028 brw_find_live_channel(p, dst);
2029 break;
2030
2031 case SHADER_OPCODE_BROADCAST:
2032 assert(inst->force_writemask_all);
2033 brw_broadcast(p, dst, src[0], src[1]);
2034 break;
2035
2036 case FS_OPCODE_SET_SAMPLE_ID:
2037 generate_set_sample_id(inst, dst, src[0], src[1]);
2038 break;
2039
2040 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2041 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2042 break;
2043
2044 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2045 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2046 generate_unpack_half_2x16_split(inst, dst, src[0]);
2047 break;
2048
2049 case FS_OPCODE_PLACEHOLDER_HALT:
2050 /* This is the place where the final HALT needs to be inserted if
2051 * we've emitted any discards. If not, this will emit no code.
2052 */
2053 if (!patch_discard_jumps_to_fb_writes()) {
2054 if (unlikely(debug_flag)) {
2055 annotation.ann_count--;
2056 }
2057 }
2058 break;
2059
2060 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2061 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2062 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2063 break;
2064
2065 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2066 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2067 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2068 break;
2069
2070 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2071 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2072 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2073 break;
2074
2075 case CS_OPCODE_CS_TERMINATE:
2076 generate_cs_terminate(inst, src[0]);
2077 break;
2078
2079 case SHADER_OPCODE_BARRIER:
2080 generate_barrier(inst, src[0]);
2081 break;
2082
2083 case BRW_OPCODE_DIM:
2084 assert(devinfo->is_haswell);
2085 assert(src[0].type == BRW_REGISTER_TYPE_DF);
2086 assert(dst.type == BRW_REGISTER_TYPE_DF);
2087 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2088 break;
2089
2090 default:
2091 unreachable("Unsupported opcode");
2092
2093 case SHADER_OPCODE_LOAD_PAYLOAD:
2094 unreachable("Should be lowered by lower_load_payload()");
2095 }
2096
2097 if (multiple_instructions_emitted)
2098 continue;
2099
2100 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2101 assert(p->next_insn_offset == last_insn_offset + 16 ||
2102 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2103 "emitting more than 1 instruction");
2104
2105 brw_inst *last = &p->store[last_insn_offset / 16];
2106
2107 if (inst->conditional_mod)
2108 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2109 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2110 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2111 }
2112 }
2113
2114 brw_set_uip_jip(p);
2115 annotation_finalize(&annotation, p->next_insn_offset);
2116
2117 #ifndef NDEBUG
2118 bool validated = brw_validate_instructions(p, start_offset, &annotation);
2119 #else
2120 if (unlikely(debug_flag))
2121 brw_validate_instructions(p, start_offset, &annotation);
2122 #endif
2123
2124 int before_size = p->next_insn_offset - start_offset;
2125 brw_compact_instructions(p, start_offset, annotation.ann_count,
2126 annotation.ann);
2127 int after_size = p->next_insn_offset - start_offset;
2128
2129 if (unlikely(debug_flag)) {
2130 fprintf(stderr, "Native code for %s\n"
2131 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2132 " bytes (%.0f%%)\n",
2133 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2134 spill_count, fill_count, promoted_constants, before_size, after_size,
2135 100.0f * (before_size - after_size) / before_size);
2136
2137 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2138 p->devinfo);
2139 ralloc_free(annotation.mem_ctx);
2140 }
2141 assert(validated);
2142
2143 compiler->shader_debug_log(log_data,
2144 "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2145 "%d:%d spills:fills, Promoted %u constants, "
2146 "compacted %d to %d bytes.",
2147 _mesa_shader_stage_to_abbrev(stage),
2148 dispatch_width, before_size / 16,
2149 loop_count, cfg->cycle_count, spill_count,
2150 fill_count, promoted_constants, before_size,
2151 after_size);
2152
2153 return start_offset;
2154 }
2155
2156 const unsigned *
2157 fs_generator::get_assembly(unsigned int *assembly_size)
2158 {
2159 return brw_get_program(p, assembly_size);
2160 }