i965: Enable EGL_KHR_gl_texture_3D_image
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33 #include "brw_program.h"
34
35 static enum brw_reg_file
36 brw_file_from_reg(fs_reg *reg)
37 {
38 switch (reg->file) {
39 case ARF:
40 return BRW_ARCHITECTURE_REGISTER_FILE;
41 case FIXED_GRF:
42 case VGRF:
43 return BRW_GENERAL_REGISTER_FILE;
44 case MRF:
45 return BRW_MESSAGE_REGISTER_FILE;
46 case IMM:
47 return BRW_IMMEDIATE_VALUE;
48 case BAD_FILE:
49 case ATTR:
50 case UNIFORM:
51 unreachable("not reached");
52 }
53 return BRW_ARCHITECTURE_REGISTER_FILE;
54 }
55
56 static struct brw_reg
57 brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
58 {
59 struct brw_reg brw_reg;
60
61 switch (reg->file) {
62 case MRF:
63 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
64 /* Fallthrough */
65 case VGRF:
66 if (reg->stride == 0) {
67 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 } else {
69 /* From the Haswell PRM:
70 *
71 * "VertStride must be used to cross GRF register boundaries. This
72 * rule implies that elements within a 'Width' cannot cross GRF
73 * boundaries."
74 *
75 * The maximum width value that could satisfy this restriction is:
76 */
77 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
78
79 /* Because the hardware can only split source regions at a whole
80 * multiple of width during decompression (i.e. vertically), clamp
81 * the value obtained above to the physical execution size of a
82 * single decompressed chunk of the instruction:
83 */
84 const unsigned phys_width = compressed ? inst->exec_size / 2 :
85 inst->exec_size;
86
87 /* XXX - The equation above is strictly speaking not correct on
88 * hardware that supports unbalanced GRF writes -- On Gen9+
89 * each decompressed chunk of the instruction may have a
90 * different execution size when the number of components
91 * written to each destination GRF is not the same.
92 */
93 const unsigned width = MIN2(reg_width, phys_width);
94 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
95 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96 }
97
98 brw_reg = retype(brw_reg, reg->type);
99 brw_reg = byte_offset(brw_reg, reg->offset);
100 brw_reg.abs = reg->abs;
101 brw_reg.negate = reg->negate;
102 break;
103 case ARF:
104 case FIXED_GRF:
105 case IMM:
106 assert(reg->offset == 0);
107 brw_reg = reg->as_brw_reg();
108 break;
109 case BAD_FILE:
110 /* Probably unused. */
111 brw_reg = brw_null_reg();
112 break;
113 case ATTR:
114 case UNIFORM:
115 unreachable("not reached");
116 }
117
118 return brw_reg;
119 }
120
121 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
122 void *mem_ctx,
123 const void *key,
124 struct brw_stage_prog_data *prog_data,
125 unsigned promoted_constants,
126 bool runtime_check_aads_emit,
127 gl_shader_stage stage)
128
129 : compiler(compiler), log_data(log_data),
130 devinfo(compiler->devinfo), key(key),
131 prog_data(prog_data),
132 promoted_constants(promoted_constants),
133 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
134 stage(stage), mem_ctx(mem_ctx)
135 {
136 p = rzalloc(mem_ctx, struct brw_codegen);
137 brw_init_codegen(devinfo, p, mem_ctx);
138 }
139
140 fs_generator::~fs_generator()
141 {
142 }
143
144 class ip_record : public exec_node {
145 public:
146 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
147
148 ip_record(int ip)
149 {
150 this->ip = ip;
151 }
152
153 int ip;
154 };
155
156 bool
157 fs_generator::patch_discard_jumps_to_fb_writes()
158 {
159 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
160 return false;
161
162 int scale = brw_jump_scale(p->devinfo);
163
164 /* There is a somewhat strange undocumented requirement of using
165 * HALT, according to the simulator. If some channel has HALTed to
166 * a particular UIP, then by the end of the program, every channel
167 * must have HALTed to that UIP. Furthermore, the tracking is a
168 * stack, so you can't do the final halt of a UIP after starting
169 * halting to a new UIP.
170 *
171 * Symptoms of not emitting this instruction on actual hardware
172 * included GPU hangs and sparkly rendering on the piglit discard
173 * tests.
174 */
175 brw_inst *last_halt = gen6_HALT(p);
176 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
177 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
178
179 int ip = p->nr_insn;
180
181 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
182 brw_inst *patch = &p->store[patch_ip->ip];
183
184 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
185 /* HALT takes a half-instruction distance from the pre-incremented IP. */
186 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
187 }
188
189 this->discard_halt_patches.make_empty();
190 return true;
191 }
192
193 void
194 fs_generator::fire_fb_write(fs_inst *inst,
195 struct brw_reg payload,
196 struct brw_reg implied_header,
197 GLuint nr)
198 {
199 uint32_t msg_control;
200
201 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
202
203 if (devinfo->gen < 6) {
204 brw_push_insn_state(p);
205 brw_set_default_exec_size(p, BRW_EXECUTE_8);
206 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
207 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
208 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
209 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
210 brw_pop_insn_state(p);
211 }
212
213 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
214 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
215 else if (prog_data->dual_src_blend) {
216 if (!inst->group)
217 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
218 else
219 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
220 } else if (inst->exec_size == 16)
221 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
222 else
223 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
224
225 uint32_t surf_index =
226 prog_data->binding_table.render_target_start + inst->target;
227
228 bool last_render_target = inst->eot ||
229 (prog_data->dual_src_blend && dispatch_width == 16);
230
231
232 brw_fb_WRITE(p,
233 payload,
234 implied_header,
235 msg_control,
236 surf_index,
237 nr,
238 0,
239 inst->eot,
240 last_render_target,
241 inst->header_size != 0);
242
243 brw_mark_surface_used(&prog_data->base, surf_index);
244 }
245
246 void
247 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
248 {
249 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
250 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
251 struct brw_reg implied_header;
252
253 if (devinfo->gen < 8 && !devinfo->is_haswell) {
254 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
255 }
256
257 if (inst->base_mrf >= 0)
258 payload = brw_message_reg(inst->base_mrf);
259
260 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
261 * move, here's g1.
262 */
263 if (inst->header_size != 0) {
264 brw_push_insn_state(p);
265 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
266 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
267 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
268 brw_set_default_flag_reg(p, 0, 0);
269
270 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
271 * present.
272 */
273 if (prog_data->uses_kill) {
274 struct brw_reg pixel_mask;
275
276 if (devinfo->gen >= 6)
277 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
278 else
279 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
280
281 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
282 }
283
284 if (devinfo->gen >= 6) {
285 brw_push_insn_state(p);
286 brw_set_default_exec_size(p, BRW_EXECUTE_16);
287 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
288 brw_MOV(p,
289 retype(payload, BRW_REGISTER_TYPE_UD),
290 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
291 brw_pop_insn_state(p);
292
293 if (inst->target > 0 && key->replicate_alpha) {
294 /* Set "Source0 Alpha Present to RenderTarget" bit in message
295 * header.
296 */
297 brw_OR(p,
298 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
299 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
300 brw_imm_ud(0x1 << 11));
301 }
302
303 if (inst->target > 0) {
304 /* Set the render target index for choosing BLEND_STATE. */
305 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
306 BRW_REGISTER_TYPE_UD),
307 brw_imm_ud(inst->target));
308 }
309
310 /* Set computes stencil to render target */
311 if (prog_data->computed_stencil) {
312 brw_OR(p,
313 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
314 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
315 brw_imm_ud(0x1 << 14));
316 }
317
318 implied_header = brw_null_reg();
319 } else {
320 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
321 }
322
323 brw_pop_insn_state(p);
324 } else {
325 implied_header = brw_null_reg();
326 }
327
328 if (!runtime_check_aads_emit) {
329 fire_fb_write(inst, payload, implied_header, inst->mlen);
330 } else {
331 /* This can only happen in gen < 6 */
332 assert(devinfo->gen < 6);
333
334 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
335
336 /* Check runtime bit to detect if we have to send AA data or not */
337 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
338 brw_AND(p,
339 v1_null_ud,
340 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
341 brw_imm_ud(1<<26));
342 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
343
344 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
345 brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
346 {
347 /* Don't send AA data */
348 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
349 }
350 brw_land_fwd_jump(p, jmp);
351 fire_fb_write(inst, payload, implied_header, inst->mlen);
352 }
353 }
354
355 void
356 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
357 struct brw_reg payload)
358 {
359 assert(inst->size_written % REG_SIZE == 0);
360 brw_wm_prog_data *prog_data =
361 reinterpret_cast<brw_wm_prog_data *>(this->prog_data);
362 const unsigned surf_index =
363 prog_data->binding_table.render_target_start + inst->target;
364
365 gen9_fb_READ(p, dst, payload, surf_index,
366 inst->header_size, inst->size_written / REG_SIZE,
367 prog_data->persample_dispatch);
368
369 brw_mark_surface_used(&prog_data->base, surf_index);
370 }
371
372 void
373 fs_generator::generate_mov_indirect(fs_inst *inst,
374 struct brw_reg dst,
375 struct brw_reg reg,
376 struct brw_reg indirect_byte_offset)
377 {
378 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
379 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
380
381 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
382
383 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
384 imm_byte_offset += indirect_byte_offset.ud;
385
386 reg.nr = imm_byte_offset / REG_SIZE;
387 reg.subnr = imm_byte_offset % REG_SIZE;
388 brw_MOV(p, dst, reg);
389 } else {
390 /* Prior to Broadwell, there are only 8 address registers. */
391 assert(inst->exec_size == 8 || devinfo->gen >= 8);
392
393 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
394 struct brw_reg addr = vec8(brw_address_reg(0));
395
396 /* The destination stride of an instruction (in bytes) must be greater
397 * than or equal to the size of the rest of the instruction. Since the
398 * address register is of type UW, we can't use a D-type instruction.
399 * In order to get around this, re retype to UW and use a stride.
400 */
401 indirect_byte_offset =
402 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
403
404 struct brw_reg ind_src;
405 if (devinfo->gen < 8) {
406 /* From the Haswell PRM section "Register Region Restrictions":
407 *
408 * "The lower bits of the AddressImmediate must not overflow to
409 * change the register address. The lower 5 bits of Address
410 * Immediate when added to lower 5 bits of address register gives
411 * the sub-register offset. The upper bits of Address Immediate
412 * when added to upper bits of address register gives the register
413 * address. Any overflow from sub-register offset is dropped."
414 *
415 * This restriction is only listed in the Haswell PRM but emperical
416 * testing indicates that it applies on all older generations and is
417 * lifted on Broadwell.
418 *
419 * Since the indirect may cause us to cross a register boundary, this
420 * makes the base offset almost useless. We could try and do
421 * something clever where we use a actual base offset if
422 * base_offset % 32 == 0 but that would mean we were generating
423 * different code depending on the base offset. Instead, for the
424 * sake of consistency, we'll just do the add ourselves.
425 */
426 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
427 ind_src = brw_VxH_indirect(0, 0);
428 } else {
429 brw_MOV(p, addr, indirect_byte_offset);
430 ind_src = brw_VxH_indirect(0, imm_byte_offset);
431 }
432
433 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, dst.type));
434
435 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
436 !inst->get_next()->is_tail_sentinel() &&
437 ((fs_inst *)inst->get_next())->mlen > 0) {
438 /* From the Sandybridge PRM:
439 *
440 * "[Errata: DevSNB(SNB)] If MRF register is updated by any
441 * instruction that “indexed/indirect” source AND is followed by a
442 * send, the instruction requires a “Switch”. This is to avoid
443 * race condition where send may dispatch before MRF is updated."
444 */
445 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
446 }
447 }
448 }
449
450 void
451 fs_generator::generate_urb_read(fs_inst *inst,
452 struct brw_reg dst,
453 struct brw_reg header)
454 {
455 assert(inst->size_written % REG_SIZE == 0);
456 assert(header.file == BRW_GENERAL_REGISTER_FILE);
457 assert(header.type == BRW_REGISTER_TYPE_UD);
458
459 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
460 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
461 brw_set_src0(p, send, header);
462 brw_set_src1(p, send, brw_imm_ud(0u));
463
464 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
465 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
466
467 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
468 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
469
470 brw_inst_set_mlen(p->devinfo, send, inst->mlen);
471 brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
472 brw_inst_set_header_present(p->devinfo, send, true);
473 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
474 }
475
476 void
477 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
478 {
479 brw_inst *insn;
480
481 insn = brw_next_insn(p, BRW_OPCODE_SEND);
482
483 brw_set_dest(p, insn, brw_null_reg());
484 brw_set_src0(p, insn, payload);
485 brw_set_src1(p, insn, brw_imm_d(0));
486
487 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
488 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
489
490 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
491 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
492 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
493
494 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
495 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
496 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
497
498 brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
499 brw_inst_set_rlen(p->devinfo, insn, 0);
500 brw_inst_set_eot(p->devinfo, insn, inst->eot);
501 brw_inst_set_header_present(p->devinfo, insn, true);
502 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
503 }
504
505 void
506 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
507 {
508 struct brw_inst *insn;
509
510 insn = brw_next_insn(p, BRW_OPCODE_SEND);
511
512 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
513 brw_set_src0(p, insn, payload);
514 brw_set_src1(p, insn, brw_imm_d(0));
515
516 /* Terminate a compute shader by sending a message to the thread spawner.
517 */
518 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
519 brw_inst_set_mlen(devinfo, insn, 1);
520 brw_inst_set_rlen(devinfo, insn, 0);
521 brw_inst_set_eot(devinfo, insn, inst->eot);
522 brw_inst_set_header_present(devinfo, insn, false);
523
524 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
525 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
526
527 /* Note that even though the thread has a URB resource associated with it,
528 * we set the "do not dereference URB" bit, because the URB resource is
529 * managed by the fixed-function unit, so it will free it automatically.
530 */
531 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
532
533 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
534 }
535
536 void
537 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
538 {
539 brw_barrier(p, src);
540 brw_WAIT(p);
541 }
542
543 void
544 fs_generator::generate_linterp(fs_inst *inst,
545 struct brw_reg dst, struct brw_reg *src)
546 {
547 /* PLN reads:
548 * / in SIMD16 \
549 * -----------------------------------
550 * | src1+0 | src1+1 | src1+2 | src1+3 |
551 * |-----------------------------------|
552 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
553 * -----------------------------------
554 *
555 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
556 *
557 * -----------------------------------
558 * | src1+0 | src1+1 | src1+2 | src1+3 |
559 * |-----------------------------------|
560 * |(x0, x1)|(y0, y1)| | | in SIMD8
561 * |-----------------------------------|
562 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
563 * -----------------------------------
564 *
565 * See also: emit_interpolation_setup_gen4().
566 */
567 struct brw_reg delta_x = src[0];
568 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
569 struct brw_reg interp = src[1];
570
571 if (devinfo->has_pln &&
572 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
573 brw_PLN(p, dst, interp, delta_x);
574 } else {
575 brw_LINE(p, brw_null_reg(), interp, delta_x);
576 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
577 }
578 }
579
580 void
581 fs_generator::generate_get_buffer_size(fs_inst *inst,
582 struct brw_reg dst,
583 struct brw_reg src,
584 struct brw_reg surf_index)
585 {
586 assert(devinfo->gen >= 7);
587 assert(surf_index.file == BRW_IMMEDIATE_VALUE);
588
589 uint32_t simd_mode;
590 int rlen = 4;
591
592 switch (inst->exec_size) {
593 case 8:
594 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
595 break;
596 case 16:
597 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
598 break;
599 default:
600 unreachable("Invalid width for texture instruction");
601 }
602
603 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
604 rlen = 8;
605 dst = vec16(dst);
606 }
607
608 brw_SAMPLE(p,
609 retype(dst, BRW_REGISTER_TYPE_UW),
610 inst->base_mrf,
611 src,
612 surf_index.ud,
613 0,
614 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
615 rlen, /* response length */
616 inst->mlen,
617 inst->header_size > 0,
618 simd_mode,
619 BRW_SAMPLER_RETURN_FORMAT_SINT32);
620
621 brw_mark_surface_used(prog_data, surf_index.ud);
622 }
623
624 void
625 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
626 struct brw_reg surface_index,
627 struct brw_reg sampler_index)
628 {
629 assert(inst->size_written % REG_SIZE == 0);
630 int msg_type = -1;
631 uint32_t simd_mode;
632 uint32_t return_format;
633 bool is_combined_send = inst->eot;
634
635 switch (dst.type) {
636 case BRW_REGISTER_TYPE_D:
637 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
638 break;
639 case BRW_REGISTER_TYPE_UD:
640 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
641 break;
642 default:
643 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
644 break;
645 }
646
647 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
648 * is set as part of the message descriptor. On gen4, the PRM seems to
649 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
650 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
651 * gone from the message descriptor entirely and you just get UINT32 all
652 * the time regasrdless. Since we can really only do non-UINT32 on gen4,
653 * just stomp it to UINT32 all the time.
654 */
655 if (inst->opcode == SHADER_OPCODE_TXS)
656 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
657
658 switch (inst->exec_size) {
659 case 8:
660 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
661 break;
662 case 16:
663 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
664 break;
665 default:
666 unreachable("Invalid width for texture instruction");
667 }
668
669 if (devinfo->gen >= 5) {
670 switch (inst->opcode) {
671 case SHADER_OPCODE_TEX:
672 if (inst->shadow_compare) {
673 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
674 } else {
675 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
676 }
677 break;
678 case FS_OPCODE_TXB:
679 if (inst->shadow_compare) {
680 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
681 } else {
682 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
683 }
684 break;
685 case SHADER_OPCODE_TXL:
686 if (inst->shadow_compare) {
687 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
688 } else {
689 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
690 }
691 break;
692 case SHADER_OPCODE_TXL_LZ:
693 assert(devinfo->gen >= 9);
694 if (inst->shadow_compare) {
695 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
696 } else {
697 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
698 }
699 break;
700 case SHADER_OPCODE_TXS:
701 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
702 break;
703 case SHADER_OPCODE_TXD:
704 if (inst->shadow_compare) {
705 /* Gen7.5+. Otherwise, lowered by brw_lower_texture_gradients(). */
706 assert(devinfo->gen >= 8 || devinfo->is_haswell);
707 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
708 } else {
709 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
710 }
711 break;
712 case SHADER_OPCODE_TXF:
713 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
714 break;
715 case SHADER_OPCODE_TXF_LZ:
716 assert(devinfo->gen >= 9);
717 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
718 break;
719 case SHADER_OPCODE_TXF_CMS_W:
720 assert(devinfo->gen >= 9);
721 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
722 break;
723 case SHADER_OPCODE_TXF_CMS:
724 if (devinfo->gen >= 7)
725 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
726 else
727 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
728 break;
729 case SHADER_OPCODE_TXF_UMS:
730 assert(devinfo->gen >= 7);
731 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
732 break;
733 case SHADER_OPCODE_TXF_MCS:
734 assert(devinfo->gen >= 7);
735 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
736 break;
737 case SHADER_OPCODE_LOD:
738 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
739 break;
740 case SHADER_OPCODE_TG4:
741 if (inst->shadow_compare) {
742 assert(devinfo->gen >= 7);
743 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
744 } else {
745 assert(devinfo->gen >= 6);
746 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
747 }
748 break;
749 case SHADER_OPCODE_TG4_OFFSET:
750 assert(devinfo->gen >= 7);
751 if (inst->shadow_compare) {
752 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
753 } else {
754 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
755 }
756 break;
757 case SHADER_OPCODE_SAMPLEINFO:
758 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
759 break;
760 default:
761 unreachable("not reached");
762 }
763 } else {
764 switch (inst->opcode) {
765 case SHADER_OPCODE_TEX:
766 /* Note that G45 and older determines shadow compare and dispatch width
767 * from message length for most messages.
768 */
769 if (inst->exec_size == 8) {
770 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
771 if (inst->shadow_compare) {
772 assert(inst->mlen == 6);
773 } else {
774 assert(inst->mlen <= 4);
775 }
776 } else {
777 if (inst->shadow_compare) {
778 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
779 assert(inst->mlen == 9);
780 } else {
781 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
782 assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
783 }
784 }
785 break;
786 case FS_OPCODE_TXB:
787 if (inst->shadow_compare) {
788 assert(inst->exec_size == 8);
789 assert(inst->mlen == 6);
790 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
791 } else {
792 assert(inst->mlen == 9);
793 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
794 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
795 }
796 break;
797 case SHADER_OPCODE_TXL:
798 if (inst->shadow_compare) {
799 assert(inst->exec_size == 8);
800 assert(inst->mlen == 6);
801 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
802 } else {
803 assert(inst->mlen == 9);
804 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
805 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
806 }
807 break;
808 case SHADER_OPCODE_TXD:
809 /* There is no sample_d_c message; comparisons are done manually */
810 assert(inst->exec_size == 8);
811 assert(inst->mlen == 7 || inst->mlen == 10);
812 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
813 break;
814 case SHADER_OPCODE_TXF:
815 assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
816 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
817 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
818 break;
819 case SHADER_OPCODE_TXS:
820 assert(inst->mlen == 3);
821 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
822 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
823 break;
824 default:
825 unreachable("not reached");
826 }
827 }
828 assert(msg_type != -1);
829
830 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
831 dst = vec16(dst);
832 }
833
834 assert(devinfo->gen < 7 || inst->header_size == 0 ||
835 src.file == BRW_GENERAL_REGISTER_FILE);
836
837 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
838
839 /* Load the message header if present. If there's a texture offset,
840 * we need to set it up explicitly and load the offset bitfield.
841 * Otherwise, we can use an implied move from g0 to the first message reg.
842 */
843 if (inst->header_size != 0) {
844 if (devinfo->gen < 6 && !inst->offset) {
845 /* Set up an implied move from g0 to the MRF. */
846 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
847 } else {
848 struct brw_reg header_reg;
849
850 if (devinfo->gen >= 7) {
851 header_reg = src;
852 } else {
853 assert(inst->base_mrf != -1);
854 header_reg = brw_message_reg(inst->base_mrf);
855 }
856
857 brw_push_insn_state(p);
858 brw_set_default_exec_size(p, BRW_EXECUTE_8);
859 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
860 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
861 /* Explicitly set up the message header by copying g0 to the MRF. */
862 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
863
864 if (inst->offset) {
865 /* Set the offset bits in DWord 2. */
866 brw_MOV(p, get_element_ud(header_reg, 2),
867 brw_imm_ud(inst->offset));
868 } else if (stage != MESA_SHADER_VERTEX &&
869 stage != MESA_SHADER_FRAGMENT) {
870 /* The vertex and fragment stages have g0.2 set to 0, so
871 * header0.2 is 0 when g0 is copied. Other stages may not, so we
872 * must set it to 0 to avoid setting undesirable bits in the
873 * message.
874 */
875 brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
876 }
877
878 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
879 brw_pop_insn_state(p);
880 }
881 }
882
883 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
884 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
885 ? prog_data->binding_table.gather_texture_start
886 : prog_data->binding_table.texture_start;
887
888 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
889 sampler_index.file == BRW_IMMEDIATE_VALUE) {
890 uint32_t surface = surface_index.ud;
891 uint32_t sampler = sampler_index.ud;
892
893 brw_SAMPLE(p,
894 retype(dst, BRW_REGISTER_TYPE_UW),
895 inst->base_mrf,
896 src,
897 surface + base_binding_table_index,
898 sampler % 16,
899 msg_type,
900 inst->size_written / REG_SIZE,
901 inst->mlen,
902 inst->header_size != 0,
903 simd_mode,
904 return_format);
905
906 brw_mark_surface_used(prog_data, surface + base_binding_table_index);
907 } else {
908 /* Non-const sampler index */
909
910 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
911 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
912 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
913
914 brw_push_insn_state(p);
915 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
916 brw_set_default_access_mode(p, BRW_ALIGN_1);
917
918 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
919 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
920 } else {
921 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
922 brw_OR(p, addr, addr, surface_reg);
923 }
924 if (base_binding_table_index)
925 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
926 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
927
928 brw_pop_insn_state(p);
929
930 /* dst = send(offset, a0.0 | <descriptor>) */
931 brw_inst *insn = brw_send_indirect_message(
932 p, BRW_SFID_SAMPLER, dst, src, addr);
933 brw_set_sampler_message(p, insn,
934 0 /* surface */,
935 0 /* sampler */,
936 msg_type,
937 inst->size_written / REG_SIZE,
938 inst->mlen /* mlen */,
939 inst->header_size != 0 /* header */,
940 simd_mode,
941 return_format);
942
943 /* visitor knows more than we do about the surface limit required,
944 * so has already done marking.
945 */
946 }
947
948 if (is_combined_send) {
949 brw_inst_set_eot(p->devinfo, brw_last_inst, true);
950 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
951 }
952 }
953
954
955 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
956 * looking like:
957 *
958 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
959 *
960 * Ideally, we want to produce:
961 *
962 * DDX DDY
963 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
964 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
965 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
966 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
967 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
968 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
969 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
970 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
971 *
972 * and add another set of two more subspans if in 16-pixel dispatch mode.
973 *
974 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
975 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
976 * pair. But the ideal approximation may impose a huge performance cost on
977 * sample_d. On at least Haswell, sample_d instruction does some
978 * optimizations if the same LOD is used for all pixels in the subspan.
979 *
980 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
981 * appropriate swizzling.
982 */
983 void
984 fs_generator::generate_ddx(enum opcode opcode,
985 struct brw_reg dst, struct brw_reg src)
986 {
987 unsigned vstride, width;
988
989 if (opcode == FS_OPCODE_DDX_FINE) {
990 /* produce accurate derivatives */
991 vstride = BRW_VERTICAL_STRIDE_2;
992 width = BRW_WIDTH_2;
993 } else {
994 /* replicate the derivative at the top-left pixel to other pixels */
995 vstride = BRW_VERTICAL_STRIDE_4;
996 width = BRW_WIDTH_4;
997 }
998
999 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1000 src.negate, src.abs,
1001 BRW_REGISTER_TYPE_F,
1002 vstride,
1003 width,
1004 BRW_HORIZONTAL_STRIDE_0,
1005 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1006 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1007 src.negate, src.abs,
1008 BRW_REGISTER_TYPE_F,
1009 vstride,
1010 width,
1011 BRW_HORIZONTAL_STRIDE_0,
1012 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1013 brw_ADD(p, dst, src0, negate(src1));
1014 }
1015
1016 /* The negate_value boolean is used to negate the derivative computation for
1017 * FBOs, since they place the origin at the upper left instead of the lower
1018 * left.
1019 */
1020 void
1021 fs_generator::generate_ddy(enum opcode opcode,
1022 struct brw_reg dst, struct brw_reg src)
1023 {
1024 if (opcode == FS_OPCODE_DDY_FINE) {
1025 /* produce accurate derivatives */
1026 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1027 src.negate, src.abs,
1028 BRW_REGISTER_TYPE_F,
1029 BRW_VERTICAL_STRIDE_4,
1030 BRW_WIDTH_4,
1031 BRW_HORIZONTAL_STRIDE_1,
1032 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1033 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1034 src.negate, src.abs,
1035 BRW_REGISTER_TYPE_F,
1036 BRW_VERTICAL_STRIDE_4,
1037 BRW_WIDTH_4,
1038 BRW_HORIZONTAL_STRIDE_1,
1039 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1040 brw_push_insn_state(p);
1041 brw_set_default_access_mode(p, BRW_ALIGN_16);
1042 brw_ADD(p, dst, negate(src0), src1);
1043 brw_pop_insn_state(p);
1044 } else {
1045 /* replicate the derivative at the top-left pixel to other pixels */
1046 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1047 src.negate, src.abs,
1048 BRW_REGISTER_TYPE_F,
1049 BRW_VERTICAL_STRIDE_4,
1050 BRW_WIDTH_4,
1051 BRW_HORIZONTAL_STRIDE_0,
1052 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1053 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1054 src.negate, src.abs,
1055 BRW_REGISTER_TYPE_F,
1056 BRW_VERTICAL_STRIDE_4,
1057 BRW_WIDTH_4,
1058 BRW_HORIZONTAL_STRIDE_0,
1059 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1060 brw_ADD(p, dst, negate(src0), src1);
1061 }
1062 }
1063
1064 void
1065 fs_generator::generate_discard_jump(fs_inst *inst)
1066 {
1067 assert(devinfo->gen >= 6);
1068
1069 /* This HALT will be patched up at FB write time to point UIP at the end of
1070 * the program, and at brw_uip_jip() JIP will be set to the end of the
1071 * current block (or the program).
1072 */
1073 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1074 gen6_HALT(p);
1075 }
1076
1077 void
1078 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1079 {
1080 /* The 32-wide messages only respect the first 16-wide half of the channel
1081 * enable signals which are replicated identically for the second group of
1082 * 16 channels, so we cannot use them unless the write is marked
1083 * force_writemask_all.
1084 */
1085 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1086 MIN2(16, inst->exec_size);
1087 const unsigned block_size = 4 * lower_size / REG_SIZE;
1088 assert(inst->mlen != 0);
1089
1090 brw_push_insn_state(p);
1091 brw_set_default_exec_size(p, cvt(lower_size) - 1);
1092 brw_set_default_compression(p, lower_size > 8);
1093
1094 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1095 brw_set_default_group(p, inst->group + lower_size * i);
1096
1097 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1098 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1099
1100 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1101 block_size,
1102 inst->offset + block_size * REG_SIZE * i);
1103 }
1104
1105 brw_pop_insn_state(p);
1106 }
1107
1108 void
1109 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1110 {
1111 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1112 assert(inst->mlen != 0);
1113
1114 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1115 inst->exec_size / 8, inst->offset);
1116 }
1117
1118 void
1119 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1120 {
1121 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1122
1123 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1124 }
1125
1126 void
1127 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1128 struct brw_reg dst,
1129 struct brw_reg index,
1130 struct brw_reg offset)
1131 {
1132 assert(inst->mlen != 0);
1133
1134 assert(index.file == BRW_IMMEDIATE_VALUE &&
1135 index.type == BRW_REGISTER_TYPE_UD);
1136 uint32_t surf_index = index.ud;
1137
1138 assert(offset.file == BRW_IMMEDIATE_VALUE &&
1139 offset.type == BRW_REGISTER_TYPE_UD);
1140 uint32_t read_offset = offset.ud;
1141
1142 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1143 read_offset, surf_index);
1144 }
1145
1146 void
1147 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1148 struct brw_reg dst,
1149 struct brw_reg index,
1150 struct brw_reg offset)
1151 {
1152 assert(index.type == BRW_REGISTER_TYPE_UD);
1153
1154 assert(offset.file == BRW_GENERAL_REGISTER_FILE);
1155 /* Reference just the dword we need, to avoid angering validate_reg(). */
1156 offset = brw_vec1_grf(offset.nr, 0);
1157
1158 /* We use the SIMD4x2 mode because we want to end up with 4 components in
1159 * the destination loaded consecutively from the same offset (which appears
1160 * in the first component, and the rest are ignored).
1161 */
1162 dst.width = BRW_WIDTH_4;
1163
1164 struct brw_reg src = offset;
1165 bool header_present = false;
1166
1167 if (devinfo->gen >= 9) {
1168 /* Skylake requires a message header in order to use SIMD4x2 mode. */
1169 src = retype(brw_vec4_grf(offset.nr, 0), BRW_REGISTER_TYPE_UD);
1170 header_present = true;
1171
1172 brw_push_insn_state(p);
1173 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1174 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1175 brw_MOV(p, vec8(src), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
1176 brw_set_default_access_mode(p, BRW_ALIGN_1);
1177
1178 brw_MOV(p, get_element_ud(src, 2),
1179 brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
1180 brw_pop_insn_state(p);
1181 }
1182
1183 if (index.file == BRW_IMMEDIATE_VALUE) {
1184
1185 uint32_t surf_index = index.ud;
1186
1187 brw_push_insn_state(p);
1188 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1189 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1190 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1191 brw_inst_set_exec_size(devinfo, send, BRW_EXECUTE_4);
1192 brw_pop_insn_state(p);
1193
1194 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1195 brw_set_src0(p, send, src);
1196 brw_set_sampler_message(p, send,
1197 surf_index,
1198 0, /* LD message ignores sampler unit */
1199 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1200 1, /* rlen */
1201 inst->mlen,
1202 header_present,
1203 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1204 0);
1205 } else {
1206
1207 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1208
1209 brw_push_insn_state(p);
1210 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1211 brw_set_default_access_mode(p, BRW_ALIGN_1);
1212
1213 /* a0.0 = surf_index & 0xff */
1214 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1215 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1216 brw_set_dest(p, insn_and, addr);
1217 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1218 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1219
1220 /* dst = send(payload, a0.0 | <descriptor>) */
1221 brw_inst *insn = brw_send_indirect_message(
1222 p, BRW_SFID_SAMPLER, dst, src, addr);
1223 brw_set_sampler_message(p, insn,
1224 0,
1225 0, /* LD message ignores sampler unit */
1226 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1227 1, /* rlen */
1228 inst->mlen,
1229 header_present,
1230 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
1231 0);
1232
1233 brw_pop_insn_state(p);
1234 }
1235 }
1236
1237 void
1238 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1239 struct brw_reg dst,
1240 struct brw_reg index)
1241 {
1242 assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1243 assert(inst->header_size != 0);
1244 assert(inst->mlen);
1245
1246 assert(index.file == BRW_IMMEDIATE_VALUE &&
1247 index.type == BRW_REGISTER_TYPE_UD);
1248 uint32_t surf_index = index.ud;
1249
1250 uint32_t simd_mode, rlen, msg_type;
1251 if (inst->exec_size == 16) {
1252 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1253 rlen = 8;
1254 } else {
1255 assert(inst->exec_size == 8);
1256 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1257 rlen = 4;
1258 }
1259
1260 if (devinfo->gen >= 5)
1261 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1262 else {
1263 /* We always use the SIMD16 message so that we only have to load U, and
1264 * not V or R.
1265 */
1266 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1267 assert(inst->mlen == 3);
1268 assert(inst->size_written == 8 * REG_SIZE);
1269 rlen = 8;
1270 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1271 }
1272
1273 struct brw_reg header = brw_vec8_grf(0, 0);
1274 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1275
1276 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1277 brw_inst_set_compression(devinfo, send, false);
1278 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1279 brw_set_src0(p, send, header);
1280 if (devinfo->gen < 6)
1281 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1282
1283 /* Our surface is set up as floats, regardless of what actual data is
1284 * stored in it.
1285 */
1286 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1287 brw_set_sampler_message(p, send,
1288 surf_index,
1289 0, /* sampler (unused) */
1290 msg_type,
1291 rlen,
1292 inst->mlen,
1293 inst->header_size != 0,
1294 simd_mode,
1295 return_format);
1296 }
1297
1298 void
1299 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1300 struct brw_reg dst,
1301 struct brw_reg index,
1302 struct brw_reg offset)
1303 {
1304 assert(devinfo->gen >= 7);
1305 /* Varying-offset pull constant loads are treated as a normal expression on
1306 * gen7, so the fact that it's a send message is hidden at the IR level.
1307 */
1308 assert(inst->header_size == 0);
1309 assert(!inst->mlen);
1310 assert(index.type == BRW_REGISTER_TYPE_UD);
1311
1312 uint32_t simd_mode, rlen, mlen;
1313 if (inst->exec_size == 16) {
1314 mlen = 2;
1315 rlen = 8;
1316 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1317 } else {
1318 assert(inst->exec_size == 8);
1319 mlen = 1;
1320 rlen = 4;
1321 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1322 }
1323
1324 if (index.file == BRW_IMMEDIATE_VALUE) {
1325
1326 uint32_t surf_index = index.ud;
1327
1328 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1329 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1330 brw_set_src0(p, send, offset);
1331 brw_set_sampler_message(p, send,
1332 surf_index,
1333 0, /* LD message ignores sampler unit */
1334 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1335 rlen,
1336 mlen,
1337 false, /* no header */
1338 simd_mode,
1339 0);
1340
1341 } else {
1342
1343 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1344
1345 brw_push_insn_state(p);
1346 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1347 brw_set_default_access_mode(p, BRW_ALIGN_1);
1348
1349 /* a0.0 = surf_index & 0xff */
1350 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1351 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1352 brw_set_dest(p, insn_and, addr);
1353 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1354 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1355
1356 brw_pop_insn_state(p);
1357
1358 /* dst = send(offset, a0.0 | <descriptor>) */
1359 brw_inst *insn = brw_send_indirect_message(
1360 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1361 offset, addr);
1362 brw_set_sampler_message(p, insn,
1363 0 /* surface */,
1364 0 /* sampler */,
1365 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1366 rlen /* rlen */,
1367 mlen /* mlen */,
1368 false /* header */,
1369 simd_mode,
1370 0);
1371 }
1372 }
1373
1374 /**
1375 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1376 * into the flags register (f0.0).
1377 *
1378 * Used only on Gen6 and above.
1379 */
1380 void
1381 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1382 {
1383 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1384 struct brw_reg dispatch_mask;
1385
1386 if (devinfo->gen >= 6)
1387 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1388 else
1389 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1390
1391 brw_push_insn_state(p);
1392 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1393 brw_MOV(p, flags, dispatch_mask);
1394 brw_pop_insn_state(p);
1395 }
1396
1397 void
1398 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1399 struct brw_reg dst,
1400 struct brw_reg src,
1401 struct brw_reg msg_data,
1402 unsigned msg_type)
1403 {
1404 assert(inst->size_written % REG_SIZE == 0);
1405 assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1406
1407 brw_pixel_interpolator_query(p,
1408 retype(dst, BRW_REGISTER_TYPE_UW),
1409 src,
1410 inst->pi_noperspective,
1411 msg_type,
1412 msg_data,
1413 inst->mlen,
1414 inst->size_written / REG_SIZE);
1415 }
1416
1417
1418 /**
1419 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
1420 * sampler LD messages.
1421 *
1422 * We don't want to bake it into the send message's code generation because
1423 * that means we don't get a chance to schedule the instructions.
1424 */
1425 void
1426 fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
1427 struct brw_reg dst,
1428 struct brw_reg value)
1429 {
1430 assert(value.file == BRW_IMMEDIATE_VALUE);
1431
1432 brw_push_insn_state(p);
1433 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1434 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1435 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1436 brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
1437 brw_pop_insn_state(p);
1438 }
1439
1440 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1441 * the ADD instruction.
1442 */
1443 void
1444 fs_generator::generate_set_sample_id(fs_inst *inst,
1445 struct brw_reg dst,
1446 struct brw_reg src0,
1447 struct brw_reg src1)
1448 {
1449 assert(dst.type == BRW_REGISTER_TYPE_D ||
1450 dst.type == BRW_REGISTER_TYPE_UD);
1451 assert(src0.type == BRW_REGISTER_TYPE_D ||
1452 src0.type == BRW_REGISTER_TYPE_UD);
1453
1454 struct brw_reg reg = stride(src1, 1, 4, 0);
1455 if (devinfo->gen >= 8 || inst->exec_size == 8) {
1456 brw_ADD(p, dst, src0, reg);
1457 } else if (inst->exec_size == 16) {
1458 brw_push_insn_state(p);
1459 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1460 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1461 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1462 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1463 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1464 brw_pop_insn_state(p);
1465 }
1466 }
1467
1468 void
1469 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1470 struct brw_reg dst,
1471 struct brw_reg x,
1472 struct brw_reg y)
1473 {
1474 assert(devinfo->gen >= 7);
1475 assert(dst.type == BRW_REGISTER_TYPE_UD);
1476 assert(x.type == BRW_REGISTER_TYPE_F);
1477 assert(y.type == BRW_REGISTER_TYPE_F);
1478
1479 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1480 *
1481 * Because this instruction does not have a 16-bit floating-point type,
1482 * the destination data type must be Word (W).
1483 *
1484 * The destination must be DWord-aligned and specify a horizontal stride
1485 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1486 * each destination channel and the upper word is not modified.
1487 */
1488 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1489
1490 /* Give each 32-bit channel of dst the form below, where "." means
1491 * unchanged.
1492 * 0x....hhhh
1493 */
1494 brw_F32TO16(p, dst_w, y);
1495
1496 /* Now the form:
1497 * 0xhhhh0000
1498 */
1499 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1500
1501 /* And, finally the form of packHalf2x16's output:
1502 * 0xhhhhllll
1503 */
1504 brw_F32TO16(p, dst_w, x);
1505 }
1506
1507 void
1508 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1509 struct brw_reg dst,
1510 struct brw_reg src)
1511 {
1512 assert(devinfo->gen >= 7);
1513 assert(dst.type == BRW_REGISTER_TYPE_F);
1514 assert(src.type == BRW_REGISTER_TYPE_UD);
1515
1516 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1517 *
1518 * Because this instruction does not have a 16-bit floating-point type,
1519 * the source data type must be Word (W). The destination type must be
1520 * F (Float).
1521 */
1522 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1523
1524 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1525 * For the Y case, we wish to access only the upper word; therefore
1526 * a 16-bit subregister offset is needed.
1527 */
1528 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1529 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1530 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1531 src_w.subnr += 2;
1532
1533 brw_F16TO32(p, dst, src_w);
1534 }
1535
1536 void
1537 fs_generator::generate_shader_time_add(fs_inst *inst,
1538 struct brw_reg payload,
1539 struct brw_reg offset,
1540 struct brw_reg value)
1541 {
1542 assert(devinfo->gen >= 7);
1543 brw_push_insn_state(p);
1544 brw_set_default_mask_control(p, true);
1545
1546 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1547 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1548 offset.type);
1549 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1550 value.type);
1551
1552 assert(offset.file == BRW_IMMEDIATE_VALUE);
1553 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1554 value.width = BRW_WIDTH_1;
1555 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1556 value.vstride = BRW_VERTICAL_STRIDE_0;
1557 } else {
1558 assert(value.file == BRW_IMMEDIATE_VALUE);
1559 }
1560
1561 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1562 * case, and we don't really care about squeezing every bit of performance
1563 * out of this path, so we just emit the MOVs from here.
1564 */
1565 brw_MOV(p, payload_offset, offset);
1566 brw_MOV(p, payload_value, value);
1567 brw_shader_time_add(p, payload,
1568 prog_data->binding_table.shader_time_start);
1569 brw_pop_insn_state(p);
1570
1571 brw_mark_surface_used(prog_data,
1572 prog_data->binding_table.shader_time_start);
1573 }
1574
1575 void
1576 fs_generator::enable_debug(const char *shader_name)
1577 {
1578 debug_flag = true;
1579 this->shader_name = shader_name;
1580 }
1581
1582 int
1583 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1584 {
1585 /* align to 64 byte boundary. */
1586 while (p->next_insn_offset % 64)
1587 brw_NOP(p);
1588
1589 this->dispatch_width = dispatch_width;
1590
1591 int start_offset = p->next_insn_offset;
1592 int spill_count = 0, fill_count = 0;
1593 int loop_count = 0;
1594
1595 struct annotation_info annotation;
1596 memset(&annotation, 0, sizeof(annotation));
1597
1598 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1599 struct brw_reg src[3], dst;
1600 unsigned int last_insn_offset = p->next_insn_offset;
1601 bool multiple_instructions_emitted = false;
1602
1603 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1604 * "Register Region Restrictions" section: for BDW, SKL:
1605 *
1606 * "A POW/FDIV operation must not be followed by an instruction
1607 * that requires two destination registers."
1608 *
1609 * The documentation is often lacking annotations for Atom parts,
1610 * and empirically this affects CHV as well.
1611 */
1612 if (devinfo->gen >= 8 &&
1613 p->nr_insn > 1 &&
1614 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1615 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1616 inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1617 brw_NOP(p);
1618 last_insn_offset = p->next_insn_offset;
1619 }
1620
1621 if (unlikely(debug_flag))
1622 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1623
1624 /* If the instruction writes to more than one register, it needs to be
1625 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
1626 * hardware figures out by itself what the right compression mode is,
1627 * but we still need to know whether the instruction is compressed to
1628 * set up the source register regions appropriately.
1629 *
1630 * XXX - This is wrong for instructions that write a single register but
1631 * read more than one which should strictly speaking be treated as
1632 * compressed. For instructions that don't write any registers it
1633 * relies on the destination being a null register of the correct
1634 * type and regioning so the instruction is considered compressed
1635 * or not accordingly.
1636 */
1637 const bool compressed =
1638 inst->dst.component_size(inst->exec_size) > REG_SIZE;
1639 brw_set_default_compression(p, compressed);
1640 brw_set_default_group(p, inst->group);
1641
1642 for (unsigned int i = 0; i < inst->sources; i++) {
1643 src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
1644 compressed);
1645
1646 /* The accumulator result appears to get used for the
1647 * conditional modifier generation. When negating a UD
1648 * value, there is a 33rd bit generated for the sign in the
1649 * accumulator value, so now you can't check, for example,
1650 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1651 */
1652 assert(!inst->conditional_mod ||
1653 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1654 !inst->src[i].negate);
1655 }
1656 dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
1657
1658 brw_set_default_access_mode(p, BRW_ALIGN_1);
1659 brw_set_default_predicate_control(p, inst->predicate);
1660 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1661 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1662 brw_set_default_saturate(p, inst->saturate);
1663 brw_set_default_mask_control(p, inst->force_writemask_all);
1664 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1665 brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
1666
1667 assert(inst->force_writemask_all || inst->exec_size >= 4);
1668 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1669 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1670 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1671
1672 switch (inst->opcode) {
1673 case BRW_OPCODE_MOV:
1674 brw_MOV(p, dst, src[0]);
1675 break;
1676 case BRW_OPCODE_ADD:
1677 brw_ADD(p, dst, src[0], src[1]);
1678 break;
1679 case BRW_OPCODE_MUL:
1680 brw_MUL(p, dst, src[0], src[1]);
1681 break;
1682 case BRW_OPCODE_AVG:
1683 brw_AVG(p, dst, src[0], src[1]);
1684 break;
1685 case BRW_OPCODE_MACH:
1686 brw_MACH(p, dst, src[0], src[1]);
1687 break;
1688
1689 case BRW_OPCODE_LINE:
1690 brw_LINE(p, dst, src[0], src[1]);
1691 break;
1692
1693 case BRW_OPCODE_MAD:
1694 assert(devinfo->gen >= 6);
1695 brw_set_default_access_mode(p, BRW_ALIGN_16);
1696 brw_MAD(p, dst, src[0], src[1], src[2]);
1697 break;
1698
1699 case BRW_OPCODE_LRP:
1700 assert(devinfo->gen >= 6);
1701 brw_set_default_access_mode(p, BRW_ALIGN_16);
1702 brw_LRP(p, dst, src[0], src[1], src[2]);
1703 break;
1704
1705 case BRW_OPCODE_FRC:
1706 brw_FRC(p, dst, src[0]);
1707 break;
1708 case BRW_OPCODE_RNDD:
1709 brw_RNDD(p, dst, src[0]);
1710 break;
1711 case BRW_OPCODE_RNDE:
1712 brw_RNDE(p, dst, src[0]);
1713 break;
1714 case BRW_OPCODE_RNDZ:
1715 brw_RNDZ(p, dst, src[0]);
1716 break;
1717
1718 case BRW_OPCODE_AND:
1719 brw_AND(p, dst, src[0], src[1]);
1720 break;
1721 case BRW_OPCODE_OR:
1722 brw_OR(p, dst, src[0], src[1]);
1723 break;
1724 case BRW_OPCODE_XOR:
1725 brw_XOR(p, dst, src[0], src[1]);
1726 break;
1727 case BRW_OPCODE_NOT:
1728 brw_NOT(p, dst, src[0]);
1729 break;
1730 case BRW_OPCODE_ASR:
1731 brw_ASR(p, dst, src[0], src[1]);
1732 break;
1733 case BRW_OPCODE_SHR:
1734 brw_SHR(p, dst, src[0], src[1]);
1735 break;
1736 case BRW_OPCODE_SHL:
1737 brw_SHL(p, dst, src[0], src[1]);
1738 break;
1739 case BRW_OPCODE_F32TO16:
1740 assert(devinfo->gen >= 7);
1741 brw_F32TO16(p, dst, src[0]);
1742 break;
1743 case BRW_OPCODE_F16TO32:
1744 assert(devinfo->gen >= 7);
1745 brw_F16TO32(p, dst, src[0]);
1746 break;
1747 case BRW_OPCODE_CMP:
1748 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1749 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1750 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1751 * implemented in the compiler is not sufficient. Overriding the
1752 * type when the destination is the null register is necessary but
1753 * not sufficient by itself.
1754 */
1755 assert(dst.nr == BRW_ARF_NULL);
1756 dst.type = BRW_REGISTER_TYPE_D;
1757 }
1758 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1759 break;
1760 case BRW_OPCODE_SEL:
1761 brw_SEL(p, dst, src[0], src[1]);
1762 break;
1763 case BRW_OPCODE_BFREV:
1764 assert(devinfo->gen >= 7);
1765 /* BFREV only supports UD type for src and dst. */
1766 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1767 retype(src[0], BRW_REGISTER_TYPE_UD));
1768 break;
1769 case BRW_OPCODE_FBH:
1770 assert(devinfo->gen >= 7);
1771 /* FBH only supports UD type for dst. */
1772 brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1773 break;
1774 case BRW_OPCODE_FBL:
1775 assert(devinfo->gen >= 7);
1776 /* FBL only supports UD type for dst. */
1777 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1778 break;
1779 case BRW_OPCODE_LZD:
1780 brw_LZD(p, dst, src[0]);
1781 break;
1782 case BRW_OPCODE_CBIT:
1783 assert(devinfo->gen >= 7);
1784 /* CBIT only supports UD type for dst. */
1785 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1786 break;
1787 case BRW_OPCODE_ADDC:
1788 assert(devinfo->gen >= 7);
1789 brw_ADDC(p, dst, src[0], src[1]);
1790 break;
1791 case BRW_OPCODE_SUBB:
1792 assert(devinfo->gen >= 7);
1793 brw_SUBB(p, dst, src[0], src[1]);
1794 break;
1795 case BRW_OPCODE_MAC:
1796 brw_MAC(p, dst, src[0], src[1]);
1797 break;
1798
1799 case BRW_OPCODE_BFE:
1800 assert(devinfo->gen >= 7);
1801 brw_set_default_access_mode(p, BRW_ALIGN_16);
1802 brw_BFE(p, dst, src[0], src[1], src[2]);
1803 break;
1804
1805 case BRW_OPCODE_BFI1:
1806 assert(devinfo->gen >= 7);
1807 brw_BFI1(p, dst, src[0], src[1]);
1808 break;
1809 case BRW_OPCODE_BFI2:
1810 assert(devinfo->gen >= 7);
1811 brw_set_default_access_mode(p, BRW_ALIGN_16);
1812 brw_BFI2(p, dst, src[0], src[1], src[2]);
1813 break;
1814
1815 case BRW_OPCODE_IF:
1816 if (inst->src[0].file != BAD_FILE) {
1817 /* The instruction has an embedded compare (only allowed on gen6) */
1818 assert(devinfo->gen == 6);
1819 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1820 } else {
1821 brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1822 }
1823 break;
1824
1825 case BRW_OPCODE_ELSE:
1826 brw_ELSE(p);
1827 break;
1828 case BRW_OPCODE_ENDIF:
1829 brw_ENDIF(p);
1830 break;
1831
1832 case BRW_OPCODE_DO:
1833 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1834 break;
1835
1836 case BRW_OPCODE_BREAK:
1837 brw_BREAK(p);
1838 break;
1839 case BRW_OPCODE_CONTINUE:
1840 brw_CONT(p);
1841 break;
1842
1843 case BRW_OPCODE_WHILE:
1844 brw_WHILE(p);
1845 loop_count++;
1846 break;
1847
1848 case SHADER_OPCODE_RCP:
1849 case SHADER_OPCODE_RSQ:
1850 case SHADER_OPCODE_SQRT:
1851 case SHADER_OPCODE_EXP2:
1852 case SHADER_OPCODE_LOG2:
1853 case SHADER_OPCODE_SIN:
1854 case SHADER_OPCODE_COS:
1855 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1856 if (devinfo->gen >= 6) {
1857 assert(inst->mlen == 0);
1858 assert(devinfo->gen >= 7 || inst->exec_size == 8);
1859 gen6_math(p, dst, brw_math_function(inst->opcode),
1860 src[0], brw_null_reg());
1861 } else {
1862 assert(inst->mlen >= 1);
1863 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1864 gen4_math(p, dst,
1865 brw_math_function(inst->opcode),
1866 inst->base_mrf, src[0],
1867 BRW_MATH_PRECISION_FULL);
1868 }
1869 break;
1870 case SHADER_OPCODE_INT_QUOTIENT:
1871 case SHADER_OPCODE_INT_REMAINDER:
1872 case SHADER_OPCODE_POW:
1873 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1874 if (devinfo->gen >= 6) {
1875 assert(inst->mlen == 0);
1876 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1877 inst->exec_size == 8);
1878 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1879 } else {
1880 assert(inst->mlen >= 1);
1881 assert(inst->exec_size == 8);
1882 gen4_math(p, dst, brw_math_function(inst->opcode),
1883 inst->base_mrf, src[0],
1884 BRW_MATH_PRECISION_FULL);
1885 }
1886 break;
1887 case FS_OPCODE_CINTERP:
1888 brw_MOV(p, dst, src[0]);
1889 break;
1890 case FS_OPCODE_LINTERP:
1891 generate_linterp(inst, dst, src);
1892 break;
1893 case FS_OPCODE_PIXEL_X:
1894 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1895 src[0].subnr = 0 * type_sz(src[0].type);
1896 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1897 break;
1898 case FS_OPCODE_PIXEL_Y:
1899 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1900 src[0].subnr = 4 * type_sz(src[0].type);
1901 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1902 break;
1903 case FS_OPCODE_GET_BUFFER_SIZE:
1904 generate_get_buffer_size(inst, dst, src[0], src[1]);
1905 break;
1906 case SHADER_OPCODE_TEX:
1907 case FS_OPCODE_TXB:
1908 case SHADER_OPCODE_TXD:
1909 case SHADER_OPCODE_TXF:
1910 case SHADER_OPCODE_TXF_LZ:
1911 case SHADER_OPCODE_TXF_CMS:
1912 case SHADER_OPCODE_TXF_CMS_W:
1913 case SHADER_OPCODE_TXF_UMS:
1914 case SHADER_OPCODE_TXF_MCS:
1915 case SHADER_OPCODE_TXL:
1916 case SHADER_OPCODE_TXL_LZ:
1917 case SHADER_OPCODE_TXS:
1918 case SHADER_OPCODE_LOD:
1919 case SHADER_OPCODE_TG4:
1920 case SHADER_OPCODE_TG4_OFFSET:
1921 case SHADER_OPCODE_SAMPLEINFO:
1922 generate_tex(inst, dst, src[0], src[1], src[2]);
1923 break;
1924 case FS_OPCODE_DDX_COARSE:
1925 case FS_OPCODE_DDX_FINE:
1926 generate_ddx(inst->opcode, dst, src[0]);
1927 break;
1928 case FS_OPCODE_DDY_COARSE:
1929 case FS_OPCODE_DDY_FINE:
1930 generate_ddy(inst->opcode, dst, src[0]);
1931 break;
1932
1933 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1934 generate_scratch_write(inst, src[0]);
1935 spill_count++;
1936 break;
1937
1938 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1939 generate_scratch_read(inst, dst);
1940 fill_count++;
1941 break;
1942
1943 case SHADER_OPCODE_GEN7_SCRATCH_READ:
1944 generate_scratch_read_gen7(inst, dst);
1945 fill_count++;
1946 break;
1947
1948 case SHADER_OPCODE_MOV_INDIRECT:
1949 generate_mov_indirect(inst, dst, src[0], src[1]);
1950 break;
1951
1952 case SHADER_OPCODE_URB_READ_SIMD8:
1953 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
1954 generate_urb_read(inst, dst, src[0]);
1955 break;
1956
1957 case SHADER_OPCODE_URB_WRITE_SIMD8:
1958 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
1959 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
1960 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
1961 generate_urb_write(inst, src[0]);
1962 break;
1963
1964 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1965 assert(inst->force_writemask_all);
1966 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1967 break;
1968
1969 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1970 assert(inst->force_writemask_all);
1971 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1972 break;
1973
1974 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
1975 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
1976 break;
1977
1978 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1979 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1980 break;
1981
1982 case FS_OPCODE_REP_FB_WRITE:
1983 case FS_OPCODE_FB_WRITE:
1984 generate_fb_write(inst, src[0]);
1985 break;
1986
1987 case FS_OPCODE_FB_READ:
1988 generate_fb_read(inst, dst, src[0]);
1989 break;
1990
1991 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1992 generate_mov_dispatch_to_flags(inst);
1993 break;
1994
1995 case FS_OPCODE_DISCARD_JUMP:
1996 generate_discard_jump(inst);
1997 break;
1998
1999 case SHADER_OPCODE_SHADER_TIME_ADD:
2000 generate_shader_time_add(inst, src[0], src[1], src[2]);
2001 break;
2002
2003 case SHADER_OPCODE_UNTYPED_ATOMIC:
2004 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2005 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
2006 inst->mlen, !inst->dst.is_null());
2007 break;
2008
2009 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2010 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2011 brw_untyped_surface_read(p, dst, src[0], src[1],
2012 inst->mlen, src[2].ud);
2013 break;
2014
2015 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
2016 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2017 brw_untyped_surface_write(p, src[0], src[1],
2018 inst->mlen, src[2].ud);
2019 break;
2020
2021 case SHADER_OPCODE_TYPED_ATOMIC:
2022 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2023 brw_typed_atomic(p, dst, src[0], src[1],
2024 src[2].ud, inst->mlen, !inst->dst.is_null());
2025 break;
2026
2027 case SHADER_OPCODE_TYPED_SURFACE_READ:
2028 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2029 brw_typed_surface_read(p, dst, src[0], src[1],
2030 inst->mlen, src[2].ud);
2031 break;
2032
2033 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2034 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2035 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2036 break;
2037
2038 case SHADER_OPCODE_MEMORY_FENCE:
2039 brw_memory_fence(p, dst);
2040 break;
2041
2042 case FS_OPCODE_SET_SIMD4X2_OFFSET:
2043 generate_set_simd4x2_offset(inst, dst, src[0]);
2044 break;
2045
2046 case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2047 const struct brw_reg mask =
2048 brw_stage_has_packed_dispatch(devinfo, stage,
2049 prog_data) ? brw_imm_ud(~0u) :
2050 stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2051 brw_dmask_reg();
2052 brw_find_live_channel(p, dst, mask);
2053 break;
2054 }
2055
2056 case SHADER_OPCODE_BROADCAST:
2057 assert(inst->force_writemask_all);
2058 brw_broadcast(p, dst, src[0], src[1]);
2059 break;
2060
2061 case FS_OPCODE_SET_SAMPLE_ID:
2062 generate_set_sample_id(inst, dst, src[0], src[1]);
2063 break;
2064
2065 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2066 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2067 break;
2068
2069 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2070 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2071 generate_unpack_half_2x16_split(inst, dst, src[0]);
2072 break;
2073
2074 case FS_OPCODE_PLACEHOLDER_HALT:
2075 /* This is the place where the final HALT needs to be inserted if
2076 * we've emitted any discards. If not, this will emit no code.
2077 */
2078 if (!patch_discard_jumps_to_fb_writes()) {
2079 if (unlikely(debug_flag)) {
2080 annotation.ann_count--;
2081 }
2082 }
2083 break;
2084
2085 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2086 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2087 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2088 break;
2089
2090 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2091 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2092 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2093 break;
2094
2095 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2096 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2097 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2098 break;
2099
2100 case CS_OPCODE_CS_TERMINATE:
2101 generate_cs_terminate(inst, src[0]);
2102 break;
2103
2104 case SHADER_OPCODE_BARRIER:
2105 generate_barrier(inst, src[0]);
2106 break;
2107
2108 case BRW_OPCODE_DIM:
2109 assert(devinfo->is_haswell);
2110 assert(src[0].type == BRW_REGISTER_TYPE_DF);
2111 assert(dst.type == BRW_REGISTER_TYPE_DF);
2112 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2113 break;
2114
2115 default:
2116 unreachable("Unsupported opcode");
2117
2118 case SHADER_OPCODE_LOAD_PAYLOAD:
2119 unreachable("Should be lowered by lower_load_payload()");
2120 }
2121
2122 if (multiple_instructions_emitted)
2123 continue;
2124
2125 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2126 assert(p->next_insn_offset == last_insn_offset + 16 ||
2127 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2128 "emitting more than 1 instruction");
2129
2130 brw_inst *last = &p->store[last_insn_offset / 16];
2131
2132 if (inst->conditional_mod)
2133 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2134 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2135 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2136 }
2137 }
2138
2139 brw_set_uip_jip(p, start_offset);
2140 annotation_finalize(&annotation, p->next_insn_offset);
2141
2142 #ifndef NDEBUG
2143 bool validated = brw_validate_instructions(p, start_offset, &annotation);
2144 #else
2145 if (unlikely(debug_flag))
2146 brw_validate_instructions(p, start_offset, &annotation);
2147 #endif
2148
2149 int before_size = p->next_insn_offset - start_offset;
2150 brw_compact_instructions(p, start_offset, annotation.ann_count,
2151 annotation.ann);
2152 int after_size = p->next_insn_offset - start_offset;
2153
2154 if (unlikely(debug_flag)) {
2155 fprintf(stderr, "Native code for %s\n"
2156 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2157 " bytes (%.0f%%)\n",
2158 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2159 spill_count, fill_count, promoted_constants, before_size, after_size,
2160 100.0f * (before_size - after_size) / before_size);
2161
2162 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2163 p->devinfo);
2164 ralloc_free(annotation.mem_ctx);
2165 }
2166 assert(validated);
2167
2168 compiler->shader_debug_log(log_data,
2169 "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2170 "%d:%d spills:fills, Promoted %u constants, "
2171 "compacted %d to %d bytes.",
2172 _mesa_shader_stage_to_abbrev(stage),
2173 dispatch_width, before_size / 16,
2174 loop_count, cfg->cycle_count, spill_count,
2175 fill_count, promoted_constants, before_size,
2176 after_size);
2177
2178 return start_offset;
2179 }
2180
2181 const unsigned *
2182 fs_generator::get_assembly(unsigned int *assembly_size)
2183 {
2184 return brw_get_program(p, assembly_size);
2185 }