i965: Move common code out of #ifdef
[mesa.git] / src / intel / compiler / brw_fs_generator.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs_generator.cpp
25 *
26 * This file supports generating code from the FS LIR to the actual
27 * native instructions.
28 */
29
30 #include "brw_eu.h"
31 #include "brw_fs.h"
32 #include "brw_cfg.h"
33
34 static enum brw_reg_file
35 brw_file_from_reg(fs_reg *reg)
36 {
37 switch (reg->file) {
38 case ARF:
39 return BRW_ARCHITECTURE_REGISTER_FILE;
40 case FIXED_GRF:
41 case VGRF:
42 return BRW_GENERAL_REGISTER_FILE;
43 case MRF:
44 return BRW_MESSAGE_REGISTER_FILE;
45 case IMM:
46 return BRW_IMMEDIATE_VALUE;
47 case BAD_FILE:
48 case ATTR:
49 case UNIFORM:
50 unreachable("not reached");
51 }
52 return BRW_ARCHITECTURE_REGISTER_FILE;
53 }
54
55 static struct brw_reg
56 brw_reg_from_fs_reg(const struct gen_device_info *devinfo, fs_inst *inst,
57 fs_reg *reg, bool compressed)
58 {
59 struct brw_reg brw_reg;
60
61 switch (reg->file) {
62 case MRF:
63 assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
64 /* Fallthrough */
65 case VGRF:
66 if (reg->stride == 0) {
67 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
68 } else {
69 /* From the Haswell PRM:
70 *
71 * "VertStride must be used to cross GRF register boundaries. This
72 * rule implies that elements within a 'Width' cannot cross GRF
73 * boundaries."
74 *
75 * The maximum width value that could satisfy this restriction is:
76 */
77 const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
78
79 /* Because the hardware can only split source regions at a whole
80 * multiple of width during decompression (i.e. vertically), clamp
81 * the value obtained above to the physical execution size of a
82 * single decompressed chunk of the instruction:
83 */
84 const unsigned phys_width = compressed ? inst->exec_size / 2 :
85 inst->exec_size;
86
87 /* XXX - The equation above is strictly speaking not correct on
88 * hardware that supports unbalanced GRF writes -- On Gen9+
89 * each decompressed chunk of the instruction may have a
90 * different execution size when the number of components
91 * written to each destination GRF is not the same.
92 */
93 const unsigned width = MIN2(reg_width, phys_width);
94 brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
95 brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
96
97 if (devinfo->gen == 7 && !devinfo->is_haswell) {
98 /* From the IvyBridge PRM (EU Changes by Processor Generation, page 13):
99 * "Each DF (Double Float) operand uses an element size of 4 rather
100 * than 8 and all regioning parameters are twice what the values
101 * would be based on the true element size: ExecSize, Width,
102 * HorzStride, and VertStride. Each DF operand uses a pair of
103 * channels and all masking and swizzing should be adjusted
104 * appropriately."
105 *
106 * From the IvyBridge PRM (Special Requirements for Handling Double
107 * Precision Data Types, page 71):
108 * "In Align1 mode, all regioning parameters like stride, execution
109 * size, and width must use the syntax of a pair of packed
110 * floats. The offsets for these data types must be 64-bit
111 * aligned. The execution size and regioning parameters are in terms
112 * of floats."
113 *
114 * Summarized: when handling DF-typed arguments, ExecSize,
115 * VertStride, and Width must be doubled.
116 *
117 * It applies to BayTrail too.
118 */
119 if (type_sz(reg->type) == 8) {
120 brw_reg.width++;
121 if (brw_reg.vstride > 0)
122 brw_reg.vstride++;
123 assert(brw_reg.hstride == BRW_HORIZONTAL_STRIDE_1);
124 }
125
126 /* When converting from DF->F, we set the destination stride to 2
127 * because each d2f conversion implicitly writes 2 floats, being
128 * the first one the converted value. IVB/BYT actually writes two
129 * F components per SIMD channel, and every other component is
130 * filled with garbage.
131 */
132 if (reg == &inst->dst && get_exec_type_size(inst) == 8 &&
133 type_sz(inst->dst.type) < 8) {
134 assert(brw_reg.hstride > BRW_HORIZONTAL_STRIDE_1);
135 brw_reg.hstride--;
136 }
137 }
138 }
139
140 brw_reg = retype(brw_reg, reg->type);
141 brw_reg = byte_offset(brw_reg, reg->offset);
142 brw_reg.abs = reg->abs;
143 brw_reg.negate = reg->negate;
144 break;
145 case ARF:
146 case FIXED_GRF:
147 case IMM:
148 assert(reg->offset == 0);
149 brw_reg = reg->as_brw_reg();
150 break;
151 case BAD_FILE:
152 /* Probably unused. */
153 brw_reg = brw_null_reg();
154 break;
155 case ATTR:
156 case UNIFORM:
157 unreachable("not reached");
158 }
159
160 /* On HSW+, scalar DF sources can be accessed using the normal <0,1,0>
161 * region, but on IVB and BYT DF regions must be programmed in terms of
162 * floats. A <0,2,1> region accomplishes this.
163 */
164 if (devinfo->gen == 7 && !devinfo->is_haswell &&
165 type_sz(reg->type) == 8 &&
166 brw_reg.vstride == BRW_VERTICAL_STRIDE_0 &&
167 brw_reg.width == BRW_WIDTH_1 &&
168 brw_reg.hstride == BRW_HORIZONTAL_STRIDE_0) {
169 brw_reg.width = BRW_WIDTH_2;
170 brw_reg.hstride = BRW_HORIZONTAL_STRIDE_1;
171 }
172
173 return brw_reg;
174 }
175
176 fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
177 void *mem_ctx,
178 const void *key,
179 struct brw_stage_prog_data *prog_data,
180 unsigned promoted_constants,
181 bool runtime_check_aads_emit,
182 gl_shader_stage stage)
183
184 : compiler(compiler), log_data(log_data),
185 devinfo(compiler->devinfo), key(key),
186 prog_data(prog_data),
187 promoted_constants(promoted_constants),
188 runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
189 stage(stage), mem_ctx(mem_ctx)
190 {
191 p = rzalloc(mem_ctx, struct brw_codegen);
192 brw_init_codegen(devinfo, p, mem_ctx);
193
194 /* In the FS code generator, we are very careful to ensure that we always
195 * set the right execution size so we don't need the EU code to "help" us
196 * by trying to infer it. Sometimes, it infers the wrong thing.
197 */
198 p->automatic_exec_sizes = false;
199 }
200
201 fs_generator::~fs_generator()
202 {
203 }
204
205 class ip_record : public exec_node {
206 public:
207 DECLARE_RALLOC_CXX_OPERATORS(ip_record)
208
209 ip_record(int ip)
210 {
211 this->ip = ip;
212 }
213
214 int ip;
215 };
216
217 bool
218 fs_generator::patch_discard_jumps_to_fb_writes()
219 {
220 if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
221 return false;
222
223 int scale = brw_jump_scale(p->devinfo);
224
225 /* There is a somewhat strange undocumented requirement of using
226 * HALT, according to the simulator. If some channel has HALTed to
227 * a particular UIP, then by the end of the program, every channel
228 * must have HALTed to that UIP. Furthermore, the tracking is a
229 * stack, so you can't do the final halt of a UIP after starting
230 * halting to a new UIP.
231 *
232 * Symptoms of not emitting this instruction on actual hardware
233 * included GPU hangs and sparkly rendering on the piglit discard
234 * tests.
235 */
236 brw_inst *last_halt = gen6_HALT(p);
237 brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
238 brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
239
240 int ip = p->nr_insn;
241
242 foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
243 brw_inst *patch = &p->store[patch_ip->ip];
244
245 assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
246 /* HALT takes a half-instruction distance from the pre-incremented IP. */
247 brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
248 }
249
250 this->discard_halt_patches.make_empty();
251 return true;
252 }
253
254 void
255 fs_generator::fire_fb_write(fs_inst *inst,
256 struct brw_reg payload,
257 struct brw_reg implied_header,
258 GLuint nr)
259 {
260 uint32_t msg_control;
261
262 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
263
264 if (devinfo->gen < 6) {
265 brw_push_insn_state(p);
266 brw_set_default_exec_size(p, BRW_EXECUTE_8);
267 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
268 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
269 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
270 brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
271 brw_pop_insn_state(p);
272 }
273
274 if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
275 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
276 else if (prog_data->dual_src_blend) {
277 if (!inst->group)
278 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
279 else
280 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
281 } else if (inst->exec_size == 16)
282 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
283 else
284 msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
285
286 /* We assume render targets start at 0, because headerless FB write
287 * messages set "Render Target Index" to 0. Using a different binding
288 * table index would make it impossible to use headerless messages.
289 */
290 assert(prog_data->binding_table.render_target_start == 0);
291
292 const uint32_t surf_index = inst->target;
293
294 bool last_render_target = inst->eot ||
295 (prog_data->dual_src_blend && dispatch_width == 16);
296
297
298 brw_fb_WRITE(p,
299 payload,
300 implied_header,
301 msg_control,
302 surf_index,
303 nr,
304 0,
305 inst->eot,
306 last_render_target,
307 inst->header_size != 0);
308
309 brw_mark_surface_used(&prog_data->base, surf_index);
310 }
311
312 void
313 fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
314 {
315 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
316 const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
317 struct brw_reg implied_header;
318
319 if (devinfo->gen < 8 && !devinfo->is_haswell) {
320 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
321 }
322
323 if (inst->base_mrf >= 0)
324 payload = brw_message_reg(inst->base_mrf);
325
326 /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
327 * move, here's g1.
328 */
329 if (inst->header_size != 0) {
330 brw_push_insn_state(p);
331 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
332 brw_set_default_exec_size(p, BRW_EXECUTE_1);
333 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
334 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
335 brw_set_default_flag_reg(p, 0, 0);
336
337 /* On HSW, the GPU will use the predicate on SENDC, unless the header is
338 * present.
339 */
340 if (prog_data->uses_kill) {
341 struct brw_reg pixel_mask;
342
343 if (devinfo->gen >= 6)
344 pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
345 else
346 pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
347
348 brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
349 }
350
351 if (devinfo->gen >= 6) {
352 brw_push_insn_state(p);
353 brw_set_default_exec_size(p, BRW_EXECUTE_16);
354 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
355 brw_MOV(p,
356 retype(payload, BRW_REGISTER_TYPE_UD),
357 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
358 brw_pop_insn_state(p);
359
360 if (inst->target > 0 && key->replicate_alpha) {
361 /* Set "Source0 Alpha Present to RenderTarget" bit in message
362 * header.
363 */
364 brw_OR(p,
365 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
366 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
367 brw_imm_ud(0x1 << 11));
368 }
369
370 if (inst->target > 0) {
371 /* Set the render target index for choosing BLEND_STATE. */
372 brw_MOV(p, retype(vec1(suboffset(payload, 2)),
373 BRW_REGISTER_TYPE_UD),
374 brw_imm_ud(inst->target));
375 }
376
377 /* Set computes stencil to render target */
378 if (prog_data->computed_stencil) {
379 brw_OR(p,
380 vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
381 vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
382 brw_imm_ud(0x1 << 14));
383 }
384
385 implied_header = brw_null_reg();
386 } else {
387 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
388 }
389
390 brw_pop_insn_state(p);
391 } else {
392 implied_header = brw_null_reg();
393 }
394
395 if (!runtime_check_aads_emit) {
396 fire_fb_write(inst, payload, implied_header, inst->mlen);
397 } else {
398 /* This can only happen in gen < 6 */
399 assert(devinfo->gen < 6);
400
401 struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
402
403 /* Check runtime bit to detect if we have to send AA data or not */
404 brw_push_insn_state(p);
405 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
406 brw_set_default_exec_size(p, BRW_EXECUTE_1);
407 brw_AND(p,
408 v1_null_ud,
409 retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
410 brw_imm_ud(1<<26));
411 brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
412
413 int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
414 brw_pop_insn_state(p);
415 {
416 /* Don't send AA data */
417 fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
418 }
419 brw_land_fwd_jump(p, jmp);
420 fire_fb_write(inst, payload, implied_header, inst->mlen);
421 }
422 }
423
424 void
425 fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
426 struct brw_reg payload)
427 {
428 assert(inst->size_written % REG_SIZE == 0);
429 struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
430 const unsigned surf_index =
431 prog_data->binding_table.render_target_start + inst->target;
432
433 gen9_fb_READ(p, dst, payload, surf_index,
434 inst->header_size, inst->size_written / REG_SIZE,
435 prog_data->persample_dispatch);
436
437 brw_mark_surface_used(&prog_data->base, surf_index);
438 }
439
440 void
441 fs_generator::generate_mov_indirect(fs_inst *inst,
442 struct brw_reg dst,
443 struct brw_reg reg,
444 struct brw_reg indirect_byte_offset)
445 {
446 assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
447 assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
448 assert(!reg.abs && !reg.negate);
449 assert(reg.type == dst.type);
450
451 unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
452
453 if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
454 imm_byte_offset += indirect_byte_offset.ud;
455
456 reg.nr = imm_byte_offset / REG_SIZE;
457 reg.subnr = imm_byte_offset % REG_SIZE;
458 brw_MOV(p, dst, reg);
459 } else {
460 /* Prior to Broadwell, there are only 8 address registers. */
461 assert(inst->exec_size <= 8 || devinfo->gen >= 8);
462
463 /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
464 struct brw_reg addr = vec8(brw_address_reg(0));
465
466 /* The destination stride of an instruction (in bytes) must be greater
467 * than or equal to the size of the rest of the instruction. Since the
468 * address register is of type UW, we can't use a D-type instruction.
469 * In order to get around this, re retype to UW and use a stride.
470 */
471 indirect_byte_offset =
472 retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
473
474 /* There are a number of reasons why we don't use the base offset here.
475 * One reason is that the field is only 9 bits which means we can only
476 * use it to access the first 16 GRFs. Also, from the Haswell PRM
477 * section "Register Region Restrictions":
478 *
479 * "The lower bits of the AddressImmediate must not overflow to
480 * change the register address. The lower 5 bits of Address
481 * Immediate when added to lower 5 bits of address register gives
482 * the sub-register offset. The upper bits of Address Immediate
483 * when added to upper bits of address register gives the register
484 * address. Any overflow from sub-register offset is dropped."
485 *
486 * Since the indirect may cause us to cross a register boundary, this
487 * makes the base offset almost useless. We could try and do something
488 * clever where we use a actual base offset if base_offset % 32 == 0 but
489 * that would mean we were generating different code depending on the
490 * base offset. Instead, for the sake of consistency, we'll just do the
491 * add ourselves. This restriction is only listed in the Haswell PRM
492 * but empirical testing indicates that it applies on all older
493 * generations and is lifted on Broadwell.
494 *
495 * In the end, while base_offset is nice to look at in the generated
496 * code, using it saves us 0 instructions and would require quite a bit
497 * of case-by-case work. It's just not worth it.
498 */
499 brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
500
501 if (type_sz(reg.type) > 4 &&
502 ((devinfo->gen == 7 && !devinfo->is_haswell) ||
503 devinfo->is_cherryview || gen_device_info_is_9lp(devinfo))) {
504 /* IVB has an issue (which we found empirically) where it reads two
505 * address register components per channel for indirectly addressed
506 * 64-bit sources.
507 *
508 * From the Cherryview PRM Vol 7. "Register Region Restrictions":
509 *
510 * "When source or destination datatype is 64b or operation is
511 * integer DWord multiply, indirect addressing must not be used."
512 *
513 * To work around both of these, we do two integer MOVs insead of one
514 * 64-bit MOV. Because no double value should ever cross a register
515 * boundary, it's safe to use the immediate offset in the indirect
516 * here to handle adding 4 bytes to the offset and avoid the extra
517 * ADD to the register file.
518 */
519 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 0),
520 retype(brw_VxH_indirect(0, 0), BRW_REGISTER_TYPE_D));
521 brw_MOV(p, subscript(dst, BRW_REGISTER_TYPE_D, 1),
522 retype(brw_VxH_indirect(0, 4), BRW_REGISTER_TYPE_D));
523 } else {
524 struct brw_reg ind_src = brw_VxH_indirect(0, 0);
525
526 brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
527
528 if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
529 !inst->get_next()->is_tail_sentinel() &&
530 ((fs_inst *)inst->get_next())->mlen > 0) {
531 /* From the Sandybridge PRM:
532 *
533 * "[Errata: DevSNB(SNB)] If MRF register is updated by any
534 * instruction that “indexed/indirect” source AND is followed
535 * by a send, the instruction requires a “Switch”. This is to
536 * avoid race condition where send may dispatch before MRF is
537 * updated."
538 */
539 brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
540 }
541 }
542 }
543 }
544
545 void
546 fs_generator::generate_urb_read(fs_inst *inst,
547 struct brw_reg dst,
548 struct brw_reg header)
549 {
550 assert(inst->size_written % REG_SIZE == 0);
551 assert(header.file == BRW_GENERAL_REGISTER_FILE);
552 assert(header.type == BRW_REGISTER_TYPE_UD);
553
554 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
555 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
556 brw_set_src0(p, send, header);
557 brw_set_src1(p, send, brw_imm_ud(0u));
558
559 brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
560 brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
561
562 if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
563 brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
564
565 brw_inst_set_mlen(p->devinfo, send, inst->mlen);
566 brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
567 brw_inst_set_header_present(p->devinfo, send, true);
568 brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
569 }
570
571 void
572 fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
573 {
574 brw_inst *insn;
575
576 insn = brw_next_insn(p, BRW_OPCODE_SEND);
577
578 brw_set_dest(p, insn, brw_null_reg());
579 brw_set_src0(p, insn, payload);
580 brw_set_src1(p, insn, brw_imm_d(0));
581
582 brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
583 brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
584
585 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
586 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
587 brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
588
589 if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
590 inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
591 brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
592
593 brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
594 brw_inst_set_rlen(p->devinfo, insn, 0);
595 brw_inst_set_eot(p->devinfo, insn, inst->eot);
596 brw_inst_set_header_present(p->devinfo, insn, true);
597 brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
598 }
599
600 void
601 fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
602 {
603 struct brw_inst *insn;
604
605 insn = brw_next_insn(p, BRW_OPCODE_SEND);
606
607 brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
608 brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
609 brw_set_src1(p, insn, brw_imm_d(0));
610
611 /* Terminate a compute shader by sending a message to the thread spawner.
612 */
613 brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
614 brw_inst_set_mlen(devinfo, insn, 1);
615 brw_inst_set_rlen(devinfo, insn, 0);
616 brw_inst_set_eot(devinfo, insn, inst->eot);
617 brw_inst_set_header_present(devinfo, insn, false);
618
619 brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
620 brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
621
622 /* Note that even though the thread has a URB resource associated with it,
623 * we set the "do not dereference URB" bit, because the URB resource is
624 * managed by the fixed-function unit, so it will free it automatically.
625 */
626 brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
627
628 brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
629 }
630
631 void
632 fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
633 {
634 brw_barrier(p, src);
635 brw_WAIT(p);
636 }
637
638 void
639 fs_generator::generate_linterp(fs_inst *inst,
640 struct brw_reg dst, struct brw_reg *src)
641 {
642 /* PLN reads:
643 * / in SIMD16 \
644 * -----------------------------------
645 * | src1+0 | src1+1 | src1+2 | src1+3 |
646 * |-----------------------------------|
647 * |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
648 * -----------------------------------
649 *
650 * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
651 *
652 * -----------------------------------
653 * | src1+0 | src1+1 | src1+2 | src1+3 |
654 * |-----------------------------------|
655 * |(x0, x1)|(y0, y1)| | | in SIMD8
656 * |-----------------------------------|
657 * |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
658 * -----------------------------------
659 *
660 * See also: emit_interpolation_setup_gen4().
661 */
662 struct brw_reg delta_x = src[0];
663 struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
664 struct brw_reg interp = src[1];
665
666 if (devinfo->has_pln &&
667 (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
668 brw_PLN(p, dst, interp, delta_x);
669 } else {
670 brw_LINE(p, brw_null_reg(), interp, delta_x);
671 brw_MAC(p, dst, suboffset(interp, 1), delta_y);
672 }
673 }
674
675 void
676 fs_generator::generate_get_buffer_size(fs_inst *inst,
677 struct brw_reg dst,
678 struct brw_reg src,
679 struct brw_reg surf_index)
680 {
681 assert(devinfo->gen >= 7);
682 assert(surf_index.file == BRW_IMMEDIATE_VALUE);
683
684 uint32_t simd_mode;
685 int rlen = 4;
686
687 switch (inst->exec_size) {
688 case 8:
689 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
690 break;
691 case 16:
692 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
693 break;
694 default:
695 unreachable("Invalid width for texture instruction");
696 }
697
698 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
699 rlen = 8;
700 dst = vec16(dst);
701 }
702
703 brw_SAMPLE(p,
704 retype(dst, BRW_REGISTER_TYPE_UW),
705 inst->base_mrf,
706 src,
707 surf_index.ud,
708 0,
709 GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
710 rlen, /* response length */
711 inst->mlen,
712 inst->header_size > 0,
713 simd_mode,
714 BRW_SAMPLER_RETURN_FORMAT_SINT32);
715
716 brw_mark_surface_used(prog_data, surf_index.ud);
717 }
718
719 void
720 fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
721 struct brw_reg surface_index,
722 struct brw_reg sampler_index)
723 {
724 assert(inst->size_written % REG_SIZE == 0);
725 int msg_type = -1;
726 uint32_t simd_mode;
727 uint32_t return_format;
728 bool is_combined_send = inst->eot;
729
730 switch (dst.type) {
731 case BRW_REGISTER_TYPE_D:
732 return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
733 break;
734 case BRW_REGISTER_TYPE_UD:
735 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
736 break;
737 default:
738 return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
739 break;
740 }
741
742 /* Stomp the resinfo output type to UINT32. On gens 4-5, the output type
743 * is set as part of the message descriptor. On gen4, the PRM seems to
744 * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
745 * later gens UINT32 is required. Once you hit Sandy Bridge, the bit is
746 * gone from the message descriptor entirely and you just get UINT32 all
747 * the time regasrdless. Since we can really only do non-UINT32 on gen4,
748 * just stomp it to UINT32 all the time.
749 */
750 if (inst->opcode == SHADER_OPCODE_TXS)
751 return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
752
753 switch (inst->exec_size) {
754 case 8:
755 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
756 break;
757 case 16:
758 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
759 break;
760 default:
761 unreachable("Invalid width for texture instruction");
762 }
763
764 if (devinfo->gen >= 5) {
765 switch (inst->opcode) {
766 case SHADER_OPCODE_TEX:
767 if (inst->shadow_compare) {
768 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
769 } else {
770 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
771 }
772 break;
773 case FS_OPCODE_TXB:
774 if (inst->shadow_compare) {
775 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
776 } else {
777 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
778 }
779 break;
780 case SHADER_OPCODE_TXL:
781 if (inst->shadow_compare) {
782 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
783 } else {
784 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
785 }
786 break;
787 case SHADER_OPCODE_TXL_LZ:
788 assert(devinfo->gen >= 9);
789 if (inst->shadow_compare) {
790 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
791 } else {
792 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
793 }
794 break;
795 case SHADER_OPCODE_TXS:
796 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
797 break;
798 case SHADER_OPCODE_TXD:
799 if (inst->shadow_compare) {
800 /* Gen7.5+. Otherwise, lowered in NIR */
801 assert(devinfo->gen >= 8 || devinfo->is_haswell);
802 msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
803 } else {
804 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
805 }
806 break;
807 case SHADER_OPCODE_TXF:
808 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
809 break;
810 case SHADER_OPCODE_TXF_LZ:
811 assert(devinfo->gen >= 9);
812 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
813 break;
814 case SHADER_OPCODE_TXF_CMS_W:
815 assert(devinfo->gen >= 9);
816 msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
817 break;
818 case SHADER_OPCODE_TXF_CMS:
819 if (devinfo->gen >= 7)
820 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
821 else
822 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
823 break;
824 case SHADER_OPCODE_TXF_UMS:
825 assert(devinfo->gen >= 7);
826 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
827 break;
828 case SHADER_OPCODE_TXF_MCS:
829 assert(devinfo->gen >= 7);
830 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
831 break;
832 case SHADER_OPCODE_LOD:
833 msg_type = GEN5_SAMPLER_MESSAGE_LOD;
834 break;
835 case SHADER_OPCODE_TG4:
836 if (inst->shadow_compare) {
837 assert(devinfo->gen >= 7);
838 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
839 } else {
840 assert(devinfo->gen >= 6);
841 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
842 }
843 break;
844 case SHADER_OPCODE_TG4_OFFSET:
845 assert(devinfo->gen >= 7);
846 if (inst->shadow_compare) {
847 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
848 } else {
849 msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
850 }
851 break;
852 case SHADER_OPCODE_SAMPLEINFO:
853 msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
854 break;
855 default:
856 unreachable("not reached");
857 }
858 } else {
859 switch (inst->opcode) {
860 case SHADER_OPCODE_TEX:
861 /* Note that G45 and older determines shadow compare and dispatch width
862 * from message length for most messages.
863 */
864 if (inst->exec_size == 8) {
865 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
866 if (inst->shadow_compare) {
867 assert(inst->mlen == 6);
868 } else {
869 assert(inst->mlen <= 4);
870 }
871 } else {
872 if (inst->shadow_compare) {
873 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
874 assert(inst->mlen == 9);
875 } else {
876 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
877 assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
878 }
879 }
880 break;
881 case FS_OPCODE_TXB:
882 if (inst->shadow_compare) {
883 assert(inst->exec_size == 8);
884 assert(inst->mlen == 6);
885 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
886 } else {
887 assert(inst->mlen == 9);
888 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
889 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
890 }
891 break;
892 case SHADER_OPCODE_TXL:
893 if (inst->shadow_compare) {
894 assert(inst->exec_size == 8);
895 assert(inst->mlen == 6);
896 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
897 } else {
898 assert(inst->mlen == 9);
899 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
900 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
901 }
902 break;
903 case SHADER_OPCODE_TXD:
904 /* There is no sample_d_c message; comparisons are done manually */
905 assert(inst->exec_size == 8);
906 assert(inst->mlen == 7 || inst->mlen == 10);
907 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
908 break;
909 case SHADER_OPCODE_TXF:
910 assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
911 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
912 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
913 break;
914 case SHADER_OPCODE_TXS:
915 assert(inst->mlen == 3);
916 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
917 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
918 break;
919 default:
920 unreachable("not reached");
921 }
922 }
923 assert(msg_type != -1);
924
925 if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
926 dst = vec16(dst);
927 }
928
929 assert(devinfo->gen < 7 || inst->header_size == 0 ||
930 src.file == BRW_GENERAL_REGISTER_FILE);
931
932 assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
933
934 /* Load the message header if present. If there's a texture offset,
935 * we need to set it up explicitly and load the offset bitfield.
936 * Otherwise, we can use an implied move from g0 to the first message reg.
937 */
938 if (inst->header_size != 0) {
939 if (devinfo->gen < 6 && !inst->offset) {
940 /* Set up an implied move from g0 to the MRF. */
941 src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
942 } else {
943 struct brw_reg header_reg;
944
945 if (devinfo->gen >= 7) {
946 header_reg = src;
947 } else {
948 assert(inst->base_mrf != -1);
949 header_reg = brw_message_reg(inst->base_mrf);
950 }
951
952 brw_push_insn_state(p);
953 brw_set_default_exec_size(p, BRW_EXECUTE_8);
954 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
955 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
956 /* Explicitly set up the message header by copying g0 to the MRF. */
957 brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
958
959 brw_set_default_exec_size(p, BRW_EXECUTE_1);
960 if (inst->offset) {
961 /* Set the offset bits in DWord 2. */
962 brw_MOV(p, get_element_ud(header_reg, 2),
963 brw_imm_ud(inst->offset));
964 } else if (stage != MESA_SHADER_VERTEX &&
965 stage != MESA_SHADER_FRAGMENT) {
966 /* The vertex and fragment stages have g0.2 set to 0, so
967 * header0.2 is 0 when g0 is copied. Other stages may not, so we
968 * must set it to 0 to avoid setting undesirable bits in the
969 * message.
970 */
971 brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
972 }
973
974 brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
975 brw_pop_insn_state(p);
976 }
977 }
978
979 uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
980 inst->opcode == SHADER_OPCODE_TG4_OFFSET)
981 ? prog_data->binding_table.gather_texture_start
982 : prog_data->binding_table.texture_start;
983
984 if (surface_index.file == BRW_IMMEDIATE_VALUE &&
985 sampler_index.file == BRW_IMMEDIATE_VALUE) {
986 uint32_t surface = surface_index.ud;
987 uint32_t sampler = sampler_index.ud;
988
989 brw_SAMPLE(p,
990 retype(dst, BRW_REGISTER_TYPE_UW),
991 inst->base_mrf,
992 src,
993 surface + base_binding_table_index,
994 sampler % 16,
995 msg_type,
996 inst->size_written / REG_SIZE,
997 inst->mlen,
998 inst->header_size != 0,
999 simd_mode,
1000 return_format);
1001
1002 brw_mark_surface_used(prog_data, surface + base_binding_table_index);
1003 } else {
1004 /* Non-const sampler index */
1005
1006 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1007 struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
1008 struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
1009
1010 brw_push_insn_state(p);
1011 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1012 brw_set_default_access_mode(p, BRW_ALIGN_1);
1013 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1014
1015 if (brw_regs_equal(&surface_reg, &sampler_reg)) {
1016 brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
1017 } else {
1018 if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
1019 brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
1020 } else {
1021 brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
1022 brw_OR(p, addr, addr, surface_reg);
1023 }
1024 }
1025 if (base_binding_table_index)
1026 brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
1027 brw_AND(p, addr, addr, brw_imm_ud(0xfff));
1028
1029 brw_pop_insn_state(p);
1030
1031 /* dst = send(offset, a0.0 | <descriptor>) */
1032 brw_inst *insn = brw_send_indirect_message(
1033 p, BRW_SFID_SAMPLER, dst, src, addr);
1034 brw_set_sampler_message(p, insn,
1035 0 /* surface */,
1036 0 /* sampler */,
1037 msg_type,
1038 inst->size_written / REG_SIZE,
1039 inst->mlen /* mlen */,
1040 inst->header_size != 0 /* header */,
1041 simd_mode,
1042 return_format);
1043
1044 /* visitor knows more than we do about the surface limit required,
1045 * so has already done marking.
1046 */
1047 }
1048
1049 if (is_combined_send) {
1050 brw_inst_set_eot(p->devinfo, brw_last_inst, true);
1051 brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
1052 }
1053 }
1054
1055
1056 /* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
1057 * looking like:
1058 *
1059 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
1060 *
1061 * Ideally, we want to produce:
1062 *
1063 * DDX DDY
1064 * dst: (ss0.tr - ss0.tl) (ss0.tl - ss0.bl)
1065 * (ss0.tr - ss0.tl) (ss0.tr - ss0.br)
1066 * (ss0.br - ss0.bl) (ss0.tl - ss0.bl)
1067 * (ss0.br - ss0.bl) (ss0.tr - ss0.br)
1068 * (ss1.tr - ss1.tl) (ss1.tl - ss1.bl)
1069 * (ss1.tr - ss1.tl) (ss1.tr - ss1.br)
1070 * (ss1.br - ss1.bl) (ss1.tl - ss1.bl)
1071 * (ss1.br - ss1.bl) (ss1.tr - ss1.br)
1072 *
1073 * and add another set of two more subspans if in 16-pixel dispatch mode.
1074 *
1075 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
1076 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
1077 * pair. But the ideal approximation may impose a huge performance cost on
1078 * sample_d. On at least Haswell, sample_d instruction does some
1079 * optimizations if the same LOD is used for all pixels in the subspan.
1080 *
1081 * For DDY, we need to use ALIGN16 mode since it's capable of doing the
1082 * appropriate swizzling.
1083 */
1084 void
1085 fs_generator::generate_ddx(enum opcode opcode,
1086 struct brw_reg dst, struct brw_reg src)
1087 {
1088 unsigned vstride, width;
1089
1090 if (opcode == FS_OPCODE_DDX_FINE) {
1091 /* produce accurate derivatives */
1092 vstride = BRW_VERTICAL_STRIDE_2;
1093 width = BRW_WIDTH_2;
1094 } else {
1095 /* replicate the derivative at the top-left pixel to other pixels */
1096 vstride = BRW_VERTICAL_STRIDE_4;
1097 width = BRW_WIDTH_4;
1098 }
1099
1100 struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
1101 src.negate, src.abs,
1102 BRW_REGISTER_TYPE_F,
1103 vstride,
1104 width,
1105 BRW_HORIZONTAL_STRIDE_0,
1106 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1107 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1108 src.negate, src.abs,
1109 BRW_REGISTER_TYPE_F,
1110 vstride,
1111 width,
1112 BRW_HORIZONTAL_STRIDE_0,
1113 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1114 brw_ADD(p, dst, src0, negate(src1));
1115 }
1116
1117 /* The negate_value boolean is used to negate the derivative computation for
1118 * FBOs, since they place the origin at the upper left instead of the lower
1119 * left.
1120 */
1121 void
1122 fs_generator::generate_ddy(enum opcode opcode,
1123 struct brw_reg dst, struct brw_reg src)
1124 {
1125 if (opcode == FS_OPCODE_DDY_FINE) {
1126 /* produce accurate derivatives */
1127 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1128 src.negate, src.abs,
1129 BRW_REGISTER_TYPE_F,
1130 BRW_VERTICAL_STRIDE_4,
1131 BRW_WIDTH_4,
1132 BRW_HORIZONTAL_STRIDE_1,
1133 BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
1134 struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
1135 src.negate, src.abs,
1136 BRW_REGISTER_TYPE_F,
1137 BRW_VERTICAL_STRIDE_4,
1138 BRW_WIDTH_4,
1139 BRW_HORIZONTAL_STRIDE_1,
1140 BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
1141 brw_push_insn_state(p);
1142 brw_set_default_access_mode(p, BRW_ALIGN_16);
1143 brw_ADD(p, dst, negate(src0), src1);
1144 brw_pop_insn_state(p);
1145 } else {
1146 /* replicate the derivative at the top-left pixel to other pixels */
1147 struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
1148 src.negate, src.abs,
1149 BRW_REGISTER_TYPE_F,
1150 BRW_VERTICAL_STRIDE_4,
1151 BRW_WIDTH_4,
1152 BRW_HORIZONTAL_STRIDE_0,
1153 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1154 struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
1155 src.negate, src.abs,
1156 BRW_REGISTER_TYPE_F,
1157 BRW_VERTICAL_STRIDE_4,
1158 BRW_WIDTH_4,
1159 BRW_HORIZONTAL_STRIDE_0,
1160 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
1161 brw_ADD(p, dst, negate(src0), src1);
1162 }
1163 }
1164
1165 void
1166 fs_generator::generate_discard_jump(fs_inst *inst)
1167 {
1168 assert(devinfo->gen >= 6);
1169
1170 /* This HALT will be patched up at FB write time to point UIP at the end of
1171 * the program, and at brw_uip_jip() JIP will be set to the end of the
1172 * current block (or the program).
1173 */
1174 this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
1175 gen6_HALT(p);
1176 }
1177
1178 void
1179 fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
1180 {
1181 /* The 32-wide messages only respect the first 16-wide half of the channel
1182 * enable signals which are replicated identically for the second group of
1183 * 16 channels, so we cannot use them unless the write is marked
1184 * force_writemask_all.
1185 */
1186 const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
1187 MIN2(16, inst->exec_size);
1188 const unsigned block_size = 4 * lower_size / REG_SIZE;
1189 assert(inst->mlen != 0);
1190
1191 brw_push_insn_state(p);
1192 brw_set_default_exec_size(p, cvt(lower_size) - 1);
1193 brw_set_default_compression(p, lower_size > 8);
1194
1195 for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
1196 brw_set_default_group(p, inst->group + lower_size * i);
1197
1198 brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
1199 retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
1200
1201 brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
1202 block_size,
1203 inst->offset + block_size * REG_SIZE * i);
1204 }
1205
1206 brw_pop_insn_state(p);
1207 }
1208
1209 void
1210 fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
1211 {
1212 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1213 assert(inst->mlen != 0);
1214
1215 brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
1216 inst->exec_size / 8, inst->offset);
1217 }
1218
1219 void
1220 fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
1221 {
1222 assert(inst->exec_size <= 16 || inst->force_writemask_all);
1223
1224 gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
1225 }
1226
1227 void
1228 fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
1229 struct brw_reg dst,
1230 struct brw_reg index,
1231 struct brw_reg offset)
1232 {
1233 assert(type_sz(dst.type) == 4);
1234 assert(inst->mlen != 0);
1235
1236 assert(index.file == BRW_IMMEDIATE_VALUE &&
1237 index.type == BRW_REGISTER_TYPE_UD);
1238 uint32_t surf_index = index.ud;
1239
1240 assert(offset.file == BRW_IMMEDIATE_VALUE &&
1241 offset.type == BRW_REGISTER_TYPE_UD);
1242 uint32_t read_offset = offset.ud;
1243
1244 brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
1245 read_offset, surf_index);
1246 }
1247
1248 void
1249 fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
1250 struct brw_reg dst,
1251 struct brw_reg index,
1252 struct brw_reg payload)
1253 {
1254 assert(index.type == BRW_REGISTER_TYPE_UD);
1255 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1256 assert(type_sz(dst.type) == 4);
1257
1258 if (index.file == BRW_IMMEDIATE_VALUE) {
1259 const uint32_t surf_index = index.ud;
1260
1261 brw_push_insn_state(p);
1262 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1263 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1264 brw_pop_insn_state(p);
1265
1266 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
1267 brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
1268 brw_set_dp_read_message(p, send, surf_index,
1269 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1270 GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1271 GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1272 1, /* mlen */
1273 true, /* header */
1274 DIV_ROUND_UP(inst->size_written, REG_SIZE));
1275
1276 } else {
1277 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1278
1279 brw_push_insn_state(p);
1280 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1281
1282 /* a0.0 = surf_index & 0xff */
1283 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1284 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1285 brw_set_dest(p, insn_and, addr);
1286 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1287 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1288
1289 /* dst = send(payload, a0.0 | <descriptor>) */
1290 brw_inst *insn = brw_send_indirect_message(
1291 p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1292 retype(dst, BRW_REGISTER_TYPE_UD),
1293 retype(payload, BRW_REGISTER_TYPE_UD), addr);
1294 brw_set_dp_read_message(p, insn, 0 /* surface */,
1295 BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
1296 GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
1297 GEN6_SFID_DATAPORT_CONSTANT_CACHE,
1298 1, /* mlen */
1299 true, /* header */
1300 DIV_ROUND_UP(inst->size_written, REG_SIZE));
1301
1302 brw_pop_insn_state(p);
1303 }
1304 }
1305
1306 void
1307 fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
1308 struct brw_reg dst,
1309 struct brw_reg index)
1310 {
1311 assert(devinfo->gen < 7); /* Should use the gen7 variant. */
1312 assert(inst->header_size != 0);
1313 assert(inst->mlen);
1314
1315 assert(index.file == BRW_IMMEDIATE_VALUE &&
1316 index.type == BRW_REGISTER_TYPE_UD);
1317 uint32_t surf_index = index.ud;
1318
1319 uint32_t simd_mode, rlen, msg_type;
1320 if (inst->exec_size == 16) {
1321 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1322 rlen = 8;
1323 } else {
1324 assert(inst->exec_size == 8);
1325 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1326 rlen = 4;
1327 }
1328
1329 if (devinfo->gen >= 5)
1330 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
1331 else {
1332 /* We always use the SIMD16 message so that we only have to load U, and
1333 * not V or R.
1334 */
1335 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
1336 assert(inst->mlen == 3);
1337 assert(inst->size_written == 8 * REG_SIZE);
1338 rlen = 8;
1339 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1340 }
1341
1342 struct brw_reg header = brw_vec8_grf(0, 0);
1343 gen6_resolve_implied_move(p, &header, inst->base_mrf);
1344
1345 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1346 brw_inst_set_compression(devinfo, send, false);
1347 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1348 brw_set_src0(p, send, header);
1349 if (devinfo->gen < 6)
1350 brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
1351
1352 /* Our surface is set up as floats, regardless of what actual data is
1353 * stored in it.
1354 */
1355 uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
1356 brw_set_sampler_message(p, send,
1357 surf_index,
1358 0, /* sampler (unused) */
1359 msg_type,
1360 rlen,
1361 inst->mlen,
1362 inst->header_size != 0,
1363 simd_mode,
1364 return_format);
1365 }
1366
1367 void
1368 fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
1369 struct brw_reg dst,
1370 struct brw_reg index,
1371 struct brw_reg offset)
1372 {
1373 assert(devinfo->gen >= 7);
1374 /* Varying-offset pull constant loads are treated as a normal expression on
1375 * gen7, so the fact that it's a send message is hidden at the IR level.
1376 */
1377 assert(inst->header_size == 0);
1378 assert(!inst->mlen);
1379 assert(index.type == BRW_REGISTER_TYPE_UD);
1380
1381 uint32_t simd_mode, rlen, mlen;
1382 if (inst->exec_size == 16) {
1383 mlen = 2;
1384 rlen = 8;
1385 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
1386 } else {
1387 assert(inst->exec_size == 8);
1388 mlen = 1;
1389 rlen = 4;
1390 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
1391 }
1392
1393 if (index.file == BRW_IMMEDIATE_VALUE) {
1394
1395 uint32_t surf_index = index.ud;
1396
1397 brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
1398 brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
1399 brw_set_src0(p, send, offset);
1400 brw_set_sampler_message(p, send,
1401 surf_index,
1402 0, /* LD message ignores sampler unit */
1403 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1404 rlen,
1405 mlen,
1406 false, /* no header */
1407 simd_mode,
1408 0);
1409
1410 } else {
1411
1412 struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
1413
1414 brw_push_insn_state(p);
1415 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1416
1417 /* a0.0 = surf_index & 0xff */
1418 brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
1419 brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
1420 brw_set_dest(p, insn_and, addr);
1421 brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
1422 brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
1423
1424 brw_pop_insn_state(p);
1425
1426 /* dst = send(offset, a0.0 | <descriptor>) */
1427 brw_inst *insn = brw_send_indirect_message(
1428 p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
1429 offset, addr);
1430 brw_set_sampler_message(p, insn,
1431 0 /* surface */,
1432 0 /* sampler */,
1433 GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
1434 rlen /* rlen */,
1435 mlen /* mlen */,
1436 false /* header */,
1437 simd_mode,
1438 0);
1439 }
1440 }
1441
1442 /**
1443 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
1444 * into the flags register (f0.0).
1445 *
1446 * Used only on Gen6 and above.
1447 */
1448 void
1449 fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
1450 {
1451 struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
1452 struct brw_reg dispatch_mask;
1453
1454 if (devinfo->gen >= 6)
1455 dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
1456 else
1457 dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
1458
1459 brw_push_insn_state(p);
1460 brw_set_default_mask_control(p, BRW_MASK_DISABLE);
1461 brw_set_default_exec_size(p, BRW_EXECUTE_1);
1462 brw_MOV(p, flags, dispatch_mask);
1463 brw_pop_insn_state(p);
1464 }
1465
1466 void
1467 fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
1468 struct brw_reg dst,
1469 struct brw_reg src,
1470 struct brw_reg msg_data,
1471 unsigned msg_type)
1472 {
1473 assert(inst->size_written % REG_SIZE == 0);
1474 assert(msg_data.type == BRW_REGISTER_TYPE_UD);
1475
1476 brw_pixel_interpolator_query(p,
1477 retype(dst, BRW_REGISTER_TYPE_UW),
1478 src,
1479 inst->pi_noperspective,
1480 msg_type,
1481 msg_data,
1482 inst->mlen,
1483 inst->size_written / REG_SIZE);
1484 }
1485
1486 /* Sets vstride=1, width=4, hstride=0 of register src1 during
1487 * the ADD instruction.
1488 */
1489 void
1490 fs_generator::generate_set_sample_id(fs_inst *inst,
1491 struct brw_reg dst,
1492 struct brw_reg src0,
1493 struct brw_reg src1)
1494 {
1495 assert(dst.type == BRW_REGISTER_TYPE_D ||
1496 dst.type == BRW_REGISTER_TYPE_UD);
1497 assert(src0.type == BRW_REGISTER_TYPE_D ||
1498 src0.type == BRW_REGISTER_TYPE_UD);
1499
1500 struct brw_reg reg = stride(src1, 1, 4, 0);
1501 if (devinfo->gen >= 8 || inst->exec_size == 8) {
1502 brw_ADD(p, dst, src0, reg);
1503 } else if (inst->exec_size == 16) {
1504 brw_push_insn_state(p);
1505 brw_set_default_exec_size(p, BRW_EXECUTE_8);
1506 brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
1507 brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
1508 brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
1509 brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
1510 brw_pop_insn_state(p);
1511 }
1512 }
1513
1514 void
1515 fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
1516 struct brw_reg dst,
1517 struct brw_reg x,
1518 struct brw_reg y)
1519 {
1520 assert(devinfo->gen >= 7);
1521 assert(dst.type == BRW_REGISTER_TYPE_UD);
1522 assert(x.type == BRW_REGISTER_TYPE_F);
1523 assert(y.type == BRW_REGISTER_TYPE_F);
1524
1525 /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
1526 *
1527 * Because this instruction does not have a 16-bit floating-point type,
1528 * the destination data type must be Word (W).
1529 *
1530 * The destination must be DWord-aligned and specify a horizontal stride
1531 * (HorzStride) of 2. The 16-bit result is stored in the lower word of
1532 * each destination channel and the upper word is not modified.
1533 */
1534 struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
1535
1536 /* Give each 32-bit channel of dst the form below, where "." means
1537 * unchanged.
1538 * 0x....hhhh
1539 */
1540 brw_F32TO16(p, dst_w, y);
1541
1542 /* Now the form:
1543 * 0xhhhh0000
1544 */
1545 brw_SHL(p, dst, dst, brw_imm_ud(16u));
1546
1547 /* And, finally the form of packHalf2x16's output:
1548 * 0xhhhhllll
1549 */
1550 brw_F32TO16(p, dst_w, x);
1551 }
1552
1553 void
1554 fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
1555 struct brw_reg dst,
1556 struct brw_reg src)
1557 {
1558 assert(devinfo->gen >= 7);
1559 assert(dst.type == BRW_REGISTER_TYPE_F);
1560 assert(src.type == BRW_REGISTER_TYPE_UD);
1561
1562 /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
1563 *
1564 * Because this instruction does not have a 16-bit floating-point type,
1565 * the source data type must be Word (W). The destination type must be
1566 * F (Float).
1567 */
1568 struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
1569
1570 /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
1571 * For the Y case, we wish to access only the upper word; therefore
1572 * a 16-bit subregister offset is needed.
1573 */
1574 assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1575 inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1576 if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1577 src_w.subnr += 2;
1578
1579 brw_F16TO32(p, dst, src_w);
1580 }
1581
1582 void
1583 fs_generator::generate_shader_time_add(fs_inst *inst,
1584 struct brw_reg payload,
1585 struct brw_reg offset,
1586 struct brw_reg value)
1587 {
1588 assert(devinfo->gen >= 7);
1589 brw_push_insn_state(p);
1590 brw_set_default_mask_control(p, true);
1591
1592 assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1593 struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1594 offset.type);
1595 struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1596 value.type);
1597
1598 assert(offset.file == BRW_IMMEDIATE_VALUE);
1599 if (value.file == BRW_GENERAL_REGISTER_FILE) {
1600 value.width = BRW_WIDTH_1;
1601 value.hstride = BRW_HORIZONTAL_STRIDE_0;
1602 value.vstride = BRW_VERTICAL_STRIDE_0;
1603 } else {
1604 assert(value.file == BRW_IMMEDIATE_VALUE);
1605 }
1606
1607 /* Trying to deal with setup of the params from the IR is crazy in the FS8
1608 * case, and we don't really care about squeezing every bit of performance
1609 * out of this path, so we just emit the MOVs from here.
1610 */
1611 brw_MOV(p, payload_offset, offset);
1612 brw_MOV(p, payload_value, value);
1613 brw_shader_time_add(p, payload,
1614 prog_data->binding_table.shader_time_start);
1615 brw_pop_insn_state(p);
1616
1617 brw_mark_surface_used(prog_data,
1618 prog_data->binding_table.shader_time_start);
1619 }
1620
1621 void
1622 fs_generator::enable_debug(const char *shader_name)
1623 {
1624 debug_flag = true;
1625 this->shader_name = shader_name;
1626 }
1627
1628 int
1629 fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
1630 {
1631 /* align to 64 byte boundary. */
1632 while (p->next_insn_offset % 64)
1633 brw_NOP(p);
1634
1635 this->dispatch_width = dispatch_width;
1636
1637 int start_offset = p->next_insn_offset;
1638 int spill_count = 0, fill_count = 0;
1639 int loop_count = 0;
1640
1641 struct annotation_info annotation;
1642 memset(&annotation, 0, sizeof(annotation));
1643
1644 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1645 struct brw_reg src[3], dst;
1646 unsigned int last_insn_offset = p->next_insn_offset;
1647 bool multiple_instructions_emitted = false;
1648
1649 /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
1650 * "Register Region Restrictions" section: for BDW, SKL:
1651 *
1652 * "A POW/FDIV operation must not be followed by an instruction
1653 * that requires two destination registers."
1654 *
1655 * The documentation is often lacking annotations for Atom parts,
1656 * and empirically this affects CHV as well.
1657 */
1658 if (devinfo->gen >= 8 &&
1659 devinfo->gen <= 9 &&
1660 p->nr_insn > 1 &&
1661 brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
1662 brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
1663 inst->dst.component_size(inst->exec_size) > REG_SIZE) {
1664 brw_NOP(p);
1665 last_insn_offset = p->next_insn_offset;
1666 }
1667
1668 if (unlikely(debug_flag))
1669 annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
1670
1671 /* If the instruction writes to more than one register, it needs to be
1672 * explicitly marked as compressed on Gen <= 5. On Gen >= 6 the
1673 * hardware figures out by itself what the right compression mode is,
1674 * but we still need to know whether the instruction is compressed to
1675 * set up the source register regions appropriately.
1676 *
1677 * XXX - This is wrong for instructions that write a single register but
1678 * read more than one which should strictly speaking be treated as
1679 * compressed. For instructions that don't write any registers it
1680 * relies on the destination being a null register of the correct
1681 * type and regioning so the instruction is considered compressed
1682 * or not accordingly.
1683 */
1684 const bool compressed =
1685 inst->dst.component_size(inst->exec_size) > REG_SIZE;
1686 brw_set_default_compression(p, compressed);
1687 brw_set_default_group(p, inst->group);
1688
1689 for (unsigned int i = 0; i < inst->sources; i++) {
1690 src[i] = brw_reg_from_fs_reg(devinfo, inst,
1691 &inst->src[i], compressed);
1692 /* The accumulator result appears to get used for the
1693 * conditional modifier generation. When negating a UD
1694 * value, there is a 33rd bit generated for the sign in the
1695 * accumulator value, so now you can't check, for example,
1696 * equality with a 32-bit value. See piglit fs-op-neg-uvec4.
1697 */
1698 assert(!inst->conditional_mod ||
1699 inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1700 !inst->src[i].negate);
1701 }
1702 dst = brw_reg_from_fs_reg(devinfo, inst,
1703 &inst->dst, compressed);
1704
1705 brw_set_default_access_mode(p, BRW_ALIGN_1);
1706 brw_set_default_predicate_control(p, inst->predicate);
1707 brw_set_default_predicate_inverse(p, inst->predicate_inverse);
1708 brw_set_default_flag_reg(p, 0, inst->flag_subreg);
1709 brw_set_default_saturate(p, inst->saturate);
1710 brw_set_default_mask_control(p, inst->force_writemask_all);
1711 brw_set_default_acc_write_control(p, inst->writes_accumulator);
1712
1713 unsigned exec_size = inst->exec_size;
1714 if (devinfo->gen == 7 && !devinfo->is_haswell &&
1715 (get_exec_type_size(inst) == 8 || type_sz(inst->dst.type) == 8)) {
1716 exec_size *= 2;
1717 }
1718
1719 brw_set_default_exec_size(p, cvt(exec_size) - 1);
1720
1721 assert(inst->force_writemask_all || inst->exec_size >= 4);
1722 assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
1723 assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
1724 assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
1725
1726 switch (inst->opcode) {
1727 case BRW_OPCODE_MOV:
1728 brw_MOV(p, dst, src[0]);
1729 break;
1730 case BRW_OPCODE_ADD:
1731 brw_ADD(p, dst, src[0], src[1]);
1732 break;
1733 case BRW_OPCODE_MUL:
1734 brw_MUL(p, dst, src[0], src[1]);
1735 break;
1736 case BRW_OPCODE_AVG:
1737 brw_AVG(p, dst, src[0], src[1]);
1738 break;
1739 case BRW_OPCODE_MACH:
1740 brw_MACH(p, dst, src[0], src[1]);
1741 break;
1742
1743 case BRW_OPCODE_LINE:
1744 brw_LINE(p, dst, src[0], src[1]);
1745 break;
1746
1747 case BRW_OPCODE_MAD:
1748 assert(devinfo->gen >= 6);
1749 if (devinfo->gen < 10)
1750 brw_set_default_access_mode(p, BRW_ALIGN_16);
1751 brw_MAD(p, dst, src[0], src[1], src[2]);
1752 break;
1753
1754 case BRW_OPCODE_LRP:
1755 assert(devinfo->gen >= 6);
1756 if (devinfo->gen < 10)
1757 brw_set_default_access_mode(p, BRW_ALIGN_16);
1758 brw_LRP(p, dst, src[0], src[1], src[2]);
1759 break;
1760
1761 case BRW_OPCODE_FRC:
1762 brw_FRC(p, dst, src[0]);
1763 break;
1764 case BRW_OPCODE_RNDD:
1765 brw_RNDD(p, dst, src[0]);
1766 break;
1767 case BRW_OPCODE_RNDE:
1768 brw_RNDE(p, dst, src[0]);
1769 break;
1770 case BRW_OPCODE_RNDZ:
1771 brw_RNDZ(p, dst, src[0]);
1772 break;
1773
1774 case BRW_OPCODE_AND:
1775 brw_AND(p, dst, src[0], src[1]);
1776 break;
1777 case BRW_OPCODE_OR:
1778 brw_OR(p, dst, src[0], src[1]);
1779 break;
1780 case BRW_OPCODE_XOR:
1781 brw_XOR(p, dst, src[0], src[1]);
1782 break;
1783 case BRW_OPCODE_NOT:
1784 brw_NOT(p, dst, src[0]);
1785 break;
1786 case BRW_OPCODE_ASR:
1787 brw_ASR(p, dst, src[0], src[1]);
1788 break;
1789 case BRW_OPCODE_SHR:
1790 brw_SHR(p, dst, src[0], src[1]);
1791 break;
1792 case BRW_OPCODE_SHL:
1793 brw_SHL(p, dst, src[0], src[1]);
1794 break;
1795 case BRW_OPCODE_F32TO16:
1796 assert(devinfo->gen >= 7);
1797 brw_F32TO16(p, dst, src[0]);
1798 break;
1799 case BRW_OPCODE_F16TO32:
1800 assert(devinfo->gen >= 7);
1801 brw_F16TO32(p, dst, src[0]);
1802 break;
1803 case BRW_OPCODE_CMP:
1804 if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
1805 dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
1806 /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
1807 * implemented in the compiler is not sufficient. Overriding the
1808 * type when the destination is the null register is necessary but
1809 * not sufficient by itself.
1810 */
1811 assert(dst.nr == BRW_ARF_NULL);
1812 dst.type = BRW_REGISTER_TYPE_D;
1813 }
1814 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1815 break;
1816 case BRW_OPCODE_SEL:
1817 brw_SEL(p, dst, src[0], src[1]);
1818 break;
1819 case BRW_OPCODE_BFREV:
1820 assert(devinfo->gen >= 7);
1821 brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1822 retype(src[0], BRW_REGISTER_TYPE_UD));
1823 break;
1824 case BRW_OPCODE_FBH:
1825 assert(devinfo->gen >= 7);
1826 brw_FBH(p, retype(dst, src[0].type), src[0]);
1827 break;
1828 case BRW_OPCODE_FBL:
1829 assert(devinfo->gen >= 7);
1830 brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD),
1831 retype(src[0], BRW_REGISTER_TYPE_UD));
1832 break;
1833 case BRW_OPCODE_LZD:
1834 brw_LZD(p, dst, src[0]);
1835 break;
1836 case BRW_OPCODE_CBIT:
1837 assert(devinfo->gen >= 7);
1838 brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD),
1839 retype(src[0], BRW_REGISTER_TYPE_UD));
1840 break;
1841 case BRW_OPCODE_ADDC:
1842 assert(devinfo->gen >= 7);
1843 brw_ADDC(p, dst, src[0], src[1]);
1844 break;
1845 case BRW_OPCODE_SUBB:
1846 assert(devinfo->gen >= 7);
1847 brw_SUBB(p, dst, src[0], src[1]);
1848 break;
1849 case BRW_OPCODE_MAC:
1850 brw_MAC(p, dst, src[0], src[1]);
1851 break;
1852
1853 case BRW_OPCODE_BFE:
1854 assert(devinfo->gen >= 7);
1855 if (devinfo->gen < 10)
1856 brw_set_default_access_mode(p, BRW_ALIGN_16);
1857 brw_BFE(p, dst, src[0], src[1], src[2]);
1858 break;
1859
1860 case BRW_OPCODE_BFI1:
1861 assert(devinfo->gen >= 7);
1862 brw_BFI1(p, dst, src[0], src[1]);
1863 break;
1864 case BRW_OPCODE_BFI2:
1865 assert(devinfo->gen >= 7);
1866 if (devinfo->gen < 10)
1867 brw_set_default_access_mode(p, BRW_ALIGN_16);
1868 brw_BFI2(p, dst, src[0], src[1], src[2]);
1869 break;
1870
1871 case BRW_OPCODE_IF:
1872 if (inst->src[0].file != BAD_FILE) {
1873 /* The instruction has an embedded compare (only allowed on gen6) */
1874 assert(devinfo->gen == 6);
1875 gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1876 } else {
1877 brw_IF(p, brw_inst_exec_size(devinfo, p->current));
1878 }
1879 break;
1880
1881 case BRW_OPCODE_ELSE:
1882 brw_ELSE(p);
1883 break;
1884 case BRW_OPCODE_ENDIF:
1885 brw_ENDIF(p);
1886 break;
1887
1888 case BRW_OPCODE_DO:
1889 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
1890 break;
1891
1892 case BRW_OPCODE_BREAK:
1893 brw_BREAK(p);
1894 break;
1895 case BRW_OPCODE_CONTINUE:
1896 brw_CONT(p);
1897 break;
1898
1899 case BRW_OPCODE_WHILE:
1900 brw_WHILE(p);
1901 loop_count++;
1902 break;
1903
1904 case SHADER_OPCODE_RCP:
1905 case SHADER_OPCODE_RSQ:
1906 case SHADER_OPCODE_SQRT:
1907 case SHADER_OPCODE_EXP2:
1908 case SHADER_OPCODE_LOG2:
1909 case SHADER_OPCODE_SIN:
1910 case SHADER_OPCODE_COS:
1911 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1912 if (devinfo->gen >= 6) {
1913 assert(inst->mlen == 0);
1914 assert(devinfo->gen >= 7 || inst->exec_size == 8);
1915 gen6_math(p, dst, brw_math_function(inst->opcode),
1916 src[0], brw_null_reg());
1917 } else {
1918 assert(inst->mlen >= 1);
1919 assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
1920 gen4_math(p, dst,
1921 brw_math_function(inst->opcode),
1922 inst->base_mrf, src[0],
1923 BRW_MATH_PRECISION_FULL);
1924 }
1925 break;
1926 case SHADER_OPCODE_INT_QUOTIENT:
1927 case SHADER_OPCODE_INT_REMAINDER:
1928 case SHADER_OPCODE_POW:
1929 assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
1930 if (devinfo->gen >= 6) {
1931 assert(inst->mlen == 0);
1932 assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
1933 inst->exec_size == 8);
1934 gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
1935 } else {
1936 assert(inst->mlen >= 1);
1937 assert(inst->exec_size == 8);
1938 gen4_math(p, dst, brw_math_function(inst->opcode),
1939 inst->base_mrf, src[0],
1940 BRW_MATH_PRECISION_FULL);
1941 }
1942 break;
1943 case FS_OPCODE_CINTERP:
1944 brw_MOV(p, dst, src[0]);
1945 break;
1946 case FS_OPCODE_LINTERP:
1947 generate_linterp(inst, dst, src);
1948 break;
1949 case FS_OPCODE_PIXEL_X:
1950 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1951 src[0].subnr = 0 * type_sz(src[0].type);
1952 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1953 break;
1954 case FS_OPCODE_PIXEL_Y:
1955 assert(src[0].type == BRW_REGISTER_TYPE_UW);
1956 src[0].subnr = 4 * type_sz(src[0].type);
1957 brw_MOV(p, dst, stride(src[0], 8, 4, 1));
1958 break;
1959 case FS_OPCODE_GET_BUFFER_SIZE:
1960 generate_get_buffer_size(inst, dst, src[0], src[1]);
1961 break;
1962 case SHADER_OPCODE_TEX:
1963 case FS_OPCODE_TXB:
1964 case SHADER_OPCODE_TXD:
1965 case SHADER_OPCODE_TXF:
1966 case SHADER_OPCODE_TXF_LZ:
1967 case SHADER_OPCODE_TXF_CMS:
1968 case SHADER_OPCODE_TXF_CMS_W:
1969 case SHADER_OPCODE_TXF_UMS:
1970 case SHADER_OPCODE_TXF_MCS:
1971 case SHADER_OPCODE_TXL:
1972 case SHADER_OPCODE_TXL_LZ:
1973 case SHADER_OPCODE_TXS:
1974 case SHADER_OPCODE_LOD:
1975 case SHADER_OPCODE_TG4:
1976 case SHADER_OPCODE_TG4_OFFSET:
1977 case SHADER_OPCODE_SAMPLEINFO:
1978 generate_tex(inst, dst, src[0], src[1], src[2]);
1979 break;
1980 case FS_OPCODE_DDX_COARSE:
1981 case FS_OPCODE_DDX_FINE:
1982 generate_ddx(inst->opcode, dst, src[0]);
1983 break;
1984 case FS_OPCODE_DDY_COARSE:
1985 case FS_OPCODE_DDY_FINE:
1986 generate_ddy(inst->opcode, dst, src[0]);
1987 break;
1988
1989 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1990 generate_scratch_write(inst, src[0]);
1991 spill_count++;
1992 break;
1993
1994 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1995 generate_scratch_read(inst, dst);
1996 fill_count++;
1997 break;
1998
1999 case SHADER_OPCODE_GEN7_SCRATCH_READ:
2000 generate_scratch_read_gen7(inst, dst);
2001 fill_count++;
2002 break;
2003
2004 case SHADER_OPCODE_MOV_INDIRECT:
2005 generate_mov_indirect(inst, dst, src[0], src[1]);
2006 break;
2007
2008 case SHADER_OPCODE_URB_READ_SIMD8:
2009 case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
2010 generate_urb_read(inst, dst, src[0]);
2011 break;
2012
2013 case SHADER_OPCODE_URB_WRITE_SIMD8:
2014 case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
2015 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
2016 case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
2017 generate_urb_write(inst, src[0]);
2018 break;
2019
2020 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2021 assert(inst->force_writemask_all);
2022 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
2023 break;
2024
2025 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2026 assert(inst->force_writemask_all);
2027 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2028 break;
2029
2030 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
2031 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
2032 break;
2033
2034 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
2035 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
2036 break;
2037
2038 case FS_OPCODE_REP_FB_WRITE:
2039 case FS_OPCODE_FB_WRITE:
2040 generate_fb_write(inst, src[0]);
2041 break;
2042
2043 case FS_OPCODE_FB_READ:
2044 generate_fb_read(inst, dst, src[0]);
2045 break;
2046
2047 case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
2048 generate_mov_dispatch_to_flags(inst);
2049 break;
2050
2051 case FS_OPCODE_DISCARD_JUMP:
2052 generate_discard_jump(inst);
2053 break;
2054
2055 case SHADER_OPCODE_SHADER_TIME_ADD:
2056 generate_shader_time_add(inst, src[0], src[1], src[2]);
2057 break;
2058
2059 case SHADER_OPCODE_UNTYPED_ATOMIC:
2060 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2061 brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
2062 inst->mlen, !inst->dst.is_null());
2063 break;
2064
2065 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
2066 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2067 brw_untyped_surface_read(p, dst, src[0], src[1],
2068 inst->mlen, src[2].ud);
2069 break;
2070
2071 case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
2072 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2073 brw_untyped_surface_write(p, src[0], src[1],
2074 inst->mlen, src[2].ud);
2075 break;
2076
2077 case SHADER_OPCODE_TYPED_ATOMIC:
2078 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2079 brw_typed_atomic(p, dst, src[0], src[1],
2080 src[2].ud, inst->mlen, !inst->dst.is_null());
2081 break;
2082
2083 case SHADER_OPCODE_TYPED_SURFACE_READ:
2084 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2085 brw_typed_surface_read(p, dst, src[0], src[1],
2086 inst->mlen, src[2].ud);
2087 break;
2088
2089 case SHADER_OPCODE_TYPED_SURFACE_WRITE:
2090 assert(src[2].file == BRW_IMMEDIATE_VALUE);
2091 brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
2092 break;
2093
2094 case SHADER_OPCODE_MEMORY_FENCE:
2095 brw_memory_fence(p, dst);
2096 break;
2097
2098 case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
2099 const struct brw_reg mask =
2100 brw_stage_has_packed_dispatch(devinfo, stage,
2101 prog_data) ? brw_imm_ud(~0u) :
2102 stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
2103 brw_dmask_reg();
2104 brw_find_live_channel(p, dst, mask);
2105 break;
2106 }
2107
2108 case SHADER_OPCODE_BROADCAST:
2109 assert(inst->force_writemask_all);
2110 brw_broadcast(p, dst, src[0], src[1]);
2111 break;
2112
2113 case FS_OPCODE_SET_SAMPLE_ID:
2114 generate_set_sample_id(inst, dst, src[0], src[1]);
2115 break;
2116
2117 case FS_OPCODE_PACK_HALF_2x16_SPLIT:
2118 generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
2119 break;
2120
2121 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
2122 case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
2123 generate_unpack_half_2x16_split(inst, dst, src[0]);
2124 break;
2125
2126 case FS_OPCODE_PLACEHOLDER_HALT:
2127 /* This is the place where the final HALT needs to be inserted if
2128 * we've emitted any discards. If not, this will emit no code.
2129 */
2130 if (!patch_discard_jumps_to_fb_writes()) {
2131 if (unlikely(debug_flag)) {
2132 annotation.ann_count--;
2133 }
2134 }
2135 break;
2136
2137 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
2138 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2139 GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
2140 break;
2141
2142 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
2143 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2144 GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
2145 break;
2146
2147 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
2148 generate_pixel_interpolator_query(inst, dst, src[0], src[1],
2149 GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
2150 break;
2151
2152 case CS_OPCODE_CS_TERMINATE:
2153 generate_cs_terminate(inst, src[0]);
2154 break;
2155
2156 case SHADER_OPCODE_BARRIER:
2157 generate_barrier(inst, src[0]);
2158 break;
2159
2160 case BRW_OPCODE_DIM:
2161 assert(devinfo->is_haswell);
2162 assert(src[0].type == BRW_REGISTER_TYPE_DF);
2163 assert(dst.type == BRW_REGISTER_TYPE_DF);
2164 brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
2165 break;
2166
2167 default:
2168 unreachable("Unsupported opcode");
2169
2170 case SHADER_OPCODE_LOAD_PAYLOAD:
2171 unreachable("Should be lowered by lower_load_payload()");
2172 }
2173
2174 if (multiple_instructions_emitted)
2175 continue;
2176
2177 if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
2178 assert(p->next_insn_offset == last_insn_offset + 16 ||
2179 !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
2180 "emitting more than 1 instruction");
2181
2182 brw_inst *last = &p->store[last_insn_offset / 16];
2183
2184 if (inst->conditional_mod)
2185 brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
2186 brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
2187 brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
2188 }
2189 }
2190
2191 brw_set_uip_jip(p, start_offset);
2192 annotation_finalize(&annotation, p->next_insn_offset);
2193
2194 #ifndef NDEBUG
2195 bool validated =
2196 #else
2197 if (unlikely(debug_flag))
2198 #endif
2199 brw_validate_instructions(devinfo, p->store,
2200 start_offset,
2201 p->next_insn_offset,
2202 &annotation);
2203
2204 int before_size = p->next_insn_offset - start_offset;
2205 brw_compact_instructions(p, start_offset, annotation.ann_count,
2206 annotation.ann);
2207 int after_size = p->next_insn_offset - start_offset;
2208
2209 if (unlikely(debug_flag)) {
2210 fprintf(stderr, "Native code for %s\n"
2211 "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
2212 " bytes (%.0f%%)\n",
2213 shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
2214 spill_count, fill_count, promoted_constants, before_size, after_size,
2215 100.0f * (before_size - after_size) / before_size);
2216
2217 dump_assembly(p->store, annotation.ann_count, annotation.ann,
2218 p->devinfo);
2219 ralloc_free(annotation.mem_ctx);
2220 }
2221 assert(validated);
2222
2223 compiler->shader_debug_log(log_data,
2224 "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
2225 "%d:%d spills:fills, Promoted %u constants, "
2226 "compacted %d to %d bytes.",
2227 _mesa_shader_stage_to_abbrev(stage),
2228 dispatch_width, before_size / 16,
2229 loop_count, cfg->cycle_count, spill_count,
2230 fill_count, promoted_constants, before_size,
2231 after_size);
2232
2233 return start_offset;
2234 }
2235
2236 const unsigned *
2237 fs_generator::get_assembly(unsigned int *assembly_size)
2238 {
2239 return brw_get_program(p, assembly_size);
2240 }