i965: Add HiZ operation state to brw_context
[mesa.git] / src / mesa / drivers / dri / i965 / brw_optimize.c
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 *
23 * Authors:
24 * Eric Anholt <eric@anholt.net>
25 *
26 */
27
28 #include "main/macros.h"
29 #include "program/program.h"
30 #include "program/prog_print.h"
31 #include "brw_context.h"
32 #include "brw_defines.h"
33 #include "brw_eu.h"
34
35 const struct brw_instruction_info brw_opcodes[128] = {
36 [BRW_OPCODE_MOV] = { .name = "mov", .nsrc = 1, .ndst = 1, .is_arith = 1 },
37 [BRW_OPCODE_FRC] = { .name = "frc", .nsrc = 1, .ndst = 1, .is_arith = 1 },
38 [BRW_OPCODE_RNDU] = { .name = "rndu", .nsrc = 1, .ndst = 1, .is_arith = 1 },
39 [BRW_OPCODE_RNDD] = { .name = "rndd", .nsrc = 1, .ndst = 1, .is_arith = 1 },
40 [BRW_OPCODE_RNDE] = { .name = "rnde", .nsrc = 1, .ndst = 1, .is_arith = 1 },
41 [BRW_OPCODE_RNDZ] = { .name = "rndz", .nsrc = 1, .ndst = 1, .is_arith = 1 },
42 [BRW_OPCODE_NOT] = { .name = "not", .nsrc = 1, .ndst = 1, .is_arith = 1 },
43 [BRW_OPCODE_LZD] = { .name = "lzd", .nsrc = 1, .ndst = 1 },
44
45 [BRW_OPCODE_MUL] = { .name = "mul", .nsrc = 2, .ndst = 1, .is_arith = 1 },
46 [BRW_OPCODE_MAC] = { .name = "mac", .nsrc = 2, .ndst = 1, .is_arith = 1 },
47 [BRW_OPCODE_MACH] = { .name = "mach", .nsrc = 2, .ndst = 1, .is_arith = 1 },
48 [BRW_OPCODE_LINE] = { .name = "line", .nsrc = 2, .ndst = 1, .is_arith = 1 },
49 [BRW_OPCODE_PLN] = { .name = "pln", .nsrc = 2, .ndst = 1 },
50 [BRW_OPCODE_SAD2] = { .name = "sad2", .nsrc = 2, .ndst = 1 },
51 [BRW_OPCODE_SADA2] = { .name = "sada2", .nsrc = 2, .ndst = 1 },
52 [BRW_OPCODE_DP4] = { .name = "dp4", .nsrc = 2, .ndst = 1 },
53 [BRW_OPCODE_DPH] = { .name = "dph", .nsrc = 2, .ndst = 1 },
54 [BRW_OPCODE_DP3] = { .name = "dp3", .nsrc = 2, .ndst = 1 },
55 [BRW_OPCODE_DP2] = { .name = "dp2", .nsrc = 2, .ndst = 1 },
56 [BRW_OPCODE_MATH] = { .name = "math", .nsrc = 2, .ndst = 1 },
57
58 [BRW_OPCODE_AVG] = { .name = "avg", .nsrc = 2, .ndst = 1, .is_arith = 1 },
59 [BRW_OPCODE_ADD] = { .name = "add", .nsrc = 2, .ndst = 1, .is_arith = 1 },
60 [BRW_OPCODE_SEL] = { .name = "sel", .nsrc = 2, .ndst = 1, .is_arith = 1 },
61 [BRW_OPCODE_AND] = { .name = "and", .nsrc = 2, .ndst = 1, .is_arith = 1 },
62 [BRW_OPCODE_OR] = { .name = "or", .nsrc = 2, .ndst = 1, .is_arith = 1 },
63 [BRW_OPCODE_XOR] = { .name = "xor", .nsrc = 2, .ndst = 1, .is_arith = 1 },
64 [BRW_OPCODE_SHR] = { .name = "shr", .nsrc = 2, .ndst = 1, .is_arith = 1 },
65 [BRW_OPCODE_SHL] = { .name = "shl", .nsrc = 2, .ndst = 1, .is_arith = 1 },
66 [BRW_OPCODE_ASR] = { .name = "asr", .nsrc = 2, .ndst = 1 },
67 [BRW_OPCODE_CMP] = { .name = "cmp", .nsrc = 2, .ndst = 1 },
68 [BRW_OPCODE_CMPN] = { .name = "cmpn", .nsrc = 2, .ndst = 1 },
69
70 [BRW_OPCODE_SEND] = { .name = "send", .nsrc = 1, .ndst = 1 },
71 [BRW_OPCODE_NOP] = { .name = "nop", .nsrc = 0, .ndst = 0 },
72 [BRW_OPCODE_JMPI] = { .name = "jmpi", .nsrc = 1, .ndst = 0 },
73 [BRW_OPCODE_IF] = { .name = "if", .nsrc = 2, .ndst = 0 },
74 [BRW_OPCODE_IFF] = { .name = "iff", .nsrc = 2, .ndst = 1 },
75 [BRW_OPCODE_WHILE] = { .name = "while", .nsrc = 2, .ndst = 0 },
76 [BRW_OPCODE_ELSE] = { .name = "else", .nsrc = 2, .ndst = 0 },
77 [BRW_OPCODE_BREAK] = { .name = "break", .nsrc = 2, .ndst = 0 },
78 [BRW_OPCODE_CONTINUE] = { .name = "cont", .nsrc = 1, .ndst = 0 },
79 [BRW_OPCODE_HALT] = { .name = "halt", .nsrc = 1, .ndst = 0 },
80 [BRW_OPCODE_MSAVE] = { .name = "msave", .nsrc = 1, .ndst = 1 },
81 [BRW_OPCODE_PUSH] = { .name = "push", .nsrc = 1, .ndst = 1 },
82 [BRW_OPCODE_MRESTORE] = { .name = "mrest", .nsrc = 1, .ndst = 1 },
83 [BRW_OPCODE_POP] = { .name = "pop", .nsrc = 2, .ndst = 0 },
84 [BRW_OPCODE_WAIT] = { .name = "wait", .nsrc = 1, .ndst = 0 },
85 [BRW_OPCODE_DO] = { .name = "do", .nsrc = 0, .ndst = 0 },
86 [BRW_OPCODE_ENDIF] = { .name = "endif", .nsrc = 2, .ndst = 0 },
87 };
88
89 static INLINE
90 bool brw_is_arithmetic_inst(const struct brw_instruction *inst)
91 {
92 return brw_opcodes[inst->header.opcode].is_arith;
93 }
94
95 static const GLuint inst_stride[7] = {
96 [0] = 0,
97 [1] = 1,
98 [2] = 2,
99 [3] = 4,
100 [4] = 8,
101 [5] = 16,
102 [6] = 32
103 };
104
105 static const GLuint inst_type_size[8] = {
106 [BRW_REGISTER_TYPE_UD] = 4,
107 [BRW_REGISTER_TYPE_D] = 4,
108 [BRW_REGISTER_TYPE_UW] = 2,
109 [BRW_REGISTER_TYPE_W] = 2,
110 [BRW_REGISTER_TYPE_UB] = 1,
111 [BRW_REGISTER_TYPE_B] = 1,
112 [BRW_REGISTER_TYPE_F] = 4
113 };
114
115 static INLINE bool
116 brw_is_grf_written(const struct brw_instruction *inst,
117 int reg_index, int size,
118 int gen)
119 {
120 if (brw_opcodes[inst->header.opcode].ndst == 0)
121 return false;
122
123 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
124 if (inst->bits1.ia1.dest_reg_file == BRW_GENERAL_REGISTER_FILE)
125 return true;
126
127 if (inst->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
128 return false;
129
130 const int reg_start = reg_index * REG_SIZE;
131 const int reg_end = reg_start + size;
132
133 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
134 const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
135 + inst->bits1.da1.dest_subreg_nr;
136 int length, write_end;
137
138 /* SEND is specific */
139 if (inst->header.opcode == BRW_OPCODE_SEND) {
140 if (gen >= 5)
141 length = inst->bits3.generic_gen5.response_length*REG_SIZE;
142 else
143 length = inst->bits3.generic.response_length*REG_SIZE;
144 }
145 else {
146 length = 1 << inst->header.execution_size;
147 length *= type_size;
148 length *= inst->bits1.da1.dest_horiz_stride;
149 }
150
151 /* If the two intervals intersect, we overwrite the register */
152 write_end = write_start + length;
153 const int left = MAX2(write_start, reg_start);
154 const int right = MIN2(write_end, reg_end);
155
156 return left < right;
157 }
158
159 static bool
160 brw_is_mrf_written_alu(const struct brw_instruction *inst,
161 int reg_index, int size)
162 {
163 if (brw_opcodes[inst->header.opcode].ndst == 0)
164 return false;
165
166 if (inst->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE)
167 return false;
168
169 if (inst->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT)
170 return true;
171
172 const int reg_start = reg_index * REG_SIZE;
173 const int reg_end = reg_start + size;
174
175 const int mrf_index = inst->bits1.da1.dest_reg_nr & 0x0f;
176 const int is_compr4 = inst->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4;
177 const int type_size = inst_type_size[inst->bits1.da1.dest_reg_type];
178
179 /* We use compr4 with a size != 16 elements. Strange, we conservatively
180 * consider that we are writing the register.
181 */
182 if (is_compr4 && inst->header.execution_size != BRW_EXECUTE_16)
183 return true;
184
185 /* Here we write mrf_{i} and mrf_{i+4}. So we read two times 8 elements */
186 if (is_compr4) {
187 const int length = 8 * type_size * inst->bits1.da1.dest_horiz_stride;
188
189 /* First 8-way register */
190 const int write_start0 = mrf_index*REG_SIZE
191 + inst->bits1.da1.dest_subreg_nr;
192 const int write_end0 = write_start0 + length;
193
194 /* Second 8-way register */
195 const int write_start1 = (mrf_index+4)*REG_SIZE
196 + inst->bits1.da1.dest_subreg_nr;
197 const int write_end1 = write_start1 + length;
198
199 /* If the two intervals intersect, we overwrite the register */
200 const int left0 = MAX2(write_start0, reg_start);
201 const int right0 = MIN2(write_end0, reg_end);
202 const int left1 = MAX2(write_start1, reg_start);
203 const int right1 = MIN2(write_end1, reg_end);
204
205 if (left0 < right0 || left1 < right1)
206 return true;
207 }
208 else {
209 int length;
210 length = 1 << inst->header.execution_size;
211 length *= type_size;
212 length *= inst->bits1.da1.dest_horiz_stride;
213
214 /* If the two intervals intersect, we write into the register */
215 const int write_start = inst->bits1.da1.dest_reg_nr*REG_SIZE
216 + inst->bits1.da1.dest_subreg_nr;
217 const int write_end = write_start + length;
218 const int left = MAX2(write_start, reg_start);
219 const int right = MIN2(write_end, reg_end);
220
221 if (left < right)
222 return true;
223 }
224
225 return false;
226 }
227
228 /* SEND may perform an implicit mov to a mrf register */
229 static bool
230 brw_is_mrf_written_send(const struct brw_instruction *inst,
231 int reg_index, int size)
232 {
233
234 const int reg_start = reg_index * REG_SIZE;
235 const int reg_end = reg_start + size;
236 const int mrf_start = inst->header.destreg__conditionalmod;
237 const int write_start = mrf_start * REG_SIZE;
238 const int write_end = write_start + REG_SIZE;
239 const int left = MAX2(write_start, reg_start);
240 const int right = MIN2(write_end, reg_end);
241
242 if (inst->header.opcode != BRW_OPCODE_SEND ||
243 inst->bits1.da1.src0_reg_file == 0)
244 return false;
245
246 return left < right;
247 }
248
249 /* Specific path for message register since we need to handle the compr4 case */
250 static INLINE bool
251 brw_is_mrf_written(const struct brw_instruction *inst, int reg_index, int size)
252 {
253 return (brw_is_mrf_written_alu(inst, reg_index, size) ||
254 brw_is_mrf_written_send(inst, reg_index, size));
255 }
256
257 static INLINE bool
258 brw_is_mrf_read(const struct brw_instruction *inst,
259 int reg_index, int size, int gen)
260 {
261 if (inst->header.opcode != BRW_OPCODE_SEND)
262 return false;
263 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
264 return true;
265
266 const int reg_start = reg_index*REG_SIZE;
267 const int reg_end = reg_start + size;
268
269 int length, read_start, read_end;
270 if (gen >= 5)
271 length = inst->bits3.generic_gen5.msg_length*REG_SIZE;
272 else
273 length = inst->bits3.generic.msg_length*REG_SIZE;
274
275 /* Look if SEND uses an implicit mov. In that case, we read one less register
276 * (but we write it)
277 */
278 if (inst->bits1.da1.src0_reg_file != 0)
279 read_start = inst->header.destreg__conditionalmod;
280 else {
281 length--;
282 read_start = inst->header.destreg__conditionalmod + 1;
283 }
284 read_start *= REG_SIZE;
285 read_end = read_start + length;
286
287 const int left = MAX2(read_start, reg_start);
288 const int right = MIN2(read_end, reg_end);
289
290 return left < right;
291 }
292
293 static INLINE bool
294 brw_is_grf_read(const struct brw_instruction *inst, int reg_index, int size)
295 {
296 int i, j;
297 if (brw_opcodes[inst->header.opcode].nsrc == 0)
298 return false;
299
300 /* Look at first source. We must take into account register regions to
301 * monitor carefully the read. Note that we are a bit too conservative here
302 * since we do not take into account the fact that some complete registers
303 * may be skipped
304 */
305 if (brw_opcodes[inst->header.opcode].nsrc >= 1) {
306
307 if (inst->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT)
308 if (inst->bits1.ia1.src0_reg_file == BRW_GENERAL_REGISTER_FILE)
309 return true;
310 if (inst->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE)
311 return false;
312
313 const int reg_start = reg_index*REG_SIZE;
314 const int reg_end = reg_start + size;
315
316 /* See if at least one of this element intersects the interval */
317 const int type_size = inst_type_size[inst->bits1.da1.src0_reg_type];
318 const int elem_num = 1 << inst->header.execution_size;
319 const int width = 1 << inst->bits2.da1.src0_width;
320 const int row_num = elem_num >> inst->bits2.da1.src0_width;
321 const int hs = type_size*inst_stride[inst->bits2.da1.src0_horiz_stride];
322 const int vs = type_size*inst_stride[inst->bits2.da1.src0_vert_stride];
323 int row_start = inst->bits2.da1.src0_reg_nr*REG_SIZE
324 + inst->bits2.da1.src0_subreg_nr;
325 for (j = 0; j < row_num; ++j) {
326 int write_start = row_start;
327 for (i = 0; i < width; ++i) {
328 const int write_end = write_start + type_size;
329 const int left = write_start > reg_start ? write_start : reg_start;
330 const int right = write_end < reg_end ? write_end : reg_end;
331 if (left < right)
332 return true;
333 write_start += hs;
334 }
335 row_start += vs;
336 }
337 }
338
339 /* Second src register */
340 if (brw_opcodes[inst->header.opcode].nsrc >= 2) {
341
342 if (inst->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT)
343 if (inst->bits1.ia1.src1_reg_file == BRW_GENERAL_REGISTER_FILE)
344 return true;
345 if (inst->bits1.da1.src1_reg_file != BRW_GENERAL_REGISTER_FILE)
346 return false;
347
348 const int reg_start = reg_index*REG_SIZE;
349 const int reg_end = reg_start + size;
350
351 /* See if at least one of this element intersects the interval */
352 const int type_size = inst_type_size[inst->bits1.da1.src1_reg_type];
353 const int elem_num = 1 << inst->header.execution_size;
354 const int width = 1 << inst->bits3.da1.src1_width;
355 const int row_num = elem_num >> inst->bits3.da1.src1_width;
356 const int hs = type_size*inst_stride[inst->bits3.da1.src1_horiz_stride];
357 const int vs = type_size*inst_stride[inst->bits3.da1.src1_vert_stride];
358 int row_start = inst->bits3.da1.src1_reg_nr*REG_SIZE
359 + inst->bits3.da1.src1_subreg_nr;
360 for (j = 0; j < row_num; ++j) {
361 int write_start = row_start;
362 for (i = 0; i < width; ++i) {
363 const int write_end = write_start + type_size;
364 const int left = write_start > reg_start ? write_start : reg_start;
365 const int right = write_end < reg_end ? write_end : reg_end;
366 if (left < right)
367 return true;
368 write_start += hs;
369 }
370 row_start += vs;
371 }
372 }
373
374 return false;
375 }
376
377 static INLINE bool
378 brw_is_control_done(const struct brw_instruction *mov) {
379 return
380 mov->header.dependency_control != 0 ||
381 mov->header.thread_control != 0 ||
382 mov->header.mask_control != 0 ||
383 mov->header.saturate != 0 ||
384 mov->header.debug_control != 0;
385 }
386
387 static INLINE bool
388 brw_is_predicated(const struct brw_instruction *mov) {
389 return mov->header.predicate_control != 0;
390 }
391
392 static INLINE bool
393 brw_is_grf_to_mrf_mov(const struct brw_instruction *mov,
394 int *mrf_index,
395 int *grf_index,
396 bool *is_compr4)
397 {
398 if (brw_is_predicated(mov) ||
399 brw_is_control_done(mov) ||
400 mov->header.debug_control != 0)
401 return false;
402
403 if (mov->bits1.da1.dest_address_mode != BRW_ADDRESS_DIRECT ||
404 mov->bits1.da1.dest_reg_file != BRW_MESSAGE_REGISTER_FILE ||
405 mov->bits1.da1.dest_reg_type != BRW_REGISTER_TYPE_F ||
406 mov->bits1.da1.dest_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
407 mov->bits1.da1.dest_subreg_nr != 0)
408 return false;
409
410 if (mov->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
411 mov->bits1.da1.src0_reg_file != BRW_GENERAL_REGISTER_FILE ||
412 mov->bits1.da1.src0_reg_type != BRW_REGISTER_TYPE_F ||
413 mov->bits2.da1.src0_width != BRW_WIDTH_8 ||
414 mov->bits2.da1.src0_horiz_stride != BRW_HORIZONTAL_STRIDE_1 ||
415 mov->bits2.da1.src0_vert_stride != BRW_VERTICAL_STRIDE_8 ||
416 mov->bits2.da1.src0_subreg_nr != 0 ||
417 mov->bits2.da1.src0_abs != 0 ||
418 mov->bits2.da1.src0_negate != 0)
419 return false;
420
421 *grf_index = mov->bits2.da1.src0_reg_nr;
422 *mrf_index = mov->bits1.da1.dest_reg_nr & 0x0f;
423 *is_compr4 = (mov->bits1.da1.dest_reg_nr & BRW_MRF_COMPR4) != 0;
424 return true;
425 }
426
427 static INLINE bool
428 brw_is_grf_straight_write(const struct brw_instruction *inst, int grf_index)
429 {
430 /* remark: no problem to predicate a SEL instruction */
431 if ((!brw_is_predicated(inst) || inst->header.opcode == BRW_OPCODE_SEL) &&
432 brw_is_control_done(inst) == false &&
433 inst->header.execution_size == 4 &&
434 inst->header.access_mode == BRW_ALIGN_1 &&
435 inst->bits1.da1.dest_address_mode == BRW_ADDRESS_DIRECT &&
436 inst->bits1.da1.dest_reg_file == BRW_GENERAL_REGISTER_FILE &&
437 inst->bits1.da1.dest_reg_type == BRW_REGISTER_TYPE_F &&
438 inst->bits1.da1.dest_horiz_stride == BRW_HORIZONTAL_STRIDE_1 &&
439 inst->bits1.da1.dest_reg_nr == grf_index &&
440 inst->bits1.da1.dest_subreg_nr == 0 &&
441 brw_is_arithmetic_inst(inst))
442 return true;
443
444 return false;
445 }
446
447 static INLINE bool
448 brw_inst_are_equal(const struct brw_instruction *src0,
449 const struct brw_instruction *src1)
450 {
451 const GLuint *field0 = (GLuint *) src0;
452 const GLuint *field1 = (GLuint *) src1;
453 return field0[0] == field1[0] &&
454 field0[1] == field1[1] &&
455 field0[2] == field1[2] &&
456 field0[3] == field1[3];
457 }
458
459 static INLINE void
460 brw_inst_copy(struct brw_instruction *dst,
461 const struct brw_instruction *src)
462 {
463 GLuint *field_dst = (GLuint *) dst;
464 const GLuint *field_src = (GLuint *) src;
465 field_dst[0] = field_src[0];
466 field_dst[1] = field_src[1];
467 field_dst[2] = field_src[2];
468 field_dst[3] = field_src[3];
469 }
470
471 static void brw_remove_inst(struct brw_compile *p, const bool *removeInst)
472 {
473 int i, nr_insn = 0, to = 0, from = 0;
474
475 for (from = 0; from < p->nr_insn; ++from) {
476 if (removeInst[from])
477 continue;
478 if(to != from)
479 brw_inst_copy(p->store + to, p->store + from);
480 to++;
481 }
482
483 for (i = 0; i < p->nr_insn; ++i)
484 if (removeInst[i] == false)
485 nr_insn++;
486 p->nr_insn = nr_insn;
487 }
488
489 /* The gen code emitter generates a lot of duplications in the
490 * grf-to-mrf moves, for example when texture sampling with the same
491 * coordinates from multiple textures.. Here, we monitor same mov
492 * grf-to-mrf instrutions and remove repeated ones where the operands
493 * and dst ahven't changed in between.
494 */
495 void brw_remove_duplicate_mrf_moves(struct brw_compile *p)
496 {
497 const int gen = p->brw->intel.gen;
498 int i, j;
499
500 bool *removeInst = calloc(sizeof(bool), p->nr_insn);
501 for (i = 0; i < p->nr_insn; i++) {
502 if (removeInst[i])
503 continue;
504
505 const struct brw_instruction *mov = p->store + i;
506 int mrf_index, grf_index;
507 bool is_compr4;
508
509 /* Only consider _straight_ grf-to-mrf moves */
510 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
511 continue;
512
513 const int mrf_index0 = mrf_index;
514 const int mrf_index1 = is_compr4 ? mrf_index0+4 : mrf_index0+1;
515 const int simd16_size = 2 * REG_SIZE;
516
517 for (j = i + 1; j < p->nr_insn; j++) {
518 const struct brw_instruction *inst = p->store + j;
519
520 if (brw_inst_are_equal(mov, inst)) {
521 removeInst[j] = true;
522 continue;
523 }
524
525 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
526 brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
527 brw_is_mrf_written(inst, mrf_index1, REG_SIZE))
528 break;
529 }
530 }
531
532 brw_remove_inst(p, removeInst);
533 free(removeInst);
534 }
535
536 /* Replace moves to MRFs where the value moved is the result of a
537 * normal arithmetic operation with computation right into the MRF.
538 */
539 void brw_remove_grf_to_mrf_moves(struct brw_compile *p)
540 {
541 int i, j, prev;
542 struct brw_context *brw = p->brw;
543 const int gen = brw->intel.gen;
544 const int simd16_size = 2*REG_SIZE;
545
546 bool *removeInst = calloc(sizeof(bool), p->nr_insn);
547 assert(removeInst);
548
549 for (i = 0; i < p->nr_insn; i++) {
550 if (removeInst[i])
551 continue;
552
553 struct brw_instruction *grf_inst = NULL;
554 const struct brw_instruction *mov = p->store + i;
555 int mrf_index, grf_index;
556 bool is_compr4;
557
558 /* Only consider _straight_ grf-to-mrf moves */
559 if (!brw_is_grf_to_mrf_mov(mov, &mrf_index, &grf_index, &is_compr4))
560 continue;
561
562 /* Using comp4 enables a stride of 4 for this instruction */
563 const int mrf_index0 = mrf_index;
564 const int mrf_index1 = is_compr4 ? mrf_index+4 : mrf_index+1;
565
566 /* Look where the register has been set */
567 prev = i;
568 bool potential_remove = false;
569 while (prev--) {
570
571 /* If _one_ instruction writes the grf, we try to remove the mov */
572 struct brw_instruction *inst = p->store + prev;
573 if (brw_is_grf_straight_write(inst, grf_index)) {
574 potential_remove = true;
575 grf_inst = inst;
576 break;
577 }
578
579 }
580
581 if (potential_remove == false)
582 continue;
583 removeInst[i] = true;
584
585 /* Monitor first the section of code between the grf computation and the
586 * mov. Here we cannot read or write both mrf and grf register
587 */
588 for (j = prev + 1; j < i; ++j) {
589 struct brw_instruction *inst = p->store + j;
590 if (removeInst[j])
591 continue;
592 if (brw_is_grf_written(inst, grf_index, simd16_size, gen) ||
593 brw_is_grf_read(inst, grf_index, simd16_size) ||
594 brw_is_mrf_written(inst, mrf_index0, REG_SIZE) ||
595 brw_is_mrf_written(inst, mrf_index1, REG_SIZE) ||
596 brw_is_mrf_read(inst, mrf_index0, REG_SIZE, gen) ||
597 brw_is_mrf_read(inst, mrf_index1, REG_SIZE, gen)) {
598 removeInst[i] = false;
599 break;
600 }
601 }
602
603 /* After the mov, we can read or write the mrf. If the grf is overwritten,
604 * we are done
605 */
606 for (j = i + 1; j < p->nr_insn; ++j) {
607 struct brw_instruction *inst = p->store + j;
608 if (removeInst[j])
609 continue;
610
611 if (brw_is_grf_read(inst, grf_index, simd16_size)) {
612 removeInst[i] = false;
613 break;
614 }
615
616 if (brw_is_grf_straight_write(inst, grf_index))
617 break;
618 }
619
620 /* Note that with the top down traversal, we can safely pacth the mov
621 * instruction
622 */
623 if (removeInst[i]) {
624 grf_inst->bits1.da1.dest_reg_file = mov->bits1.da1.dest_reg_file;
625 grf_inst->bits1.da1.dest_reg_nr = mov->bits1.da1.dest_reg_nr;
626 }
627 }
628
629 brw_remove_inst(p, removeInst);
630 free(removeInst);
631 }
632
633 static bool
634 is_single_channel_dp4(struct brw_instruction *insn)
635 {
636 if (insn->header.opcode != BRW_OPCODE_DP4 ||
637 insn->header.execution_size != BRW_EXECUTE_8 ||
638 insn->header.access_mode != BRW_ALIGN_16 ||
639 insn->bits1.da1.dest_reg_file != BRW_GENERAL_REGISTER_FILE)
640 return false;
641
642 if (!is_power_of_two(insn->bits1.da16.dest_writemask))
643 return false;
644
645 return true;
646 }
647
648 /**
649 * Sets the dependency control fields on DP4 instructions.
650 *
651 * The hardware only tracks dependencies on a register basis, so when
652 * you do:
653 *
654 * DP4 dst.x src1 src2
655 * DP4 dst.y src1 src3
656 * DP4 dst.z src1 src4
657 * DP4 dst.w src1 src5
658 *
659 * It will wait to do the DP4 dst.y until the dst.x is resolved, etc.
660 * We can examine our instruction stream and set the dependency
661 * control fields to tell the hardware when to do it.
662 *
663 * We may want to extend this to other instructions that are used to
664 * fill in a channel at a time of the destination register.
665 */
666 static void
667 brw_set_dp4_dependency_control(struct brw_compile *p)
668 {
669 int i;
670
671 for (i = 1; i < p->nr_insn; i++) {
672 struct brw_instruction *insn = &p->store[i];
673 struct brw_instruction *prev = &p->store[i - 1];
674
675 if (!is_single_channel_dp4(prev))
676 continue;
677
678 if (!is_single_channel_dp4(insn)) {
679 i++;
680 continue;
681 }
682
683 /* Only avoid hw dep control if the write masks are different
684 * channels of one reg.
685 */
686 if (insn->bits1.da16.dest_writemask == prev->bits1.da16.dest_writemask)
687 continue;
688 if (insn->bits1.da16.dest_reg_nr != prev->bits1.da16.dest_reg_nr)
689 continue;
690
691 /* Check if the second instruction depends on the previous one
692 * for a src.
693 */
694 if (insn->bits1.da1.src0_reg_file == BRW_GENERAL_REGISTER_FILE &&
695 (insn->bits2.da1.src0_address_mode != BRW_ADDRESS_DIRECT ||
696 insn->bits2.da1.src0_reg_nr == insn->bits1.da16.dest_reg_nr))
697 continue;
698 if (insn->bits1.da1.src1_reg_file == BRW_GENERAL_REGISTER_FILE &&
699 (insn->bits3.da1.src1_address_mode != BRW_ADDRESS_DIRECT ||
700 insn->bits3.da1.src1_reg_nr == insn->bits1.da16.dest_reg_nr))
701 continue;
702
703 prev->header.dependency_control |= BRW_DEPENDENCY_NOTCLEARED;
704 insn->header.dependency_control |= BRW_DEPENDENCY_NOTCHECKED;
705 }
706 }
707
708 void
709 brw_optimize(struct brw_compile *p)
710 {
711 brw_set_dp4_dependency_control(p);
712 }