re PR other/35094 (RTL dump file letters hosed and partly undocumented)
[gcc.git] / gcc / tree-vect-generic.c
1 /* Lower vector operations to scalar operations.
2 Copyright (C) 2004, 2005, 2006, 2007 Free Software Foundation, Inc.
3
4 This file is part of GCC.
5
6 GCC is free software; you can redistribute it and/or modify it
7 under the terms of the GNU General Public License as published by the
8 Free Software Foundation; either version 3, or (at your option) any
9 later version.
10
11 GCC is distributed in the hope that it will be useful, but WITHOUT
12 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
13 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
14 for more details.
15
16 You should have received a copy of the GNU General Public License
17 along with GCC; see the file COPYING3. If not see
18 <http://www.gnu.org/licenses/>. */
19
20 #include "config.h"
21 #include "system.h"
22 #include "coretypes.h"
23 #include "tree.h"
24 #include "tm.h"
25 #include "rtl.h"
26 #include "expr.h"
27 #include "insn-codes.h"
28 #include "diagnostic.h"
29 #include "optabs.h"
30 #include "machmode.h"
31 #include "langhooks.h"
32 #include "tree-flow.h"
33 #include "tree-gimple.h"
34 #include "tree-iterator.h"
35 #include "tree-pass.h"
36 #include "flags.h"
37 #include "ggc.h"
38
39
40 /* Build a constant of type TYPE, made of VALUE's bits replicated
41 every TYPE_SIZE (INNER_TYPE) bits to fit TYPE's precision. */
42 static tree
43 build_replicated_const (tree type, tree inner_type, HOST_WIDE_INT value)
44 {
45 int width = tree_low_cst (TYPE_SIZE (inner_type), 1);
46 int n = HOST_BITS_PER_WIDE_INT / width;
47 unsigned HOST_WIDE_INT low, high, mask;
48 tree ret;
49
50 gcc_assert (n);
51
52 if (width == HOST_BITS_PER_WIDE_INT)
53 low = value;
54 else
55 {
56 mask = ((HOST_WIDE_INT)1 << width) - 1;
57 low = (unsigned HOST_WIDE_INT) ~0 / mask * (value & mask);
58 }
59
60 if (TYPE_PRECISION (type) < HOST_BITS_PER_WIDE_INT)
61 low &= ((HOST_WIDE_INT)1 << TYPE_PRECISION (type)) - 1, high = 0;
62 else if (TYPE_PRECISION (type) == HOST_BITS_PER_WIDE_INT)
63 high = 0;
64 else if (TYPE_PRECISION (type) == 2 * HOST_BITS_PER_WIDE_INT)
65 high = low;
66 else
67 gcc_unreachable ();
68
69 ret = build_int_cst_wide (type, low, high);
70 return ret;
71 }
72
73 static GTY(()) tree vector_inner_type;
74 static GTY(()) tree vector_last_type;
75 static GTY(()) int vector_last_nunits;
76
77 /* Return a suitable vector types made of SUBPARTS units each of mode
78 "word_mode" (the global variable). */
79 static tree
80 build_word_mode_vector_type (int nunits)
81 {
82 if (!vector_inner_type)
83 vector_inner_type = lang_hooks.types.type_for_mode (word_mode, 1);
84 else if (vector_last_nunits == nunits)
85 {
86 gcc_assert (TREE_CODE (vector_last_type) == VECTOR_TYPE);
87 return vector_last_type;
88 }
89
90 /* We build a new type, but we canonicalize it nevertheless,
91 because it still saves some memory. */
92 vector_last_nunits = nunits;
93 vector_last_type = type_hash_canon (nunits,
94 build_vector_type (vector_inner_type,
95 nunits));
96 return vector_last_type;
97 }
98
99 typedef tree (*elem_op_func) (block_stmt_iterator *,
100 tree, tree, tree, tree, tree, enum tree_code);
101
102 static inline tree
103 tree_vec_extract (block_stmt_iterator *bsi, tree type,
104 tree t, tree bitsize, tree bitpos)
105 {
106 if (bitpos)
107 return gimplify_build3 (bsi, BIT_FIELD_REF, type, t, bitsize, bitpos);
108 else
109 return gimplify_build1 (bsi, VIEW_CONVERT_EXPR, type, t);
110 }
111
112 static tree
113 do_unop (block_stmt_iterator *bsi, tree inner_type, tree a,
114 tree b ATTRIBUTE_UNUSED, tree bitpos, tree bitsize,
115 enum tree_code code)
116 {
117 a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos);
118 return gimplify_build1 (bsi, code, inner_type, a);
119 }
120
121 static tree
122 do_binop (block_stmt_iterator *bsi, tree inner_type, tree a, tree b,
123 tree bitpos, tree bitsize, enum tree_code code)
124 {
125 a = tree_vec_extract (bsi, inner_type, a, bitsize, bitpos);
126 b = tree_vec_extract (bsi, inner_type, b, bitsize, bitpos);
127 return gimplify_build2 (bsi, code, inner_type, a, b);
128 }
129
130 /* Expand vector addition to scalars. This does bit twiddling
131 in order to increase parallelism:
132
133 a + b = (((int) a & 0x7f7f7f7f) + ((int) b & 0x7f7f7f7f)) ^
134 (a ^ b) & 0x80808080
135
136 a - b = (((int) a | 0x80808080) - ((int) b & 0x7f7f7f7f)) ^
137 (a ^ ~b) & 0x80808080
138
139 -b = (0x80808080 - ((int) b & 0x7f7f7f7f)) ^ (~b & 0x80808080)
140
141 This optimization should be done only if 4 vector items or more
142 fit into a word. */
143 static tree
144 do_plus_minus (block_stmt_iterator *bsi, tree word_type, tree a, tree b,
145 tree bitpos ATTRIBUTE_UNUSED, tree bitsize ATTRIBUTE_UNUSED,
146 enum tree_code code)
147 {
148 tree inner_type = TREE_TYPE (TREE_TYPE (a));
149 unsigned HOST_WIDE_INT max;
150 tree low_bits, high_bits, a_low, b_low, result_low, signs;
151
152 max = GET_MODE_MASK (TYPE_MODE (inner_type));
153 low_bits = build_replicated_const (word_type, inner_type, max >> 1);
154 high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1));
155
156 a = tree_vec_extract (bsi, word_type, a, bitsize, bitpos);
157 b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos);
158
159 signs = gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, a, b);
160 b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits);
161 if (code == PLUS_EXPR)
162 a_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, a, low_bits);
163 else
164 {
165 a_low = gimplify_build2 (bsi, BIT_IOR_EXPR, word_type, a, high_bits);
166 signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, signs);
167 }
168
169 signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits);
170 result_low = gimplify_build2 (bsi, code, word_type, a_low, b_low);
171 return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs);
172 }
173
174 static tree
175 do_negate (block_stmt_iterator *bsi, tree word_type, tree b,
176 tree unused ATTRIBUTE_UNUSED, tree bitpos ATTRIBUTE_UNUSED,
177 tree bitsize ATTRIBUTE_UNUSED,
178 enum tree_code code ATTRIBUTE_UNUSED)
179 {
180 tree inner_type = TREE_TYPE (TREE_TYPE (b));
181 HOST_WIDE_INT max;
182 tree low_bits, high_bits, b_low, result_low, signs;
183
184 max = GET_MODE_MASK (TYPE_MODE (inner_type));
185 low_bits = build_replicated_const (word_type, inner_type, max >> 1);
186 high_bits = build_replicated_const (word_type, inner_type, max & ~(max >> 1));
187
188 b = tree_vec_extract (bsi, word_type, b, bitsize, bitpos);
189
190 b_low = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, b, low_bits);
191 signs = gimplify_build1 (bsi, BIT_NOT_EXPR, word_type, b);
192 signs = gimplify_build2 (bsi, BIT_AND_EXPR, word_type, signs, high_bits);
193 result_low = gimplify_build2 (bsi, MINUS_EXPR, word_type, high_bits, b_low);
194 return gimplify_build2 (bsi, BIT_XOR_EXPR, word_type, result_low, signs);
195 }
196
197 /* Expand a vector operation to scalars, by using many operations
198 whose type is the vector type's inner type. */
199 static tree
200 expand_vector_piecewise (block_stmt_iterator *bsi, elem_op_func f,
201 tree type, tree inner_type,
202 tree a, tree b, enum tree_code code)
203 {
204 VEC(constructor_elt,gc) *v;
205 tree part_width = TYPE_SIZE (inner_type);
206 tree index = bitsize_int (0);
207 int nunits = TYPE_VECTOR_SUBPARTS (type);
208 int delta = tree_low_cst (part_width, 1)
209 / tree_low_cst (TYPE_SIZE (TREE_TYPE (type)), 1);
210 int i;
211
212 v = VEC_alloc(constructor_elt, gc, (nunits + delta - 1) / delta);
213 for (i = 0; i < nunits;
214 i += delta, index = int_const_binop (PLUS_EXPR, index, part_width, 0))
215 {
216 tree result = f (bsi, inner_type, a, b, index, part_width, code);
217 constructor_elt *ce = VEC_quick_push (constructor_elt, v, NULL);
218 ce->index = NULL_TREE;
219 ce->value = result;
220 }
221
222 return build_constructor (type, v);
223 }
224
225 /* Expand a vector operation to scalars with the freedom to use
226 a scalar integer type, or to use a different size for the items
227 in the vector type. */
228 static tree
229 expand_vector_parallel (block_stmt_iterator *bsi, elem_op_func f, tree type,
230 tree a, tree b,
231 enum tree_code code)
232 {
233 tree result, compute_type;
234 enum machine_mode mode;
235 int n_words = tree_low_cst (TYPE_SIZE_UNIT (type), 1) / UNITS_PER_WORD;
236
237 /* We have three strategies. If the type is already correct, just do
238 the operation an element at a time. Else, if the vector is wider than
239 one word, do it a word at a time; finally, if the vector is smaller
240 than one word, do it as a scalar. */
241 if (TYPE_MODE (TREE_TYPE (type)) == word_mode)
242 return expand_vector_piecewise (bsi, f,
243 type, TREE_TYPE (type),
244 a, b, code);
245 else if (n_words > 1)
246 {
247 tree word_type = build_word_mode_vector_type (n_words);
248 result = expand_vector_piecewise (bsi, f,
249 word_type, TREE_TYPE (word_type),
250 a, b, code);
251 result = gimplify_val (bsi, word_type, result);
252 }
253 else
254 {
255 /* Use a single scalar operation with a mode no wider than word_mode. */
256 mode = mode_for_size (tree_low_cst (TYPE_SIZE (type), 1), MODE_INT, 0);
257 compute_type = lang_hooks.types.type_for_mode (mode, 1);
258 result = f (bsi, compute_type, a, b, NULL_TREE, NULL_TREE, code);
259 }
260
261 return result;
262 }
263
264 /* Expand a vector operation to scalars; for integer types we can use
265 special bit twiddling tricks to do the sums a word at a time, using
266 function F_PARALLEL instead of F. These tricks are done only if
267 they can process at least four items, that is, only if the vector
268 holds at least four items and if a word can hold four items. */
269 static tree
270 expand_vector_addition (block_stmt_iterator *bsi,
271 elem_op_func f, elem_op_func f_parallel,
272 tree type, tree a, tree b, enum tree_code code)
273 {
274 int parts_per_word = UNITS_PER_WORD
275 / tree_low_cst (TYPE_SIZE_UNIT (TREE_TYPE (type)), 1);
276
277 if (INTEGRAL_TYPE_P (TREE_TYPE (type))
278 && parts_per_word >= 4
279 && TYPE_VECTOR_SUBPARTS (type) >= 4)
280 return expand_vector_parallel (bsi, f_parallel,
281 type, a, b, code);
282 else
283 return expand_vector_piecewise (bsi, f,
284 type, TREE_TYPE (type),
285 a, b, code);
286 }
287
288 static tree
289 expand_vector_operation (block_stmt_iterator *bsi, tree type, tree compute_type,
290 tree rhs, enum tree_code code)
291 {
292 enum machine_mode compute_mode = TYPE_MODE (compute_type);
293
294 /* If the compute mode is not a vector mode (hence we are not decomposing
295 a BLKmode vector to smaller, hardware-supported vectors), we may want
296 to expand the operations in parallel. */
297 if (GET_MODE_CLASS (compute_mode) != MODE_VECTOR_INT
298 && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FLOAT
299 && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_FRACT
300 && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UFRACT
301 && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_ACCUM
302 && GET_MODE_CLASS (compute_mode) != MODE_VECTOR_UACCUM)
303 switch (code)
304 {
305 case PLUS_EXPR:
306 case MINUS_EXPR:
307 if (!TYPE_OVERFLOW_TRAPS (type))
308 return expand_vector_addition (bsi, do_binop, do_plus_minus, type,
309 TREE_OPERAND (rhs, 0),
310 TREE_OPERAND (rhs, 1), code);
311 break;
312
313 case NEGATE_EXPR:
314 if (!TYPE_OVERFLOW_TRAPS (type))
315 return expand_vector_addition (bsi, do_unop, do_negate, type,
316 TREE_OPERAND (rhs, 0),
317 NULL_TREE, code);
318 break;
319
320 case BIT_AND_EXPR:
321 case BIT_IOR_EXPR:
322 case BIT_XOR_EXPR:
323 return expand_vector_parallel (bsi, do_binop, type,
324 TREE_OPERAND (rhs, 0),
325 TREE_OPERAND (rhs, 1), code);
326
327 case BIT_NOT_EXPR:
328 return expand_vector_parallel (bsi, do_unop, type,
329 TREE_OPERAND (rhs, 0),
330 NULL_TREE, code);
331
332 default:
333 break;
334 }
335
336 if (TREE_CODE_CLASS (code) == tcc_unary)
337 return expand_vector_piecewise (bsi, do_unop, type, compute_type,
338 TREE_OPERAND (rhs, 0),
339 NULL_TREE, code);
340 else
341 return expand_vector_piecewise (bsi, do_binop, type, compute_type,
342 TREE_OPERAND (rhs, 0),
343 TREE_OPERAND (rhs, 1), code);
344 }
345 \f
346 /* Return a type for the widest vector mode whose components are of mode
347 INNER_MODE, or NULL_TREE if none is found.
348 SATP is true for saturating fixed-point types. */
349
350 static tree
351 type_for_widest_vector_mode (enum machine_mode inner_mode, optab op, int satp)
352 {
353 enum machine_mode best_mode = VOIDmode, mode;
354 int best_nunits = 0;
355
356 if (SCALAR_FLOAT_MODE_P (inner_mode))
357 mode = MIN_MODE_VECTOR_FLOAT;
358 else if (SCALAR_FRACT_MODE_P (inner_mode))
359 mode = MIN_MODE_VECTOR_FRACT;
360 else if (SCALAR_UFRACT_MODE_P (inner_mode))
361 mode = MIN_MODE_VECTOR_UFRACT;
362 else if (SCALAR_ACCUM_MODE_P (inner_mode))
363 mode = MIN_MODE_VECTOR_ACCUM;
364 else if (SCALAR_UACCUM_MODE_P (inner_mode))
365 mode = MIN_MODE_VECTOR_UACCUM;
366 else
367 mode = MIN_MODE_VECTOR_INT;
368
369 for (; mode != VOIDmode; mode = GET_MODE_WIDER_MODE (mode))
370 if (GET_MODE_INNER (mode) == inner_mode
371 && GET_MODE_NUNITS (mode) > best_nunits
372 && optab_handler (op, mode)->insn_code != CODE_FOR_nothing)
373 best_mode = mode, best_nunits = GET_MODE_NUNITS (mode);
374
375 if (best_mode == VOIDmode)
376 return NULL_TREE;
377 else
378 {
379 /* For fixed-point modes, we need to pass satp as the 2nd parameter. */
380 if (ALL_FIXED_POINT_MODE_P (best_mode))
381 return lang_hooks.types.type_for_mode (best_mode, satp);
382
383 return lang_hooks.types.type_for_mode (best_mode, 1);
384 }
385 }
386
387 /* Process one statement. If we identify a vector operation, expand it. */
388
389 static void
390 expand_vector_operations_1 (block_stmt_iterator *bsi)
391 {
392 tree stmt = bsi_stmt (*bsi);
393 tree *p_lhs, *p_rhs, lhs, rhs, type, compute_type;
394 enum tree_code code;
395 enum machine_mode compute_mode;
396 optab op;
397
398 switch (TREE_CODE (stmt))
399 {
400 case RETURN_EXPR:
401 stmt = TREE_OPERAND (stmt, 0);
402 if (!stmt || TREE_CODE (stmt) != GIMPLE_MODIFY_STMT)
403 return;
404
405 /* FALLTHRU */
406
407 case GIMPLE_MODIFY_STMT:
408 p_lhs = &GIMPLE_STMT_OPERAND (stmt, 0);
409 p_rhs = &GIMPLE_STMT_OPERAND (stmt, 1);
410 lhs = *p_lhs;
411 rhs = *p_rhs;
412 break;
413
414 default:
415 return;
416 }
417
418 type = TREE_TYPE (rhs);
419 if (TREE_CODE (type) != VECTOR_TYPE)
420 return;
421
422 code = TREE_CODE (rhs);
423 if (TREE_CODE_CLASS (code) != tcc_unary
424 && TREE_CODE_CLASS (code) != tcc_binary)
425 return;
426
427 if (code == NOP_EXPR
428 || code == FLOAT_EXPR
429 || code == FIX_TRUNC_EXPR
430 || code == VIEW_CONVERT_EXPR)
431 return;
432
433 gcc_assert (code != CONVERT_EXPR);
434
435 /* The signedness is determined from input argument. */
436 if (code == VEC_UNPACK_FLOAT_HI_EXPR
437 || code == VEC_UNPACK_FLOAT_LO_EXPR)
438 type = TREE_TYPE (TREE_OPERAND (rhs, 0));
439
440 op = optab_for_tree_code (code, type);
441
442 /* For widening/narrowing vector operations, the relevant type is of the
443 arguments, not the widened result. VEC_UNPACK_FLOAT_*_EXPR is
444 calculated in the same way above. */
445 if (code == WIDEN_SUM_EXPR
446 || code == VEC_WIDEN_MULT_HI_EXPR
447 || code == VEC_WIDEN_MULT_LO_EXPR
448 || code == VEC_UNPACK_HI_EXPR
449 || code == VEC_UNPACK_LO_EXPR
450 || code == VEC_PACK_TRUNC_EXPR
451 || code == VEC_PACK_SAT_EXPR
452 || code == VEC_PACK_FIX_TRUNC_EXPR)
453 type = TREE_TYPE (TREE_OPERAND (rhs, 0));
454
455 /* Optabs will try converting a negation into a subtraction, so
456 look for it as well. TODO: negation of floating-point vectors
457 might be turned into an exclusive OR toggling the sign bit. */
458 if (op == NULL
459 && code == NEGATE_EXPR
460 && INTEGRAL_TYPE_P (TREE_TYPE (type)))
461 op = optab_for_tree_code (MINUS_EXPR, type);
462
463 /* For very wide vectors, try using a smaller vector mode. */
464 compute_type = type;
465 if (TYPE_MODE (type) == BLKmode && op)
466 {
467 tree vector_compute_type
468 = type_for_widest_vector_mode (TYPE_MODE (TREE_TYPE (type)), op,
469 TYPE_SATURATING (TREE_TYPE (type)));
470 if (vector_compute_type != NULL_TREE)
471 compute_type = vector_compute_type;
472 }
473
474 /* If we are breaking a BLKmode vector into smaller pieces,
475 type_for_widest_vector_mode has already looked into the optab,
476 so skip these checks. */
477 if (compute_type == type)
478 {
479 compute_mode = TYPE_MODE (compute_type);
480 if ((GET_MODE_CLASS (compute_mode) == MODE_VECTOR_INT
481 || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_FLOAT
482 || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_FRACT
483 || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_UFRACT
484 || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_ACCUM
485 || GET_MODE_CLASS (compute_mode) == MODE_VECTOR_UACCUM)
486 && op != NULL
487 && optab_handler (op, compute_mode)->insn_code != CODE_FOR_nothing)
488 return;
489 else
490 /* There is no operation in hardware, so fall back to scalars. */
491 compute_type = TREE_TYPE (type);
492 }
493
494 gcc_assert (code != VEC_LSHIFT_EXPR && code != VEC_RSHIFT_EXPR);
495 rhs = expand_vector_operation (bsi, type, compute_type, rhs, code);
496 if (useless_type_conversion_p (TREE_TYPE (lhs), TREE_TYPE (rhs)))
497 *p_rhs = rhs;
498 else
499 *p_rhs = gimplify_build1 (bsi, VIEW_CONVERT_EXPR, TREE_TYPE (lhs), rhs);
500
501 mark_stmt_modified (bsi_stmt (*bsi));
502 }
503 \f
504 /* Use this to lower vector operations introduced by the vectorizer,
505 if it may need the bit-twiddling tricks implemented in this file. */
506
507 static bool
508 gate_expand_vector_operations (void)
509 {
510 return flag_tree_vectorize != 0;
511 }
512
513 static unsigned int
514 expand_vector_operations (void)
515 {
516 block_stmt_iterator bsi;
517 basic_block bb;
518
519 FOR_EACH_BB (bb)
520 {
521 for (bsi = bsi_start (bb); !bsi_end_p (bsi); bsi_next (&bsi))
522 {
523 expand_vector_operations_1 (&bsi);
524 update_stmt_if_modified (bsi_stmt (bsi));
525 }
526 }
527 return 0;
528 }
529
530 struct gimple_opt_pass pass_lower_vector =
531 {
532 {
533 GIMPLE_PASS,
534 "veclower", /* name */
535 0, /* gate */
536 expand_vector_operations, /* execute */
537 NULL, /* sub */
538 NULL, /* next */
539 0, /* static_pass_number */
540 0, /* tv_id */
541 PROP_cfg, /* properties_required */
542 0, /* properties_provided */
543 0, /* properties_destroyed */
544 0, /* todo_flags_start */
545 TODO_dump_func | TODO_ggc_collect
546 | TODO_verify_stmts /* todo_flags_finish */
547 }
548 };
549
550 struct gimple_opt_pass pass_lower_vector_ssa =
551 {
552 {
553 GIMPLE_PASS,
554 "veclower2", /* name */
555 gate_expand_vector_operations, /* gate */
556 expand_vector_operations, /* execute */
557 NULL, /* sub */
558 NULL, /* next */
559 0, /* static_pass_number */
560 0, /* tv_id */
561 PROP_cfg, /* properties_required */
562 0, /* properties_provided */
563 0, /* properties_destroyed */
564 0, /* todo_flags_start */
565 TODO_dump_func | TODO_update_ssa /* todo_flags_finish */
566 | TODO_verify_ssa
567 | TODO_verify_stmts | TODO_verify_flow
568 }
569 };
570
571 #include "gt-tree-vect-generic.h"