From f24f4c15884bf1ee65a10e2f959842eec4198876 Mon Sep 17 00:00:00 2001 From: Richard Sandiford Date: Thu, 19 Sep 2019 09:04:28 +0000 Subject: [PATCH] Rework constant subreg folds and handle more variable-length cases This patch rewrites the way simplify_subreg handles constants. It uses similar native_encode/native_decode routines to the tree-level handling of VIEW_CONVERT_EXPR, meaning that we can move between rtx constants and the target memory image of them. The main point of this patch is to support subregs of constant-length vectors for VLA vectors, beyond the very simple cases that were already handled. Many of the new tests failed before the patch for variable- length vectors. The boolean side is tested more by the upcoming SVE ACLE work. 2019-09-19 Richard Sandiford gcc/ * defaults.h (TARGET_UNIT): New macro. (target_unit): New type. * rtl.h (native_encode_rtx, native_decode_rtx) (native_decode_vector_rtx, subreg_size_lsb): Declare. (subreg_lsb_1): Turn into an inline wrapper around subreg_size_lsb. * rtlanal.c (subreg_lsb_1): Delete. (subreg_size_lsb): New function. * simplify-rtx.c: Include rtx-vector-builder.h (simplify_immed_subreg): Delete. (native_encode_rtx, native_decode_vector_rtx, native_decode_rtx) (simplify_const_vector_byte_offset, simplify_const_vector_subreg): New functions. (simplify_subreg): Use them. (test_vector_subregs_modes, test_vector_subregs_repeating) (test_vector_subregs_fore_back, test_vector_subregs_stepped) (test_vector_subregs): New functions. (test_vector_ops): Call test_vector_subregs for integer vector modes with at least 2 elements. From-SVN: r275959 --- gcc/ChangeLog | 21 ++ gcc/defaults.h | 14 + gcc/rtl.h | 20 +- gcc/rtlanal.c | 28 +- gcc/simplify-rtx.c | 902 +++++++++++++++++++++++++++++---------------- 5 files changed, 663 insertions(+), 322 deletions(-) diff --git a/gcc/ChangeLog b/gcc/ChangeLog index 92fdc63efc6..c51b6f65720 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,24 @@ +2019-09-19 Richard Sandiford + + * defaults.h (TARGET_UNIT): New macro. + (target_unit): New type. + * rtl.h (native_encode_rtx, native_decode_rtx) + (native_decode_vector_rtx, subreg_size_lsb): Declare. + (subreg_lsb_1): Turn into an inline wrapper around subreg_size_lsb. + * rtlanal.c (subreg_lsb_1): Delete. + (subreg_size_lsb): New function. + * simplify-rtx.c: Include rtx-vector-builder.h + (simplify_immed_subreg): Delete. + (native_encode_rtx, native_decode_vector_rtx, native_decode_rtx) + (simplify_const_vector_byte_offset, simplify_const_vector_subreg): New + functions. + (simplify_subreg): Use them. + (test_vector_subregs_modes, test_vector_subregs_repeating) + (test_vector_subregs_fore_back, test_vector_subregs_stepped) + (test_vector_subregs): New functions. + (test_vector_ops): Call test_vector_subregs for integer vector + modes with at least 2 elements. + 2019-09-19 Richard Biener * tree-parloops.c (parloops_is_slp_reduction): Do not set diff --git a/gcc/defaults.h b/gcc/defaults.h index af7ea185f1e..72d4fba11a6 100644 --- a/gcc/defaults.h +++ b/gcc/defaults.h @@ -1459,4 +1459,18 @@ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see #define DWARF_GNAT_ENCODINGS_DEFAULT DWARF_GNAT_ENCODINGS_GDB #endif +#ifndef USED_FOR_TARGET +/* Done this way to keep gengtype happy. */ +#if BITS_PER_UNIT == 8 +#define TARGET_UNIT uint8_t +#elif BITS_PER_UNIT == 16 +#define TARGET_UNIT uint16_t +#elif BITS_PER_UNIT == 32 +#define TARGET_UNIT uint32_t +#else +#error Unknown BITS_PER_UNIT +#endif +typedef TARGET_UNIT target_unit; +#endif + #endif /* ! GCC_DEFAULTS_H */ diff --git a/gcc/rtl.h b/gcc/rtl.h index c054861f896..9cadac7a970 100644 --- a/gcc/rtl.h +++ b/gcc/rtl.h @@ -2406,12 +2406,30 @@ extern int rtx_cost (rtx, machine_mode, enum rtx_code, int, bool); extern int address_cost (rtx, machine_mode, addr_space_t, bool); extern void get_full_rtx_cost (rtx, machine_mode, enum rtx_code, int, struct full_rtx_costs *); +extern bool native_encode_rtx (machine_mode, rtx, vec &, + unsigned int, unsigned int); +extern rtx native_decode_rtx (machine_mode, vec, + unsigned int); +extern rtx native_decode_vector_rtx (machine_mode, vec, + unsigned int, unsigned int, unsigned int); extern poly_uint64 subreg_lsb (const_rtx); -extern poly_uint64 subreg_lsb_1 (machine_mode, machine_mode, poly_uint64); +extern poly_uint64 subreg_size_lsb (poly_uint64, poly_uint64, poly_uint64); extern poly_uint64 subreg_size_offset_from_lsb (poly_uint64, poly_uint64, poly_uint64); extern bool read_modify_subreg_p (const_rtx); +/* Given a subreg's OUTER_MODE, INNER_MODE, and SUBREG_BYTE, return the + bit offset at which the subreg begins (counting from the least significant + bit of the operand). */ + +inline poly_uint64 +subreg_lsb_1 (machine_mode outer_mode, machine_mode inner_mode, + poly_uint64 subreg_byte) +{ + return subreg_size_lsb (GET_MODE_SIZE (outer_mode), + GET_MODE_SIZE (inner_mode), subreg_byte); +} + /* Return the subreg byte offset for a subreg whose outer mode is OUTER_MODE, whose inner mode is INNER_MODE, and where there are LSB_SHIFT *bits* between the lsb of the outer value and the lsb of diff --git a/gcc/rtlanal.c b/gcc/rtlanal.c index 3a72db7572d..28b399cfc44 100644 --- a/gcc/rtlanal.c +++ b/gcc/rtlanal.c @@ -3637,23 +3637,31 @@ loc_mentioned_in_p (rtx *loc, const_rtx in) return 0; } -/* Helper function for subreg_lsb. Given a subreg's OUTER_MODE, INNER_MODE, - and SUBREG_BYTE, return the bit offset where the subreg begins - (counting from the least significant bit of the operand). */ +/* Reinterpret a subreg as a bit extraction from an integer and return + the position of the least significant bit of the extracted value. + In other words, if the extraction were performed as a shift right + and mask, return the number of bits to shift right. + + The outer value of the subreg has OUTER_BYTES bytes and starts at + byte offset SUBREG_BYTE within an inner value of INNER_BYTES bytes. */ poly_uint64 -subreg_lsb_1 (machine_mode outer_mode, - machine_mode inner_mode, - poly_uint64 subreg_byte) +subreg_size_lsb (poly_uint64 outer_bytes, + poly_uint64 inner_bytes, + poly_uint64 subreg_byte) { poly_uint64 subreg_end, trailing_bytes, byte_pos; /* A paradoxical subreg begins at bit position 0. */ - if (paradoxical_subreg_p (outer_mode, inner_mode)) - return 0; + gcc_checking_assert (ordered_p (outer_bytes, inner_bytes)); + if (maybe_gt (outer_bytes, inner_bytes)) + { + gcc_checking_assert (known_eq (subreg_byte, 0U)); + return 0; + } - subreg_end = subreg_byte + GET_MODE_SIZE (outer_mode); - trailing_bytes = GET_MODE_SIZE (inner_mode) - subreg_end; + subreg_end = subreg_byte + outer_bytes; + trailing_bytes = inner_bytes - subreg_end; if (WORDS_BIG_ENDIAN && BYTES_BIG_ENDIAN) byte_pos = trailing_bytes; else if (!WORDS_BIG_ENDIAN && !BYTES_BIG_ENDIAN) diff --git a/gcc/simplify-rtx.c b/gcc/simplify-rtx.c index 9359a3cdb4d..87ba337725b 100644 --- a/gcc/simplify-rtx.c +++ b/gcc/simplify-rtx.c @@ -6130,342 +6130,466 @@ simplify_ternary_operation (enum rtx_code code, machine_mode mode, return 0; } -/* Evaluate a SUBREG of a CONST_INT or CONST_WIDE_INT or CONST_DOUBLE - or CONST_FIXED or CONST_VECTOR, returning another CONST_INT or - CONST_WIDE_INT or CONST_DOUBLE or CONST_FIXED or CONST_VECTOR. +/* Try to calculate NUM_BYTES bytes of the target memory image of X, + starting at byte FIRST_BYTE. Return true on success and add the + bytes to BYTES, such that each byte has BITS_PER_UNIT bits and such + that the bytes follow target memory order. Leave BYTES unmodified + on failure. - Works by unpacking INNER_BYTES bytes of OP into a collection of 8-bit values - represented as a little-endian array of 'unsigned char', selecting by BYTE, - and then repacking them again for OUTERMODE. If OP is a CONST_VECTOR, - FIRST_ELEM is the number of the first element to extract, otherwise - FIRST_ELEM is ignored. */ + MODE is the mode of X. The caller must reserve NUM_BYTES bytes in + BYTES before calling this function. */ -static rtx -simplify_immed_subreg (fixed_size_mode outermode, rtx op, - machine_mode innermode, unsigned int byte, - unsigned int first_elem, unsigned int inner_bytes) +bool +native_encode_rtx (machine_mode mode, rtx x, vec &bytes, + unsigned int first_byte, unsigned int num_bytes) { - enum { - value_bit = 8, - value_mask = (1 << value_bit) - 1 - }; - unsigned char value[MAX_BITSIZE_MODE_ANY_MODE / value_bit]; - int value_start; - int i; - int elem; - - int num_elem; - rtx * elems; - int elem_bitsize; - rtx result_s = NULL; - rtvec result_v = NULL; - enum mode_class outer_class; - scalar_mode outer_submode; - int max_bitsize; + /* Check the mode is sensible. */ + gcc_assert (GET_MODE (x) == VOIDmode + ? is_a (mode) + : mode == GET_MODE (x)); - /* Some ports misuse CCmode. */ - if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (op)) - return op; + if (GET_CODE (x) == CONST_VECTOR) + { + /* CONST_VECTOR_ELT follows target memory order, so no shuffling + is necessary. The only complication is that MODE_VECTOR_BOOL + vectors can have several elements per byte. */ + unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode), + GET_MODE_NUNITS (mode)); + unsigned int elt = first_byte * BITS_PER_UNIT / elt_bits; + if (elt_bits < BITS_PER_UNIT) + { + /* This is the only case in which elements can be smaller than + a byte. */ + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL); + for (unsigned int i = 0; i < num_bytes; ++i) + { + target_unit value = 0; + for (unsigned int j = 0; j < BITS_PER_UNIT; j += elt_bits) + { + value |= (INTVAL (CONST_VECTOR_ELT (x, elt)) & 1) << j; + elt += 1; + } + bytes.quick_push (value); + } + return true; + } - /* We have no way to represent a complex constant at the rtl level. */ - if (COMPLEX_MODE_P (outermode)) - return NULL_RTX; + unsigned int start = bytes.length (); + unsigned int elt_bytes = GET_MODE_UNIT_SIZE (mode); + /* Make FIRST_BYTE relative to ELT. */ + first_byte %= elt_bytes; + while (num_bytes > 0) + { + /* Work out how many bytes we want from element ELT. */ + unsigned int chunk_bytes = MIN (num_bytes, elt_bytes - first_byte); + if (!native_encode_rtx (GET_MODE_INNER (mode), + CONST_VECTOR_ELT (x, elt), bytes, + first_byte, chunk_bytes)) + { + bytes.truncate (start); + return false; + } + elt += 1; + first_byte = 0; + num_bytes -= chunk_bytes; + } + return true; + } - /* We support any size mode. */ - max_bitsize = MAX (GET_MODE_BITSIZE (outermode), - inner_bytes * BITS_PER_UNIT); + /* All subsequent cases are limited to scalars. */ + scalar_mode smode; + if (!is_a (mode, &smode)) + return false; - /* Unpack the value. */ + /* Make sure that the region is in range. */ + unsigned int end_byte = first_byte + num_bytes; + unsigned int mode_bytes = GET_MODE_SIZE (smode); + gcc_assert (end_byte <= mode_bytes); - if (GET_CODE (op) == CONST_VECTOR) + if (CONST_SCALAR_INT_P (x)) { - num_elem = CEIL (inner_bytes, GET_MODE_UNIT_SIZE (innermode)); - elem_bitsize = GET_MODE_UNIT_BITSIZE (innermode); + /* The target memory layout is affected by both BYTES_BIG_ENDIAN + and WORDS_BIG_ENDIAN. Use the subreg machinery to get the lsb + position of each byte. */ + rtx_mode_t value (x, smode); + wide_int_ref value_wi (value); + for (unsigned int byte = first_byte; byte < end_byte; ++byte) + { + /* Always constant because the inputs are. */ + unsigned int lsb + = subreg_size_lsb (1, mode_bytes, byte).to_constant (); + /* Operate directly on the encoding rather than using + wi::extract_uhwi, so that we preserve the sign or zero + extension for modes that are not a whole number of bits in + size. (Zero extension is only used for the combination of + innermode == BImode && STORE_FLAG_VALUE == 1). */ + unsigned int elt = lsb / HOST_BITS_PER_WIDE_INT; + unsigned int shift = lsb % HOST_BITS_PER_WIDE_INT; + unsigned HOST_WIDE_INT uhwi = value_wi.elt (elt); + bytes.quick_push (uhwi >> shift); + } + return true; } - else + + if (CONST_DOUBLE_P (x)) { - num_elem = 1; - elem_bitsize = max_bitsize; + /* real_to_target produces an array of integers in target memory order. + All integers before the last one have 32 bits; the last one may + have 32 bits or fewer, depending on whether the mode bitsize + is divisible by 32. Each of these integers is then laid out + in target memory as any other integer would be. */ + long el32[MAX_BITSIZE_MODE_ANY_MODE / 32]; + real_to_target (el32, CONST_DOUBLE_REAL_VALUE (x), smode); + + /* The (maximum) number of target bytes per element of el32. */ + unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT; + gcc_assert (bytes_per_el32 != 0); + + /* Build up the integers in a similar way to the CONST_SCALAR_INT_P + handling above. */ + for (unsigned int byte = first_byte; byte < end_byte; ++byte) + { + unsigned int index = byte / bytes_per_el32; + unsigned int subbyte = byte % bytes_per_el32; + unsigned int int_bytes = MIN (bytes_per_el32, + mode_bytes - index * bytes_per_el32); + /* Always constant because the inputs are. */ + unsigned int lsb + = subreg_size_lsb (1, int_bytes, subbyte).to_constant (); + bytes.quick_push ((unsigned long) el32[index] >> lsb); + } + return true; } - /* If this asserts, it is too complicated; reducing value_bit may help. */ - gcc_assert (BITS_PER_UNIT % value_bit == 0); - /* I don't know how to handle endianness of sub-units. */ - gcc_assert (elem_bitsize % BITS_PER_UNIT == 0); - for (elem = 0; elem < num_elem; elem++) + if (GET_CODE (x) == CONST_FIXED) { - unsigned char * vp; - rtx el = (GET_CODE (op) == CONST_VECTOR - ? CONST_VECTOR_ELT (op, first_elem + elem) - : op); + for (unsigned int byte = first_byte; byte < end_byte; ++byte) + { + /* Always constant because the inputs are. */ + unsigned int lsb + = subreg_size_lsb (1, mode_bytes, byte).to_constant (); + unsigned HOST_WIDE_INT piece = CONST_FIXED_VALUE_LOW (x); + if (lsb >= HOST_BITS_PER_WIDE_INT) + { + lsb -= HOST_BITS_PER_WIDE_INT; + piece = CONST_FIXED_VALUE_HIGH (x); + } + bytes.quick_push (piece >> lsb); + } + return true; + } - /* Vectors are kept in target memory order. (This is probably - a mistake.) */ - { - unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT; - unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize) - / BITS_PER_UNIT); - unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte; - unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte; - unsigned bytele = (subword_byte % UNITS_PER_WORD - + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD); - vp = value + (bytele * BITS_PER_UNIT) / value_bit; - } + return false; +} - switch (GET_CODE (el)) - { - case CONST_INT: - for (i = 0; - i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize; - i += value_bit) - *vp++ = INTVAL (el) >> i; - /* CONST_INTs are always logically sign-extended. */ - for (; i < elem_bitsize; i += value_bit) - *vp++ = INTVAL (el) < 0 ? -1 : 0; - break; +/* Read a vector of mode MODE from the target memory image given by BYTES, + starting at byte FIRST_BYTE. The vector is known to be encodable using + NPATTERNS interleaved patterns with NELTS_PER_PATTERN elements each, + and BYTES is known to have enough bytes to supply NPATTERNS * + NELTS_PER_PATTERN vector elements. Each element of BYTES contains + BITS_PER_UNIT bits and the bytes are in target memory order. - case CONST_WIDE_INT: - { - rtx_mode_t val = rtx_mode_t (el, GET_MODE_INNER (innermode)); - unsigned char extend = wi::sign_mask (val); - int prec = wi::get_precision (val); - - for (i = 0; i < prec && i < elem_bitsize; i += value_bit) - *vp++ = wi::extract_uhwi (val, i, value_bit); - for (; i < elem_bitsize; i += value_bit) - *vp++ = extend; - } - break; + Return the vector on success, otherwise return NULL_RTX. */ - case CONST_DOUBLE: - if (TARGET_SUPPORTS_WIDE_INT == 0 && GET_MODE (el) == VOIDmode) - { - unsigned char extend = 0; - /* If this triggers, someone should have generated a - CONST_INT instead. */ - gcc_assert (elem_bitsize > HOST_BITS_PER_WIDE_INT); - - for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit) - *vp++ = CONST_DOUBLE_LOW (el) >> i; - while (i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize) - { - *vp++ - = CONST_DOUBLE_HIGH (el) >> (i - HOST_BITS_PER_WIDE_INT); - i += value_bit; - } +rtx +native_decode_vector_rtx (machine_mode mode, vec bytes, + unsigned int first_byte, unsigned int npatterns, + unsigned int nelts_per_pattern) +{ + rtx_vector_builder builder (mode, npatterns, nelts_per_pattern); - if (CONST_DOUBLE_HIGH (el) >> (HOST_BITS_PER_WIDE_INT - 1)) - extend = -1; - for (; i < elem_bitsize; i += value_bit) - *vp++ = extend; - } - else - { - /* This is big enough for anything on the platform. */ - long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32]; - scalar_float_mode el_mode; + unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode), + GET_MODE_NUNITS (mode)); + if (elt_bits < BITS_PER_UNIT) + { + /* This is the only case in which elements can be smaller than a byte. + Element 0 is always in the lsb of the containing byte. */ + gcc_assert (GET_MODE_CLASS (mode) == MODE_VECTOR_BOOL); + for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) + { + unsigned int bit_index = first_byte * BITS_PER_UNIT + i * elt_bits; + unsigned int byte_index = bit_index / BITS_PER_UNIT; + unsigned int lsb = bit_index % BITS_PER_UNIT; + builder.quick_push (bytes[byte_index] & (1 << lsb) + ? CONST1_RTX (BImode) + : CONST0_RTX (BImode)); + } + } + else + { + for (unsigned int i = 0; i < builder.encoded_nelts (); ++i) + { + rtx x = native_decode_rtx (GET_MODE_INNER (mode), bytes, first_byte); + if (!x) + return NULL_RTX; + builder.quick_push (x); + first_byte += elt_bits / BITS_PER_UNIT; + } + } + return builder.build (); +} - el_mode = as_a (GET_MODE (el)); - int bitsize = GET_MODE_BITSIZE (el_mode); +/* Read an rtx of mode MODE from the target memory image given by BYTES, + starting at byte FIRST_BYTE. Each element of BYTES contains BITS_PER_UNIT + bits and the bytes are in target memory order. The image has enough + values to specify all bytes of MODE. - gcc_assert (bitsize <= elem_bitsize); - gcc_assert (bitsize % value_bit == 0); + Return the rtx on success, otherwise return NULL_RTX. */ - real_to_target (tmp, CONST_DOUBLE_REAL_VALUE (el), - GET_MODE (el)); +rtx +native_decode_rtx (machine_mode mode, vec bytes, + unsigned int first_byte) +{ + if (VECTOR_MODE_P (mode)) + { + /* If we know at compile time how many elements there are, + pull each element directly from BYTES. */ + unsigned int nelts; + if (GET_MODE_NUNITS (mode).is_constant (&nelts)) + return native_decode_vector_rtx (mode, bytes, first_byte, nelts, 1); + return NULL_RTX; + } - /* real_to_target produces its result in words affected by - FLOAT_WORDS_BIG_ENDIAN. However, we ignore this, - and use WORDS_BIG_ENDIAN instead; see the documentation - of SUBREG in rtl.texi. */ - for (i = 0; i < bitsize; i += value_bit) - { - int ibase; - if (WORDS_BIG_ENDIAN) - ibase = bitsize - 1 - i; - else - ibase = i; - *vp++ = tmp[ibase / 32] >> i % 32; - } + scalar_int_mode imode; + if (is_a (mode, &imode) + && GET_MODE_PRECISION (imode) <= MAX_BITSIZE_MODE_ANY_INT) + { + /* Pull the bytes msb first, so that we can use simple + shift-and-insert wide_int operations. */ + unsigned int size = GET_MODE_SIZE (imode); + wide_int result (wi::zero (GET_MODE_PRECISION (imode))); + for (unsigned int i = 0; i < size; ++i) + { + unsigned int lsb = (size - i - 1) * BITS_PER_UNIT; + /* Always constant because the inputs are. */ + unsigned int subbyte + = subreg_size_offset_from_lsb (1, size, lsb).to_constant (); + result <<= BITS_PER_UNIT; + result |= bytes[first_byte + subbyte]; + } + return immed_wide_int_const (result, imode); + } - /* It shouldn't matter what's done here, so fill it with - zero. */ - for (; i < elem_bitsize; i += value_bit) - *vp++ = 0; - } - break; + scalar_float_mode fmode; + if (is_a (mode, &fmode)) + { + /* We need to build an array of integers in target memory order. + All integers before the last one have 32 bits; the last one may + have 32 bits or fewer, depending on whether the mode bitsize + is divisible by 32. */ + long el32[MAX_BITSIZE_MODE_ANY_MODE / 32]; + unsigned int num_el32 = CEIL (GET_MODE_BITSIZE (fmode), 32); + memset (el32, 0, num_el32 * sizeof (long)); + + /* The (maximum) number of target bytes per element of el32. */ + unsigned int bytes_per_el32 = 32 / BITS_PER_UNIT; + gcc_assert (bytes_per_el32 != 0); + + unsigned int mode_bytes = GET_MODE_SIZE (fmode); + for (unsigned int byte = 0; byte < mode_bytes; ++byte) + { + unsigned int index = byte / bytes_per_el32; + unsigned int subbyte = byte % bytes_per_el32; + unsigned int int_bytes = MIN (bytes_per_el32, + mode_bytes - index * bytes_per_el32); + /* Always constant because the inputs are. */ + unsigned int lsb + = subreg_size_lsb (1, int_bytes, subbyte).to_constant (); + el32[index] |= (unsigned long) bytes[first_byte + byte] << lsb; + } + REAL_VALUE_TYPE r; + real_from_target (&r, el32, fmode); + return const_double_from_real_value (r, fmode); + } - case CONST_FIXED: - if (elem_bitsize <= HOST_BITS_PER_WIDE_INT) - { - for (i = 0; i < elem_bitsize; i += value_bit) - *vp++ = CONST_FIXED_VALUE_LOW (el) >> i; - } + if (ALL_SCALAR_FIXED_POINT_MODE_P (mode)) + { + scalar_mode smode = as_a (mode); + FIXED_VALUE_TYPE f; + f.data.low = 0; + f.data.high = 0; + f.mode = smode; + + unsigned int mode_bytes = GET_MODE_SIZE (smode); + for (unsigned int byte = 0; byte < mode_bytes; ++byte) + { + /* Always constant because the inputs are. */ + unsigned int lsb + = subreg_size_lsb (1, mode_bytes, byte).to_constant (); + unsigned HOST_WIDE_INT unit = bytes[first_byte + byte]; + if (lsb >= HOST_BITS_PER_WIDE_INT) + f.data.high |= unit << (lsb - HOST_BITS_PER_WIDE_INT); else - { - for (i = 0; i < HOST_BITS_PER_WIDE_INT; i += value_bit) - *vp++ = CONST_FIXED_VALUE_LOW (el) >> i; - for (; i < HOST_BITS_PER_DOUBLE_INT && i < elem_bitsize; - i += value_bit) - *vp++ = CONST_FIXED_VALUE_HIGH (el) - >> (i - HOST_BITS_PER_WIDE_INT); - for (; i < elem_bitsize; i += value_bit) - *vp++ = 0; - } - break; - - default: - gcc_unreachable (); + f.data.low |= unit << lsb; } + return CONST_FIXED_FROM_FIXED_VALUE (f, mode); } - /* Now, pick the right byte to start with. */ - /* Renumber BYTE so that the least-significant byte is byte 0. A special - case is paradoxical SUBREGs, which shouldn't be adjusted since they - will already have offset 0. */ - if (inner_bytes >= GET_MODE_SIZE (outermode)) + return NULL_RTX; +} + +/* Simplify a byte offset BYTE into CONST_VECTOR X. The main purpose + is to convert a runtime BYTE value into a constant one. */ + +static poly_uint64 +simplify_const_vector_byte_offset (rtx x, poly_uint64 byte) +{ + /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes. */ + machine_mode mode = GET_MODE (x); + unsigned int elt_bits = vector_element_size (GET_MODE_BITSIZE (mode), + GET_MODE_NUNITS (mode)); + /* The number of bits needed to encode one element from each pattern. */ + unsigned int sequence_bits = CONST_VECTOR_NPATTERNS (x) * elt_bits; + + /* Identify the start point in terms of a sequence number and a byte offset + within that sequence. */ + poly_uint64 first_sequence; + unsigned HOST_WIDE_INT subbit; + if (can_div_trunc_p (byte * BITS_PER_UNIT, sequence_bits, + &first_sequence, &subbit)) { - unsigned ibyte = inner_bytes - GET_MODE_SIZE (outermode) - byte; - unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte; - unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte; - byte = (subword_byte % UNITS_PER_WORD - + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD); + unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x); + if (nelts_per_pattern == 1) + /* This is a duplicated vector, so the value of FIRST_SEQUENCE + doesn't matter. */ + byte = subbit / BITS_PER_UNIT; + else if (nelts_per_pattern == 2 && known_gt (first_sequence, 0U)) + { + /* The subreg drops the first element from each pattern and + only uses the second element. Find the first sequence + that starts on a byte boundary. */ + subbit += least_common_multiple (sequence_bits, BITS_PER_UNIT); + byte = subbit / BITS_PER_UNIT; + } } + return byte; +} + +/* Subroutine of simplify_subreg in which: + + - X is known to be a CONST_VECTOR + - OUTERMODE is known to be a vector mode - /* BYTE should still be inside OP. (Note that BYTE is unsigned, - so if it's become negative it will instead be very large.) */ - gcc_assert (byte < inner_bytes); + Try to handle the subreg by operating on the CONST_VECTOR encoding + rather than on each individual element of the CONST_VECTOR. - /* Convert from bytes to chunks of size value_bit. */ - value_start = byte * (BITS_PER_UNIT / value_bit); + Return the simplified subreg on success, otherwise return NULL_RTX. */ + +static rtx +simplify_const_vector_subreg (machine_mode outermode, rtx x, + machine_mode innermode, unsigned int first_byte) +{ + /* Paradoxical subregs of vectors have dubious semantics. */ + if (paradoxical_subreg_p (outermode, innermode)) + return NULL_RTX; - /* Re-pack the value. */ - num_elem = GET_MODE_NUNITS (outermode); + /* We can only preserve the semantics of a stepped pattern if the new + vector element is the same as the original one. */ + if (CONST_VECTOR_STEPPED_P (x) + && GET_MODE_INNER (outermode) != GET_MODE_INNER (innermode)) + return NULL_RTX; - if (VECTOR_MODE_P (outermode)) + /* Cope with MODE_VECTOR_BOOL by operating on bits rather than bytes. */ + unsigned int x_elt_bits + = vector_element_size (GET_MODE_BITSIZE (innermode), + GET_MODE_NUNITS (innermode)); + unsigned int out_elt_bits + = vector_element_size (GET_MODE_BITSIZE (outermode), + GET_MODE_NUNITS (outermode)); + + /* The number of bits needed to encode one element from every pattern + of the original vector. */ + unsigned int x_sequence_bits = CONST_VECTOR_NPATTERNS (x) * x_elt_bits; + + /* The number of bits needed to encode one element from every pattern + of the result. */ + unsigned int out_sequence_bits + = least_common_multiple (x_sequence_bits, out_elt_bits); + + /* Work out the number of interleaved patterns in the output vector + and the number of encoded elements per pattern. */ + unsigned int out_npatterns = out_sequence_bits / out_elt_bits; + unsigned int nelts_per_pattern = CONST_VECTOR_NELTS_PER_PATTERN (x); + + /* The encoding scheme requires the number of elements to be a multiple + of the number of patterns, so that each pattern appears at least once + and so that the same number of elements appear from each pattern. */ + bool ok_p = multiple_p (GET_MODE_NUNITS (outermode), out_npatterns); + unsigned int const_nunits; + if (GET_MODE_NUNITS (outermode).is_constant (&const_nunits) + && (!ok_p || out_npatterns * nelts_per_pattern > const_nunits)) { - result_v = rtvec_alloc (num_elem); - elems = &RTVEC_ELT (result_v, 0); + /* Either the encoding is invalid, or applying it would give us + more elements than we need. Just encode each element directly. */ + out_npatterns = const_nunits; + nelts_per_pattern = 1; } - else - elems = &result_s; + else if (!ok_p) + return NULL_RTX; - outer_submode = GET_MODE_INNER (outermode); - outer_class = GET_MODE_CLASS (outer_submode); - elem_bitsize = GET_MODE_BITSIZE (outer_submode); + /* Get enough bytes of X to form the new encoding. */ + unsigned int buffer_bits = out_npatterns * nelts_per_pattern * out_elt_bits; + unsigned int buffer_bytes = CEIL (buffer_bits, BITS_PER_UNIT); + auto_vec buffer (buffer_bytes); + if (!native_encode_rtx (innermode, x, buffer, first_byte, buffer_bytes)) + return NULL_RTX; - gcc_assert (elem_bitsize % value_bit == 0); - gcc_assert (elem_bitsize + value_start * value_bit <= max_bitsize); + /* Reencode the bytes as OUTERMODE. */ + return native_decode_vector_rtx (outermode, buffer, 0, out_npatterns, + nelts_per_pattern); +} - for (elem = 0; elem < num_elem; elem++) - { - unsigned char *vp; +/* Try to simplify a subreg of a constant by encoding the subreg region + as a sequence of target bytes and reading them back in the new mode. + Return the new value on success, otherwise return null. - /* Vectors are stored in target memory order. (This is probably - a mistake.) */ - { - unsigned byte = (elem * elem_bitsize) / BITS_PER_UNIT; - unsigned ibyte = (((num_elem - 1 - elem) * elem_bitsize) - / BITS_PER_UNIT); - unsigned word_byte = WORDS_BIG_ENDIAN ? ibyte : byte; - unsigned subword_byte = BYTES_BIG_ENDIAN ? ibyte : byte; - unsigned bytele = (subword_byte % UNITS_PER_WORD - + (word_byte / UNITS_PER_WORD) * UNITS_PER_WORD); - vp = value + value_start + (bytele * BITS_PER_UNIT) / value_bit; - } + The subreg has outer mode OUTERMODE, inner mode INNERMODE, inner value X + and byte offset FIRST_BYTE. */ - switch (outer_class) - { - case MODE_INT: - case MODE_PARTIAL_INT: - { - int u; - int base = 0; - int units - = (GET_MODE_BITSIZE (outer_submode) + HOST_BITS_PER_WIDE_INT - 1) - / HOST_BITS_PER_WIDE_INT; - HOST_WIDE_INT tmp[MAX_BITSIZE_MODE_ANY_INT / HOST_BITS_PER_WIDE_INT]; - wide_int r; - - if (GET_MODE_PRECISION (outer_submode) > MAX_BITSIZE_MODE_ANY_INT) - return NULL_RTX; - for (u = 0; u < units; u++) - { - unsigned HOST_WIDE_INT buf = 0; - for (i = 0; - i < HOST_BITS_PER_WIDE_INT && base + i < elem_bitsize; - i += value_bit) - buf |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i; - - tmp[u] = buf; - base += HOST_BITS_PER_WIDE_INT; - } - r = wide_int::from_array (tmp, units, - GET_MODE_PRECISION (outer_submode)); -#if TARGET_SUPPORTS_WIDE_INT == 0 - /* Make sure r will fit into CONST_INT or CONST_DOUBLE. */ - if (wi::min_precision (r, SIGNED) > HOST_BITS_PER_DOUBLE_INT) - return NULL_RTX; -#endif - elems[elem] = immed_wide_int_const (r, outer_submode); - } - break; +static rtx +simplify_immed_subreg (fixed_size_mode outermode, rtx x, + machine_mode innermode, unsigned int first_byte) +{ + unsigned int buffer_bytes = GET_MODE_SIZE (outermode); + auto_vec buffer (buffer_bytes); - case MODE_FLOAT: - case MODE_DECIMAL_FLOAT: - { - REAL_VALUE_TYPE r; - long tmp[MAX_BITSIZE_MODE_ANY_MODE / 32] = { 0 }; - - /* real_from_target wants its input in words affected by - FLOAT_WORDS_BIG_ENDIAN. However, we ignore this, - and use WORDS_BIG_ENDIAN instead; see the documentation - of SUBREG in rtl.texi. */ - for (i = 0; i < elem_bitsize; i += value_bit) - { - int ibase; - if (WORDS_BIG_ENDIAN) - ibase = elem_bitsize - 1 - i; - else - ibase = i; - tmp[ibase / 32] |= (*vp++ & value_mask) << i % 32; - } + /* Some ports misuse CCmode. */ + if (GET_MODE_CLASS (outermode) == MODE_CC && CONST_INT_P (x)) + return x; - real_from_target (&r, tmp, outer_submode); - elems[elem] = const_double_from_real_value (r, outer_submode); - } - break; + /* Paradoxical subregs read undefined values for bytes outside of the + inner value. However, we have traditionally always sign-extended + integer constants and zero-extended others. */ + unsigned int inner_bytes = buffer_bytes; + if (paradoxical_subreg_p (outermode, innermode)) + { + if (!GET_MODE_SIZE (innermode).is_constant (&inner_bytes)) + return NULL_RTX; - case MODE_FRACT: - case MODE_UFRACT: - case MODE_ACCUM: - case MODE_UACCUM: - { - FIXED_VALUE_TYPE f; - f.data.low = 0; - f.data.high = 0; - f.mode = outer_submode; - - for (i = 0; - i < HOST_BITS_PER_WIDE_INT && i < elem_bitsize; - i += value_bit) - f.data.low |= (unsigned HOST_WIDE_INT)(*vp++ & value_mask) << i; - for (; i < elem_bitsize; i += value_bit) - f.data.high |= ((unsigned HOST_WIDE_INT)(*vp++ & value_mask) - << (i - HOST_BITS_PER_WIDE_INT)); - - elems[elem] = CONST_FIXED_FROM_FIXED_VALUE (f, outer_submode); - } - break; + target_unit filler = 0; + if (CONST_SCALAR_INT_P (x) && wi::neg_p (rtx_mode_t (x, innermode))) + filler = -1; - default: - gcc_unreachable (); - } + /* Add any leading bytes due to big-endian layout. The number of + bytes must be constant because both modes have constant size. */ + unsigned int leading_bytes + = -byte_lowpart_offset (outermode, innermode).to_constant (); + for (unsigned int i = 0; i < leading_bytes; ++i) + buffer.quick_push (filler); + + if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes)) + return NULL_RTX; + + /* Add any trailing bytes due to little-endian layout. */ + while (buffer.length () < buffer_bytes) + buffer.quick_push (filler); } - if (VECTOR_MODE_P (outermode)) - return gen_rtx_CONST_VECTOR (outermode, result_v); else - return result_s; + { + if (!native_encode_rtx (innermode, x, buffer, first_byte, inner_bytes)) + return NULL_RTX; + } + return native_decode_rtx (outermode, buffer, 0); } /* Simplify SUBREG:OUTERMODE(OP:INNERMODE, BYTE) @@ -6494,6 +6618,9 @@ simplify_subreg (machine_mode outermode, rtx op, if (outermode == innermode && known_eq (byte, 0U)) return op; + if (GET_CODE (op) == CONST_VECTOR) + byte = simplify_const_vector_byte_offset (op, byte); + if (multiple_p (byte, GET_MODE_UNIT_SIZE (innermode))) { rtx elt; @@ -6513,30 +6640,21 @@ simplify_subreg (machine_mode outermode, rtx op, || CONST_FIXED_P (op) || GET_CODE (op) == CONST_VECTOR) { - /* simplify_immed_subreg deconstructs OP into bytes and constructs - the result from bytes, so it only works if the sizes of the modes - and the value of the offset are known at compile time. Cases that - that apply to general modes and offsets should be handled here - before calling simplify_immed_subreg. */ - fixed_size_mode fs_outermode, fs_innermode; unsigned HOST_WIDE_INT cbyte; - if (is_a (outermode, &fs_outermode) - && is_a (innermode, &fs_innermode) - && byte.is_constant (&cbyte)) - return simplify_immed_subreg (fs_outermode, op, fs_innermode, cbyte, - 0, GET_MODE_SIZE (fs_innermode)); - - /* Handle constant-sized outer modes and variable-sized inner modes. */ - unsigned HOST_WIDE_INT first_elem; - if (GET_CODE (op) == CONST_VECTOR - && is_a (outermode, &fs_outermode) - && constant_multiple_p (byte, GET_MODE_UNIT_SIZE (innermode), - &first_elem)) - return simplify_immed_subreg (fs_outermode, op, innermode, 0, - first_elem, - GET_MODE_SIZE (fs_outermode)); + if (byte.is_constant (&cbyte)) + { + if (GET_CODE (op) == CONST_VECTOR && VECTOR_MODE_P (outermode)) + { + rtx tmp = simplify_const_vector_subreg (outermode, op, + innermode, cbyte); + if (tmp) + return tmp; + } - return NULL_RTX; + fixed_size_mode fs_outermode; + if (is_a (outermode, &fs_outermode)) + return simplify_immed_subreg (fs_outermode, op, innermode, cbyte); + } } /* Changing mode twice with SUBREG => just change it once, @@ -7179,6 +7297,165 @@ test_vec_merge (machine_mode mode) simplify_rtx (nvm)); } +/* Test subregs of integer vector constant X, trying elements in + the range [ELT_BIAS, ELT_BIAS + constant_lower_bound (NELTS)), + where NELTS is the number of elements in X. Subregs involving + elements [ELT_BIAS, ELT_BIAS + FIRST_VALID) are expected to fail. */ + +static void +test_vector_subregs_modes (rtx x, poly_uint64 elt_bias = 0, + unsigned int first_valid = 0) +{ + machine_mode inner_mode = GET_MODE (x); + scalar_mode int_mode = GET_MODE_INNER (inner_mode); + + for (unsigned int modei = 0; modei < NUM_MACHINE_MODES; ++modei) + { + machine_mode outer_mode = (machine_mode) modei; + if (!VECTOR_MODE_P (outer_mode)) + continue; + + unsigned int outer_nunits; + if (GET_MODE_INNER (outer_mode) == int_mode + && GET_MODE_NUNITS (outer_mode).is_constant (&outer_nunits) + && multiple_p (GET_MODE_NUNITS (inner_mode), outer_nunits)) + { + /* Test subregs in which the outer mode is a smaller, + constant-sized vector of the same element type. */ + unsigned int limit + = constant_lower_bound (GET_MODE_NUNITS (inner_mode)); + for (unsigned int elt = 0; elt < limit; elt += outer_nunits) + { + rtx expected = NULL_RTX; + if (elt >= first_valid) + { + rtx_vector_builder builder (outer_mode, outer_nunits, 1); + for (unsigned int i = 0; i < outer_nunits; ++i) + builder.quick_push (CONST_VECTOR_ELT (x, elt + i)); + expected = builder.build (); + } + poly_uint64 byte = (elt_bias + elt) * GET_MODE_SIZE (int_mode); + ASSERT_RTX_EQ (expected, + simplify_subreg (outer_mode, x, + inner_mode, byte)); + } + } + else if (known_eq (GET_MODE_SIZE (outer_mode), + GET_MODE_SIZE (inner_mode)) + && known_eq (elt_bias, 0U) + && (GET_MODE_CLASS (outer_mode) != MODE_VECTOR_BOOL + || known_eq (GET_MODE_BITSIZE (outer_mode), + GET_MODE_NUNITS (outer_mode))) + && (!FLOAT_MODE_P (outer_mode) + || (FLOAT_MODE_FORMAT (outer_mode)->ieee_bits + == GET_MODE_UNIT_PRECISION (outer_mode))) + && (GET_MODE_SIZE (inner_mode).is_constant () + || !CONST_VECTOR_STEPPED_P (x))) + { + /* Try converting to OUTER_MODE and back. */ + rtx outer_x = simplify_subreg (outer_mode, x, inner_mode, 0); + ASSERT_TRUE (outer_x != NULL_RTX); + ASSERT_RTX_EQ (x, simplify_subreg (inner_mode, outer_x, + outer_mode, 0)); + } + } + + if (BYTES_BIG_ENDIAN == WORDS_BIG_ENDIAN) + { + /* Test each byte in the element range. */ + unsigned int limit + = constant_lower_bound (GET_MODE_SIZE (inner_mode)); + for (unsigned int i = 0; i < limit; ++i) + { + unsigned int elt = i / GET_MODE_SIZE (int_mode); + rtx expected = NULL_RTX; + if (elt >= first_valid) + { + unsigned int byte_shift = i % GET_MODE_SIZE (int_mode); + if (BYTES_BIG_ENDIAN) + byte_shift = GET_MODE_SIZE (int_mode) - byte_shift - 1; + rtx_mode_t vec_elt (CONST_VECTOR_ELT (x, elt), int_mode); + wide_int shifted_elt + = wi::lrshift (vec_elt, byte_shift * BITS_PER_UNIT); + expected = immed_wide_int_const (shifted_elt, QImode); + } + poly_uint64 byte = elt_bias * GET_MODE_SIZE (int_mode) + i; + ASSERT_RTX_EQ (expected, + simplify_subreg (QImode, x, inner_mode, byte)); + } + } +} + +/* Test constant subregs of integer vector mode INNER_MODE, using 1 + element per pattern. */ + +static void +test_vector_subregs_repeating (machine_mode inner_mode) +{ + poly_uint64 nunits = GET_MODE_NUNITS (inner_mode); + unsigned int min_nunits = constant_lower_bound (nunits); + scalar_mode int_mode = GET_MODE_INNER (inner_mode); + unsigned int count = gcd (min_nunits, 8); + + rtx_vector_builder builder (inner_mode, count, 1); + for (unsigned int i = 0; i < count; ++i) + builder.quick_push (gen_int_mode (8 - i, int_mode)); + rtx x = builder.build (); + + test_vector_subregs_modes (x); + if (!nunits.is_constant ()) + test_vector_subregs_modes (x, nunits - min_nunits); +} + +/* Test constant subregs of integer vector mode INNER_MODE, using 2 + elements per pattern. */ + +static void +test_vector_subregs_fore_back (machine_mode inner_mode) +{ + poly_uint64 nunits = GET_MODE_NUNITS (inner_mode); + unsigned int min_nunits = constant_lower_bound (nunits); + scalar_mode int_mode = GET_MODE_INNER (inner_mode); + unsigned int count = gcd (min_nunits, 4); + + rtx_vector_builder builder (inner_mode, count, 2); + for (unsigned int i = 0; i < count; ++i) + builder.quick_push (gen_int_mode (i, int_mode)); + for (unsigned int i = 0; i < count; ++i) + builder.quick_push (gen_int_mode (-(int) i, int_mode)); + rtx x = builder.build (); + + test_vector_subregs_modes (x); + if (!nunits.is_constant ()) + test_vector_subregs_modes (x, nunits - min_nunits, count); +} + +/* Test constant subregs of integer vector mode INNER_MODE, using 3 + elements per pattern. */ + +static void +test_vector_subregs_stepped (machine_mode inner_mode) +{ + /* Build { 0, 1, 2, 3, ... }. */ + scalar_mode int_mode = GET_MODE_INNER (inner_mode); + rtx_vector_builder builder (inner_mode, 1, 3); + for (unsigned int i = 0; i < 3; ++i) + builder.quick_push (gen_int_mode (i, int_mode)); + rtx x = builder.build (); + + test_vector_subregs_modes (x); +} + +/* Test constant subregs of integer vector mode INNER_MODE. */ + +static void +test_vector_subregs (machine_mode inner_mode) +{ + test_vector_subregs_repeating (inner_mode); + test_vector_subregs_fore_back (inner_mode); + test_vector_subregs_stepped (inner_mode); +} + /* Verify some simplifications involving vectors. */ static void @@ -7193,7 +7470,10 @@ test_vector_ops () test_vector_ops_duplicate (mode, scalar_reg); if (GET_MODE_CLASS (mode) == MODE_VECTOR_INT && maybe_gt (GET_MODE_NUNITS (mode), 2)) - test_vector_ops_series (mode, scalar_reg); + { + test_vector_ops_series (mode, scalar_reg); + test_vector_subregs (mode); + } test_vec_merge (mode); } } -- 2.30.2