From 299456f3c2978c03a18fc419dc7e29b65e5844f0 Mon Sep 17 00:00:00 2001 From: Ben Elliston Date: Mon, 26 Oct 2009 21:59:17 +0000 Subject: [PATCH] config.gcc (spu-*-elf*): Add spu_cache.h to extra_headers. 2009-10-26 Ben Elliston Michael Meissner Ulrich Weigand * config.gcc (spu-*-elf*): Add spu_cache.h to extra_headers. * config/spu/spu_cache.h: New file. * config/spu/cachemgr.c: New file. * config/spu/cache.S: New file. * config/spu/spu.h (ASM_OUTPUT_SYMBOL_REF): Define. (ADDR_SPACE_EA): Define. (TARGET_ADDR_SPACE_KEYWORDS): Define. * config/spu/spu.c (EAmode): New macro. (TARGET_ADDR_SPACE_POINTER_MODE): Define. (TARGET_ADDR_SPACE_ADDRESS_MODE): Likewise. (TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P): Likewise. (TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS): Likewise. (TARGET_ADDR_SPACE_SUBSET_P): Likewise. (TARGET_ADDR_SPACE_CONVERT): Likewise. (TARGET_ASM_SELECT_SECTION): Likewise. (TARGET_ASM_UNIQUE_SECTION): Likewise. (TARGET_ASM_UNALIGNED_SI_OP): Likewise. (TARGET_ASM_ALIGNED_DI_OP): Likewise. (ea_symbol_ref): New function. (spu_legitimate_constant_p): Handle __ea qualified addresses. (spu_addr_space_legitimate_address_p): New function. (spu_addr_space_legitimize_address): Likewise. (cache_fetch): New global. (cache_fetch_dirty): Likewise. (ea_alias_set): Likewise. (ea_load_store): New function. (ea_load_store_inline): Likewise. (expand_ea_mem): Likewise. (spu_expand_mov): Handle __ea qualified memory references. (spu_addr_space_pointer_mode): New function. (spu_addr_space_address_mode): Likewise. (spu_addr_space_subset_p): Likewise. (spu_addr_space_convert): Likewise. (spu_section_type_flags): Handle "._ea" section. (spu_select_section): New function. (spu_unique_section): Likewise. * config/spu/spu-c.c (spu_cpu_cpp_builtins): Support __EA32__ and __EA64__ predefined macros. * config/spu/spu-elf.h (LIB_SPEC): Handle -mcache-size= and -matomic-updates switches. * config/spu/t-spu-elf (MULTILIB_OPTIONS): Define. (EXTRA_MULTILIB_PARTS): Add libgcc_cachemgr.a, libgcc_cachemgr_nonatomic.a, libgcc_cache8k.a, libgcc_cache16k.a, libgcc_cache32k.a, libgcc_cache64k.a, libgcc_cache128k.a. ($(T)cachemgr.o, $(T)cachemgr_nonatomic.o): New target. ($(T)cache8k.o, $(T)cache16k.o, $(T)cache32k.o, $(T)cache64k.o, $(T)cache128k.o): Likewise. ($(T)libgcc_%.a): Likewise. * config/spu/spu.h (TARGET_DEFAULT): Add MASK_ADDRESS_SPACE_CONVERSION. * config/spu/spu.opt (-mea32/-mea64): Add switches. (-maddress-space-conversion): Likewise. (-mcache-size=): Likewise. (-matomic-updates): Likewise. * doc/invoke.texi (-mea32/-mea64): Document. (-maddress-space-conversion): Likewise. (-mcache-size=): Likewise. (-matomic-updates): Likewise. Co-Authored-By: Michael Meissner Co-Authored-By: Ulrich Weigand From-SVN: r153575 --- gcc/ChangeLog | 66 +++++ gcc/config.gcc | 2 +- gcc/config/spu/cache.S | 43 ++++ gcc/config/spu/cachemgr.c | 438 +++++++++++++++++++++++++++++++++ gcc/config/spu/spu-c.c | 11 + gcc/config/spu/spu-elf.h | 10 +- gcc/config/spu/spu.c | 491 ++++++++++++++++++++++++++++++++++++- gcc/config/spu/spu.h | 20 +- gcc/config/spu/spu.opt | 21 ++ gcc/config/spu/spu_cache.h | 39 +++ gcc/config/spu/t-spu-elf | 33 ++- gcc/doc/invoke.texi | 46 +++- 12 files changed, 1209 insertions(+), 11 deletions(-) create mode 100644 gcc/config/spu/cache.S create mode 100644 gcc/config/spu/cachemgr.c create mode 100644 gcc/config/spu/spu_cache.h diff --git a/gcc/ChangeLog b/gcc/ChangeLog index db375c7fb92..3014db71c12 100644 --- a/gcc/ChangeLog +++ b/gcc/ChangeLog @@ -1,3 +1,69 @@ +2009-10-26 Ben Elliston + Michael Meissner + Ulrich Weigand + + * config.gcc (spu-*-elf*): Add spu_cache.h to extra_headers. + * config/spu/spu_cache.h: New file. + + * config/spu/cachemgr.c: New file. + * config/spu/cache.S: New file. + + * config/spu/spu.h (ASM_OUTPUT_SYMBOL_REF): Define. + (ADDR_SPACE_EA): Define. + (TARGET_ADDR_SPACE_KEYWORDS): Define. + * config/spu/spu.c (EAmode): New macro. + (TARGET_ADDR_SPACE_POINTER_MODE): Define. + (TARGET_ADDR_SPACE_ADDRESS_MODE): Likewise. + (TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P): Likewise. + (TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS): Likewise. + (TARGET_ADDR_SPACE_SUBSET_P): Likewise. + (TARGET_ADDR_SPACE_CONVERT): Likewise. + (TARGET_ASM_SELECT_SECTION): Likewise. + (TARGET_ASM_UNIQUE_SECTION): Likewise. + (TARGET_ASM_UNALIGNED_SI_OP): Likewise. + (TARGET_ASM_ALIGNED_DI_OP): Likewise. + (ea_symbol_ref): New function. + (spu_legitimate_constant_p): Handle __ea qualified addresses. + (spu_addr_space_legitimate_address_p): New function. + (spu_addr_space_legitimize_address): Likewise. + (cache_fetch): New global. + (cache_fetch_dirty): Likewise. + (ea_alias_set): Likewise. + (ea_load_store): New function. + (ea_load_store_inline): Likewise. + (expand_ea_mem): Likewise. + (spu_expand_mov): Handle __ea qualified memory references. + (spu_addr_space_pointer_mode): New function. + (spu_addr_space_address_mode): Likewise. + (spu_addr_space_subset_p): Likewise. + (spu_addr_space_convert): Likewise. + (spu_section_type_flags): Handle "._ea" section. + (spu_select_section): New function. + (spu_unique_section): Likewise. + * config/spu/spu-c.c (spu_cpu_cpp_builtins): Support __EA32__ + and __EA64__ predefined macros. + * config/spu/spu-elf.h (LIB_SPEC): Handle -mcache-size= and + -matomic-updates switches. + + * config/spu/t-spu-elf (MULTILIB_OPTIONS): Define. + (EXTRA_MULTILIB_PARTS): Add libgcc_cachemgr.a, + libgcc_cachemgr_nonatomic.a, libgcc_cache8k.a, libgcc_cache16k.a, + libgcc_cache32k.a, libgcc_cache64k.a, libgcc_cache128k.a. + ($(T)cachemgr.o, $(T)cachemgr_nonatomic.o): New target. + ($(T)cache8k.o, $(T)cache16k.o, $(T)cache32k.o, $(T)cache64k.o, + $(T)cache128k.o): Likewise. + ($(T)libgcc_%.a): Likewise. + + * config/spu/spu.h (TARGET_DEFAULT): Add MASK_ADDRESS_SPACE_CONVERSION. + * config/spu/spu.opt (-mea32/-mea64): Add switches. + (-maddress-space-conversion): Likewise. + (-mcache-size=): Likewise. + (-matomic-updates): Likewise. + * doc/invoke.texi (-mea32/-mea64): Document. + (-maddress-space-conversion): Likewise. + (-mcache-size=): Likewise. + (-matomic-updates): Likewise. + 2009-10-26 Ben Elliston Michael Meissner Ulrich Weigand diff --git a/gcc/config.gcc b/gcc/config.gcc index 7dac3004cd8..4f388babe6a 100644 --- a/gcc/config.gcc +++ b/gcc/config.gcc @@ -2449,7 +2449,7 @@ sparc64-*-netbsd*) spu-*-elf*) tm_file="dbxelf.h elfos.h spu/spu-elf.h spu/spu.h newlib-stdint.h" tmake_file="spu/t-spu-elf" - extra_headers="spu_intrinsics.h spu_internals.h vmx2spu.h spu_mfcio.h vec_types.h" + extra_headers="spu_intrinsics.h spu_internals.h vmx2spu.h spu_mfcio.h vec_types.h spu_cache.h" extra_modes=spu/spu-modes.def c_target_objs="${c_target_objs} spu-c.o" cxx_target_objs="${cxx_target_objs} spu-c.o" diff --git a/gcc/config/spu/cache.S b/gcc/config/spu/cache.S new file mode 100644 index 00000000000..9ffb6a0d194 --- /dev/null +++ b/gcc/config/spu/cache.S @@ -0,0 +1,43 @@ +/* Copyright (C) 2008, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + + .data + .p2align 7 + .global __cache +__cache: + .rept __CACHE_SIZE__ * 8 + .fill 128 + .endr + + .p2align 7 + .global __cache_tag_array +__cache_tag_array: + .rept __CACHE_SIZE__ * 2 + .long 1, 1, 1, 1 + .fill 128-16 + .endr +__end_cache_tag_array: + + .globl __cache_tag_array_size + .set __cache_tag_array_size, __end_cache_tag_array-__cache_tag_array + diff --git a/gcc/config/spu/cachemgr.c b/gcc/config/spu/cachemgr.c new file mode 100644 index 00000000000..e7abd5e62db --- /dev/null +++ b/gcc/config/spu/cachemgr.c @@ -0,0 +1,438 @@ +/* Copyright (C) 2008, 2009 Free Software Foundation, Inc. + +This file is part of GCC. + +GCC is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free +Software Foundation; either version 3, or (at your option) any later +version. + +GCC is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or +FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +Under Section 7 of GPL version 3, you are granted additional +permissions described in the GCC Runtime Library Exception, version +3.1, as published by the Free Software Foundation. + +You should have received a copy of the GNU General Public License and +a copy of the GCC Runtime Library Exception along with this program; +see the files COPYING3 and COPYING.RUNTIME respectively. If not, see +. */ + +#include +#include +#include +#include + +extern unsigned long long __ea_local_store; +extern char __cache_tag_array_size; + +#define LINE_SIZE 128 +#define TAG_MASK (LINE_SIZE - 1) + +#define WAYS 4 +#define SET_MASK ((int) &__cache_tag_array_size - LINE_SIZE) + +#define CACHE_LINES ((int) &__cache_tag_array_size / \ + sizeof (struct __cache_tag_array) * WAYS) + +struct __cache_tag_array +{ + unsigned int tag_lo[WAYS]; + unsigned int tag_hi[WAYS]; + void *base[WAYS]; + int reserved[WAYS]; + vector unsigned short dirty_bits[WAYS]; +}; + +extern struct __cache_tag_array __cache_tag_array[]; +extern char __cache[]; + +/* In order to make the code seem a little cleaner, and to avoid having + 64/32 bit ifdefs all over the place, we use macros. */ + +#ifdef __EA64__ +typedef unsigned long long addr; + +#define CHECK_TAG(_entry, _way, _tag) \ + ((_entry)->tag_lo[(_way)] == ((_tag) & 0xFFFFFFFF) \ + && (_entry)->tag_hi[(_way)] == ((_tag) >> 32)) + +#define GET_TAG(_entry, _way) \ + ((unsigned long long)(_entry)->tag_hi[(_way)] << 32 \ + | (unsigned long long)(_entry)->tag_lo[(_way)]) + +#define SET_TAG(_entry, _way, _tag) \ + (_entry)->tag_lo[(_way)] = (_tag) & 0xFFFFFFFF; \ + (_entry)->tag_hi[(_way)] = (_tag) >> 32 + +#else /*__EA32__*/ +typedef unsigned long addr; + +#define CHECK_TAG(_entry, _way, _tag) \ + ((_entry)->tag_lo[(_way)] == (_tag)) + +#define GET_TAG(_entry, _way) \ + ((_entry)->tag_lo[(_way)]) + +#define SET_TAG(_entry, _way, _tag) \ + (_entry)->tag_lo[(_way)] = (_tag) + +#endif + +/* In GET_ENTRY, we cast away the high 32 bits, + as the tag is only in the low 32. */ + +#define GET_ENTRY(_addr) \ + ((struct __cache_tag_array *) \ + si_to_uint (si_a (si_and (si_from_uint ((unsigned int) (addr) (_addr)), \ + si_from_uint (SET_MASK)), \ + si_from_uint ((unsigned int) __cache_tag_array)))) + +#define GET_CACHE_LINE(_addr, _way) \ + ((void *) (__cache + ((_addr) & SET_MASK) * WAYS) + ((_way) * LINE_SIZE)); + +#define CHECK_DIRTY(_vec) (si_to_uint (si_orx ((qword) (_vec)))) +#define SET_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] = 1) +#define CHECK_EMPTY(_entry, _way) ((_entry)->tag_lo[(_way)] == 1) + +#define LS_FLAG 0x80000000 +#define SET_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] |= LS_FLAG) +#define CHECK_IS_LS(_entry, _way) ((_entry)->reserved[(_way)] & LS_FLAG) +#define GET_LRU(_entry, _way) ((_entry)->reserved[(_way)] & ~LS_FLAG) + +static int dma_tag = 32; + +static void +__cache_evict_entry (struct __cache_tag_array *entry, int way) +{ + addr tag = GET_TAG (entry, way); + + if (CHECK_DIRTY (entry->dirty_bits[way]) && !CHECK_IS_LS (entry, way)) + { +#ifdef NONATOMIC + /* Non-atomic writes. */ + unsigned int oldmask, mach_stat; + char *line = ((void *) 0); + + /* Enter critical section. */ + mach_stat = spu_readch (SPU_RdMachStat); + spu_idisable (); + + /* Issue DMA request. */ + line = GET_CACHE_LINE (entry->tag_lo[way], way); + mfc_put (line, tag, LINE_SIZE, dma_tag, 0, 0); + + /* Wait for DMA completion. */ + oldmask = mfc_read_tag_mask (); + mfc_write_tag_mask (1 << dma_tag); + mfc_read_tag_status_all (); + mfc_write_tag_mask (oldmask); + + /* Leave critical section. */ + if (__builtin_expect (mach_stat & 1, 0)) + spu_ienable (); +#else + /* Allocate a buffer large enough that we know it has 128 bytes + that are 128 byte aligned (for DMA). */ + + char buffer[LINE_SIZE + 127]; + qword *buf_ptr = (qword *) (((unsigned int) (buffer) + 127) & ~127); + qword *line = GET_CACHE_LINE (entry->tag_lo[way], way); + qword bits; + unsigned int mach_stat; + + /* Enter critical section. */ + mach_stat = spu_readch (SPU_RdMachStat); + spu_idisable (); + + do + { + /* We atomically read the current memory into a buffer + modify the dirty bytes in the buffer, and write it + back. If writeback fails, loop and try again. */ + + mfc_getllar (buf_ptr, tag, 0, 0); + mfc_read_atomic_status (); + + /* The method we're using to write 16 dirty bytes into + the buffer at a time uses fsmb which in turn uses + the least significant 16 bits of word 0, so we + load the bits and rotate so that the first bit of + the bitmap is in the first bit that fsmb will use. */ + + bits = (qword) entry->dirty_bits[way]; + bits = si_rotqbyi (bits, -2); + + /* Si_fsmb creates the mask of dirty bytes. + Use selb to nab the appropriate bits. */ + buf_ptr[0] = si_selb (buf_ptr[0], line[0], si_fsmb (bits)); + + /* Rotate to next 16 byte section of cache. */ + bits = si_rotqbyi (bits, 2); + + buf_ptr[1] = si_selb (buf_ptr[1], line[1], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + buf_ptr[2] = si_selb (buf_ptr[2], line[2], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + buf_ptr[3] = si_selb (buf_ptr[3], line[3], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + buf_ptr[4] = si_selb (buf_ptr[4], line[4], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + buf_ptr[5] = si_selb (buf_ptr[5], line[5], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + buf_ptr[6] = si_selb (buf_ptr[6], line[6], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + buf_ptr[7] = si_selb (buf_ptr[7], line[7], si_fsmb (bits)); + bits = si_rotqbyi (bits, 2); + + mfc_putllc (buf_ptr, tag, 0, 0); + } + while (mfc_read_atomic_status ()); + + /* Leave critical section. */ + if (__builtin_expect (mach_stat & 1, 0)) + spu_ienable (); +#endif + } + + /* In any case, marking the lo tag with 1 which denotes empty. */ + SET_EMPTY (entry, way); + entry->dirty_bits[way] = (vector unsigned short) si_from_uint (0); +} + +void +__cache_evict (__ea void *ea) +{ + addr tag = (addr) ea & ~TAG_MASK; + struct __cache_tag_array *entry = GET_ENTRY (ea); + int i = 0; + + /* Cycles through all the possible ways an address could be at + and evicts the way if found. */ + + for (i = 0; i < WAYS; i++) + if (CHECK_TAG (entry, i, tag)) + __cache_evict_entry (entry, i); +} + +static void * +__cache_fill (int way, addr tag) +{ + unsigned int oldmask, mach_stat; + char *line = ((void *) 0); + + /* Reserve our DMA tag. */ + if (dma_tag == 32) + dma_tag = mfc_tag_reserve (); + + /* Enter critical section. */ + mach_stat = spu_readch (SPU_RdMachStat); + spu_idisable (); + + /* Issue DMA request. */ + line = GET_CACHE_LINE (tag, way); + mfc_get (line, tag, LINE_SIZE, dma_tag, 0, 0); + + /* Wait for DMA completion. */ + oldmask = mfc_read_tag_mask (); + mfc_write_tag_mask (1 << dma_tag); + mfc_read_tag_status_all (); + mfc_write_tag_mask (oldmask); + + /* Leave critical section. */ + if (__builtin_expect (mach_stat & 1, 0)) + spu_ienable (); + + return (void *) line; +} + +static void +__cache_miss (__ea void *ea, struct __cache_tag_array *entry, int way) +{ + + addr tag = (addr) ea & ~TAG_MASK; + unsigned int lru = 0; + int i = 0; + int idx = 0; + + /* If way > 4, then there are no empty slots, so we must evict + the least recently used entry. */ + if (way >= 4) + { + for (i = 0; i < WAYS; i++) + { + if (GET_LRU (entry, i) > lru) + { + lru = GET_LRU (entry, i); + idx = i; + } + } + __cache_evict_entry (entry, idx); + way = idx; + } + + /* Set the empty entry's tag and fill it's cache line. */ + + SET_TAG (entry, way, tag); + entry->reserved[way] = 0; + + /* Check if the address is just an effective address within the + SPU's local store. */ + + /* Because the LS is not 256k aligned, we can't do a nice and mask + here to compare, so we must check the whole range. */ + + if ((addr) ea >= (addr) __ea_local_store + && (addr) ea < (addr) (__ea_local_store + 0x40000)) + { + SET_IS_LS (entry, way); + entry->base[way] = + (void *) ((unsigned int) ((addr) ea - + (addr) __ea_local_store) & ~0x7f); + } + else + { + entry->base[way] = __cache_fill (way, tag); + } +} + +void * +__cache_fetch_dirty (__ea void *ea, int n_bytes_dirty) +{ +#ifdef __EA64__ + unsigned int tag_hi; + qword etag_hi; +#endif + unsigned int tag_lo; + struct __cache_tag_array *entry; + + qword etag_lo; + qword equal; + qword bit_mask; + qword way; + + /* This first chunk, we merely fill the pointer and tag. */ + + entry = GET_ENTRY (ea); + +#ifndef __EA64__ + tag_lo = + si_to_uint (si_andc + (si_shufb + (si_from_uint ((addr) ea), si_from_uint (0), + si_from_uint (0x00010203)), si_from_uint (TAG_MASK))); +#else + tag_lo = + si_to_uint (si_andc + (si_shufb + (si_from_ullong ((addr) ea), si_from_uint (0), + si_from_uint (0x04050607)), si_from_uint (TAG_MASK))); + + tag_hi = + si_to_uint (si_shufb + (si_from_ullong ((addr) ea), si_from_uint (0), + si_from_uint (0x00010203))); +#endif + + /* Increment LRU in reserved bytes. */ + si_stqd (si_ai (si_lqd (si_from_ptr (entry), 48), 1), + si_from_ptr (entry), 48); + +missreturn: + /* Check if the entry's lo_tag is equal to the address' lo_tag. */ + etag_lo = si_lqd (si_from_ptr (entry), 0); + equal = si_ceq (etag_lo, si_from_uint (tag_lo)); +#ifdef __EA64__ + /* And the high tag too. */ + etag_hi = si_lqd (si_from_ptr (entry), 16); + equal = si_and (equal, (si_ceq (etag_hi, si_from_uint (tag_hi)))); +#endif + + if ((si_to_uint (si_orx (equal)) == 0)) + goto misshandler; + + if (n_bytes_dirty) + { + /* way = 0x40,0x50,0x60,0x70 for each way, which is also the + offset of the appropriate dirty bits. */ + way = si_shli (si_clz (si_gbb (equal)), 2); + + /* To create the bit_mask, we set it to all 1s (uint -1), then we + shift it over (128 - n_bytes_dirty) times. */ + + bit_mask = si_from_uint (-1); + + bit_mask = + si_shlqby (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) / 8)); + + bit_mask = + si_shlqbi (bit_mask, si_from_uint ((LINE_SIZE - n_bytes_dirty) % 8)); + + /* Rotate it around to the correct offset. */ + bit_mask = + si_rotqby (bit_mask, + si_from_uint (-1 * ((addr) ea & TAG_MASK) / 8)); + + bit_mask = + si_rotqbi (bit_mask, + si_from_uint (-1 * ((addr) ea & TAG_MASK) % 8)); + + /* Update the dirty bits. */ + si_stqx (si_or (si_lqx (si_from_ptr (entry), way), bit_mask), + si_from_ptr (entry), way); + }; + + /* We've definitely found the right entry, set LRU (reserved) to 0 + maintaining the LS flag (MSB). */ + + si_stqd (si_andc + (si_lqd (si_from_ptr (entry), 48), + si_and (equal, si_from_uint (~(LS_FLAG)))), + si_from_ptr (entry), 48); + + return (void *) + si_to_uint (si_a + (si_orx + (si_and (si_lqd (si_from_ptr (entry), 32), equal)), + si_from_uint (((unsigned int) (addr) ea) & TAG_MASK))); + +misshandler: + equal = si_ceqi (etag_lo, 1); + __cache_miss (ea, entry, (si_to_uint (si_clz (si_gbb (equal))) - 16) >> 2); + goto missreturn; +} + +void * +__cache_fetch (__ea void *ea) +{ + return __cache_fetch_dirty (ea, 0); +} + +void +__cache_touch (__ea void *ea __attribute__ ((unused))) +{ + /* NO-OP for now. */ +} + +void __cache_flush (void) __attribute__ ((destructor)); +void +__cache_flush (void) +{ + struct __cache_tag_array *entry = __cache_tag_array; + unsigned int i; + int j; + + /* Cycle through each cache entry and evict all used ways. */ + + for (i = 0; i < CACHE_LINES / WAYS; i++) + { + for (j = 0; j < WAYS; j++) + if (!CHECK_EMPTY (entry, j)) + __cache_evict_entry (entry, j); + + entry++; + } +} diff --git a/gcc/config/spu/spu-c.c b/gcc/config/spu/spu-c.c index fbbbf32e157..380af402c48 100644 --- a/gcc/config/spu/spu-c.c +++ b/gcc/config/spu/spu-c.c @@ -201,6 +201,17 @@ spu_cpu_cpp_builtins (struct cpp_reader *pfile) if (spu_arch == PROCESSOR_CELLEDP) builtin_define_std ("__SPU_EDP__"); builtin_define_std ("__vector=__attribute__((__spu_vector__))"); + switch (spu_ea_model) + { + case 32: + builtin_define_std ("__EA32__"); + break; + case 64: + builtin_define_std ("__EA64__"); + break; + default: + gcc_unreachable (); + } if (!flag_iso) { diff --git a/gcc/config/spu/spu-elf.h b/gcc/config/spu/spu-elf.h index 532313119cb..68982002103 100644 --- a/gcc/config/spu/spu-elf.h +++ b/gcc/config/spu/spu-elf.h @@ -68,8 +68,14 @@ #define LINK_SPEC "%{mlarge-mem: --defsym __stack=0xfffffff0 }" -#define LIB_SPEC \ - "-( %{!shared:%{g*:-lg}} -lc -lgloss -)" +#define LIB_SPEC "-( %{!shared:%{g*:-lg}} -lc -lgloss -) \ + %{mno-atomic-updates:-lgcc_cachemgr_nonatomic; :-lgcc_cachemgr} \ + %{mcache-size=128:-lgcc_cache128k; \ + mcache-size=64 :-lgcc_cache64k; \ + mcache-size=32 :-lgcc_cache32k; \ + mcache-size=16 :-lgcc_cache16k; \ + mcache-size=8 :-lgcc_cache8k; \ + :-lgcc_cache64k}" /* Turn off warnings in the assembler too. */ #undef ASM_SPEC diff --git a/gcc/config/spu/spu.c b/gcc/config/spu/spu.c index ed5d6c59345..2888da67281 100644 --- a/gcc/config/spu/spu.c +++ b/gcc/config/spu/spu.c @@ -154,6 +154,8 @@ static tree spu_builtin_decl (unsigned, bool); static unsigned char spu_scalar_mode_supported_p (enum machine_mode mode); static unsigned char spu_vector_mode_supported_p (enum machine_mode mode); static bool spu_legitimate_address_p (enum machine_mode, rtx, bool); +static bool spu_addr_space_legitimate_address_p (enum machine_mode, rtx, + bool, addr_space_t); static rtx adjust_operand (rtx op, HOST_WIDE_INT * start); static rtx get_pic_reg (void); static int need_to_save_reg (int regno, int saving); @@ -203,15 +205,23 @@ static bool spu_return_in_memory (const_tree type, const_tree fntype); static void fix_range (const char *); static void spu_encode_section_info (tree, rtx, int); static rtx spu_legitimize_address (rtx, rtx, enum machine_mode); +static rtx spu_addr_space_legitimize_address (rtx, rtx, enum machine_mode, + addr_space_t); static tree spu_builtin_mul_widen_even (tree); static tree spu_builtin_mul_widen_odd (tree); static tree spu_builtin_mask_for_load (void); static int spu_builtin_vectorization_cost (bool); static bool spu_vector_alignment_reachable (const_tree, bool); static tree spu_builtin_vec_perm (tree, tree *); +static enum machine_mode spu_addr_space_pointer_mode (addr_space_t); +static enum machine_mode spu_addr_space_address_mode (addr_space_t); +static bool spu_addr_space_subset_p (addr_space_t, addr_space_t); +static rtx spu_addr_space_convert (rtx, tree, tree); static int spu_sms_res_mii (struct ddg *g); static void asm_file_start (void); static unsigned int spu_section_type_flags (tree, const char *, int); +static section *spu_select_section (tree, int, unsigned HOST_WIDE_INT); +static void spu_unique_section (tree, int); static rtx spu_expand_load (rtx, rtx, rtx, int); static void spu_trampoline_init (rtx, tree, rtx); @@ -270,6 +280,10 @@ spu_libgcc_cmp_return_mode (void); static enum machine_mode spu_libgcc_shift_count_mode (void); + +/* Pointer mode for __ea references. */ +#define EAmode (spu_ea_model != 32 ? DImode : SImode) + /* Table of machine attributes. */ static const struct attribute_spec spu_attribute_table[] = @@ -282,6 +296,25 @@ static const struct attribute_spec spu_attribute_table[] = /* TARGET overrides. */ +#undef TARGET_ADDR_SPACE_POINTER_MODE +#define TARGET_ADDR_SPACE_POINTER_MODE spu_addr_space_pointer_mode + +#undef TARGET_ADDR_SPACE_ADDRESS_MODE +#define TARGET_ADDR_SPACE_ADDRESS_MODE spu_addr_space_address_mode + +#undef TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P +#define TARGET_ADDR_SPACE_LEGITIMATE_ADDRESS_P \ + spu_addr_space_legitimate_address_p + +#undef TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS +#define TARGET_ADDR_SPACE_LEGITIMIZE_ADDRESS spu_addr_space_legitimize_address + +#undef TARGET_ADDR_SPACE_SUBSET_P +#define TARGET_ADDR_SPACE_SUBSET_P spu_addr_space_subset_p + +#undef TARGET_ADDR_SPACE_CONVERT +#define TARGET_ADDR_SPACE_CONVERT spu_addr_space_convert + #undef TARGET_INIT_BUILTINS #define TARGET_INIT_BUILTINS spu_init_builtins #undef TARGET_BUILTIN_DECL @@ -296,6 +329,15 @@ static const struct attribute_spec spu_attribute_table[] = #undef TARGET_LEGITIMIZE_ADDRESS #define TARGET_LEGITIMIZE_ADDRESS spu_legitimize_address +/* The current assembler doesn't like .4byte foo@ppu, so use the normal .long + and .quad for the debugger. When it is known that the assembler is fixed, + these can be removed. */ +#undef TARGET_ASM_UNALIGNED_SI_OP +#define TARGET_ASM_UNALIGNED_SI_OP "\t.long\t" + +#undef TARGET_ASM_ALIGNED_DI_OP +#define TARGET_ASM_ALIGNED_DI_OP "\t.quad\t" + /* The .8byte directive doesn't seem to work well for a 32 bit architecture. */ #undef TARGET_ASM_UNALIGNED_DI_OP @@ -412,6 +454,12 @@ static const struct attribute_spec spu_attribute_table[] = #undef TARGET_SECTION_TYPE_FLAGS #define TARGET_SECTION_TYPE_FLAGS spu_section_type_flags +#undef TARGET_ASM_SELECT_SECTION +#define TARGET_ASM_SELECT_SECTION spu_select_section + +#undef TARGET_ASM_UNIQUE_SECTION +#define TARGET_ASM_UNIQUE_SECTION spu_unique_section + #undef TARGET_LEGITIMATE_ADDRESS_P #define TARGET_LEGITIMATE_ADDRESS_P spu_legitimate_address_p @@ -3613,6 +3661,29 @@ exp2_immediate_p (rtx op, enum machine_mode mode, int low, int high) return FALSE; } +/* Return true if X is a SYMBOL_REF to an __ea qualified variable. */ + +static int +ea_symbol_ref (rtx *px, void *data ATTRIBUTE_UNUSED) +{ + rtx x = *px; + tree decl; + + if (GET_CODE (x) == CONST && GET_CODE (XEXP (x, 0)) == PLUS) + { + rtx plus = XEXP (x, 0); + rtx op0 = XEXP (plus, 0); + rtx op1 = XEXP (plus, 1); + if (GET_CODE (op1) == CONST_INT) + x = op0; + } + + return (GET_CODE (x) == SYMBOL_REF + && (decl = SYMBOL_REF_DECL (x)) != 0 + && TREE_CODE (decl) == VAR_DECL + && TYPE_ADDR_SPACE (TREE_TYPE (decl))); +} + /* We accept: - any 32-bit constant (SImode, SFmode) - any constant that can be generated with fsmbi (any mode) @@ -3624,6 +3695,12 @@ spu_legitimate_constant_p (rtx x) { if (GET_CODE (x) == HIGH) x = XEXP (x, 0); + + /* Reject any __ea qualified reference. These can't appear in + instructions but must be forced to the constant pool. */ + if (for_each_rtx (&x, ea_symbol_ref, 0)) + return 0; + /* V4SI with all identical symbols is valid. */ if (!flag_pic && GET_MODE (x) == V4SImode @@ -3662,8 +3739,14 @@ spu_legitimate_address_p (enum machine_mode mode, switch (GET_CODE (x)) { case LABEL_REF: + return !TARGET_LARGE_MEM; + case SYMBOL_REF: case CONST: + /* Keep __ea references until reload so that spu_expand_mov can see them + in MEMs. */ + if (ea_symbol_ref (&x, 0)) + return !reload_in_progress && !reload_completed; return !TARGET_LARGE_MEM; case CONST_INT: @@ -3707,6 +3790,20 @@ spu_legitimate_address_p (enum machine_mode mode, return FALSE; } +/* Like spu_legitimate_address_p, except with named addresses. */ +static bool +spu_addr_space_legitimate_address_p (enum machine_mode mode, rtx x, + bool reg_ok_strict, addr_space_t as) +{ + if (as == ADDR_SPACE_EA) + return (REG_P (x) && (GET_MODE (x) == EAmode)); + + else if (as != ADDR_SPACE_GENERIC) + gcc_unreachable (); + + return spu_legitimate_address_p (mode, x, reg_ok_strict); +} + /* When the address is reg + const_int, force the const_int into a register. */ rtx @@ -3738,6 +3835,17 @@ spu_legitimize_address (rtx x, rtx oldx ATTRIBUTE_UNUSED, return x; } +/* Like spu_legitimate_address, except with named address support. */ +static rtx +spu_addr_space_legitimize_address (rtx x, rtx oldx, enum machine_mode mode, + addr_space_t as) +{ + if (as != ADDR_SPACE_GENERIC) + return x; + + return spu_legitimize_address (x, oldx, mode); +} + /* Handle an attribute requiring a FUNCTION_DECL; arguments as in struct attribute_spec.handler. */ static tree @@ -4241,6 +4349,233 @@ address_needs_split (rtx mem) return 0; } +static GTY(()) rtx cache_fetch; /* __cache_fetch function */ +static GTY(()) rtx cache_fetch_dirty; /* __cache_fetch_dirty function */ +static alias_set_type ea_alias_set = -1; /* alias set for __ea memory */ + +/* MEM is known to be an __ea qualified memory access. Emit a call to + fetch the ppu memory to local store, and return its address in local + store. */ + +static void +ea_load_store (rtx mem, bool is_store, rtx ea_addr, rtx data_addr) +{ + if (is_store) + { + rtx ndirty = GEN_INT (GET_MODE_SIZE (GET_MODE (mem))); + if (!cache_fetch_dirty) + cache_fetch_dirty = init_one_libfunc ("__cache_fetch_dirty"); + emit_library_call_value (cache_fetch_dirty, data_addr, LCT_NORMAL, Pmode, + 2, ea_addr, EAmode, ndirty, SImode); + } + else + { + if (!cache_fetch) + cache_fetch = init_one_libfunc ("__cache_fetch"); + emit_library_call_value (cache_fetch, data_addr, LCT_NORMAL, Pmode, + 1, ea_addr, EAmode); + } +} + +/* Like ea_load_store, but do the cache tag comparison and, for stores, + dirty bit marking, inline. + + The cache control data structure is an array of + + struct __cache_tag_array + { + unsigned int tag_lo[4]; + unsigned int tag_hi[4]; + void *data_pointer[4]; + int reserved[4]; + vector unsigned short dirty_bits[4]; + } */ + +static void +ea_load_store_inline (rtx mem, bool is_store, rtx ea_addr, rtx data_addr) +{ + rtx ea_addr_si; + HOST_WIDE_INT v; + rtx tag_size_sym = gen_rtx_SYMBOL_REF (Pmode, "__cache_tag_array_size"); + rtx tag_arr_sym = gen_rtx_SYMBOL_REF (Pmode, "__cache_tag_array"); + rtx index_mask = gen_reg_rtx (SImode); + rtx tag_arr = gen_reg_rtx (Pmode); + rtx splat_mask = gen_reg_rtx (TImode); + rtx splat = gen_reg_rtx (V4SImode); + rtx splat_hi = NULL_RTX; + rtx tag_index = gen_reg_rtx (Pmode); + rtx block_off = gen_reg_rtx (SImode); + rtx tag_addr = gen_reg_rtx (Pmode); + rtx tag = gen_reg_rtx (V4SImode); + rtx cache_tag = gen_reg_rtx (V4SImode); + rtx cache_tag_hi = NULL_RTX; + rtx cache_ptrs = gen_reg_rtx (TImode); + rtx cache_ptrs_si = gen_reg_rtx (SImode); + rtx tag_equal = gen_reg_rtx (V4SImode); + rtx tag_equal_hi = NULL_RTX; + rtx tag_eq_pack = gen_reg_rtx (V4SImode); + rtx tag_eq_pack_si = gen_reg_rtx (SImode); + rtx eq_index = gen_reg_rtx (SImode); + rtx bcomp, hit_label, hit_ref, cont_label, insn; + + if (spu_ea_model != 32) + { + splat_hi = gen_reg_rtx (V4SImode); + cache_tag_hi = gen_reg_rtx (V4SImode); + tag_equal_hi = gen_reg_rtx (V4SImode); + } + + emit_move_insn (index_mask, plus_constant (tag_size_sym, -128)); + emit_move_insn (tag_arr, tag_arr_sym); + v = 0x0001020300010203LL; + emit_move_insn (splat_mask, immed_double_const (v, v, TImode)); + ea_addr_si = ea_addr; + if (spu_ea_model != 32) + ea_addr_si = convert_to_mode (SImode, ea_addr, 1); + + /* tag_index = ea_addr & (tag_array_size - 128) */ + emit_insn (gen_andsi3 (tag_index, ea_addr_si, index_mask)); + + /* splat ea_addr to all 4 slots. */ + emit_insn (gen_shufb (splat, ea_addr_si, ea_addr_si, splat_mask)); + /* Similarly for high 32 bits of ea_addr. */ + if (spu_ea_model != 32) + emit_insn (gen_shufb (splat_hi, ea_addr, ea_addr, splat_mask)); + + /* block_off = ea_addr & 127 */ + emit_insn (gen_andsi3 (block_off, ea_addr_si, spu_const (SImode, 127))); + + /* tag_addr = tag_arr + tag_index */ + emit_insn (gen_addsi3 (tag_addr, tag_arr, tag_index)); + + /* Read cache tags. */ + emit_move_insn (cache_tag, gen_rtx_MEM (V4SImode, tag_addr)); + if (spu_ea_model != 32) + emit_move_insn (cache_tag_hi, gen_rtx_MEM (V4SImode, + plus_constant (tag_addr, 16))); + + /* tag = ea_addr & -128 */ + emit_insn (gen_andv4si3 (tag, splat, spu_const (V4SImode, -128))); + + /* Read all four cache data pointers. */ + emit_move_insn (cache_ptrs, gen_rtx_MEM (TImode, + plus_constant (tag_addr, 32))); + + /* Compare tags. */ + emit_insn (gen_ceq_v4si (tag_equal, tag, cache_tag)); + if (spu_ea_model != 32) + { + emit_insn (gen_ceq_v4si (tag_equal_hi, splat_hi, cache_tag_hi)); + emit_insn (gen_andv4si3 (tag_equal, tag_equal, tag_equal_hi)); + } + + /* At most one of the tags compare equal, so tag_equal has one + 32-bit slot set to all 1's, with the other slots all zero. + gbb picks off low bit from each byte in the 128-bit registers, + so tag_eq_pack is one of 0xf000, 0x0f00, 0x00f0, 0x000f, assuming + we have a hit. */ + emit_insn (gen_spu_gbb (tag_eq_pack, spu_gen_subreg (V16QImode, tag_equal))); + emit_insn (gen_spu_convert (tag_eq_pack_si, tag_eq_pack)); + + /* So counting leading zeros will set eq_index to 16, 20, 24 or 28. */ + emit_insn (gen_clzsi2 (eq_index, tag_eq_pack_si)); + + /* Allowing us to rotate the corresponding cache data pointer to slot0. + (rotating eq_index mod 16 bytes). */ + emit_insn (gen_rotqby_ti (cache_ptrs, cache_ptrs, eq_index)); + emit_insn (gen_spu_convert (cache_ptrs_si, cache_ptrs)); + + /* Add block offset to form final data address. */ + emit_insn (gen_addsi3 (data_addr, cache_ptrs_si, block_off)); + + /* Check that we did hit. */ + hit_label = gen_label_rtx (); + hit_ref = gen_rtx_LABEL_REF (VOIDmode, hit_label); + bcomp = gen_rtx_NE (SImode, tag_eq_pack_si, const0_rtx); + insn = emit_jump_insn (gen_rtx_SET (VOIDmode, pc_rtx, + gen_rtx_IF_THEN_ELSE (VOIDmode, bcomp, + hit_ref, pc_rtx))); + /* Say that this branch is very likely to happen. */ + v = REG_BR_PROB_BASE - REG_BR_PROB_BASE / 100 - 1; + REG_NOTES (insn) + = gen_rtx_EXPR_LIST (REG_BR_PROB, GEN_INT (v), REG_NOTES (insn)); + + ea_load_store (mem, is_store, ea_addr, data_addr); + cont_label = gen_label_rtx (); + emit_jump_insn (gen_jump (cont_label)); + emit_barrier (); + + emit_label (hit_label); + + if (is_store) + { + HOST_WIDE_INT v_hi; + rtx dirty_bits = gen_reg_rtx (TImode); + rtx dirty_off = gen_reg_rtx (SImode); + rtx dirty_128 = gen_reg_rtx (TImode); + rtx neg_block_off = gen_reg_rtx (SImode); + + /* Set up mask with one dirty bit per byte of the mem we are + writing, starting from top bit. */ + v_hi = v = -1; + v <<= (128 - GET_MODE_SIZE (GET_MODE (mem))) & 63; + if ((128 - GET_MODE_SIZE (GET_MODE (mem))) >= 64) + { + v_hi = v; + v = 0; + } + emit_move_insn (dirty_bits, immed_double_const (v, v_hi, TImode)); + + /* Form index into cache dirty_bits. eq_index is one of + 0x10, 0x14, 0x18 or 0x1c. Multiplying by 4 gives us + 0x40, 0x50, 0x60 or 0x70 which just happens to be the + offset to each of the four dirty_bits elements. */ + emit_insn (gen_ashlsi3 (dirty_off, eq_index, spu_const (SImode, 2))); + + emit_insn (gen_spu_lqx (dirty_128, tag_addr, dirty_off)); + + /* Rotate bit mask to proper bit. */ + emit_insn (gen_negsi2 (neg_block_off, block_off)); + emit_insn (gen_rotqbybi_ti (dirty_bits, dirty_bits, neg_block_off)); + emit_insn (gen_rotqbi_ti (dirty_bits, dirty_bits, neg_block_off)); + + /* Or in the new dirty bits. */ + emit_insn (gen_iorti3 (dirty_128, dirty_bits, dirty_128)); + + /* Store. */ + emit_insn (gen_spu_stqx (dirty_128, tag_addr, dirty_off)); + } + + emit_label (cont_label); +} + +static rtx +expand_ea_mem (rtx mem, bool is_store) +{ + rtx ea_addr; + rtx data_addr = gen_reg_rtx (Pmode); + rtx new_mem; + + ea_addr = force_reg (EAmode, XEXP (mem, 0)); + if (optimize_size || optimize == 0) + ea_load_store (mem, is_store, ea_addr, data_addr); + else + ea_load_store_inline (mem, is_store, ea_addr, data_addr); + + if (ea_alias_set == -1) + ea_alias_set = new_alias_set (); + + /* We generate a new MEM RTX to refer to the copy of the data + in the cache. We do not copy memory attributes (except the + alignment) from the original MEM, as they may no longer apply + to the cache copy. */ + new_mem = gen_rtx_MEM (GET_MODE (mem), data_addr); + set_mem_alias_set (new_mem, ea_alias_set); + set_mem_align (new_mem, MIN (MEM_ALIGN (mem), 128 * 8)); + + return new_mem; +} + int spu_expand_mov (rtx * ops, enum machine_mode mode) { @@ -4298,9 +4633,17 @@ spu_expand_mov (rtx * ops, enum machine_mode mode) } } if (MEM_P (ops[0])) - return spu_split_store (ops); + { + if (MEM_ADDR_SPACE (ops[0])) + ops[0] = expand_ea_mem (ops[0], true); + return spu_split_store (ops); + } if (MEM_P (ops[1])) - return spu_split_load (ops); + { + if (MEM_ADDR_SPACE (ops[1])) + ops[1] = expand_ea_mem (ops[1], false); + return spu_split_load (ops); + } return 0; } @@ -6442,6 +6785,113 @@ spu_builtin_vec_perm (tree type, tree *mask_element_type) return d->fndecl; } +/* Return the appropriate mode for a named address pointer. */ +static enum machine_mode +spu_addr_space_pointer_mode (addr_space_t addrspace) +{ + switch (addrspace) + { + case ADDR_SPACE_GENERIC: + return ptr_mode; + case ADDR_SPACE_EA: + return EAmode; + default: + gcc_unreachable (); + } +} + +/* Return the appropriate mode for a named address address. */ +static enum machine_mode +spu_addr_space_address_mode (addr_space_t addrspace) +{ + switch (addrspace) + { + case ADDR_SPACE_GENERIC: + return Pmode; + case ADDR_SPACE_EA: + return EAmode; + default: + gcc_unreachable (); + } +} + +/* Determine if one named address space is a subset of another. */ + +static bool +spu_addr_space_subset_p (addr_space_t subset, addr_space_t superset) +{ + gcc_assert (subset == ADDR_SPACE_GENERIC || subset == ADDR_SPACE_EA); + gcc_assert (superset == ADDR_SPACE_GENERIC || superset == ADDR_SPACE_EA); + + if (subset == superset) + return true; + + /* If we have -mno-address-space-conversion, treat __ea and generic as not + being subsets but instead as disjoint address spaces. */ + else if (!TARGET_ADDRESS_SPACE_CONVERSION) + return false; + + else + return (subset == ADDR_SPACE_GENERIC && superset == ADDR_SPACE_EA); +} + +/* Convert from one address space to another. */ +static rtx +spu_addr_space_convert (rtx op, tree from_type, tree to_type) +{ + addr_space_t from_as = TYPE_ADDR_SPACE (TREE_TYPE (from_type)); + addr_space_t to_as = TYPE_ADDR_SPACE (TREE_TYPE (to_type)); + + gcc_assert (from_as == ADDR_SPACE_GENERIC || from_as == ADDR_SPACE_EA); + gcc_assert (to_as == ADDR_SPACE_GENERIC || to_as == ADDR_SPACE_EA); + + if (to_as == ADDR_SPACE_GENERIC && from_as == ADDR_SPACE_EA) + { + rtx result, ls; + + ls = gen_const_mem (DImode, + gen_rtx_SYMBOL_REF (Pmode, "__ea_local_store")); + set_mem_align (ls, 128); + + result = gen_reg_rtx (Pmode); + ls = force_reg (Pmode, convert_modes (Pmode, DImode, ls, 1)); + op = force_reg (Pmode, convert_modes (Pmode, EAmode, op, 1)); + ls = emit_conditional_move (ls, NE, op, const0_rtx, Pmode, + ls, const0_rtx, Pmode, 1); + + emit_insn (gen_subsi3 (result, op, ls)); + + return result; + } + + else if (to_as == ADDR_SPACE_EA && from_as == ADDR_SPACE_GENERIC) + { + rtx result, ls; + + ls = gen_const_mem (DImode, + gen_rtx_SYMBOL_REF (Pmode, "__ea_local_store")); + set_mem_align (ls, 128); + + result = gen_reg_rtx (EAmode); + ls = force_reg (EAmode, convert_modes (EAmode, DImode, ls, 1)); + op = force_reg (Pmode, op); + ls = emit_conditional_move (ls, NE, op, const0_rtx, Pmode, + ls, const0_rtx, EAmode, 1); + op = force_reg (EAmode, convert_modes (EAmode, Pmode, op, 1)); + + if (EAmode == SImode) + emit_insn (gen_addsi3 (result, op, ls)); + else + emit_insn (gen_adddi3 (result, op, ls)); + + return result; + } + + else + gcc_unreachable (); +} + + /* Count the total number of instructions in each pipe and return the maximum, which is used as the Minimum Iteration Interval (MII) in the modulo scheduler. get_pipe() will return -2, -1, 0, or 1. @@ -6534,9 +6984,46 @@ spu_section_type_flags (tree decl, const char *name, int reloc) /* .toe needs to have type @nobits. */ if (strcmp (name, ".toe") == 0) return SECTION_BSS; + /* Don't load _ea into the current address space. */ + if (strcmp (name, "._ea") == 0) + return SECTION_WRITE | SECTION_DEBUG; return default_section_type_flags (decl, name, reloc); } +/* Implement targetm.select_section. */ +static section * +spu_select_section (tree decl, int reloc, unsigned HOST_WIDE_INT align) +{ + /* Variables and constants defined in the __ea address space + go into a special section named "._ea". */ + if (TREE_TYPE (decl) != error_mark_node + && TYPE_ADDR_SPACE (TREE_TYPE (decl)) == ADDR_SPACE_EA) + { + /* We might get called with string constants, but get_named_section + doesn't like them as they are not DECLs. Also, we need to set + flags in that case. */ + if (!DECL_P (decl)) + return get_section ("._ea", SECTION_WRITE | SECTION_DEBUG, NULL); + + return get_named_section (decl, "._ea", reloc); + } + + return default_elf_select_section (decl, reloc, align); +} + +/* Implement targetm.unique_section. */ +static void +spu_unique_section (tree decl, int reloc) +{ + /* We don't support unique section names in the __ea address + space for now. */ + if (TREE_TYPE (decl) != error_mark_node + && TYPE_ADDR_SPACE (TREE_TYPE (decl)) != 0) + return; + + default_unique_section (decl, reloc); +} + /* Generate a constant or register which contains 2^SCALE. We assume the result is valid for MODE. Currently, MODE must be V4SFmode and SCALE must be SImode. */ diff --git a/gcc/config/spu/spu.h b/gcc/config/spu/spu.h index 67011a62126..369e6d76e9d 100644 --- a/gcc/config/spu/spu.h +++ b/gcc/config/spu/spu.h @@ -51,7 +51,7 @@ extern GTY(()) int spu_tune; /* Default target_flags if no switches specified. */ #ifndef TARGET_DEFAULT #define TARGET_DEFAULT (MASK_ERROR_RELOC | MASK_SAFE_DMA | MASK_BRANCH_HINTS \ - | MASK_SAFE_HINTS) + | MASK_SAFE_HINTS | MASK_ADDRESS_SPACE_CONVERSION) #endif @@ -469,6 +469,17 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \ #define ASM_OUTPUT_LABELREF(FILE, NAME) \ asm_fprintf (FILE, "%U%s", default_strip_name_encoding (NAME)) +#define ASM_OUTPUT_SYMBOL_REF(FILE, X) \ + do \ + { \ + tree decl; \ + assemble_name (FILE, XSTR ((X), 0)); \ + if ((decl = SYMBOL_REF_DECL ((X))) != 0 \ + && TREE_CODE (decl) == VAR_DECL \ + && TYPE_ADDR_SPACE (TREE_TYPE (decl))) \ + fputs ("@ppu", FILE); \ + } while (0) + /* Instruction Output */ #define REGISTER_NAMES \ @@ -590,6 +601,13 @@ targetm.resolve_overloaded_builtin = spu_resolve_overloaded_builtin; \ } while (0) +/* Address spaces. */ +#define ADDR_SPACE_EA 1 + +/* Named address space keywords. */ +#define TARGET_ADDR_SPACE_KEYWORDS ADDR_SPACE_KEYWORD ("__ea", ADDR_SPACE_EA) + + /* Builtins. */ enum spu_builtin_type diff --git a/gcc/config/spu/spu.opt b/gcc/config/spu/spu.opt index 1589199b60b..4ad7128de51 100644 --- a/gcc/config/spu/spu.opt +++ b/gcc/config/spu/spu.opt @@ -82,3 +82,24 @@ Generate code for given CPU mtune= Target RejectNegative Joined Var(spu_tune_string) Schedule code for given CPU + +mea32 +Target Report RejectNegative Var(spu_ea_model,32) Init(32) +Access variables in 32-bit PPU objects (default) + +mea64 +Target Report RejectNegative Var(spu_ea_model,64) VarExists +Access variables in 64-bit PPU objects + +maddress-space-conversion +Target Report Mask(ADDRESS_SPACE_CONVERSION) +Allow conversions between __ea and generic pointers (default) + +mcache-size= +Target Report RejectNegative Joined UInteger +Size (in KB) of software data cache + +matomic-updates +Target Report +Atomically write back software data cache lines (default) + diff --git a/gcc/config/spu/spu_cache.h b/gcc/config/spu/spu_cache.h new file mode 100644 index 00000000000..66a679be5a0 --- /dev/null +++ b/gcc/config/spu/spu_cache.h @@ -0,0 +1,39 @@ +/* Copyright (C) 2008, 2009 Free Software Foundation, Inc. + + This file is free software; you can redistribute it and/or modify it under + the terms of the GNU General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your option) + any later version. + + This file is distributed in the hope that it will be useful, but WITHOUT + ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + for more details. + + Under Section 7 of GPL version 3, you are granted additional + permissions described in the GCC Runtime Library Exception, version + 3.1, as published by the Free Software Foundation. + + You should have received a copy of the GNU General Public License and + a copy of the GCC Runtime Library Exception along with this program; + see the files COPYING3 and COPYING.RUNTIME respectively. If not, see + . */ + +#ifndef _SPU_CACHE_H +#define _SPU_CACHE_H + +void *__cache_fetch_dirty (__ea void *ea, int n_bytes_dirty); +void *__cache_fetch (__ea void *ea); +void __cache_evict (__ea void *ea); +void __cache_flush (void); +void __cache_touch (__ea void *ea); + +#define cache_fetch_dirty(_ea, _n_bytes_dirty) \ + __cache_fetch_dirty(_ea, _n_bytes_dirty) + +#define cache_fetch(_ea) __cache_fetch(_ea) +#define cache_touch(_ea) __cache_touch(_ea) +#define cache_evict(_ea) __cache_evict(_ea) +#define cache_flush() __cache_flush() + +#endif diff --git a/gcc/config/spu/t-spu-elf b/gcc/config/spu/t-spu-elf index 0c9236fa89f..a54ede9fa25 100644 --- a/gcc/config/spu/t-spu-elf +++ b/gcc/config/spu/t-spu-elf @@ -66,14 +66,39 @@ fp-bit.c: $(srcdir)/config/fp-bit.c $(srcdir)/config/spu/t-spu-elf # Don't let CTOR_LIST end up in sdata section. CRTSTUFF_T_CFLAGS = -#MULTILIB_OPTIONS=mlarge-mem/mtest-abi -#MULTILIB_DIRNAMES=large-mem test-abi -#MULTILIB_MATCHES= +# Multi-lib support. +MULTILIB_OPTIONS=mea64 # Neither gcc or newlib seem to have a standard way to generate multiple # crt*.o files. So we don't use the standard crt0.o name anymore. -EXTRA_MULTILIB_PARTS = crtbegin.o crtend.o +EXTRA_MULTILIB_PARTS = crtbegin.o crtend.o libgcc_cachemgr.a libgcc_cachemgr_nonatomic.a \ + libgcc_cache8k.a libgcc_cache16k.a libgcc_cache32k.a libgcc_cache64k.a libgcc_cache128k.a + +$(T)cachemgr.o: $(srcdir)/config/spu/cachemgr.c + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) $(MULTILIB_CFLAGS) -c $< -o $@ + +# Specialised rule to add a -D flag. +$(T)cachemgr_nonatomic.o: $(srcdir)/config/spu/cachemgr.c + $(GCC_FOR_TARGET) $(LIBGCC2_CFLAGS) $(MULTILIB_CFLAGS) -DNONATOMIC -c $< -o $@ + +$(T)libgcc_%.a: $(T)%.o + $(AR_FOR_TARGET) -rcs $@ $< + +$(T)cache8k.o: $(srcdir)/config/spu/cache.S + $(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -D__CACHE_SIZE__=8 -o $@ -c $< + +$(T)cache16k.o: $(srcdir)/config/spu/cache.S + $(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -D__CACHE_SIZE__=16 -o $@ -c $< + +$(T)cache32k.o: $(srcdir)/config/spu/cache.S + $(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -D__CACHE_SIZE__=32 -o $@ -c $< + +$(T)cache64k.o: $(srcdir)/config/spu/cache.S + $(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -D__CACHE_SIZE__=64 -o $@ -c $< + +$(T)cache128k.o: $(srcdir)/config/spu/cache.S + $(GCC_FOR_TARGET) $(MULTILIB_CFLAGS) -D__CACHE_SIZE__=128 -o $@ -c $< LIBGCC = stmp-multilib INSTALL_LIBGCC = install-multilib diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi index b45df833f2e..4a9ffbfaf7f 100644 --- a/gcc/doc/invoke.texi +++ b/gcc/doc/invoke.texi @@ -846,7 +846,11 @@ See RS/6000 and PowerPC Options. -msafe-dma -munsafe-dma @gol -mbranch-hints @gol -msmall-mem -mlarge-mem -mstdmain @gol --mfixed-range=@var{register-range}} +-mfixed-range=@var{register-range} @gol +-mea32 -mea64 @gol +-maddress-space-conversion -mno-address-space-conversion @gol +-mcache-size=@var{cache-size} @gol +-matomic-updates -mno-atomic-updates} @emph{System V Options} @gccoptlist{-Qy -Qn -YP,@var{paths} -Ym,@var{dir}} @@ -16358,6 +16362,46 @@ useful when compiling kernel code. A register range is specified as two registers separated by a dash. Multiple register ranges can be specified separated by a comma. +@item -mea32 +@itemx -mea64 +@opindex mea32 +@opindex mea64 +Compile code assuming that pointers to the PPU address space accessed +via the @code{__ea} named address space qualifier are either 32 or 64 +bits wide. The default is 32 bits. As this is an ABI changing option, +all object code in an executable must be compiled with the same setting. + +@item -maddress-space-conversion +@itemx -mno-address-space-conversion +@opindex maddress-space-conversion +@opindex mno-address-space-conversion +Allow/disallow treating the @code{__ea} address space as superset +of the generic address space. This enables explicit type casts +between @code{__ea} and generic pointer as well as implicit +conversions of generic pointers to @code{__ea} pointers. The +default is to allow address space pointer conversions. + +@item -mcache-size=@var{cache-size} +@opindex mcache-size +This option controls the version of libgcc that the compiler links to an +executable and selects a software-managed cache for accessing variables +in the @code{__ea} address space with a particular cache size. Possible +options for @var{cache-size} are @samp{8}, @samp{16}, @samp{32}, @samp{64} +and @samp{128}. The default cache size is 64KB. + +@item -matomic-updates +@itemx -mno-atomic-updates +@opindex matomic-updates +@opindex mno-atomic-updates +This option controls the version of libgcc that the compiler links to an +executable and selects whether atomic updates to the software-managed +cache of PPU-side variables are used. If you use atomic updates, changes +to a PPU variable from SPU code using the @code{__ea} named address space +qualifier will not interfere with changes to other PPU variables residing +in the same cache line from PPU code. If you do not use atomic updates, +such interference may occur; however, writing back cache lines will be +more efficient. The default behavior is to use atomic updates. + @item -mdual-nops @itemx -mdual-nops=@var{n} @opindex mdual-nops -- 2.30.2