runtime: copy memory hash code from Go 1.7
authorIan Lance Taylor <ian@gcc.gnu.org>
Thu, 8 Dec 2016 16:37:54 +0000 (16:37 +0000)
committerIan Lance Taylor <ian@gcc.gnu.org>
Thu, 8 Dec 2016 16:37:54 +0000 (16:37 +0000)
    Rewrite the AES hashing code from gc assembler to C code using
    intrinsics.  The resulting code generates the same hash code for the
    same input as the gc code--that doesn't matter as such, but testing it
    ensures that the C code does something useful.

    Also change mips64pe32le to mips64p32le in configure script--noticed
    during CL review.

    Reviewed-on: https://go-review.googlesource.com/34022

From-SVN: r243445

22 files changed:
gcc/go/gofrontend/MERGE
gcc/go/gofrontend/types.cc
libgo/Makefile.am
libgo/Makefile.in
libgo/configure
libgo/configure.ac
libgo/go/runtime/alg.go
libgo/go/runtime/hash32.go [new file with mode: 0644]
libgo/go/runtime/hash64.go [new file with mode: 0644]
libgo/go/runtime/os_gccgo.go [new file with mode: 0644]
libgo/go/runtime/runtime2.go
libgo/go/runtime/stubs.go
libgo/go/runtime/unaligned1.go [new file with mode: 0644]
libgo/go/runtime/unaligned2.go [new file with mode: 0644]
libgo/runtime/aeshash.c [new file with mode: 0644]
libgo/runtime/go-libmain.c
libgo/runtime/go-main.c
libgo/runtime/go-type-identity.c
libgo/runtime/go-type.h
libgo/runtime/proc.c
libgo/runtime/runtime.h
libgo/runtime/runtime_c.c

index df38903801645e873194be0d979ad798d51a3dc5..6bc3797e895b6976b0a66f75412971cfbdc7c94f 100644 (file)
@@ -1,4 +1,4 @@
-2442fca7be8a4f51ddc91070fa69ef66e24593ac
+78e3527fcaf4ffd33b22e39a56e5d076844302be
 
 The first line of this file holds the git revision number of the last
 merge done from the gofrontend repository.
index d540acb35b3dcd01777555ef8028b1323dfa02a8..f3cb32b5142bf12b36a36f88e0b023960c34ad62 100644 (file)
@@ -1648,7 +1648,7 @@ Type::type_functions(Gogo* gogo, Named_type* name, Function_type* hash_fntype,
   const char* equal_fnname;
   if (this->compare_is_identity(gogo))
     {
-      hash_fnname = "__go_type_hash_identity";
+      hash_fnname = "runtime.memhash";
       equal_fnname = "__go_type_equal_identity";
     }
   else
index 7165dfd3acf1f5f4be9c3bd28d126647725a6871..b9aee9d657dd2509f2916014797d6ef59eb1ac69 100644 (file)
@@ -422,6 +422,7 @@ endif
 endif
 
 runtime_files = \
+       runtime/aeshash.c \
        runtime/go-assert.c \
        runtime/go-breakpoint.c \
        runtime/go-caller.c \
index 9b87db00982366e2eb3f4d990ff4983c5a9e16fa..86d7aa84e7d041b1e8493b0e93fd932f33ef39af 100644 (file)
@@ -189,7 +189,7 @@ libgo_llgo_la_DEPENDENCIES = $(am__DEPENDENCIES_4)
 @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@am__objects_4 =  \
 @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@   getncpu-bsd.lo
 @LIBGO_IS_LINUX_TRUE@am__objects_4 = getncpu-linux.lo
-am__objects_5 = go-assert.lo go-breakpoint.lo go-caller.lo \
+am__objects_5 = aeshash.lo go-assert.lo go-breakpoint.lo go-caller.lo \
        go-callers.lo go-cdiv.lo go-cgo.lo go-construct-map.lo \
        go-ffi.lo go-fieldtrack.lo go-matherr.lo go-memclr.lo \
        go-memcmp.lo go-memequal.lo go-memmove.lo go-nanotime.lo \
@@ -767,6 +767,7 @@ toolexeclibgounicode_DATA = \
 @LIBGO_IS_DARWIN_TRUE@@LIBGO_IS_LINUX_FALSE@runtime_getncpu_file = runtime/getncpu-bsd.c
 @LIBGO_IS_LINUX_TRUE@runtime_getncpu_file = runtime/getncpu-linux.c
 runtime_files = \
+       runtime/aeshash.c \
        runtime/go-assert.c \
        runtime/go-breakpoint.c \
        runtime/go-caller.c \
@@ -1446,6 +1447,7 @@ mostlyclean-compile:
 distclean-compile:
        -rm -f *.tab.c
 
+@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/aeshash.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/env_posix.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-bsd.Plo@am__quote@
 @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/getncpu-irix.Plo@am__quote@
@@ -1573,6 +1575,13 @@ libgolibbegin_a-go-libmain.obj: runtime/go-libmain.c
 @AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
 @am__fastdepCC_FALSE@  $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(libgolibbegin_a_CFLAGS) $(CFLAGS) -c -o libgolibbegin_a-go-libmain.obj `if test -f 'runtime/go-libmain.c'; then $(CYGPATH_W) 'runtime/go-libmain.c'; else $(CYGPATH_W) '$(srcdir)/runtime/go-libmain.c'; fi`
 
+aeshash.lo: runtime/aeshash.c
+@am__fastdepCC_TRUE@   $(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT aeshash.lo -MD -MP -MF $(DEPDIR)/aeshash.Tpo -c -o aeshash.lo `test -f 'runtime/aeshash.c' || echo '$(srcdir)/'`runtime/aeshash.c
+@am__fastdepCC_TRUE@   $(am__mv) $(DEPDIR)/aeshash.Tpo $(DEPDIR)/aeshash.Plo
+@AMDEP_TRUE@@am__fastdepCC_FALSE@      source='runtime/aeshash.c' object='aeshash.lo' libtool=yes @AMDEPBACKSLASH@
+@AMDEP_TRUE@@am__fastdepCC_FALSE@      DEPDIR=$(DEPDIR) $(CCDEPMODE) $(depcomp) @AMDEPBACKSLASH@
+@am__fastdepCC_FALSE@  $(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -c -o aeshash.lo `test -f 'runtime/aeshash.c' || echo '$(srcdir)/'`runtime/aeshash.c
+
 go-assert.lo: runtime/go-assert.c
 @am__fastdepCC_TRUE@   $(LIBTOOL)  --tag=CC $(AM_LIBTOOLFLAGS) $(LIBTOOLFLAGS) --mode=compile $(CC) $(DEFS) $(DEFAULT_INCLUDES) $(INCLUDES) $(AM_CPPFLAGS) $(CPPFLAGS) $(AM_CFLAGS) $(CFLAGS) -MT go-assert.lo -MD -MP -MF $(DEPDIR)/go-assert.Tpo -c -o go-assert.lo `test -f 'runtime/go-assert.c' || echo '$(srcdir)/'`runtime/go-assert.c
 @am__fastdepCC_TRUE@   $(am__mv) $(DEPDIR)/go-assert.Tpo $(DEPDIR)/go-assert.Plo
index 9eac5c0e5c5f510fba10254ee23d65a5232623a0..7789c120a9b92f4b7737e9e5c5ca8799730509cc 100755 (executable)
@@ -13624,7 +13624,7 @@ esac
 # supported by the gofrontend and all architectures supported by the
 # gc toolchain.
 # N.B. Keep in sync with gcc/testsuite/go.test/go-test.exp (go-set-goarch).
-ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64pe32le ppc ppc64 ppc64le s390 s390x sparc sparc64"
+ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64p32le ppc ppc64 ppc64le s390 s390x sparc sparc64"
 
 # All known GOARCH_FAMILY values.
 ALLGOARCHFAMILY="I386 ALPHA AMD64 ARM ARM64 IA64 M68K MIPS MIPS64 PPC PPC64 S390 S390X SPARC SPARC64"
index 9e765404739e38127ec62dd17f9f672995286bf1..77a744ea4187f8672cf161c776f5b60cbdf7fa78 100644 (file)
@@ -197,7 +197,7 @@ AC_SUBST(USE_DEJAGNU)
 # supported by the gofrontend and all architectures supported by the
 # gc toolchain.
 # N.B. Keep in sync with gcc/testsuite/go.test/go-test.exp (go-set-goarch).
-ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64pe32le ppc ppc64 ppc64le s390 s390x sparc sparc64"
+ALLGOARCH="386 alpha amd64 amd64p32 arm armbe arm64 arm64be ia64 m68k mipso32 mipsn32 mipso64 mipsn64 mips mipsle mips64 mips64le mips64p32 mips64p32le ppc ppc64 ppc64le s390 s390x sparc sparc64"
 
 # All known GOARCH_FAMILY values.
 ALLGOARCHFAMILY="I386 ALPHA AMD64 ARM ARM64 IA64 M68K MIPS MIPS64 PPC PPC64 S390 S390X SPARC SPARC64"
index 8f7c3c0531c200cb4a4e2c3c801a1d0ae50925f2..5ec19d0a9f9ff33139e2710bca27b625ff17838d 100644 (file)
@@ -23,12 +23,29 @@ import (
 //go:linkname efacevaleq runtime.efacevaleq
 //go:linkname eqstring runtime.eqstring
 //go:linkname cmpstring runtime.cmpstring
+//
+// Temporary to be called from C code.
+//go:linkname alginit runtime.alginit
 
 const (
        c0 = uintptr((8-sys.PtrSize)/4*2860486313 + (sys.PtrSize-4)/4*33054211828000289)
        c1 = uintptr((8-sys.PtrSize)/4*3267000013 + (sys.PtrSize-4)/4*23344194077549503)
 )
 
+var useAeshash bool
+
+// in C code
+func aeshashbody(p unsafe.Pointer, h, s uintptr, sched []byte) uintptr
+
+func aeshash(p unsafe.Pointer, h, s uintptr) uintptr {
+       return aeshashbody(p, h, s, aeskeysched[:])
+}
+
+func aeshashstr(p unsafe.Pointer, h uintptr) uintptr {
+       ps := (*stringStruct)(p)
+       return aeshashbody(unsafe.Pointer(ps.str), h, uintptr(ps.len), aeskeysched[:])
+}
+
 func interhash(p unsafe.Pointer, h uintptr, size uintptr) uintptr {
        a := (*iface)(p)
        tab := a.tab
@@ -198,7 +215,35 @@ func cmpstring(x, y string) int {
 
 // Force the creation of function descriptors for equality and hash
 // functions.  These will be referenced directly by the compiler.
+var _ = memhash
 var _ = interhash
 var _ = interequal
 var _ = nilinterhash
 var _ = nilinterequal
+
+const hashRandomBytes = sys.PtrSize / 4 * 64
+
+// used in asm_{386,amd64}.s to seed the hash function
+var aeskeysched [hashRandomBytes]byte
+
+// used in hash{32,64}.go to seed the hash function
+var hashkey [4]uintptr
+
+func alginit() {
+       // Install aes hash algorithm if we have the instructions we need
+       if (GOARCH == "386" || GOARCH == "amd64") &&
+               GOOS != "nacl" &&
+               cpuid_ecx&(1<<25) != 0 && // aes (aesenc)
+               cpuid_ecx&(1<<9) != 0 && // sse3 (pshufb)
+               cpuid_ecx&(1<<19) != 0 { // sse4.1 (pinsr{d,q})
+               useAeshash = true
+               // Initialize with random data so hash collisions will be hard to engineer.
+               getRandomData(aeskeysched[:])
+               return
+       }
+       getRandomData((*[len(hashkey) * sys.PtrSize]byte)(unsafe.Pointer(&hashkey))[:])
+       hashkey[0] |= 1 // make sure these numbers are odd
+       hashkey[1] |= 1
+       hashkey[2] |= 1
+       hashkey[3] |= 1
+}
diff --git a/libgo/go/runtime/hash32.go b/libgo/go/runtime/hash32.go
new file mode 100644 (file)
index 0000000..cfb3a58
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hashing algorithm inspired by
+//   xxhash: https://code.google.com/p/xxhash/
+// cityhash: https://code.google.com/p/cityhash/
+
+// +build 386 arm armbe m68k mipso32 mipsn32 mips mipsle ppc s390 sparc
+
+package runtime
+
+import "unsafe"
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname memhash runtime.memhash
+
+const (
+       // Constants for multiplication: four random odd 32-bit numbers.
+       m1 = 3168982561
+       m2 = 3339683297
+       m3 = 832293441
+       m4 = 2336365089
+)
+
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
+       if GOARCH == "386" && GOOS != "nacl" && useAeshash {
+               return aeshash(p, seed, s)
+       }
+       h := uint32(seed + s*hashkey[0])
+tail:
+       switch {
+       case s == 0:
+       case s < 4:
+               h ^= uint32(*(*byte)(p))
+               h ^= uint32(*(*byte)(add(p, s>>1))) << 8
+               h ^= uint32(*(*byte)(add(p, s-1))) << 16
+               h = rotl_15(h*m1) * m2
+       case s == 4:
+               h ^= readUnaligned32(p)
+               h = rotl_15(h*m1) * m2
+       case s <= 8:
+               h ^= readUnaligned32(p)
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, s-4))
+               h = rotl_15(h*m1) * m2
+       case s <= 16:
+               h ^= readUnaligned32(p)
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, 4))
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, s-8))
+               h = rotl_15(h*m1) * m2
+               h ^= readUnaligned32(add(p, s-4))
+               h = rotl_15(h*m1) * m2
+       default:
+               v1 := h
+               v2 := uint32(seed * hashkey[1])
+               v3 := uint32(seed * hashkey[2])
+               v4 := uint32(seed * hashkey[3])
+               for s >= 16 {
+                       v1 ^= readUnaligned32(p)
+                       v1 = rotl_15(v1*m1) * m2
+                       p = add(p, 4)
+                       v2 ^= readUnaligned32(p)
+                       v2 = rotl_15(v2*m2) * m3
+                       p = add(p, 4)
+                       v3 ^= readUnaligned32(p)
+                       v3 = rotl_15(v3*m3) * m4
+                       p = add(p, 4)
+                       v4 ^= readUnaligned32(p)
+                       v4 = rotl_15(v4*m4) * m1
+                       p = add(p, 4)
+                       s -= 16
+               }
+               h = v1 ^ v2 ^ v3 ^ v4
+               goto tail
+       }
+       h ^= h >> 17
+       h *= m3
+       h ^= h >> 13
+       h *= m4
+       h ^= h >> 16
+       return uintptr(h)
+}
+
+// Note: in order to get the compiler to issue rotl instructions, we
+// need to constant fold the shift amount by hand.
+// TODO: convince the compiler to issue rotl instructions after inlining.
+func rotl_15(x uint32) uint32 {
+       return (x << 15) | (x >> (32 - 15))
+}
diff --git a/libgo/go/runtime/hash64.go b/libgo/go/runtime/hash64.go
new file mode 100644 (file)
index 0000000..551d5b5
--- /dev/null
@@ -0,0 +1,94 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hashing algorithm inspired by
+//   xxhash: https://code.google.com/p/xxhash/
+// cityhash: https://code.google.com/p/cityhash/
+
+// +build amd64 amd64p32 arm64 mips64 mips64le ppc64 ppc64le s390x alpha arm64be ia64 mipso64 mipsn64 mips64p32 mips64p32le sparc64
+
+package runtime
+
+import "unsafe"
+
+// For gccgo, use go:linkname to rename compiler-called functions to
+// themselves, so that the compiler will export them.
+//
+//go:linkname memhash runtime.memhash
+
+const (
+       // Constants for multiplication: four random odd 64-bit numbers.
+       m1 = 16877499708836156737
+       m2 = 2820277070424839065
+       m3 = 9497967016996688599
+       m4 = 15839092249703872147
+)
+
+func memhash(p unsafe.Pointer, seed, s uintptr) uintptr {
+       if GOARCH == "amd64" && GOOS != "nacl" && useAeshash {
+               return aeshash(p, seed, s)
+       }
+       h := uint64(seed + s*hashkey[0])
+tail:
+       switch {
+       case s == 0:
+       case s < 4:
+               h ^= uint64(*(*byte)(p))
+               h ^= uint64(*(*byte)(add(p, s>>1))) << 8
+               h ^= uint64(*(*byte)(add(p, s-1))) << 16
+               h = rotl_31(h*m1) * m2
+       case s <= 8:
+               h ^= uint64(readUnaligned32(p))
+               h ^= uint64(readUnaligned32(add(p, s-4))) << 32
+               h = rotl_31(h*m1) * m2
+       case s <= 16:
+               h ^= readUnaligned64(p)
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, s-8))
+               h = rotl_31(h*m1) * m2
+       case s <= 32:
+               h ^= readUnaligned64(p)
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, 8))
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, s-16))
+               h = rotl_31(h*m1) * m2
+               h ^= readUnaligned64(add(p, s-8))
+               h = rotl_31(h*m1) * m2
+       default:
+               v1 := h
+               v2 := uint64(seed * hashkey[1])
+               v3 := uint64(seed * hashkey[2])
+               v4 := uint64(seed * hashkey[3])
+               for s >= 32 {
+                       v1 ^= readUnaligned64(p)
+                       v1 = rotl_31(v1*m1) * m2
+                       p = add(p, 8)
+                       v2 ^= readUnaligned64(p)
+                       v2 = rotl_31(v2*m2) * m3
+                       p = add(p, 8)
+                       v3 ^= readUnaligned64(p)
+                       v3 = rotl_31(v3*m3) * m4
+                       p = add(p, 8)
+                       v4 ^= readUnaligned64(p)
+                       v4 = rotl_31(v4*m4) * m1
+                       p = add(p, 8)
+                       s -= 32
+               }
+               h = v1 ^ v2 ^ v3 ^ v4
+               goto tail
+       }
+
+       h ^= h >> 29
+       h *= m3
+       h ^= h >> 32
+       return uintptr(h)
+}
+
+// Note: in order to get the compiler to issue rotl instructions, we
+// need to constant fold the shift amount by hand.
+// TODO: convince the compiler to issue rotl instructions after inlining.
+func rotl_31(x uint64) uint64 {
+       return (x << 31) | (x >> (64 - 31))
+}
diff --git a/libgo/go/runtime/os_gccgo.go b/libgo/go/runtime/os_gccgo.go
new file mode 100644 (file)
index 0000000..4609432
--- /dev/null
@@ -0,0 +1,23 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package runtime
+
+import (
+       "unsafe"
+)
+
+var urandom_dev = []byte("/dev/urandom\x00")
+
+func getRandomData(r []byte) {
+       if startupRandomData != nil {
+               n := copy(r, startupRandomData)
+               extendRandom(r, n)
+               return
+       }
+       fd := open(&urandom_dev[0], 0 /* O_RDONLY */, 0)
+       n := read(fd, unsafe.Pointer(&r[0]), int32(len(r)))
+       closefd(fd)
+       extendRandom(r, int(n))
+}
index c8db7adde2281c6ec55d14b2c2d2b6b194a79099..4712318b76a63f4625d32bb2dec9a6a882cfc32e 100644 (file)
@@ -5,6 +5,7 @@
 package runtime
 
 import (
+       "runtime/internal/sys"
        "unsafe"
 )
 
@@ -668,7 +669,6 @@ type forcegcstate struct {
 // the ELF AT_RANDOM auxiliary vector (vdso_linux_amd64.go or os_linux_386.go).
 var startupRandomData []byte
 
-/*
 // extendRandom extends the random numbers in r[:n] to the whole slice r.
 // Treats n<0 as n==0.
 func extendRandom(r []byte, n int) {
@@ -689,7 +689,6 @@ func extendRandom(r []byte, n int) {
                }
        }
 }
-*/
 
 // deferred subroutine calls
 // This is the gccgo version.
@@ -770,11 +769,12 @@ var (
 
        sched schedt
 
-//     newprocs    int32
+       //      newprocs    int32
+
+       // Information about what cpu features are available.
+       // Set on startup.
+       cpuid_ecx uint32
 
-// Information about what cpu features are available.
-// Set on startup in asm_{x86,amd64}.s.
-//     cpuid_ecx         uint32
 //     cpuid_edx         uint32
 //     cpuid_ebx7        uint32
 //     lfenceBeforeRdtsc bool
index b2f1829feced1234bfeec52507b37e98dbd60594..b4fee6b9076b4b1f9b9146ed0870b45c5ecd4ca0 100644 (file)
@@ -248,6 +248,12 @@ func funcPC(f interface{}) uintptr {
        return **(**uintptr)(i.data)
 }
 
+// For gccgo, to communicate from the C code to the Go code.
+//go:linkname setCpuidECX runtime.setCpuidECX
+func setCpuidECX(v uint32) {
+       cpuid_ecx = v
+}
+
 // typedmemmove copies a typed value.
 // For gccgo for now.
 //go:nosplit
diff --git a/libgo/go/runtime/unaligned1.go b/libgo/go/runtime/unaligned1.go
new file mode 100644 (file)
index 0000000..c94f19e
--- /dev/null
@@ -0,0 +1,17 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build 386 amd64 amd64p32 arm64 ppc64 ppc64le s390x ppc s390 arm64be
+
+package runtime
+
+import "unsafe"
+
+func readUnaligned32(p unsafe.Pointer) uint32 {
+       return *(*uint32)(p)
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+       return *(*uint64)(p)
+}
diff --git a/libgo/go/runtime/unaligned2.go b/libgo/go/runtime/unaligned2.go
new file mode 100644 (file)
index 0000000..e52d6ce
--- /dev/null
@@ -0,0 +1,20 @@
+// Copyright 2014 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// +build arm mips64 mips64le armbe m68k mipso32 mipsn32 mips mipsle sparc alpha ia64 mipso64 mipsn64 mips64p32 mips64p32le sparc64
+
+package runtime
+
+import "unsafe"
+
+// Note: These routines perform the read with an unspecified endianness.
+func readUnaligned32(p unsafe.Pointer) uint32 {
+       q := (*[4]byte)(p)
+       return uint32(q[0]) + uint32(q[1])<<8 + uint32(q[2])<<16 + uint32(q[3])<<24
+}
+
+func readUnaligned64(p unsafe.Pointer) uint64 {
+       q := (*[8]byte)(p)
+       return uint64(q[0]) + uint64(q[1])<<8 + uint64(q[2])<<16 + uint64(q[3])<<24 + uint64(q[4])<<32 + uint64(q[5])<<40 + uint64(q[6])<<48 + uint64(q[7])<<56
+}
diff --git a/libgo/runtime/aeshash.c b/libgo/runtime/aeshash.c
new file mode 100644 (file)
index 0000000..faa90e0
--- /dev/null
@@ -0,0 +1,583 @@
+// Copyright 2016 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Hash code using AES intrinsics.
+
+#include "runtime.h"
+
+uintptr aeshashbody(void*, uintptr, uintptr, Slice)
+       __asm__(GOSYM_PREFIX "runtime.aeshashbody");
+
+uintptr aeshashbody(void*, uintptr, uintptr, Slice)
+       __attribute__((no_split_stack));
+
+#if defined(__i386__) || defined(__x86_64__)
+
+#include <emmintrin.h>
+#include <tmmintrin.h>
+#include <wmmintrin.h>
+
+// Force appropriate CPU level.  We won't call here unless the CPU
+// supports it.
+
+#pragma GCC target("ssse3", "aes")
+
+#ifdef __x86_64__
+
+// aeshashbody implements a hash function using AES instructions
+// available in recent x86 processors. Note this is not encryption,
+// just hashing.
+//
+// This is written to produce exactly the same results as the gc
+// implementation, not because that matters, but just to ensure that
+// this does something reasonable.
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+       __m128i mseed, mseed2, mseed3, mseed4, mseed5, mseed6, mseed7, mseed8;
+       __m128i mval, mval2, mval3, mval4, mval5, mval6, mval7, mval8;
+
+       // Start with hash seed.
+       mseed = _mm_cvtsi64_si128(seed);
+       // Get 16 bits of length.
+       mseed = _mm_insert_epi16(mseed, size, 4);
+       // Repeat length 4 times total.
+       mseed = _mm_shufflehi_epi16(mseed, 0);
+       // Save unscrambled seed.
+       mseed2 = mseed;
+       // XOR in per-process seed.
+       mseed ^= _mm_loadu_si128(aeskeysched.__values);
+       // Scramble seed.
+       mseed = _mm_aesenc_si128(mseed, mseed);
+
+       if (size <= 16) {
+               if (size == 0) {
+                       // Return scrambled input seed.
+                       return _mm_cvtsi128_si64(_mm_aesenc_si128(mseed, mseed));
+               } else if (size < 16) {
+                       if ((((uintptr)(p) + 16) & 0xff0) != 0) {
+                               static const uint64 masks[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0x00000000000000ff, 0x0000000000000000,
+                                   0x000000000000ffff, 0x0000000000000000,
+                                   0x0000000000ffffff, 0x0000000000000000,
+                                   0x00000000ffffffff, 0x0000000000000000,
+                                   0x000000ffffffffff, 0x0000000000000000,
+                                   0x0000ffffffffffff, 0x0000000000000000,
+                                   0x00ffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x00000000000000ff,
+                                   0xffffffffffffffff, 0x000000000000ffff,
+                                   0xffffffffffffffff, 0x0000000000ffffff,
+                                   0xffffffffffffffff, 0x00000000ffffffff,
+                                   0xffffffffffffffff, 0x000000ffffffffff,
+                                   0xffffffffffffffff, 0x0000ffffffffffff,
+                                   0xffffffffffffffff, 0x00ffffffffffffff
+                                 };
+
+                               // 16 bytes loaded at p won't cross a page
+                               // boundary, so we can load directly.
+                               mval = _mm_loadu_si128(p);
+                               mval &= *(const __m128i*)(&masks[size*2]);
+                       } else {
+                               static const uint64 shifts[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0xffffffffffffff0f, 0xffffffffffffffff,
+                                   0xffffffffffff0f0e, 0xffffffffffffffff,
+                                   0xffffffffff0f0e0d, 0xffffffffffffffff,
+                                   0xffffffff0f0e0d0c, 0xffffffffffffffff,
+                                   0xffffff0f0e0d0c0b, 0xffffffffffffffff,
+                                   0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
+                                   0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
+                                   0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
+                                   0x0e0d0c0b0a090807, 0xffffffffffffff0f,
+                                   0x0d0c0b0a09080706, 0xffffffffffff0f0e,
+                                   0x0c0b0a0908070605, 0xffffffffff0f0e0d,
+                                   0x0b0a090807060504, 0xffffffff0f0e0d0c,
+                                   0x0a09080706050403, 0xffffff0f0e0d0c0b,
+                                   0x0908070605040302, 0xffff0f0e0d0c0b0a,
+                                   0x0807060504030201, 0xff0f0e0d0c0b0a09,
+                                 };
+
+                               // address ends in 1111xxxx. Might be
+                               // up against a page boundary, so load
+                               // ending at last byte.  Then shift
+                               // bytes down using pshufb.
+                               mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
+                               mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
+                       }
+               } else {
+                       mval = _mm_loadu_si128(p);
+               }
+
+               // XOR data with seed.
+               mval ^= mseed;
+               // Scramble combo 3 times.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval = _mm_aesenc_si128(mval, mval);
+               return _mm_cvtsi128_si64(mval);
+       } else if (size <= 32) {
+               // Make second starting seed.
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               // Load data to be hashed.
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
+               // XOR with seed.
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               // Scramble 3 times.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               // Combine results.
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       } else if (size <= 64) {
+               // Make 3 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + 16));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               mval3 ^= mseed3;
+               mval4 ^= mseed4;
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       } else if (size <= 128) {
+               // Make 7 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed5 = mseed2;
+               mseed6 = mseed2;
+               mseed7 = mseed2;
+               mseed8 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
+               mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
+               mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
+               mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
+               mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+               mseed5 = _mm_aesenc_si128(mseed5, mseed5);
+               mseed6 = _mm_aesenc_si128(mseed6, mseed6);
+               mseed7 = _mm_aesenc_si128(mseed7, mseed7);
+               mseed8 = _mm_aesenc_si128(mseed8, mseed8);
+
+               // Load data.
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + 16));
+               mval3 = _mm_loadu_si128((void*)((char*)p + 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + 48));
+               mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
+               mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
+               mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // XOR with seed.
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               mval3 ^= mseed3;
+               mval4 ^= mseed4;
+               mval5 ^= mseed5;
+               mval6 ^= mseed6;
+               mval7 ^= mseed7;
+               mval8 ^= mseed8;
+
+               // Scramble 3 times.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               // Combine results.
+               mval ^= mval5;
+               mval2 ^= mval6;
+               mval3 ^= mval7;
+               mval4 ^= mval8;
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       } else {
+               // Make 7 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed5 = mseed2;
+               mseed6 = mseed2;
+               mseed7 = mseed2;
+               mseed8 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
+               mseed5 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 64));
+               mseed6 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 80));
+               mseed7 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 96));
+               mseed8 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 112));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+               mseed5 = _mm_aesenc_si128(mseed5, mseed5);
+               mseed6 = _mm_aesenc_si128(mseed6, mseed6);
+               mseed7 = _mm_aesenc_si128(mseed7, mseed7);
+               mseed8 = _mm_aesenc_si128(mseed8, mseed8);
+
+               // Start with last (possibly overlapping) block.
+               mval = _mm_loadu_si128((void*)((char*)p + size - 128));
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 112));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 96));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 80));
+               mval5 = _mm_loadu_si128((void*)((char*)p + size - 64));
+               mval6 = _mm_loadu_si128((void*)((char*)p + size - 48));
+               mval7 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval8 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // XOR in seed.
+               mval ^= mseed;
+               mval2 ^= mseed2;
+               mval3 ^= mseed3;
+               mval4 ^= mseed4;
+               mval5 ^= mseed5;
+               mval6 ^= mseed6;
+               mval7 ^= mseed7;
+               mval8 ^= mseed8;
+
+               // Compute number of remaining 128-byte blocks.
+               size--;
+               size >>= 7;
+               do {
+                       // Scramble state.
+                       mval = _mm_aesenc_si128(mval, mval);
+                       mval2 = _mm_aesenc_si128(mval2, mval2);
+                       mval3 = _mm_aesenc_si128(mval3, mval3);
+                       mval4 = _mm_aesenc_si128(mval4, mval4);
+                       mval5 = _mm_aesenc_si128(mval5, mval5);
+                       mval6 = _mm_aesenc_si128(mval6, mval6);
+                       mval7 = _mm_aesenc_si128(mval7, mval7);
+                       mval8 = _mm_aesenc_si128(mval8, mval8);
+
+                       // Scramble state, XOR in a block.
+                       mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
+                       mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
+                       mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
+                       mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
+                       mval5 = _mm_aesenc_si128(mval5, _mm_loadu_si128((void*)((char*)p + 64)));
+                       mval6 = _mm_aesenc_si128(mval6, _mm_loadu_si128((void*)((char*)p + 80)));
+                       mval7 = _mm_aesenc_si128(mval7, _mm_loadu_si128((void*)((char*)p + 96)));
+                       mval8 = _mm_aesenc_si128(mval8, _mm_loadu_si128((void*)((char*)p + 112)));
+
+                       p = (void*)((char*)p + 128);
+               } while (--size > 0);
+
+               // 3 more scrambles to finish.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+               mval5 = _mm_aesenc_si128(mval5, mval5);
+               mval6 = _mm_aesenc_si128(mval6, mval6);
+               mval7 = _mm_aesenc_si128(mval7, mval7);
+               mval8 = _mm_aesenc_si128(mval8, mval8);
+
+               mval ^= mval5;
+               mval2 ^= mval6;
+               mval3 ^= mval7;
+               mval4 ^= mval8;
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si64(mval);
+       }
+}
+
+#else // !defined(__x86_64__)
+
+// The 32-bit version of aeshashbody.
+
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+       __m128i mseed, mseed2, mseed3, mseed4;
+       __m128i mval, mval2, mval3, mval4;
+
+       // Start with hash seed.
+       mseed = _mm_cvtsi32_si128(seed);
+       // Get 16 bits of length.
+       mseed = _mm_insert_epi16(mseed, size, 4);
+       // Replace size with its low 2 bytes repeated 4 times.
+       mseed = _mm_shufflehi_epi16(mseed, 0);
+       // Save unscrambled seed.
+       mseed2 = mseed;
+       // XOR in per-process seed.
+       mseed ^= _mm_loadu_si128(aeskeysched.__values);
+       // Scramble seed.
+       mseed = _mm_aesenc_si128(mseed, mseed);
+
+       if (size <= 16) {
+               if (size == 0) {
+                       // Return scrambled input seed.
+                       return _mm_cvtsi128_si32(_mm_aesenc_si128(mseed, mseed));
+               } else if (size < 16) {
+                       if ((((uintptr)(p) + 16) & 0xff0) != 0) {
+                               static const uint64 masks[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0x00000000000000ff, 0x0000000000000000,
+                                   0x000000000000ffff, 0x0000000000000000,
+                                   0x0000000000ffffff, 0x0000000000000000,
+                                   0x00000000ffffffff, 0x0000000000000000,
+                                   0x000000ffffffffff, 0x0000000000000000,
+                                   0x0000ffffffffffff, 0x0000000000000000,
+                                   0x00ffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x0000000000000000,
+                                   0xffffffffffffffff, 0x00000000000000ff,
+                                   0xffffffffffffffff, 0x000000000000ffff,
+                                   0xffffffffffffffff, 0x0000000000ffffff,
+                                   0xffffffffffffffff, 0x00000000ffffffff,
+                                   0xffffffffffffffff, 0x000000ffffffffff,
+                                   0xffffffffffffffff, 0x0000ffffffffffff,
+                                   0xffffffffffffffff, 0x00ffffffffffffff
+                                 };
+
+                               // 16 bytes loaded at p won't cross a page
+                               // boundary, so we can load it directly.
+                               mval = _mm_loadu_si128(p);
+                               mval &= *(const __m128i*)(&masks[size*2]);
+                       } else {
+                               static const uint64 shifts[32]
+                                 __attribute__ ((aligned(16))) =
+                                 {
+                                   0x0000000000000000, 0x0000000000000000,
+                                   0xffffffffffffff0f, 0xffffffffffffffff,
+                                   0xffffffffffff0f0e, 0xffffffffffffffff,
+                                   0xffffffffff0f0e0d, 0xffffffffffffffff,
+                                   0xffffffff0f0e0d0c, 0xffffffffffffffff,
+                                   0xffffff0f0e0d0c0b, 0xffffffffffffffff,
+                                   0xffff0f0e0d0c0b0a, 0xffffffffffffffff,
+                                   0xff0f0e0d0c0b0a09, 0xffffffffffffffff,
+                                   0x0f0e0d0c0b0a0908, 0xffffffffffffffff,
+                                   0x0e0d0c0b0a090807, 0xffffffffffffff0f,
+                                   0x0d0c0b0a09080706, 0xffffffffffff0f0e,
+                                   0x0c0b0a0908070605, 0xffffffffff0f0e0d,
+                                   0x0b0a090807060504, 0xffffffff0f0e0d0c,
+                                   0x0a09080706050403, 0xffffff0f0e0d0c0b,
+                                   0x0908070605040302, 0xffff0f0e0d0c0b0a,
+                                   0x0807060504030201, 0xff0f0e0d0c0b0a09,
+                                 };
+
+                               // address ends in 1111xxxx. Might be
+                               // up against a page boundary, so load
+                               // ending at last byte.  Then shift
+                               // bytes down using pshufb.
+                               mval = _mm_loadu_si128((void*)((char*)p - 16 + size));
+                               mval = _mm_shuffle_epi8(mval, *(const __m128i*)(&shifts[size*2]));
+                       }
+               } else {
+                       mval = _mm_loadu_si128(p);
+               }
+
+               // Scramble input, XOR in seed.
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval = _mm_aesenc_si128(mval, mval);
+               return _mm_cvtsi128_si32(mval);
+       } else if (size <= 32) {
+               // Make second starting seed.
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               // Load data to be hashed.
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // Scramble 3 times.
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval2 = _mm_aesenc_si128(mval2, mseed2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+
+               // Combine results.
+               mval ^= mval2;
+               return _mm_cvtsi128_si32(mval);
+       } else if (size <= 64) {
+               // Make 3 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+
+               mval = _mm_loadu_si128(p);
+               mval2 = _mm_loadu_si128((void*)((char*)p + 16));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval2 = _mm_aesenc_si128(mval2, mseed2);
+               mval3 = _mm_aesenc_si128(mval3, mseed3);
+               mval4 = _mm_aesenc_si128(mval4, mseed4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si32(mval);
+       } else {
+               // Make 3 more starting seeds.
+               mseed3 = mseed2;
+               mseed4 = mseed2;
+               mseed2 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 16));
+               mseed3 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 32));
+               mseed4 ^= _mm_loadu_si128((void*)((char*)aeskeysched.__values + 48));
+               mseed2 = _mm_aesenc_si128(mseed2, mseed2);
+               mseed3 = _mm_aesenc_si128(mseed3, mseed3);
+               mseed4 = _mm_aesenc_si128(mseed4, mseed4);
+
+               // Start with last (possibly overlapping) block.
+               mval = _mm_loadu_si128((void*)((char*)p + size - 64));
+               mval2 = _mm_loadu_si128((void*)((char*)p + size - 48));
+               mval3 = _mm_loadu_si128((void*)((char*)p + size - 32));
+               mval4 = _mm_loadu_si128((void*)((char*)p + size - 16));
+
+               // Scramble state once.
+               mval = _mm_aesenc_si128(mval, mseed);
+               mval2 = _mm_aesenc_si128(mval2, mseed2);
+               mval3 = _mm_aesenc_si128(mval3, mseed3);
+               mval4 = _mm_aesenc_si128(mval4, mseed4);
+
+               // Compute number of remaining 64-byte blocks.
+               size--;
+               size >>= 6;
+               do {
+                       // Scramble state, XOR in a block.
+                       mval = _mm_aesenc_si128(mval, _mm_loadu_si128(p));
+                       mval2 = _mm_aesenc_si128(mval2, _mm_loadu_si128((void*)((char*)p + 16)));
+                       mval3 = _mm_aesenc_si128(mval3, _mm_loadu_si128((void*)((char*)p + 32)));
+                       mval4 = _mm_aesenc_si128(mval4, _mm_loadu_si128((void*)((char*)p + 48)));
+
+                       // Scramble state.
+                       mval = _mm_aesenc_si128(mval, mval);
+                       mval2 = _mm_aesenc_si128(mval2, mval2);
+                       mval3 = _mm_aesenc_si128(mval3, mval3);
+                       mval4 = _mm_aesenc_si128(mval4, mval4);
+
+                       p = (void*)((char*)p + 64);
+               } while (--size > 0);
+
+               // 2 more scrambles to finish.
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval = _mm_aesenc_si128(mval, mval);
+               mval2 = _mm_aesenc_si128(mval2, mval2);
+               mval3 = _mm_aesenc_si128(mval3, mval3);
+               mval4 = _mm_aesenc_si128(mval4, mval4);
+
+               mval ^= mval3;
+               mval2 ^= mval4;
+               mval ^= mval2;
+               return _mm_cvtsi128_si32(mval);
+       }
+}
+
+#endif // !defined(__x86_64__)
+
+#else // !defined(__i386__) && !defined(__x86_64__)
+
+uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) {
+       // We should never get here on a non-x86 system.
+       runtime_throw("impossible call to aeshashbody");
+}
+
+#endif // !defined(__i386__) && !defined(__x86_64__)
index 6884f3a5f56d796271ebe70f16f0f46e018c6094..c62ad93c185220e8d5298d4ab3693d83667f541d 100644 (file)
@@ -61,6 +61,7 @@ initfn (int argc, char **argv, char** env __attribute__ ((unused)))
 
   runtime_isarchive = true;
 
+  runtime_cpuinit ();
   runtime_initsig(true);
 
   a = (struct args *) malloc (sizeof *a);
index ff2958c239a30eb6b945691936ad93992088219b..622a77d96af8e80f2531ee41e6ac1c09e5929060 100644 (file)
@@ -47,6 +47,7 @@ main (int argc, char **argv)
   runtime_isstarted = true;
 
   __go_end = (uintptr)_end;
+  runtime_cpuinit ();
   runtime_check ();
   runtime_args (argc, (byte **) argv);
   runtime_osinit ();
index d58aa75e5ed2cfb39f0feb635b2266c52d3d3c11..842fa249bd0c482b47949d31bc4d5045c2c11d26 100644 (file)
@@ -9,44 +9,14 @@
 #include "runtime.h"
 #include "go-type.h"
 
-/* An identity hash function for a type.  This is used for types where
-   we can simply use the type value itself as a hash code.  This is
-   true of, e.g., integers and pointers.  */
+/* The hash functions for types that can compare as identity is
+   written in Go.  */
 
-uintptr_t
-__go_type_hash_identity (const void *key, uintptr_t seed, uintptr_t key_size)
-{
-  uintptr_t ret;
-  uintptr_t i;
-  const unsigned char *p;
-
-  if (key_size <= 8)
-    {
-      union
-      {
-       uint64 v;
-       unsigned char a[8];
-      } u;
-      u.v = 0;
-#ifdef WORDS_BIGENDIAN
-      __builtin_memcpy (&u.a[8 - key_size], key, key_size);
-#else
-      __builtin_memcpy (&u.a[0], key, key_size);
-#endif
-      if (sizeof (uintptr_t) >= 8)
-       return (uintptr_t) u.v ^ seed;
-      else
-       return (uintptr_t) ((u.v >> 32) ^ (u.v & 0xffffffff)) ^ seed;
-    }
-
-  ret = seed;
-  for (i = 0, p = (const unsigned char *) key; i < key_size; i++, p++)
-    ret = ret * 33 + *p;
-  return ret;
-}
+extern uintptr runtime_memhash(void *, uintptr, uintptr)
+  __asm__ (GOSYM_PREFIX "runtime.memhash");
 
 const FuncVal __go_type_hash_identity_descriptor =
-  { (void *) __go_type_hash_identity };
+  { (void *) runtime_memhash };
 
 /* An identity equality function for a type.  This is used for types
    where we can check for equality by checking that the values have
index 7c3149badc741575a778ed342321e446a588c3a3..2d5965c64d3eaf9fe6230ea12c88b8c153ec1c51 100644 (file)
@@ -362,7 +362,6 @@ extern _Bool
 __go_type_descriptors_equal(const struct __go_type_descriptor*,
                            const struct __go_type_descriptor*);
 
-extern uintptr_t __go_type_hash_identity (const void *, uintptr_t, uintptr_t);
 extern const FuncVal __go_type_hash_identity_descriptor;
 extern _Bool __go_type_equal_identity (const void *, const void *, uintptr_t);
 extern const FuncVal __go_type_equal_identity_descriptor;
index dd5562bb57668ece482d95c26abf66062b20d05a..be7e083f080be6c4b1c06f3bde8c663a492497dd 100644 (file)
@@ -455,7 +455,8 @@ runtime_schedinit(void)
        // runtime_symtabinit();
        runtime_mallocinit();
        mcommoninit(m);
-       
+       runtime_alginit(); // maps must not be used before this call
+
        // Initialize the itable value for newErrorCString,
        // so that the next time it gets called, possibly
        // in a fault during a garbage collection, it will not
index f793fea9cb2dfbc5c4c808263b8bbbf074807333..424b429b0516ea568344874cea4106c169553e74 100644 (file)
@@ -265,6 +265,8 @@ struct __go_func_type;
 void   runtime_args(int32, byte**)
   __asm__ (GOSYM_PREFIX "runtime.args");
 void   runtime_osinit();
+void   runtime_alginit(void)
+  __asm__ (GOSYM_PREFIX "runtime.alginit");
 void   runtime_goargs(void)
   __asm__ (GOSYM_PREFIX "runtime.goargs");
 void   runtime_goenvs(void);
@@ -592,3 +594,7 @@ extern void *getitab(const struct __go_type_descriptor *,
                     const struct __go_type_descriptor *,
                     _Bool)
   __asm__ (GOSYM_PREFIX "runtime.getitab");
+
+extern void runtime_cpuinit(void);
+extern void setCpuidECX(uint32)
+  __asm__ (GOSYM_PREFIX "runtime.setCpuidECX");
index 16be0891aeab91cf70ea3b966c9b5fce726ff112..3387401b427c3419a814ce1ddf0d88704677e75f 100644 (file)
@@ -6,6 +6,10 @@
 #include <signal.h>
 #include <unistd.h>
 
+#if defined(__i386__) || defined(__x86_64__)
+#include <cpuid.h>
+#endif
+
 #include "config.h"
 
 #include "runtime.h"
@@ -204,3 +208,18 @@ go_errno()
 {
   return (intgo)errno;
 }
+
+// CPU-specific initialization.
+// Fetch CPUID info on x86.
+
+void
+runtime_cpuinit()
+{
+#if defined(__i386__) || defined(__x86_64__)
+       unsigned int eax, ebx, ecx, edx;
+
+       if (__get_cpuid(1, &eax, &ebx, &ecx, &edx)) {
+               setCpuidECX(ecx);
+       }
+#endif
+}