From 1b28253347dedba72abaf81e86b1812863d07a11 Mon Sep 17 00:00:00 2001 From: Ian Lance Taylor Date: Mon, 1 Oct 2018 20:14:29 +0000 Subject: [PATCH] runtime: add arm64 version of AES hash code Rewrite the arm64 AES hashing code from gc assembler to C code using intrinsics. The resulting code generates the same hash code for the same input as the gc code--that doesn't matter as such, but testing it ensures that the C code does something useful. Reviewed-on: https://go-review.googlesource.com/138535 From-SVN: r264771 --- gcc/go/gofrontend/MERGE | 2 +- libgo/runtime/aeshash.c | 403 +++++++++++++++++++++++++++++++++++++++- 2 files changed, 402 insertions(+), 3 deletions(-) diff --git a/gcc/go/gofrontend/MERGE b/gcc/go/gofrontend/MERGE index 590d2eb5b4e..69dd8b746b7 100644 --- a/gcc/go/gofrontend/MERGE +++ b/gcc/go/gofrontend/MERGE @@ -1,4 +1,4 @@ -f4a224ec481957ca4f14d0e8cc4fe59cc95b3a49 +013a9e68c9a31f888733d46182d19f9e5d956f27 The first line of this file holds the git revision number of the last merge done from the gofrontend repository. diff --git a/libgo/runtime/aeshash.c b/libgo/runtime/aeshash.c index 7f29baa07b2..00658d7a896 100644 --- a/libgo/runtime/aeshash.c +++ b/libgo/runtime/aeshash.c @@ -573,13 +573,412 @@ uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) { #endif // !defined(__x86_64__) -#else // !defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES) +#elif defined(__aarch64__) + +// Undefine some identifiers that we pick up from the Go runtime package that +// are used in arm_neon.h. + +#undef t1 +#undef tx +#undef t2 +#undef t3 +#undef t4 +#undef t5 + +#include + +// Force appropriate CPU level. We won't call here unless the CPU +// supports it. + +#pragma GCC target("+crypto") + +// The arm64 version of aeshashbody. + +uintptr aeshashbody(void* p, uintptr seed, uintptr size, Slice aeskeysched) { + uint8x16_t *pseed; + uint32x4_t vinit32; + uint8x16_t vinit; + uint8x16_t vseed, vseed2, vseed3, vseed4; + uint8x16_t vseed5, vseed6, vseed7, vseed8; + uint8x16_t vval, vval2, vval3, vval4; + uint8x16_t vval5, vval6, vval7, vval8; + uint8x16_t vvalLoop, vvalLoop2, vvalLoop3, vvalLoop4; + uint8x16_t vvalLoop5, vvalLoop6, vvalLoop7, vvalLoop8; + uint8x16x2_t avval2; + uint8x16x3_t avseed3; + + pseed = (uint8x16_t*)(aeskeysched.__values); + + // Combined hash seed and length. + vinit32 = vdupq_n_u32(0); + vinit32[0] = (uint32)seed; + vinit32[1] = (uint32)size; + vinit = vreinterpretq_u8_u32(vinit32); + + // Mix in per-process seed. + vseed = vaeseq_u8(*pseed, vinit); + ++pseed; + // Scramble seed. + vseed = vaesmcq_u8(vseed); + + if (size <= 16) { + if (size == 0) { + // Return 64 bits of scrambled input seed. + return vreinterpretq_u64_u8(vseed)[0]; + } else if (size < 16) { + vval = vreinterpretq_u8_u32(vdupq_n_u32(0)); + if ((size & 8) != 0) { + vval = vreinterpretq_u8_u64(vld1q_lane_u64((uint64_t*)(p), vreinterpretq_u64_u8(vval), 0)); + p = (void*)((uint64_t*)(p) + 1); + } + if ((size & 4) != 0) { + vval = vreinterpretq_u8_u32(vld1q_lane_u32((uint32_t*)(p), vreinterpretq_u32_u8(vval), 2)); + p = (void*)((uint32_t*)(p) + 1); + } + if ((size & 2) != 0) { + vval = vreinterpretq_u8_u16(vld1q_lane_u16((uint16_t*)(p), vreinterpretq_u16_u8(vval), 6)); + p = (void*)((uint16_t*)(p) + 1); + } + if ((size & 1) != 0) { + vval = vld1q_lane_u8((uint8*)(p), vval, 14); + } + } else { + vval = *(uint8x16_t*)(p); + } + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval = vaeseq_u8(vval, vseed); + return vreinterpretq_u64_u8(vval)[0]; + } else if (size <= 32) { + // Make a second seed. + vseed2 = vaeseq_u8(*pseed, vinit); + vseed2 = vaesmcq_u8(vseed2); + vval = *(uint8x16_t*)(p); + vval2 = *(uint8x16_t*)((char*)(p) + (size - 16)); + + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vseed2); + vval2 = vaesmcq_u8(vval2); + + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vseed2); + vval2 = vaesmcq_u8(vval2); + + vval = vaeseq_u8(vval, vseed); + vval2 = vaeseq_u8(vval2, vseed2); + + vval ^= vval2; + + return vreinterpretq_u64_u8(vval)[0]; + } else if (size <= 64) { + avseed3 = vld1q_u8_x3((uint8*)(pseed)); + vseed2 = avseed3.val[0]; + vseed3 = avseed3.val[1]; + vseed4 = avseed3.val[2]; + + vseed2 = vaeseq_u8(vseed2, vinit); + vseed2 = vaesmcq_u8(vseed2); + vseed3 = vaeseq_u8(vseed3, vinit); + vseed3 = vaesmcq_u8(vseed3); + vseed4 = vaeseq_u8(vseed4, vinit); + vseed4 = vaesmcq_u8(vseed4); + + avval2 = vld1q_u8_x2((uint8*)(p)); + vval = avval2.val[0]; + vval2 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32)); + vval3 = avval2.val[0]; + vval4 = avval2.val[1]; + + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vseed2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vseed3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vseed4); + vval4 = vaesmcq_u8(vval4); + + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vseed2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vseed3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vseed4); + vval4 = vaesmcq_u8(vval4); + + vval = vaeseq_u8(vval, vseed); + vval2 = vaeseq_u8(vval2, vseed2); + vval3 = vaeseq_u8(vval3, vseed3); + vval4 = vaeseq_u8(vval4, vseed4); + + vval ^= vval3; + vval2 ^= vval4; + vval ^= vval2; + + return vreinterpretq_u64_u8(vval)[0]; + } else if (size <= 128) { + // For some reason vld1q_u8_x4 is missing. + avseed3 = vld1q_u8_x3((uint8*)(pseed)); + vseed2 = avseed3.val[0]; + vseed3 = avseed3.val[1]; + vseed4 = avseed3.val[2]; + avseed3 = vld1q_u8_x3((uint8*)(pseed + 3)); + vseed5 = avseed3.val[0]; + vseed6 = avseed3.val[1]; + vseed7 = avseed3.val[2]; + vseed8 = *(pseed + 6); + + vseed2 = vaeseq_u8(vseed2, vinit); + vseed2 = vaesmcq_u8(vseed2); + vseed3 = vaeseq_u8(vseed3, vinit); + vseed3 = vaesmcq_u8(vseed3); + vseed4 = vaeseq_u8(vseed4, vinit); + vseed4 = vaesmcq_u8(vseed4); + vseed5 = vaeseq_u8(vseed5, vinit); + vseed5 = vaesmcq_u8(vseed5); + vseed6 = vaeseq_u8(vseed6, vinit); + vseed6 = vaesmcq_u8(vseed6); + vseed7 = vaeseq_u8(vseed7, vinit); + vseed7 = vaesmcq_u8(vseed7); + vseed8 = vaeseq_u8(vseed8, vinit); + vseed8 = vaesmcq_u8(vseed8); + + avval2 = vld1q_u8_x2((uint8*)(p)); + vval = avval2.val[0]; + vval2 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + 32); + vval3 = avval2.val[0]; + vval4 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64)); + vval5 = avval2.val[0]; + vval6 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32)); + vval7 = avval2.val[0]; + vval8 = avval2.val[1]; + + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vseed2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vseed3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vseed4); + vval4 = vaesmcq_u8(vval4); + vval5 = vaeseq_u8(vval5, vseed5); + vval5 = vaesmcq_u8(vval5); + vval6 = vaeseq_u8(vval6, vseed6); + vval6 = vaesmcq_u8(vval6); + vval7 = vaeseq_u8(vval7, vseed7); + vval7 = vaesmcq_u8(vval7); + vval8 = vaeseq_u8(vval8, vseed8); + vval8 = vaesmcq_u8(vval8); + + vval = vaeseq_u8(vval, vseed); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vseed2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vseed3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vseed4); + vval4 = vaesmcq_u8(vval4); + vval5 = vaeseq_u8(vval5, vseed5); + vval5 = vaesmcq_u8(vval5); + vval6 = vaeseq_u8(vval6, vseed6); + vval6 = vaesmcq_u8(vval6); + vval7 = vaeseq_u8(vval7, vseed7); + vval7 = vaesmcq_u8(vval7); + vval8 = vaeseq_u8(vval8, vseed8); + vval8 = vaesmcq_u8(vval8); + + vval = vaeseq_u8(vval, vseed); + vval2 = vaeseq_u8(vval2, vseed2); + vval3 = vaeseq_u8(vval3, vseed3); + vval4 = vaeseq_u8(vval4, vseed4); + vval5 = vaeseq_u8(vval5, vseed5); + vval6 = vaeseq_u8(vval6, vseed6); + vval7 = vaeseq_u8(vval7, vseed7); + vval8 = vaeseq_u8(vval8, vseed8); + + vval ^= vval5; + vval2 ^= vval6; + vval3 ^= vval7; + vval4 ^= vval8; + vval ^= vval3; + vval2 ^= vval4; + vval ^= vval2; + + return vreinterpretq_u64_u8(vval)[0]; + } else { + // For some reason vld1q_u8_x4 is missing. + avseed3 = vld1q_u8_x3((uint8*)(pseed)); + vseed2 = avseed3.val[0]; + vseed3 = avseed3.val[1]; + vseed4 = avseed3.val[2]; + avseed3 = vld1q_u8_x3((uint8*)(pseed + 3)); + vseed5 = avseed3.val[0]; + vseed6 = avseed3.val[1]; + vseed7 = avseed3.val[2]; + vseed8 = *(pseed + 6); + + vseed2 = vaeseq_u8(vseed2, vinit); + vseed2 = vaesmcq_u8(vseed2); + vseed3 = vaeseq_u8(vseed3, vinit); + vseed3 = vaesmcq_u8(vseed3); + vseed4 = vaeseq_u8(vseed4, vinit); + vseed4 = vaesmcq_u8(vseed4); + vseed5 = vaeseq_u8(vseed5, vinit); + vseed5 = vaesmcq_u8(vseed5); + vseed6 = vaeseq_u8(vseed6, vinit); + vseed6 = vaesmcq_u8(vseed6); + vseed7 = vaeseq_u8(vseed7, vinit); + vseed7 = vaesmcq_u8(vseed7); + vseed8 = vaeseq_u8(vseed8, vinit); + vseed8 = vaesmcq_u8(vseed8); + + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 128)); + vval = avval2.val[0]; + vval2 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 96)); + vval3 = avval2.val[0]; + vval4 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 64)); + vval5 = avval2.val[0]; + vval6 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + (size - 32)); + vval7 = avval2.val[0]; + vval8 = avval2.val[1]; + + vvalLoop = vseed; + vvalLoop2 = vseed2; + vvalLoop3 = vseed3; + vvalLoop4 = vseed4; + vvalLoop5 = vseed5; + vvalLoop6 = vseed6; + vvalLoop7 = vseed7; + vvalLoop8 = vseed8; + + size--; + size >>= 7; + do { + vval = vaeseq_u8(vval, vvalLoop); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vvalLoop2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vvalLoop3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vvalLoop4); + vval4 = vaesmcq_u8(vval4); + vval5 = vaeseq_u8(vval5, vvalLoop5); + vval5 = vaesmcq_u8(vval5); + vval6 = vaeseq_u8(vval6, vvalLoop6); + vval6 = vaesmcq_u8(vval6); + vval7 = vaeseq_u8(vval7, vvalLoop7); + vval7 = vaesmcq_u8(vval7); + vval8 = vaeseq_u8(vval8, vvalLoop8); + vval8 = vaesmcq_u8(vval8); + + avval2 = vld1q_u8_x2((uint8*)(p)); + vvalLoop = avval2.val[0]; + vvalLoop2 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + 32); + vvalLoop3 = avval2.val[0]; + vvalLoop4 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + 64); + vvalLoop5 = avval2.val[0]; + vvalLoop6 = avval2.val[1]; + avval2 = vld1q_u8_x2((uint8*)(p) + 96); + vvalLoop7 = avval2.val[0]; + vvalLoop8 = avval2.val[1]; + + p = (void *)((uint8*)(p) + 128); + + vval = vaeseq_u8(vval, vvalLoop); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vvalLoop2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vvalLoop3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vvalLoop4); + vval4 = vaesmcq_u8(vval4); + vval5 = vaeseq_u8(vval5, vvalLoop5); + vval5 = vaesmcq_u8(vval5); + vval6 = vaeseq_u8(vval6, vvalLoop6); + vval6 = vaesmcq_u8(vval6); + vval7 = vaeseq_u8(vval7, vvalLoop7); + vval7 = vaesmcq_u8(vval7); + vval8 = vaeseq_u8(vval8, vvalLoop8); + vval8 = vaesmcq_u8(vval8); + } while (--size > 0); + + vval = vaeseq_u8(vval, vvalLoop); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vvalLoop2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vvalLoop3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vvalLoop4); + vval4 = vaesmcq_u8(vval4); + vval5 = vaeseq_u8(vval5, vvalLoop5); + vval5 = vaesmcq_u8(vval5); + vval6 = vaeseq_u8(vval6, vvalLoop6); + vval6 = vaesmcq_u8(vval6); + vval7 = vaeseq_u8(vval7, vvalLoop7); + vval7 = vaesmcq_u8(vval7); + vval8 = vaeseq_u8(vval8, vvalLoop8); + vval8 = vaesmcq_u8(vval8); + + + vval = vaeseq_u8(vval, vvalLoop); + vval = vaesmcq_u8(vval); + vval2 = vaeseq_u8(vval2, vvalLoop2); + vval2 = vaesmcq_u8(vval2); + vval3 = vaeseq_u8(vval3, vvalLoop3); + vval3 = vaesmcq_u8(vval3); + vval4 = vaeseq_u8(vval4, vvalLoop4); + vval4 = vaesmcq_u8(vval4); + vval5 = vaeseq_u8(vval5, vvalLoop5); + vval5 = vaesmcq_u8(vval5); + vval6 = vaeseq_u8(vval6, vvalLoop6); + vval6 = vaesmcq_u8(vval6); + vval7 = vaeseq_u8(vval7, vvalLoop7); + vval7 = vaesmcq_u8(vval7); + vval8 = vaeseq_u8(vval8, vvalLoop8); + vval8 = vaesmcq_u8(vval8); + + vval = vaeseq_u8(vval, vvalLoop); + vval2 = vaeseq_u8(vval2, vvalLoop2); + vval3 = vaeseq_u8(vval3, vvalLoop3); + vval4 = vaeseq_u8(vval4, vvalLoop4); + vval5 = vaeseq_u8(vval5, vvalLoop5); + vval6 = vaeseq_u8(vval6, vvalLoop6); + vval7 = vaeseq_u8(vval7, vvalLoop7); + vval8 = vaeseq_u8(vval8, vvalLoop8); + + vval ^= vval5; + vval2 ^= vval6; + vval3 ^= vval7; + vval4 ^= vval8; + vval ^= vval3; + vval2 ^= vval4; + vval ^= vval2; + + return vreinterpretq_u64_u8(vval)[0]; + } +} + +#else // (!defined(__i386__) && !defined(__x86_64__) || !defined(HAVE_AS_X86_AES)) && !defined(__aarch64__) uintptr aeshashbody(void* p __attribute__((unused)), uintptr seed __attribute__((unused)), uintptr size __attribute__((unused)), Slice aeskeysched __attribute__((unused))) { - // We should never get here on a non-x86 system. + // We should never get here on a non-x86, non-arm64 system. runtime_throw("impossible call to aeshashbody"); } -- 2.30.2