From d345693aa8a5089cfc0080ed5bc3f5d0abf37603 Mon Sep 17 00:00:00 2001 From: Konstantinos Margaritis Date: Sun, 12 Mar 2023 10:44:43 +0000 Subject: [PATCH] [WIP] xchacha20 SVP64 implementation using pypowersim wrapper --- crypto/chacha20/Makefile | 21 ++-- crypto/chacha20/src/xchacha20.c | 9 +- crypto/chacha20/src/xchacha20_svp64.s | 125 ++++++++++++++++++++++++ crypto/chacha20/src/xchacha20_wrapper.c | 105 ++++++++++++++++++++ crypto/chacha20/src/xchacha20_wrapper.h | 4 + 5 files changed, 250 insertions(+), 14 deletions(-) create mode 100644 crypto/chacha20/src/xchacha20_svp64.s create mode 100644 crypto/chacha20/src/xchacha20_wrapper.c create mode 100644 crypto/chacha20/src/xchacha20_wrapper.h diff --git a/crypto/chacha20/Makefile b/crypto/chacha20/Makefile index e356fa63..6978e8f1 100644 --- a/crypto/chacha20/Makefile +++ b/crypto/chacha20/Makefile @@ -1,5 +1,5 @@ # A simple Makefile, to build run: make all -TARGET = test +TARGET = test-chacha20 CROSS ?= powerpc64le-linux-gnu- AS = $(CROSS)as @@ -7,25 +7,26 @@ CC = $(CROSS)gcc LD = $(CROSS)ld #compiler flags here -CFLAGS = -O3 -Wall -Wextra -static -mno-vsx -mno-altivec -DDUMP +CFLAGS = -O -Wall -Wextra -mno-vsx -mno-altivec -DDUMP -I../../media/pypowersim_wrapper -I/usr/include/python3.7m + +# assembler flags here +ASFLAGS= -mlibresoc -mregnames #linker flags here -LDFLAGS = -Wall -static +LDFLAGS = -Wall -pthread -lpython3.7m SRCDIR = src -SOURCES := $(SRCDIR)/xchacha20.c $(SRCDIR)/test.c -INCLUDES := $(wildcard $(SRCDIR)/*.h)) -OBJECTS := $(SOURCES:$(SRCDIR)/%.c=$(SRCDIR)/%.o) +CFILES := $(SRCDIR)/xchacha20.c $(SRCDIR)/test.c $(SRCDIR)/xchacha20_wrapper.c +ASFILES := $(SRCDIR)/xchacha20_svp64.s +INCLUDES := $(wildcard $(SRCDIR)/*.h)) +OBJECTS := $(CFILES:$(SRCDIR)/%.c=$(SRCDIR)/%.o) $(ASFILES:$(SRCDIR)/%.s=$(SRCDIR)/%.o) .PHONY: all clean remove all: ${TARGET} $(TARGET): $(OBJECTS) - $(CC) -o $@ $(LDFLAGS) $(OBJECTS) - -$(OBJECTS): $(SRCDIR)/%.o : $(SRCDIR)/%.c - $(CC) $(CFLAGS) -c $< -o $@ + $(CC) -o $@ $(OBJECTS) $(LDFLAGS) clean: $ rm -f $(OBJECTS) diff --git a/crypto/chacha20/src/xchacha20.c b/crypto/chacha20/src/xchacha20.c index 27fb8f96..cc8c0420 100644 --- a/crypto/chacha20/src/xchacha20.c +++ b/crypto/chacha20/src/xchacha20.c @@ -10,6 +10,7 @@ #include #include #include "xchacha20.h" +#include "xchacha20_wrapper.h" #include @@ -66,7 +67,7 @@ void xchacha_hchacha20(uint8_t *out, const uint8_t *in, const uint8_t *k){ x1 = 0x3320646e; x2 = 0x79622d32; x3 = 0x6b206574; - + (void)k; x4 = U8TO32_LITTLE(k + 0); x5 = U8TO32_LITTLE(k + 4); x6 = U8TO32_LITTLE(k + 8); @@ -80,7 +81,7 @@ void xchacha_hchacha20(uint8_t *out, const uint8_t *in, const uint8_t *k){ x14 = U8TO32_LITTLE(in + 8); x15 = U8TO32_LITTLE(in + 12); - for (i = 0; i < 10; i++){ + //for (i = 0; i < 10; i++){ QUARTERROUND(x0, x4, x8, x12); QUARTERROUND(x1, x5, x9, x13); QUARTERROUND(x2, x6, x10, x14); @@ -89,7 +90,7 @@ void xchacha_hchacha20(uint8_t *out, const uint8_t *in, const uint8_t *k){ QUARTERROUND(x1, x6, x11, x12); QUARTERROUND(x2, x7, x8, x13); QUARTERROUND(x3, x4, x9, x14); - } + //} U32TO8_LITTLE(out + 0, x0); U32TO8_LITTLE(out + 4, x1); @@ -117,7 +118,7 @@ void xchacha_keysetup(XChaCha_ctx *ctx, const uint8_t *k, uint8_t *iv){ * We then use this sub-key and the last 8 bytes of the iv * as normal. */ - xchacha_hchacha20(k2, iv, k); + xchacha_hchacha20_svp64(k2, iv, k); ctx->input[0] = 0x61707865; diff --git a/crypto/chacha20/src/xchacha20_svp64.s b/crypto/chacha20/src/xchacha20_svp64.s new file mode 100644 index 00000000..095362cb --- /dev/null +++ b/crypto/chacha20/src/xchacha20_svp64.s @@ -0,0 +1,125 @@ +.set out_ptr, 3 +.set in_ptr, 4 +.set k_ptr, 5 +.set ctr, 7 +.set SHAPE0, 8 +.set SHAPE1, 12 +.set SHAPE2, 16 +.set SHIFTS, 20 +.set x, 24 + +.macro lwi rD, const +.if (\const >= -0x8000) && (\const <= 0x7fff) + li \rD, \const +.else + lis \rD, \const@ha + ori \rD, \rD, \const@l +.endif +.endm + +.macro ldi rD, const +.if (\const >= -0x80000000) && (\const <= 0x7fffffff) + lwi \rD, \const +.else + # load high word into the high word of rD + lis \rD,\const@highest # load msg bits 48-63 into rD bits 16-31 + ori \rD,\rD,\const@higher # load msg bits 32-47 into rD bits 0-15 + + rldicr \rD,\rD,32,31 # rotate r4's low word into rD's high word + + # load low word into the low word of rD + oris \rD,\rD,\const@h # load msg bits 16-31 into rD bits 16-31 + ori \rD,\rD,\const@l # load msg bits 0-15 into rD bits 0-15 +.endif +.endm + + .machine libresoc + .file "xchacha20_svp64.s" + .abiversion 2 + .section ".text" + .align 2 + .globl xchacha_hchacha20_svp64_real + .type xchacha_hchacha20_svp64_real, @function +xchacha_hchacha20_svp64_real: +.LFB0: + .cfi_startproc + # load x[0] = 0x61707865, x[1] = 0x3320646e + ldi x+0, 0x3320646e61707865 + # load x[2] = 0x79622d32, x[3] = 0x6b206574 + ldi x+1, 0x6b20657479622d32 + # load SHAPE0 indices + ldi SHAPE0+0, 0x901090108000800 + ldi SHAPE0+1, 0xb030b030a020a02 + ldi SHAPE0+2, 0xb010b010a000a00 + ldi SHAPE0+3, 0x903090308020802 + # load SHAPE1 indices + ldi SHAPE1+0, 0xd050d050c040c04 + ldi SHAPE1+1, 0xf070f070e060e06 + ldi SHAPE1+2, 0xc060c060f050f05 + ldi SHAPE1+3, 0xe040e040d070d07 + # load SHAPE2 indices + ldi SHAPE2+0, 0x50d050d040c040c + ldi SHAPE2+1, 0x70f070f060e060e + ldi SHAPE2+2, 0x60c060c050f050f + ldi SHAPE2+3, 0x40e040e070d070d + #shift values + ldi SHIFTS+0, 0x0000000c00000010 + ldi SHIFTS+1, 0x0000000700000008 + + # Load 8 values from k_ptr + setvl 0,0,4,0,1,1 # Set VL to 8 elements + sv.ld *x+2, 0(k_ptr) + + # Load 4 values from in_ptr + setvl 0,0,2,0,1,1 # Set VL to 4 elements + sv.ld *x+6, 0(in_ptr) + + # after this step, registers 16-32 hold the values that will be in the main loop + # establish CTR for outer round count + #li ctr, 10 + #mtctr ctr # Set up counter + + # outer loop begins here (standard CTR loop) + # set up VL=32 vertical-first, and SVSHAPEs 0-2 + # vertical-first, set MAXVL (and r22) + setvl 22, 0, 16, 1, 0, 1 + # SHAPE0, used by sv.add starts at GPR #8, need to offset those indices for x=24 + svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a + # SHAPE1, used by sv.xor starts at GPR #12 + svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b + # SHAPE2, used by sv.rldcl starts at GPR #16 + svindex 8, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c + # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20 + # The inner loop will do 16 iterations, but there are only 4 shift values, so we mod 4 + svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4 + +.outer: + # outer loop begins here (standard CTR loop) + setvl 22, 22, 16, 1, 1, 0 # vertical-first, set VL from r22 + # inner loop begins here. add-xor-rotl32 with remap, step, branch +.inner: + svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011) + sv.add/w=32 *x+24, *x+24, *x+24 + svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111) + sv.xor/w=32 *x+24, *x+24, *x+24 + svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110) + sv.rldcl/w=32 *x+24, *x+24, *SHIFTS, 0 + svstep. 16, 1, 0 # step to next in-regs element + bc 6, 3, .inner # svstep. Rc=1 loop-end-condition? + # inner-loop done: outer loop standard CTR-decrement to setvl again + #bdnz .outer # Loop until CTR is zero + + # store x0-x3 directly to *out_ptr + setvl 0,0,2,0,1,1 # Set VL to 4 elements + sv.std *x, 0(out_ptr) + # store x12-x15 to *out_ptr + 16 + sv.std *x+6, 16(out_ptr) + blr + .long 0 + .byte 0,0,0,0,0,3,0,0 + .cfi_endproc + +.LFE0: + .size xchacha_hchacha20_svp64_real,.-xchacha_hchacha20_svp64_real + .ident "GCC: (Debian 8.3.0-6) 8.3.0" + .section .note.GNU-stack,"",@progbits diff --git a/crypto/chacha20/src/xchacha20_wrapper.c b/crypto/chacha20/src/xchacha20_wrapper.c new file mode 100644 index 00000000..4fd7a4dd --- /dev/null +++ b/crypto/chacha20/src/xchacha20_wrapper.c @@ -0,0 +1,105 @@ +#include +#include +#include + +#include "pypowersim_wrapper_common.h" +#include "xchacha20_wrapper.h" +#include "xchacha20.h" + +void xchacha_hchacha20_svp64(uint8_t *out, const uint8_t *in, const uint8_t *k) { + + uint8_t out2[32]; + xchacha_hchacha20(out2, in, k); + + // These cannot be the same pointer as the original function, as it is really a separate CPU/RAM + // we have to memcpy from input to this pointer, the address was chosen arbitrarily + uint64_t inptr_svp64 = 0x100000; + uint64_t outptr_svp64 = 0x200000; + uint64_t keyptr_svp64 = 0x300000; + + // Create the pypowersim_state + pypowersim_state_t *state = pypowersim_prepare(); + + // Change the relevant elements, mandatory: body + state->binary = PyBytes_FromStringAndSize((const char *)&xchacha_hchacha20_svp64_real, 1000); + // Set GPR #3 to the output pointer + PyObject *out_address = PyLong_FromUnsignedLongLong(outptr_svp64); + PyList_SetItem(state->initial_regs, 3, out_address); + + // Set GPR #4 to the input pointer + PyObject *in_address = PyLong_FromUnsignedLongLong(inptr_svp64); + PyList_SetItem(state->initial_regs, 4, in_address); + + // Load data into buffer from real memory + for (int i=0; i < 16; i += 8) { + PyObject *svp64_address = PyLong_FromUnsignedLongLong(inptr_svp64 + i); + uint64_t *inptr64 = (uint64_t *) in; +/* printf("in[%d] \t: %p -> %02x %02x %02x %02x %02x %02x %02x %02x\n", i, inptr64, in[i+0], in[i+1], in[i+2], in[i+3], + in[i+4], in[i+5], in[i+6], in[i+7]); + + printf("val \t: %016lx -> %016lx\n", *inptr64, inptr_svp64 + i);*/ + PyObject *word = PyLong_FromUnsignedLongLong(*inptr64); + PyDict_SetItem(state->initial_mem, svp64_address, word); + in += 8; + } + + // Set GPR #5 to the key pointer + PyObject *key_address = PyLong_FromUnsignedLongLong(keyptr_svp64); + PyList_SetItem(state->initial_regs, 5, key_address); + + // Load data into buffer from real memory + for (int i=0; i < 32; i += 8) { + PyObject *svp64_address = PyLong_FromUnsignedLongLong(keyptr_svp64 + i); + uint64_t *keyptr64 = (uint64_t *) k; +/* printf("k[%d] \t: %p -> %02x %02x %02x %02x %02x %02x %02x %02x\n", i, keyptr64, k[i+0], k[i+1], k[i+2], k[i+3], + k[i+4], k[i+5], k[i+6], k[i+7]); + + printf("val \t: %016lx -> %016lx\n", *keyptr64, keyptr_svp64 + i);*/ + PyObject *word = PyLong_FromUnsignedLongLong(*keyptr64); + PyDict_SetItem(state->initial_mem, svp64_address, word); + k += 8; + } + + // Prepare the arguments object for the call + pypowersim_prepareargs(state); + + // Call the function and get the resulting object + state->result_obj = PyObject_CallObject(state->simulator, state->args); + if (!state->result_obj) { + PyErr_Print(); + printf("Error invoking 'run_a_simulation'\n"); + pypowersim_finalize(state); + exit(1); + } + + PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem"); + if (!memobj) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting mem object\n"); + } + + PyObject *mem = PyObject_GetAttrString(memobj, "mem"); + if (!mem) { + PyErr_Print(); + Py_DECREF(state->result_obj); + printf("Error getting mem dict\n"); + } + uint64_t *outptr64 = (uint64_t *) out; + for (int i=0; i < 32; i += 8) { + PyObject *svp64_address = PyLong_FromUnsignedLongLong((outptr_svp64 + i)/8); + PyObject *pyval = PyDict_GetItem(mem, svp64_address); + uint64_t val = PyLong_AsUnsignedLongLong(pyval); + *outptr64 = val; + printf("out: %p -> %016lx\t val: %016lx -> %lx\n", outptr64, *outptr64, val, outptr_svp64 + i); + outptr64++; + } + + for (int i=0; i < 32; i+= 8) { + printf("out[%d] : %02x %02x %02x %02x %02x %02x %02x %02x\n", i, out[i+0], out[i+1], out[i+2], out[i+3], + out[i+4], out[i+5], out[i+6], out[i+7]); + printf("out2[%d] : %02x %02x %02x %02x %02x %02x %02x %02x\n", i, out2[i+0], out2[i+1], out2[i+2], out2[i+3], + out2[i+4], out2[i+5], out2[i+6], out2[i+7]); + + } +} diff --git a/crypto/chacha20/src/xchacha20_wrapper.h b/crypto/chacha20/src/xchacha20_wrapper.h new file mode 100644 index 00000000..717aaaf6 --- /dev/null +++ b/crypto/chacha20/src/xchacha20_wrapper.h @@ -0,0 +1,4 @@ +#include + +void xchacha_hchacha20_svp64_real(uint8_t *out, const uint8_t *in, const uint8_t *k); +void xchacha_hchacha20_svp64(uint8_t *out, const uint8_t *in, const uint8_t *k); -- 2.30.2