# A simple Makefile, to build run: make all
-TARGET = test
+TARGET = test-chacha20
CROSS ?= powerpc64le-linux-gnu-
AS = $(CROSS)as
LD = $(CROSS)ld
#compiler flags here
-CFLAGS = -O3 -Wall -Wextra -static -mno-vsx -mno-altivec -DDUMP
+CFLAGS = -O -Wall -Wextra -mno-vsx -mno-altivec -DDUMP -I../../media/pypowersim_wrapper -I/usr/include/python3.7m
+
+# assembler flags here
+ASFLAGS= -mlibresoc -mregnames
#linker flags here
-LDFLAGS = -Wall -static
+LDFLAGS = -Wall -pthread -lpython3.7m
SRCDIR = src
-SOURCES := $(SRCDIR)/xchacha20.c $(SRCDIR)/test.c
-INCLUDES := $(wildcard $(SRCDIR)/*.h))
-OBJECTS := $(SOURCES:$(SRCDIR)/%.c=$(SRCDIR)/%.o)
+CFILES := $(SRCDIR)/xchacha20.c $(SRCDIR)/test.c $(SRCDIR)/xchacha20_wrapper.c
+ASFILES := $(SRCDIR)/xchacha20_svp64.s
+INCLUDES := $(wildcard $(SRCDIR)/*.h))
+OBJECTS := $(CFILES:$(SRCDIR)/%.c=$(SRCDIR)/%.o) $(ASFILES:$(SRCDIR)/%.s=$(SRCDIR)/%.o)
.PHONY: all clean remove
all: ${TARGET}
$(TARGET): $(OBJECTS)
- $(CC) -o $@ $(LDFLAGS) $(OBJECTS)
-
-$(OBJECTS): $(SRCDIR)/%.o : $(SRCDIR)/%.c
- $(CC) $(CFLAGS) -c $< -o $@
+ $(CC) -o $@ $(OBJECTS) $(LDFLAGS)
clean:
$ rm -f $(OBJECTS)
#include <stdlib.h>
#include <stdint.h>
#include "xchacha20.h"
+#include "xchacha20_wrapper.h"
#include <stdio.h>
x1 = 0x3320646e;
x2 = 0x79622d32;
x3 = 0x6b206574;
-
+ (void)k;
x4 = U8TO32_LITTLE(k + 0);
x5 = U8TO32_LITTLE(k + 4);
x6 = U8TO32_LITTLE(k + 8);
x14 = U8TO32_LITTLE(in + 8);
x15 = U8TO32_LITTLE(in + 12);
- for (i = 0; i < 10; i++){
+ //for (i = 0; i < 10; i++){
QUARTERROUND(x0, x4, x8, x12);
QUARTERROUND(x1, x5, x9, x13);
QUARTERROUND(x2, x6, x10, x14);
QUARTERROUND(x1, x6, x11, x12);
QUARTERROUND(x2, x7, x8, x13);
QUARTERROUND(x3, x4, x9, x14);
- }
+ //}
U32TO8_LITTLE(out + 0, x0);
U32TO8_LITTLE(out + 4, x1);
* We then use this sub-key and the last 8 bytes of the iv
* as normal.
*/
- xchacha_hchacha20(k2, iv, k);
+ xchacha_hchacha20_svp64(k2, iv, k);
ctx->input[0] = 0x61707865;
--- /dev/null
+.set out_ptr, 3
+.set in_ptr, 4
+.set k_ptr, 5
+.set ctr, 7
+.set SHAPE0, 8
+.set SHAPE1, 12
+.set SHAPE2, 16
+.set SHIFTS, 20
+.set x, 24
+
+.macro lwi rD, const
+.if (\const >= -0x8000) && (\const <= 0x7fff)
+ li \rD, \const
+.else
+ lis \rD, \const@ha
+ ori \rD, \rD, \const@l
+.endif
+.endm
+
+.macro ldi rD, const
+.if (\const >= -0x80000000) && (\const <= 0x7fffffff)
+ lwi \rD, \const
+.else
+ # load high word into the high word of rD
+ lis \rD,\const@highest # load msg bits 48-63 into rD bits 16-31
+ ori \rD,\rD,\const@higher # load msg bits 32-47 into rD bits 0-15
+
+ rldicr \rD,\rD,32,31 # rotate r4's low word into rD's high word
+
+ # load low word into the low word of rD
+ oris \rD,\rD,\const@h # load msg bits 16-31 into rD bits 16-31
+ ori \rD,\rD,\const@l # load msg bits 0-15 into rD bits 0-15
+.endif
+.endm
+
+ .machine libresoc
+ .file "xchacha20_svp64.s"
+ .abiversion 2
+ .section ".text"
+ .align 2
+ .globl xchacha_hchacha20_svp64_real
+ .type xchacha_hchacha20_svp64_real, @function
+xchacha_hchacha20_svp64_real:
+.LFB0:
+ .cfi_startproc
+ # load x[0] = 0x61707865, x[1] = 0x3320646e
+ ldi x+0, 0x3320646e61707865
+ # load x[2] = 0x79622d32, x[3] = 0x6b206574
+ ldi x+1, 0x6b20657479622d32
+ # load SHAPE0 indices
+ ldi SHAPE0+0, 0x901090108000800
+ ldi SHAPE0+1, 0xb030b030a020a02
+ ldi SHAPE0+2, 0xb010b010a000a00
+ ldi SHAPE0+3, 0x903090308020802
+ # load SHAPE1 indices
+ ldi SHAPE1+0, 0xd050d050c040c04
+ ldi SHAPE1+1, 0xf070f070e060e06
+ ldi SHAPE1+2, 0xc060c060f050f05
+ ldi SHAPE1+3, 0xe040e040d070d07
+ # load SHAPE2 indices
+ ldi SHAPE2+0, 0x50d050d040c040c
+ ldi SHAPE2+1, 0x70f070f060e060e
+ ldi SHAPE2+2, 0x60c060c050f050f
+ ldi SHAPE2+3, 0x40e040e070d070d
+ #shift values
+ ldi SHIFTS+0, 0x0000000c00000010
+ ldi SHIFTS+1, 0x0000000700000008
+
+ # Load 8 values from k_ptr
+ setvl 0,0,4,0,1,1 # Set VL to 8 elements
+ sv.ld *x+2, 0(k_ptr)
+
+ # Load 4 values from in_ptr
+ setvl 0,0,2,0,1,1 # Set VL to 4 elements
+ sv.ld *x+6, 0(in_ptr)
+
+ # after this step, registers 16-32 hold the values that will be in the main loop
+ # establish CTR for outer round count
+ #li ctr, 10
+ #mtctr ctr # Set up counter
+
+ # outer loop begins here (standard CTR loop)
+ # set up VL=32 vertical-first, and SVSHAPEs 0-2
+ # vertical-first, set MAXVL (and r22)
+ setvl 22, 0, 16, 1, 0, 1
+ # SHAPE0, used by sv.add starts at GPR #8, need to offset those indices for x=24
+ svindex 4, 0, 1, 3, 0, 1, 0 # SVSHAPE0, a
+ # SHAPE1, used by sv.xor starts at GPR #12
+ svindex 6, 1, 1, 3, 0, 1, 0 # SVSHAPE1, b
+ # SHAPE2, used by sv.rldcl starts at GPR #16
+ svindex 8, 2, 1, 3, 0, 1, 0 # SVSHAPE2, c
+ # SHAPE3, used also by sv.rldcl to hold the shift values starts at GPR #20
+ # The inner loop will do 16 iterations, but there are only 4 shift values, so we mod 4
+ svshape2 0, 0, 3, 4, 0, 1 # SVSHAPE3, shift amount, mod 4
+
+.outer:
+ # outer loop begins here (standard CTR loop)
+ setvl 22, 22, 16, 1, 1, 0 # vertical-first, set VL from r22
+ # inner loop begins here. add-xor-rotl32 with remap, step, branch
+.inner:
+ svremap 31, 1, 0, 0, 0, 0, 0 # RA=1, RB=0, RT=0 (0b01011)
+ sv.add/w=32 *x+24, *x+24, *x+24
+ svremap 31, 2, 0, 2, 2, 0, 0 # RA=2, RB=0, RS=2 (0b00111)
+ sv.xor/w=32 *x+24, *x+24, *x+24
+ svremap 31, 0, 3, 2, 2, 0, 0 # RA=2, RB=3, RS=2 (0b01110)
+ sv.rldcl/w=32 *x+24, *x+24, *SHIFTS, 0
+ svstep. 16, 1, 0 # step to next in-regs element
+ bc 6, 3, .inner # svstep. Rc=1 loop-end-condition?
+ # inner-loop done: outer loop standard CTR-decrement to setvl again
+ #bdnz .outer # Loop until CTR is zero
+
+ # store x0-x3 directly to *out_ptr
+ setvl 0,0,2,0,1,1 # Set VL to 4 elements
+ sv.std *x, 0(out_ptr)
+ # store x12-x15 to *out_ptr + 16
+ sv.std *x+6, 16(out_ptr)
+ blr
+ .long 0
+ .byte 0,0,0,0,0,3,0,0
+ .cfi_endproc
+
+.LFE0:
+ .size xchacha_hchacha20_svp64_real,.-xchacha_hchacha20_svp64_real
+ .ident "GCC: (Debian 8.3.0-6) 8.3.0"
+ .section .note.GNU-stack,"",@progbits
--- /dev/null
+#include <Python.h>
+#include <stdint.h>
+#include <stdio.h>
+
+#include "pypowersim_wrapper_common.h"
+#include "xchacha20_wrapper.h"
+#include "xchacha20.h"
+
+void xchacha_hchacha20_svp64(uint8_t *out, const uint8_t *in, const uint8_t *k) {
+
+ uint8_t out2[32];
+ xchacha_hchacha20(out2, in, k);
+
+ // These cannot be the same pointer as the original function, as it is really a separate CPU/RAM
+ // we have to memcpy from input to this pointer, the address was chosen arbitrarily
+ uint64_t inptr_svp64 = 0x100000;
+ uint64_t outptr_svp64 = 0x200000;
+ uint64_t keyptr_svp64 = 0x300000;
+
+ // Create the pypowersim_state
+ pypowersim_state_t *state = pypowersim_prepare();
+
+ // Change the relevant elements, mandatory: body
+ state->binary = PyBytes_FromStringAndSize((const char *)&xchacha_hchacha20_svp64_real, 1000);
+ // Set GPR #3 to the output pointer
+ PyObject *out_address = PyLong_FromUnsignedLongLong(outptr_svp64);
+ PyList_SetItem(state->initial_regs, 3, out_address);
+
+ // Set GPR #4 to the input pointer
+ PyObject *in_address = PyLong_FromUnsignedLongLong(inptr_svp64);
+ PyList_SetItem(state->initial_regs, 4, in_address);
+
+ // Load data into buffer from real memory
+ for (int i=0; i < 16; i += 8) {
+ PyObject *svp64_address = PyLong_FromUnsignedLongLong(inptr_svp64 + i);
+ uint64_t *inptr64 = (uint64_t *) in;
+/* printf("in[%d] \t: %p -> %02x %02x %02x %02x %02x %02x %02x %02x\n", i, inptr64, in[i+0], in[i+1], in[i+2], in[i+3],
+ in[i+4], in[i+5], in[i+6], in[i+7]);
+
+ printf("val \t: %016lx -> %016lx\n", *inptr64, inptr_svp64 + i);*/
+ PyObject *word = PyLong_FromUnsignedLongLong(*inptr64);
+ PyDict_SetItem(state->initial_mem, svp64_address, word);
+ in += 8;
+ }
+
+ // Set GPR #5 to the key pointer
+ PyObject *key_address = PyLong_FromUnsignedLongLong(keyptr_svp64);
+ PyList_SetItem(state->initial_regs, 5, key_address);
+
+ // Load data into buffer from real memory
+ for (int i=0; i < 32; i += 8) {
+ PyObject *svp64_address = PyLong_FromUnsignedLongLong(keyptr_svp64 + i);
+ uint64_t *keyptr64 = (uint64_t *) k;
+/* printf("k[%d] \t: %p -> %02x %02x %02x %02x %02x %02x %02x %02x\n", i, keyptr64, k[i+0], k[i+1], k[i+2], k[i+3],
+ k[i+4], k[i+5], k[i+6], k[i+7]);
+
+ printf("val \t: %016lx -> %016lx\n", *keyptr64, keyptr_svp64 + i);*/
+ PyObject *word = PyLong_FromUnsignedLongLong(*keyptr64);
+ PyDict_SetItem(state->initial_mem, svp64_address, word);
+ k += 8;
+ }
+
+ // Prepare the arguments object for the call
+ pypowersim_prepareargs(state);
+
+ // Call the function and get the resulting object
+ state->result_obj = PyObject_CallObject(state->simulator, state->args);
+ if (!state->result_obj) {
+ PyErr_Print();
+ printf("Error invoking 'run_a_simulation'\n");
+ pypowersim_finalize(state);
+ exit(1);
+ }
+
+ PyObject *memobj = PyObject_GetAttrString(state->result_obj, "mem");
+ if (!memobj) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting mem object\n");
+ }
+
+ PyObject *mem = PyObject_GetAttrString(memobj, "mem");
+ if (!mem) {
+ PyErr_Print();
+ Py_DECREF(state->result_obj);
+ printf("Error getting mem dict\n");
+ }
+ uint64_t *outptr64 = (uint64_t *) out;
+ for (int i=0; i < 32; i += 8) {
+ PyObject *svp64_address = PyLong_FromUnsignedLongLong((outptr_svp64 + i)/8);
+ PyObject *pyval = PyDict_GetItem(mem, svp64_address);
+ uint64_t val = PyLong_AsUnsignedLongLong(pyval);
+ *outptr64 = val;
+ printf("out: %p -> %016lx\t val: %016lx -> %lx\n", outptr64, *outptr64, val, outptr_svp64 + i);
+ outptr64++;
+ }
+
+ for (int i=0; i < 32; i+= 8) {
+ printf("out[%d] : %02x %02x %02x %02x %02x %02x %02x %02x\n", i, out[i+0], out[i+1], out[i+2], out[i+3],
+ out[i+4], out[i+5], out[i+6], out[i+7]);
+ printf("out2[%d] : %02x %02x %02x %02x %02x %02x %02x %02x\n", i, out2[i+0], out2[i+1], out2[i+2], out2[i+3],
+ out2[i+4], out2[i+5], out2[i+6], out2[i+7]);
+
+ }
+}
--- /dev/null
+#include <stdint.h>
+
+void xchacha_hchacha20_svp64_real(uint8_t *out, const uint8_t *in, const uint8_t *k);
+void xchacha_hchacha20_svp64(uint8_t *out, const uint8_t *in, const uint8_t *k);