From 0bc47c95b9b77d2237d36e23c1d45f233f9f6326 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Wed, 12 Apr 2023 13:26:21 +0100 Subject: [PATCH] add LD/ST-Index-Shifted provisional instructions to optable.csv for v2 ls012 --- openpower/sv/rfc/Makefile | 1 + openpower/sv/rfc/ls012.mdwn | 17 ++++++++++++- openpower/sv/rfc/ls012/optable.csv | 40 ++++++++++++++++++++++++++++++ 3 files changed, 57 insertions(+), 1 deletion(-) diff --git a/openpower/sv/rfc/Makefile b/openpower/sv/rfc/Makefile index acebf478d..8932897f1 100644 --- a/openpower/sv/rfc/Makefile +++ b/openpower/sv/rfc/Makefile @@ -11,6 +11,7 @@ all: $(pdfs) ls012 = $(realpath ls012) $(ls012)/areas.mdwn $(ls012)/xo_cost.mdwn: ls012_optable.py ls012/optable.csv + @echo making optables python3 ls012_optable.py tex_out/%.mdwn.d: %.mdwn ../../mdwn_inline.py diff --git a/openpower/sv/rfc/ls012.mdwn b/openpower/sv/rfc/ls012.mdwn index 0964bc883..99e545418 100644 --- a/openpower/sv/rfc/ls012.mdwn +++ b/openpower/sv/rfc/ls012.mdwn @@ -80,6 +80,8 @@ Audio/Visual, High-Performance Compute, GPU workloads and DSP. * 4 - INT<->FP mv [[ls006]] * 19 - GPR LD/ST-PostIncrement-Update (saves hugely in hot-loops) [[ls011]] * ~12 - FPR LD/ST-PostIncrement-Update (ditto) [[ls011]] +* 26 - GPR LD/ST-Shifted (again saves hugely in hot-loops) [[ls004]] +* 11 - FPR LD/ST-Shifted (ditto) [[ls004]] * 2 - Float-Load-Immediate (always saves one LD L1/2/3 D-Cache op) [[ls002]] * 5 - Big-Integer Chained 3-in 2-out (64-bit Carry) [[sv/biginteger]] * 6 - Bitmanip LUT2/3 operations. high cost high reward [[sv/bitmanip]] @@ -375,7 +377,7 @@ Whilst this is a "solution" it is less than ideal, and the opportunity exists now with the EXT2xx Primary Opcodes to correct this and bring Power ISA up a level. -## Shift-and-add +## Shift-and-add (and LD/ST Indexed-Shift) Shift-and-Add are proposed in [[ls004]]. They mitigate the need to add LD-ST-Shift instructions which are a high-priority aspect of both x86 @@ -389,6 +391,19 @@ Being a 10-bit XO it would be somewhat punitive to place these in EXT2xx when their whole purpose and value is to reduce binary size in Address offset computation, thus they are best placed in EXT0xx. +Also included because it is important to see the quantity of instructions: +LD/ST-Indexed-Shifted. Across Update variants, Byte-reverse variants, +Arithmetic and FP, the total is a slightly-eye-watering **37** instructions, +only ameliorated by the fact that they are all 9-bit XO. The upside as +far as adding them is concerned is that existing hardware will already +have amalgamated pipelines with very few actual back-end (Micro-Coded) +internal operations (likely just two: one load, one store). +Passing a 2-bit additional immediate field down to those pipelines really +is not hard. + +*(Readers unfamiliar with Micro-coding should look at the Microwatt VHDL +source code)* + \newpage{} # Vectorisation: SVP64 and SVP64Single diff --git a/openpower/sv/rfc/ls012/optable.csv b/openpower/sv/rfc/ls012/optable.csv index 3113ff5ec..7483fd8f0 100644 --- a/openpower/sv/rfc/ls012/optable.csv +++ b/openpower/sv/rfc/ls012/optable.csv @@ -28,6 +28,46 @@ stfdu, ls011, high, PO, yes, EXT2xx, no, isa/pifixedstore, 2R1W stfsu, ls011, high, PO, yes, EXT2xx, no, isa/pifixedstore, 2R1W stfdux, ls011, high, 10, yes, EXT2xx, no, isa/pifixedstore, 3R1W stfsux, ls011, high, 10, yes, EXT2xx, no, isa/pifixedstore, 3R1W +# LD/ST-Index-Shifted (w/Update) +lbzsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lbzusx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R2W +lhzsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lhzusx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R2W +lhasx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lhausx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R2W +lwzsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lwzusx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R2W +lwasx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lwausx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R2W +ldsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +ldusx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R2W +lhbrsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lwbrsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +ldbrsx, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +stbus, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stbusx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R1W +sthsx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +sthusx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R1W +stwsx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stwusx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R1W +stdsx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stdusx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R1W +sthbrsx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stwbrsx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stdbrsx, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +# FP LD/ST-Index-Shifted (w/Update) +lfsxs, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lfsuxs, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lfdxs, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lfduxs, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lfiwaxs, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +lfiwzxs, ls004, high, 9, yes, EXT0xx, no, ls004, 2R1W +stfsxs, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stfsuxs, ls004, high, 9, yes, EXT0xx, no, ls004, 3R1W +stfdxs, ls004, high, 9, yes, EXT0xx, no, ls004, 3R +stfduxs, ls004, high, 9, yes, EXT0xx, no, ls004, 3R1W +stfiwxs, ls004, high, 9, yes, EXT0xx, no, ls004, 3R + # Bitmanip LUT2/3 operations. high cost high reward grevlut, TBD, high, 3, yes, TBD, no, sv/bitmanip, 2R1W grevluti, TBD, high, 3, yes, TBD, yes, sv/bitmanip, 1R1W -- 2.30.2