src/openpower/test/algorithms/svp64_utf_8_validation.py

   1 # SPDX-License-Identifier: LGPL-3-or-later
   2 # Copyright 2022 Jacob Lifshay
   3
   4 import enum
   5 from openpower.decoder.selectable_int import SelectableInt
   6 from openpower.simulator.program import Program
   7 from openpower.test.common import TestAccumulatorBase
   8 from openpower.test.state import ExpectedState
   9 from openpower.sv.trans.svp64 import SVP64Asm
  10
  11
  12 SVP64_UTF8_VALIDATION_DATA_ADDR = 0x10000
  13
  14
  15 class UTF8FirstTwoBytesError(enum.IntFlag):
  16     """ Error conditions that are detectable from just the first two bytes in
  17     a UTF-8 sequence.
  18     """
  19
  20     TooLong = 1 << 0
  21     """ ascii byte followed by a continuation byte """
  22
  23     TooShort = 1 << 1
  24     """ leading byte followed by something other than a continuation byte """
  25
  26     Overlong2 = 1 << 2
  27     """ value is `< 0x80` but is encoded using 2 bytes """
  28
  29     Surrogate = 1 << 3
  30     """ value is a surrogate (`0xD800 <= value <= 0xDFFF`) """
  31
  32     Overlong3 = 1 << 4
  33     """ value is `< 0x800` but is encoded using 3 bytes """
  34
  35     Overlong4OrTooLarge = 1 << 5
  36     """ value is either:
  37         * `< 0x10000` but is encoded using 4 bytes
  38         * or the value is `>= 0x140000` with the first continuation byte
  39             being `<= 0x8F`
  40
  41         The rest of the cases where the value is `> 0x10FFFF` are covered by
  42         `TooLarge`.
  43     """
  44
  45     TooLarge = 1 << 6
  46     """ value is `> 0x10FFFF` with the first continuation byte being `>= 0x90`
  47
  48         The rest of the cases where the value is `> 0x10FFFF` are covered by
  49         `Overlong4OrTooLarge`.
  50     """
  51
  52     TwoContinuations = 1 << 7
  53     """ not actually an error -- two continuations in a row """
  54
  55     AllActualErrors = (TooLong | TooShort | Overlong2 | Surrogate |
  56                        Overlong3 | Overlong4OrTooLarge | TooLarge)
  57
  58
  59 # look up tables for checking for errors in the first two bytes, the final
  60 # error flags are generated by looking up the nibbles of the first two bytes
  61 # in the appropriate tables, and bitwise ANDing the results together.
  62 # To figure out what to put in each entry in the LUTs, look for all cases
  63 # that could match the comment.
  64
  65 _TLN = UTF8FirstTwoBytesError.TooLong
  66 _TS = UTF8FirstTwoBytesError.TooShort
  67 _O2 = UTF8FirstTwoBytesError.Overlong2
  68 _SG = UTF8FirstTwoBytesError.Surrogate
  69 _O3 = UTF8FirstTwoBytesError.Overlong3
  70 _O4TL = UTF8FirstTwoBytesError.Overlong4OrTooLarge
  71 _TLG = UTF8FirstTwoBytesError.TooLarge
  72 _2C = UTF8FirstTwoBytesError.TwoContinuations
  73
  74 FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR = 0xFF00
  75 FIRST_BYTE_HIGH_NIBBLE_LUT = [
  76     _TLN,  # first 2 bytes are 0x0? 0x??
  77     _TLN,  # first 2 bytes are 0x1? 0x??
  78     _TLN,  # first 2 bytes are 0x2? 0x??
  79     _TLN,  # first 2 bytes are 0x3? 0x??
  80     _TLN,  # first 2 bytes are 0x4? 0x??
  81     _TLN,  # first 2 bytes are 0x5? 0x??
  82     _TLN,  # first 2 bytes are 0x6? 0x??
  83     _TLN,  # first 2 bytes are 0x7? 0x??
  84     _2C,  # first 2 bytes are 0x8? 0x??
  85     _2C,  # first 2 bytes are 0x9? 0x??
  86     _2C,  # first 2 bytes are 0xA? 0x??
  87     _2C,  # first 2 bytes are 0xB? 0x??
  88     _TS | _O2,  # first 2 bytes are 0xC? 0x??
  89     _TS,  # first 2 bytes are 0xD? 0x??
  90     _TS | _SG | _O3,  # first 2 bytes are 0xE? 0x??
  91     _TS | _O4TL | _TLG,  # first 2 bytes are 0xF? 0x??
  92 ]
  93 FIRST_BYTE_LOW_NIBBLE_LUT_ADDR = 0xFF10
  94 FIRST_BYTE_LOW_NIBBLE_LUT = [
  95     _TLN | _TS | _O2 | _O3 | _O4TL | _2C,  # first 2 bytes are 0x?0 0x??
  96     _TLN | _TS | _O2 | _2C,  # first 2 bytes are 0x?1 0x??
  97     _TLN | _TS | _2C,  # first 2 bytes are 0x?2 0x??
  98     _TLN | _TS | _2C,  # first 2 bytes are 0x?3 0x??
  99     _TLN | _TS | _TLG | _2C,  # first 2 bytes are 0x?4 0x??
 100     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?5 0x??
 101     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?6 0x??
 102     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?7 0x??
 103     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?8 0x??
 104     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?9 0x??
 105     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?A 0x??
 106     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?B 0x??
 107     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?C 0x??
 108     _TLN | _TS | _SG | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?D 0x??
 109     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?E 0x??
 110     _TLN | _TS | _O4TL | _TLG | _2C,  # first 2 bytes are 0x?F 0x??
 111 ]
 112 SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR = 0xFF20
 113 SECOND_BYTE_HIGH_NIBBLE_LUT = [
 114     _TS,  # first 2 bytes are 0x?? 0x0?
 115     _TS,  # first 2 bytes are 0x?? 0x1?
 116     _TS,  # first 2 bytes are 0x?? 0x2?
 117     _TS,  # first 2 bytes are 0x?? 0x3?
 118     _TS,  # first 2 bytes are 0x?? 0x4?
 119     _TS,  # first 2 bytes are 0x?? 0x5?
 120     _TS,  # first 2 bytes are 0x?? 0x6?
 121     _TS,  # first 2 bytes are 0x?? 0x7?
 122     _TLN | _O2 | _O3 | _O4TL | _2C,  # first 2 bytes are 0x?? 0x8?
 123     _TLN | _O2 | _O3 | _TLG | _2C,  # first 2 bytes are 0x?? 0x9?
 124     _TLN | _O2 | _SG | _TLG | _2C,  # first 2 bytes are 0x?? 0xA?
 125     _TLN | _O2 | _SG | _TLG | _2C,  # first 2 bytes are 0x?? 0xB?
 126     _TS,  # first 2 bytes are 0x?? 0xC?
 127     _TS,  # first 2 bytes are 0x?? 0xD?
 128     _TS,  # first 2 bytes are 0x?? 0xE?
 129     _TS,  # first 2 bytes are 0x?? 0xF?
 130 ]
 131
 132
 133 def svp64_utf8_validation_asm():
 134     # raise NotImplementedError("not finished")
 135     return [
 136         # input addr in r3, input length in r4
 137         # prev bytes in r45-r47 -- u64x3
 138         # cur bytes in r48-r63 -- u64x16
 139         # nibbles to look up in r80-r95 -- u64x16
 140         # error flags in r64-r79 -- u64x16
 141         "setvl 0, 0, 3, 0, 1, 1",  # set VL to 3
 142         "sv.addi *45, 0, 0",  # clear prev bytes
 143         f"lis 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
 144         f"ori 6, 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
 145         f"lis 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR >> 16}",
 146         f"ori 7, 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR & 0xFFFF}",
 147         f"lis 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
 148         f"ori 8, 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
 149         "loop:",
 150         "setvl 0, 0, 3, 0, 1, 1",  # set VL to 3
 151         "sv.ori *45, *61, 0",  # copy prev bytes from end of cur bytes
 152
 153         # clear cur bytes, so bytes beyond end end up being zeros
 154         "setvl 0, 0, 16, 0, 1, 1",  # set VL to 16
 155         "sv.addi *48, 0, 0",  # clear cur bytes
 156         "setvl. 5, 4, 16, 0, 1, 1",  # set VL to min(16, r4)
 157         "beq final_check",  # if no bytes left to load, run final check
 158         "sv.lbz/els *48, 0(3)",  # load bytes
 159         "setvl 0, 0, 16, 0, 1, 1",  # set VL to 16
 160         # now we can operate on 16 byte chunks, branch to `fail` if they don't
 161         # pass validation.
 162
 163         # get high nibbles of input shifted by 1 byte
 164         f"sv.rldicl *80, *47, {64 - 4}, 4",  # srdi *80, *47, 4
 165         # look-up nibbles in table, writing to error flags
 166         "sv.lbzx *64, 6, *80",
 167
 168         # get low nibbles of input shifted by 1 byte
 169         f"sv.andi. *80, *47, {0xF}",  # there is no andi without Rc
 170         # look-up nibbles in table
 171         "sv.lbzx *80, 6, *80",
 172         # bitwise and into error flags
 173         "sv.and *64, *64, *80",
 174
 175         # get high nibbles of input
 176         "sv.srdi *80, *48, 4",
 177         # look-up nibbles in table
 178         "sv.lbzx *80, 6, *80",
 179         # bitwise and into error flags
 180         "sv.and *64, *64, *80",
 181
 182         # or-reduce error flags into r96
 183         "sv.mv *80, *64"
 184         "sv.or *81, *80, *81",
 185         # check for any actual error flags set
 186         f"sv.andi. 96, 96, {UTF8FirstTwoBytesError.AllActualErrors}",
 187         "bne fail",
 188
 189         # check for the correct number of continuation bytes for 3/4-byte cases
 190         # set bit 0x80 (TwoContinuations) if input is >= 0xE0
 191         f"sv.subi/satu *80, *46, {0xE0 - 0x80}",
 192         # xor into error flags
 193         "sv.xor *64, *64, *80",
 194         # set bit 0x80 (TwoContinuations) if input is >= 0xF0
 195         f"sv.subi/satu *80, *45, {0xF0 - 0x80}",
 196         # xor into error flags
 197         "sv.xor *80, *64, *80",
 198         # now bit 0x80 is set in r80-95 if there's an error
 199         # or-reduce into r96
 200         "sv.or *81, *80, *81",
 201         # adjust count/pointer
 202         "add 3, 3, 5",  # increment pointer
 203         "sub 4, 4, 5",  # decrement count
 204         f"sv.andi. 96, 96, {0x80}",  # check if any errors
 205         "beq loop",  # if no errors loop, else fail
 206         "fail:",
 207         "li 3, 0",
 208         "blr",
 209         "final_check:",
 210         # check if prev input is incomplete
 211         # check if byte 3 bytes from end needed 4 bytes
 212         f"sv.cmpli 0, 1, 45, {0xF0}",
 213         "bge fail",
 214         # check if byte 2 bytes from end needed 3 bytes
 215         f"sv.cmpli 0, 1, 46, {0xE0}",
 216         "bge fail",
 217         # check if byte 1 bytes from end needed 2 bytes
 218         f"sv.cmpli 0, 1, 47, {0xC0}",
 219         "bge fail",
 220         "li 3, 1",
 221         "blr",
 222     ]
 223
 224
 225 class SVP64UTF8ValidationTestCase(TestAccumulatorBase):
 226     def run_case(self, data):
 227         # type: (bytes) -> None
 228         expected = 1
 229         try:
 230             data.decode("utf-8")
 231         except UnicodeDecodeError:
 232             expected = 0
 233         isa = SVP64Asm(svp64_utf8_validation_asm())
 234         lst = list(isa)
 235         initial_regs = [0x15cee3293aa9bfbe] * 128  # fill with junk
 236         initial_regs[3] = 0x10000  # pointer to bytes to check
 237         initial_regs[4] = len(data)  # length of bytes to check
 238
 239         initial_mem = {}
 240         for i, v in enumerate(data):
 241             initial_mem[i + initial_regs[3]] = v, 1
 242         stop_at_pc = 0x10000000
 243         initial_sprs = {8: SelectableInt(stop_at_pc, 64)}
 244         e = ExpectedState(pc=stop_at_pc)
 245         e.intregs[3] = expected
 246         self.add_case(Program(lst, 0), initial_regs, initial_mem=initial_mem,
 247                       initial_sprs=initial_sprs, stop_at_pc=stop_at_pc,
 248                       expected=e)
 249
 250     def run_cases(self, data):
 251         # type: (bytes | str) -> None
 252         if isinstance(data, str):
 253             data = data.encode("utf-8")
 254         data = b' ' * 8 + data + b' ' * 8
 255         for i in range(len(data)):
 256             part = data[i:]
 257             for j in range(len(part)):
 258                 self.run_case(part[:j])
 259
 260     def case_empty(self):
 261         self.run_case(b"")
 262
 263     def case_nul(self):
 264         self.run_cases("\u0000")  # min 1-byte
 265
 266     def case_a(self):
 267         self.run_cases("a")
 268
 269     def case_7f(self):
 270         self.run_cases("\u007F")  # max 1-byte
 271
 272     def case_c0_80(self):
 273         self.run_cases(b"\xC0\x80")  # min 2-byte overlong encoding
 274
 275     def case_c1_bf(self):
 276         self.run_cases(b"\xC1\xBF")  # max 2-byte overlong encoding
 277
 278     def case_u0080(self):
 279         self.run_cases("\u0080")  # min 2-byte
 280
 281     def case_u07ff(self):
 282         self.run_cases("\u07FF")  # max 2-byte
 283
 284     def case_e0_80_80(self):
 285         self.run_cases(b"\xE0\x80\x80")  # min 3-byte overlong encoding
 286
 287     def case_e0_9f_bf(self):
 288         self.run_cases(b"\xE0\x9F\xBF")  # max 3-byte overlong encoding
 289
 290     def case_u0800(self):
 291         self.run_cases("\u0800")  # min 3-byte
 292
 293     def case_u0fff(self):
 294         self.run_cases("\u0FFF")
 295
 296     def case_u1000(self):
 297         self.run_cases("\u1000")
 298
 299     def case_ucfff(self):
 300         self.run_cases("\uCFFF")
 301
 302     def case_ud000(self):
 303         self.run_cases("\uD000")
 304
 305     def case_ud7ff(self):
 306         self.run_cases("\uD7FF")
 307
 308     def case_ud800(self):
 309         self.run_cases("\uD800")  # surrogate
 310
 311     def case_udbff(self):
 312         self.run_cases("\uDBFF")  # surrogate
 313
 314     def case_udc00(self):
 315         self.run_cases("\uDC00")  # surrogate
 316
 317     def case_udfff(self):
 318         self.run_cases("\uDFFF")  # surrogate
 319
 320     def case_ue000(self):
 321         self.run_cases("\uE000")
 322
 323     def case_uffff(self):
 324         self.run_cases("\uFFFF")  # max 3-byte
 325
 326     def case_f0_80_80_80(self):
 327         self.run_cases(b"\xF0\x80\x80\x80")  # min 4-byte overlong encoding
 328
 329     def case_f0_bf_bf_bf(self):
 330         self.run_cases(b"\xF0\x8F\xBF\xBF")  # max 4-byte overlong encoding
 331
 332     def case_u00010000(self):
 333         self.run_cases("\U00010000")  # min 4-byte
 334
 335     def case_u0003ffff(self):
 336         self.run_cases("\U0003FFFF")
 337
 338     def case_u00040000(self):
 339         self.run_cases("\U00040000")
 340
 341     def case_u000fffff(self):
 342         self.run_cases("\U000FFFFF")
 343
 344     def case_u00100000(self):
 345         self.run_cases("\U00100000")
 346
 347     def case_u0010ffff(self):
 348         self.run_cases("\U0010FFFF")  # max 4-byte
 349
 350     def case_f4_90_80_80(self):
 351         self.run_cases(b"\xF4\x90\x80\x80")  # first too-big encoding
 352
 353     def case_f7_bf_bf_bf(self):
 354         self.run_cases(b"\xF7\xBF\xBF\xBF")  # max too-big 4-byte encoding
 355
 356     def case_f8_x4_80(self):
 357         self.run_cases(b"\xF8" + b"\x80" * 4)  # min too-big 5-byte encoding
 358
 359     def case_fb_x4_bf(self):
 360         self.run_cases(b"\xFB" + b"\xBF" * 4)  # max too-big 5-byte encoding
 361
 362     def case_fc_x5_80(self):
 363         self.run_cases(b"\xFC" + b"\x80" * 5)  # min too-big 6-byte encoding
 364
 365     def case_fd_x5_bf(self):
 366         self.run_cases(b"\xFD" + b"\xBF" * 5)  # max too-big 6-byte encoding
 367
 368     def case_fe_x6_80(self):
 369         self.run_cases(b"\xFE" + b"\x80" * 6)  # min too-big 7-byte encoding
 370
 371     def case_fe_x6_bf(self):
 372         self.run_cases(b"\xFE" + b"\xBF" * 6)  # max too-big 7-byte encoding
 373
 374     def case_ff_x7_80(self):
 375         self.run_cases(b"\xFF" + b"\x80" * 7)  # min too-big 8-byte encoding
 376
 377     def case_ff_x7_bf(self):
 378         self.run_cases(b"\xFF" + b"\xBF" * 7)  # max too-big 8-byte encoding