test less cases of utf-8 validation, to avoid taking forever
[openpower-isa.git] / src / openpower / test / algorithms / svp64_utf_8_validation.py
1 # SPDX-License-Identifier: LGPL-3-or-later
2 # Copyright 2022 Jacob Lifshay
3
4 import enum
5 import re
6 from openpower.decoder.selectable_int import SelectableInt
7 from openpower.simulator.program import Program
8 from openpower.test.common import TestAccumulatorBase, skip_case
9 from openpower.test.state import ExpectedState
10 from openpower.sv.trans.svp64 import SVP64Asm
11 from cached_property import cached_property
12
13
14 SVP64_UTF8_VALIDATION_DATA_ADDR = 0x10000
15
16
17 class UTF8FirstTwoBytesError(enum.IntFlag):
18 """ Error conditions that are detectable from just the first two bytes in
19 a UTF-8 sequence.
20 """
21
22 TooLong = 1 << 0
23 """ ascii byte followed by a continuation byte """
24
25 TooShort = 1 << 1
26 """ leading byte followed by something other than a continuation byte """
27
28 Overlong2 = 1 << 2
29 """ value is `< 0x80` but is encoded using 2 bytes """
30
31 Surrogate = 1 << 3
32 """ value is a surrogate (`0xD800 <= value <= 0xDFFF`) """
33
34 Overlong3 = 1 << 4
35 """ value is `< 0x800` but is encoded using 3 bytes """
36
37 Overlong4OrTooLarge = 1 << 5
38 """ value is either:
39 * `< 0x10000` but is encoded using 4 bytes
40 * or the value is `>= 0x140000` with the first continuation byte
41 being `<= 0x8F`
42
43 The rest of the cases where the value is `> 0x10FFFF` are covered by
44 `TooLarge`.
45 """
46
47 TooLarge = 1 << 6
48 """ value is `> 0x10FFFF` with the first continuation byte being `>= 0x90`
49
50 The rest of the cases where the value is `> 0x10FFFF` are covered by
51 `Overlong4OrTooLarge`.
52 """
53
54 TwoContinuations = 1 << 7
55 """ not actually an error -- two continuations in a row """
56
57 AllActualErrors = (TooLong | TooShort | Overlong2 | Surrogate |
58 Overlong3 | Overlong4OrTooLarge | TooLarge)
59
60
61 # look up tables for checking for errors in the first two bytes, the final
62 # error flags are generated by looking up the nibbles of the first two bytes
63 # in the appropriate tables, and bitwise ANDing the results together.
64 # To figure out what to put in each entry in the LUTs, look for all cases
65 # that could match the comment.
66
67 _TLN = UTF8FirstTwoBytesError.TooLong
68 _TS = UTF8FirstTwoBytesError.TooShort
69 _O2 = UTF8FirstTwoBytesError.Overlong2
70 _SG = UTF8FirstTwoBytesError.Surrogate
71 _O3 = UTF8FirstTwoBytesError.Overlong3
72 _O4TL = UTF8FirstTwoBytesError.Overlong4OrTooLarge
73 _TLG = UTF8FirstTwoBytesError.TooLarge
74 _2C = UTF8FirstTwoBytesError.TwoContinuations
75
76 FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR = 0xFF00
77 FIRST_BYTE_HIGH_NIBBLE_LUT = [
78 _TLN, # first 2 bytes are 0x0? 0x??
79 _TLN, # first 2 bytes are 0x1? 0x??
80 _TLN, # first 2 bytes are 0x2? 0x??
81 _TLN, # first 2 bytes are 0x3? 0x??
82 _TLN, # first 2 bytes are 0x4? 0x??
83 _TLN, # first 2 bytes are 0x5? 0x??
84 _TLN, # first 2 bytes are 0x6? 0x??
85 _TLN, # first 2 bytes are 0x7? 0x??
86 _2C, # first 2 bytes are 0x8? 0x??
87 _2C, # first 2 bytes are 0x9? 0x??
88 _2C, # first 2 bytes are 0xA? 0x??
89 _2C, # first 2 bytes are 0xB? 0x??
90 _TS | _O2, # first 2 bytes are 0xC? 0x??
91 _TS, # first 2 bytes are 0xD? 0x??
92 _TS | _SG | _O3, # first 2 bytes are 0xE? 0x??
93 _TS | _O4TL | _TLG, # first 2 bytes are 0xF? 0x??
94 ]
95 FIRST_BYTE_LOW_NIBBLE_LUT_ADDR = 0xFF10
96 FIRST_BYTE_LOW_NIBBLE_LUT = [
97 _TLN | _TS | _O2 | _O3 | _O4TL | _2C, # first 2 bytes are 0x?0 0x??
98 _TLN | _TS | _O2 | _2C, # first 2 bytes are 0x?1 0x??
99 _TLN | _TS | _2C, # first 2 bytes are 0x?2 0x??
100 _TLN | _TS | _2C, # first 2 bytes are 0x?3 0x??
101 _TLN | _TS | _TLG | _2C, # first 2 bytes are 0x?4 0x??
102 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?5 0x??
103 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?6 0x??
104 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?7 0x??
105 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?8 0x??
106 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?9 0x??
107 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?A 0x??
108 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?B 0x??
109 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?C 0x??
110 _TLN | _TS | _SG | _O4TL | _TLG | _2C, # first 2 bytes are 0x?D 0x??
111 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?E 0x??
112 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?F 0x??
113 ]
114 SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR = 0xFF20
115 SECOND_BYTE_HIGH_NIBBLE_LUT = [
116 _TS, # first 2 bytes are 0x?? 0x0?
117 _TS, # first 2 bytes are 0x?? 0x1?
118 _TS, # first 2 bytes are 0x?? 0x2?
119 _TS, # first 2 bytes are 0x?? 0x3?
120 _TS, # first 2 bytes are 0x?? 0x4?
121 _TS, # first 2 bytes are 0x?? 0x5?
122 _TS, # first 2 bytes are 0x?? 0x6?
123 _TS, # first 2 bytes are 0x?? 0x7?
124 _TLN | _O2 | _O3 | _O4TL | _2C, # first 2 bytes are 0x?? 0x8?
125 _TLN | _O2 | _O3 | _TLG | _2C, # first 2 bytes are 0x?? 0x9?
126 _TLN | _O2 | _SG | _TLG | _2C, # first 2 bytes are 0x?? 0xA?
127 _TLN | _O2 | _SG | _TLG | _2C, # first 2 bytes are 0x?? 0xB?
128 _TS, # first 2 bytes are 0x?? 0xC?
129 _TS, # first 2 bytes are 0x?? 0xD?
130 _TS, # first 2 bytes are 0x?? 0xE?
131 _TS, # first 2 bytes are 0x?? 0xF?
132 ]
133
134
135 def svp64_utf8_validation_asm():
136 vec_sz = 8 # limited by number of CR fields implemented in the simulator
137 inp_addr = 3
138 # cur bytes in r48-r63 -- u64x16
139 cur_bytes = 48
140 # prev bytes in r45-r47 -- u64x3
141 prev_bytes_sz = 3
142 prev_bytes = cur_bytes - prev_bytes_sz
143 # error flags in r56-r63 -- u64x8
144 temp_vec1 = cur_bytes + vec_sz
145 # nibbles to look up in r64-r71 -- u64x8
146 temp_vec2 = cur_bytes + vec_sz * 2
147 temp_vec2_end = temp_vec2 + vec_sz
148
149 def sv_set_0x80_if_ge(out_v, inp_v, temp_s, compare_rhs):
150 # type: (int, int, int, int) -> list[str]
151 """ generate values with bit 0x80 set if the input vector is
152 unsigned `>= compare_rhs`, this assumes `0x80 <= compare_rhs <= 0xFF`
153 and the input vector elements are in `0 <= v <= 0xFF`.
154
155 can't use CRs for this, since vectors of CRs used as masks currently
156 max out at 4 in the simulator.
157 """
158 assert 0x80 <= compare_rhs <= 0xFF, \
159 "the algorithm only works if compare_rhs is in range"
160 max_arg = compare_rhs - 1
161 add_arg = 0x80 - compare_rhs
162 return [
163 f"addi {temp_s}, 0, {max_arg}",
164 f"sv.maxu *{out_v}, *{inp_v}, {temp_s}",
165 f"sv.addi *{out_v}, *{out_v}, {add_arg}"
166 ]
167 return [
168 # input addr in r3, input length in r4
169 f"setvl 0, 0, {prev_bytes_sz}, 0, 1, 1", # set VL to prev_bytes_sz
170 # clear what will go into prev bytes
171 f"sv.addi *{cur_bytes + vec_sz - prev_bytes_sz}, 0, 0",
172 f"addis 6, 0, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
173 f"ori 6, 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
174 f"addis 7, 0, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR >> 16}",
175 f"ori 7, 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR & 0xFFFF}",
176 f"addis 8, 0, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
177 f"ori 8, 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
178 f"loop:",
179 f"setvl 0, 0, {prev_bytes_sz}, 0, 1, 1", # set VL to prev_bytes_sz
180 # copy prev bytes from end of cur bytes
181 f"sv.ori *{prev_bytes}, *{cur_bytes + vec_sz - prev_bytes_sz}, 0",
182
183 # clear cur bytes, so bytes beyond end end up being zeros
184 f"setvl 0, 0, {vec_sz}, 0, 1, 1", # set VL to vec_sz
185 f"sv.addi *{cur_bytes}, 0, 0", # clear cur bytes
186 f"setvl. 5, 4, {vec_sz}, 0, 1, 1", # set VL to min(vec_sz, r4)
187 # if no bytes left to load, run final check
188 f"bc 12, 2, final_check # beq final_check",
189 # sv.lbz/els is buggy, use sv.lbzx instead:
190 f"sv.addi *{cur_bytes + 1}, *{cur_bytes}, 1", # create indexes
191 f"sv.lbzx *{cur_bytes}, {inp_addr}, *{cur_bytes}", # load bytes
192 f"setvl 0, 0, {vec_sz}, 0, 1, 1", # set VL to vec_sz
193 # now we can operate on vec_sz byte chunks, branch to `fail` if they
194 # don't pass validation.
195
196 # get high nibbles of input shifted by 1 byte
197 (f"sv.rldicl *{temp_vec2}, *{cur_bytes - 1}, {64 - 4}, 4"
198 f" # sv.srdi *{temp_vec2}, *{cur_bytes - 1}, 4"),
199 # look-up nibbles in table, writing to error flags
200 f"sv.lbzx *{temp_vec1}, 6, *{temp_vec2}",
201
202 # get low nibbles of input shifted by 1 byte
203 # there is no andi without Rc
204 # sv.andi. with scalars is buggy, so use a temporary and sv.and
205 f"addi 9, 0, {0xF}",
206 f"sv.and *{temp_vec2}, *{cur_bytes - 1}, 9",
207 # look-up nibbles in table
208 f"sv.lbzx *{temp_vec2}, 7, *{temp_vec2}",
209 # bitwise and into error flags
210 f"sv.and *{temp_vec1}, *{temp_vec1}, *{temp_vec2}",
211
212 # get high nibbles of input
213 # srdi *{temp_vec2}, *{cur_bytes}, 4
214 f"sv.rldicl *{temp_vec2}, *{cur_bytes}, {64 - 4}, 4",
215 # look-up nibbles in table
216 f"sv.lbzx *{temp_vec2}, 8, *{temp_vec2}",
217 # bitwise and into error flags
218 f"sv.and *{temp_vec1}, *{temp_vec1}, *{temp_vec2}",
219
220 # or-reduce error flags into temp_vec2_end
221 f"sv.addi {temp_vec2_end}, 0, 0",
222 f"sv.ori *{temp_vec2}, *{temp_vec1}, 0",
223 f"sv.or *{temp_vec2 + 1}, *{temp_vec2}, *{temp_vec2 + 1}",
224 # check for any actual error flags set
225 # sv.andi. is buggy, so use sv.and, then compare
226 f"addi 9, 0, {UTF8FirstTwoBytesError.AllActualErrors}",
227 f"sv.and 9, {temp_vec2_end}, 9",
228 f"cmpli 0, 1, 9, 0",
229 f"bc 4, 2, fail # bne fail",
230
231 # check for the correct number of continuation bytes for 3/4-byte cases
232
233 # set bit 0x80 (TwoContinuations) if input is >= 0xE0
234 *sv_set_0x80_if_ge(out_v=temp_vec2, inp_v=cur_bytes - 2,
235 temp_s=9, compare_rhs=0xE0),
236 # xor into error flags
237 f"sv.xor *{temp_vec1}, *{temp_vec1}, *{temp_vec2}",
238 # set bit 0x80 (TwoContinuations) if input is >= 0xF0
239 *sv_set_0x80_if_ge(out_v=temp_vec2, inp_v=cur_bytes - 3,
240 temp_s=9, compare_rhs=0xF0),
241 # xor into error flags
242 f"sv.xor *{temp_vec1}, *{temp_vec1}, *{temp_vec2}",
243 # now bit 0x80 is set in temp_vec1 if there's an error
244 # or-reduce into temp_vec2
245 f"sv.addi {temp_vec2}, 0, 0",
246 f"sv.or *{temp_vec1 + 1}, *{temp_vec1}, *{temp_vec1 + 1}",
247 # adjust count/pointer
248 f"add 3, 3, 5", # increment pointer
249 f"subf 4, 5, 4", # decrement count
250 # sv.andi. is buggy, so move to r9 first
251 f"sv.ori 9, {temp_vec2}, 0",
252 f"andi. 9, 9, {0x80}", # check if any errors
253 f"bc 12, 2, loop # beq loop", # if no errors loop, else fail
254 f"fail:",
255 f"addi 3, 0, 0",
256 f"bclr 20, 0, 0 # blr",
257 f"final_check:",
258
259 # need to set VL to something non-zero otherwise all our scalar
260 # instructions don't run --- I definitely don't like that ... scalar
261 # instructions should run regardless of VL.
262 f"setvl 0, 0, 1, 0, 1, 1", # set VL to 1
263
264 # check if prev input is incomplete
265 # check if byte 3 bytes from end needed 4 bytes
266 f"sv.cmpli 0, 1, {cur_bytes - 3}, {0xF0}",
267 f"bc 4, 0, fail # bge fail",
268 # check if byte 2 bytes from end needed 3 bytes
269 f"sv.cmpli 0, 1, {cur_bytes - 2}, {0xE0}",
270 f"bc 4, 0, fail # bge fail",
271 # check if byte 1 bytes from end needed 2 bytes
272 f"sv.cmpli 0, 1, {cur_bytes - 1}, {0xC0}",
273 f"bc 4, 0, fail # bge fail",
274 f"addi 3, 0, 1",
275 f"bclr 20, 0, 0 # blr",
276 ]
277
278
279 def assemble(instructions, start_pc=0):
280 pc = start_pc
281 labels = {}
282 out_instructions = []
283 for instr in instructions:
284 m = re.fullmatch(r" *([a-zA-Z0-9_]+): *(#.*)?", instr)
285 if m is not None:
286 name = m.group(1)
287 if name in labels:
288 raise ValueError(f"label {name!r} defined multiple times")
289 labels[name] = pc
290 continue
291 m = re.fullmatch(r" *sv\.[a-zA-Z0-9_].*", instr)
292 if m is not None:
293 pc += 8
294 else:
295 pc += 4
296 out_instructions.append(instr)
297 for k, v in labels.items():
298 out_instructions.append(f".set {k}, . - 0x{pc - v:X} # 0x{v:X}")
299 return Program(list(SVP64Asm(out_instructions)), 0)
300
301
302 class SVP64UTF8ValidationTestCase(TestAccumulatorBase):
303 def __init__(self):
304 self.__seen_cases = set()
305 super().__init__()
306
307 @cached_property
308 def program(self):
309 return assemble(svp64_utf8_validation_asm())
310
311 def run_case(self, data, src_loc_at=0):
312 # type: (bytes, int) -> None
313 if data in self.__seen_cases:
314 return
315 self.__seen_cases.add(data)
316 expected = 1
317 try:
318 data.decode("utf-8")
319 except UnicodeDecodeError:
320 expected = 0
321 initial_regs = [0x15cee3293aa9bfbe] * 128 # fill with junk
322 initial_regs[3] = 0x10000 # pointer to bytes to check
323 initial_regs[4] = len(data) # length of bytes to check
324
325 initial_mem = {}
326 for i, v in enumerate(data):
327 initial_mem[i + initial_regs[3]] = v, 1
328 for i, v in enumerate(FIRST_BYTE_LOW_NIBBLE_LUT):
329 initial_mem[i + FIRST_BYTE_LOW_NIBBLE_LUT_ADDR] = int(v), 1
330 for i, v in enumerate(FIRST_BYTE_HIGH_NIBBLE_LUT):
331 initial_mem[i + FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR] = int(v), 1
332 for i, v in enumerate(SECOND_BYTE_HIGH_NIBBLE_LUT):
333 initial_mem[i + SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR] = int(v), 1
334 stop_at_pc = 0x10000000
335 initial_sprs = {8: SelectableInt(stop_at_pc, 64)}
336 e = ExpectedState(pc=stop_at_pc, int_regs=4, crregs=0, fp_regs=0,
337 so=None, ov=None, ca=None)
338 e.intregs[:3] = initial_regs[:3]
339 e.intregs[3] = expected
340 with self.subTest(data=data, expected=expected):
341 self.add_case(self.program, initial_regs, initial_mem=initial_mem,
342 initial_sprs=initial_sprs, stop_at_pc=stop_at_pc,
343 expected=e,
344 src_loc_at=src_loc_at + 1)
345
346 def run_cases(self, data):
347 # type: (bytes | str) -> None
348 if isinstance(data, str):
349 data = data.encode("utf-8")
350 data = data.center(8, b' ')
351 for i in range(len(data)):
352 self.run_case(data[i:], src_loc_at=1)
353 self.run_case(data[:i], src_loc_at=1)
354
355 def case_empty(self):
356 self.run_case(b"")
357
358 def case_x6_sp_nul(self):
359 self.run_case(b' ' * 6 + b'\x00')
360
361 def case_nul(self):
362 self.run_cases("\u0000") # min 1-byte
363
364 def case_a(self):
365 self.run_cases("a")
366
367 def case_7f(self):
368 self.run_cases("\u007F") # max 1-byte
369
370 def case_c0_80(self):
371 self.run_cases(b"\xC0\x80") # min 2-byte overlong encoding
372
373 def case_c1_bf(self):
374 self.run_cases(b"\xC1\xBF") # max 2-byte overlong encoding
375
376 def case_u0080(self):
377 self.run_cases("\u0080") # min 2-byte
378
379 def case_u07ff(self):
380 self.run_cases("\u07FF") # max 2-byte
381
382 def case_e0_80_80(self):
383 self.run_cases(b"\xE0\x80\x80") # min 3-byte overlong encoding
384
385 def case_e0_9f_bf(self):
386 self.run_cases(b"\xE0\x9F\xBF") # max 3-byte overlong encoding
387
388 def case_u0800(self):
389 self.run_cases("\u0800") # min 3-byte
390
391 def case_u0fff(self):
392 self.run_cases("\u0FFF")
393
394 def case_u1000(self):
395 self.run_cases("\u1000")
396
397 def case_ucfff(self):
398 self.run_cases("\uCFFF")
399
400 def case_ud000(self):
401 self.run_cases("\uD000")
402
403 def case_ud7ff(self):
404 self.run_cases("\uD7FF")
405
406 def case_ed_a0_80(self):
407 self.run_cases(b"\xED\xA0\x80") # first high surrogate
408
409 def case_ed_af_bf(self):
410 self.run_cases(b"\xED\xAF\xBF") # last high surrogate
411
412 def case_ed_b0_80(self):
413 self.run_cases(b"\xED\xB0\x80") # first low surrogate
414
415 def case_ed_bf_bf(self):
416 self.run_cases(b"\xED\xBF\xBF") # last low surrogate
417
418 def case_ue000(self):
419 self.run_cases("\uE000")
420
421 def case_uffff(self):
422 self.run_cases("\uFFFF") # max 3-byte
423
424 def case_f0_80_80_80(self):
425 self.run_cases(b"\xF0\x80\x80\x80") # min 4-byte overlong encoding
426
427 def case_f0_bf_bf_bf(self):
428 self.run_cases(b"\xF0\x8F\xBF\xBF") # max 4-byte overlong encoding
429
430 def case_u00010000(self):
431 self.run_cases("\U00010000") # min 4-byte
432
433 def case_u0003ffff(self):
434 self.run_cases("\U0003FFFF")
435
436 def case_u00040000(self):
437 self.run_cases("\U00040000")
438
439 def case_u000fffff(self):
440 self.run_cases("\U000FFFFF")
441
442 def case_u00100000(self):
443 self.run_cases("\U00100000")
444
445 def case_u0010ffff(self):
446 self.run_cases("\U0010FFFF") # max 4-byte
447
448 def case_f4_90_80_80(self):
449 self.run_cases(b"\xF4\x90\x80\x80") # first too-big encoding
450
451 def case_f7_bf_bf_bf(self):
452 self.run_cases(b"\xF7\xBF\xBF\xBF") # max too-big 4-byte encoding
453
454 def case_f8_x4_80(self):
455 self.run_cases(b"\xF8" + b"\x80" * 4) # min too-big 5-byte encoding
456
457 def case_fb_x4_bf(self):
458 self.run_cases(b"\xFB" + b"\xBF" * 4) # max too-big 5-byte encoding
459
460 def case_fc_x5_80(self):
461 self.run_cases(b"\xFC" + b"\x80" * 5) # min too-big 6-byte encoding
462
463 def case_fd_x5_bf(self):
464 self.run_cases(b"\xFD" + b"\xBF" * 5) # max too-big 6-byte encoding
465
466 def case_fe_x6_80(self):
467 self.run_cases(b"\xFE" + b"\x80" * 6) # min too-big 7-byte encoding
468
469 def case_fe_x6_bf(self):
470 self.run_cases(b"\xFE" + b"\xBF" * 6) # max too-big 7-byte encoding
471
472 def case_ff_x7_80(self):
473 self.run_cases(b"\xFF" + b"\x80" * 7) # min too-big 8-byte encoding
474
475 def case_ff_x7_bf(self):
476 self.run_cases(b"\xFF" + b"\xBF" * 7) # max too-big 8-byte encoding