efa5dcfe9e2ef2d0984976346876f13a30b9ca68
[openpower-isa.git] / src / openpower / test / algorithms / svp64_utf_8_validation.py
1 # SPDX-License-Identifier: LGPL-3-or-later
2 # Copyright 2022 Jacob Lifshay
3
4 import enum
5 from openpower.decoder.selectable_int import SelectableInt
6 from openpower.simulator.program import Program
7 from openpower.test.common import TestAccumulatorBase
8 from openpower.test.state import ExpectedState
9 from openpower.sv.trans.svp64 import SVP64Asm
10
11
12 SVP64_UTF8_VALIDATION_DATA_ADDR = 0x10000
13
14
15 class UTF8FirstTwoBytesError(enum.IntFlag):
16 """ Error conditions that are detectable from just the first two bytes in
17 a UTF-8 sequence.
18 """
19
20 TooLong = 1 << 0
21 """ ascii byte followed by a continuation byte """
22
23 TooShort = 1 << 1
24 """ leading byte followed by something other than a continuation byte """
25
26 Overlong2 = 1 << 2
27 """ value is `< 0x80` but is encoded using 2 bytes """
28
29 Surrogate = 1 << 3
30 """ value is a surrogate (`0xD800 <= value <= 0xDFFF`) """
31
32 Overlong3 = 1 << 4
33 """ value is `< 0x800` but is encoded using 3 bytes """
34
35 Overlong4OrTooLarge = 1 << 5
36 """ value is either:
37 * `< 0x10000` but is encoded using 4 bytes
38 * or the value is `>= 0x140000` with the first continuation byte
39 being `<= 0x8F`
40
41 The rest of the cases where the value is `> 0x10FFFF` are covered by
42 `TooLarge`.
43 """
44
45 TooLarge = 1 << 6
46 """ value is `> 0x10FFFF` with the first continuation byte being `>= 0x90`
47
48 The rest of the cases where the value is `> 0x10FFFF` are covered by
49 `Overlong4OrTooLarge`.
50 """
51
52 TwoContinuations = 1 << 7
53 """ not actually an error -- two continuations in a row """
54
55 AllActualErrors = (TooLong | TooShort | Overlong2 | Surrogate |
56 Overlong3 | Overlong4OrTooLarge | TooLarge)
57
58
59 # look up tables for checking for errors in the first two bytes, the final
60 # error flags are generated by looking up the nibbles of the first two bytes
61 # in the appropriate tables, and bitwise ANDing the results together.
62 # To figure out what to put in each entry in the LUTs, look for all cases
63 # that could match the comment.
64
65 _TLN = UTF8FirstTwoBytesError.TooLong
66 _TS = UTF8FirstTwoBytesError.TooShort
67 _O2 = UTF8FirstTwoBytesError.Overlong2
68 _SG = UTF8FirstTwoBytesError.Surrogate
69 _O3 = UTF8FirstTwoBytesError.Overlong3
70 _O4TL = UTF8FirstTwoBytesError.Overlong4OrTooLarge
71 _TLG = UTF8FirstTwoBytesError.TooLarge
72 _2C = UTF8FirstTwoBytesError.TwoContinuations
73
74 FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR = 0xFF00
75 FIRST_BYTE_HIGH_NIBBLE_LUT = [
76 _TLN, # first 2 bytes are 0x0? 0x??
77 _TLN, # first 2 bytes are 0x1? 0x??
78 _TLN, # first 2 bytes are 0x2? 0x??
79 _TLN, # first 2 bytes are 0x3? 0x??
80 _TLN, # first 2 bytes are 0x4? 0x??
81 _TLN, # first 2 bytes are 0x5? 0x??
82 _TLN, # first 2 bytes are 0x6? 0x??
83 _TLN, # first 2 bytes are 0x7? 0x??
84 _2C, # first 2 bytes are 0x8? 0x??
85 _2C, # first 2 bytes are 0x9? 0x??
86 _2C, # first 2 bytes are 0xA? 0x??
87 _2C, # first 2 bytes are 0xB? 0x??
88 _TS | _O2, # first 2 bytes are 0xC? 0x??
89 _TS, # first 2 bytes are 0xD? 0x??
90 _TS | _SG | _O3, # first 2 bytes are 0xE? 0x??
91 _TS | _O4TL | _TLG, # first 2 bytes are 0xF? 0x??
92 ]
93 FIRST_BYTE_LOW_NIBBLE_LUT_ADDR = 0xFF10
94 FIRST_BYTE_LOW_NIBBLE_LUT = [
95 _TLN | _TS | _O2 | _O3 | _O4TL | _2C, # first 2 bytes are 0x?0 0x??
96 _TLN | _TS | _O2 | _2C, # first 2 bytes are 0x?1 0x??
97 _TLN | _TS | _2C, # first 2 bytes are 0x?2 0x??
98 _TLN | _TS | _2C, # first 2 bytes are 0x?3 0x??
99 _TLN | _TS | _TLG | _2C, # first 2 bytes are 0x?4 0x??
100 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?5 0x??
101 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?6 0x??
102 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?7 0x??
103 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?8 0x??
104 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?9 0x??
105 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?A 0x??
106 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?B 0x??
107 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?C 0x??
108 _TLN | _TS | _SG | _O4TL | _TLG | _2C, # first 2 bytes are 0x?D 0x??
109 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?E 0x??
110 _TLN | _TS | _O4TL | _TLG | _2C, # first 2 bytes are 0x?F 0x??
111 ]
112 SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR = 0xFF20
113 SECOND_BYTE_HIGH_NIBBLE_LUT = [
114 _TS, # first 2 bytes are 0x?? 0x0?
115 _TS, # first 2 bytes are 0x?? 0x1?
116 _TS, # first 2 bytes are 0x?? 0x2?
117 _TS, # first 2 bytes are 0x?? 0x3?
118 _TS, # first 2 bytes are 0x?? 0x4?
119 _TS, # first 2 bytes are 0x?? 0x5?
120 _TS, # first 2 bytes are 0x?? 0x6?
121 _TS, # first 2 bytes are 0x?? 0x7?
122 _TLN | _O2 | _O3 | _O4TL | _2C, # first 2 bytes are 0x?? 0x8?
123 _TLN | _O2 | _O3 | _TLG | _2C, # first 2 bytes are 0x?? 0x9?
124 _TLN | _O2 | _SG | _TLG | _2C, # first 2 bytes are 0x?? 0xA?
125 _TLN | _O2 | _SG | _TLG | _2C, # first 2 bytes are 0x?? 0xB?
126 _TS, # first 2 bytes are 0x?? 0xC?
127 _TS, # first 2 bytes are 0x?? 0xD?
128 _TS, # first 2 bytes are 0x?? 0xE?
129 _TS, # first 2 bytes are 0x?? 0xF?
130 ]
131
132
133 def svp64_utf8_validation_asm():
134 # raise NotImplementedError("not finished")
135 return [
136 # input addr in r3, input length in r4
137 # prev bytes in r45-r47 -- u64x3
138 # cur bytes in r48-r63 -- u64x16
139 # nibbles to look up in r80-r95 -- u64x16
140 # error flags in r64-r79 -- u64x16
141 "setvl 0, 0, 3, 0, 1, 1", # set VL to 3
142 "sv.addi *45, 0, 0", # clear prev bytes
143 f"lis 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
144 f"ori 6, 6, {FIRST_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
145 f"lis 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR >> 16}",
146 f"ori 7, 7, {FIRST_BYTE_LOW_NIBBLE_LUT_ADDR & 0xFFFF}",
147 f"lis 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR >> 16}",
148 f"ori 8, 8, {SECOND_BYTE_HIGH_NIBBLE_LUT_ADDR & 0xFFFF}",
149 "loop:",
150 "setvl 0, 0, 3, 0, 1, 1", # set VL to 3
151 "sv.ori *45, *61, 0", # copy prev bytes from end of cur bytes
152
153 # clear cur bytes, so bytes beyond end end up being zeros
154 "setvl 0, 0, 16, 0, 1, 1", # set VL to 16
155 "sv.addi *48, 0, 0", # clear cur bytes
156 "setvl. 5, 4, 16, 0, 1, 1", # set VL to min(16, r4)
157 "beq final_check", # if no bytes left to load, run final check
158 "sv.lbz/els *48, 0(3)", # load bytes
159 "setvl 0, 0, 16, 0, 1, 1", # set VL to 16
160 # now we can operate on 16 byte chunks, branch to `fail` if they don't
161 # pass validation.
162
163 # get high nibbles of input shifted by 1 byte
164 f"sv.rldicl *80, *47, {64 - 4}, 4", # srdi *80, *47, 4
165 # look-up nibbles in table, writing to error flags
166 "sv.lbzx *64, 6, *80",
167
168 # get low nibbles of input shifted by 1 byte
169 f"sv.andi. *80, *47, {0xF}", # there is no andi without Rc
170 # look-up nibbles in table
171 "sv.lbzx *80, 6, *80",
172 # bitwise and into error flags
173 "sv.and *64, *64, *80",
174
175 # get high nibbles of input
176 "sv.srdi *80, *48, 4",
177 # look-up nibbles in table
178 "sv.lbzx *80, 6, *80",
179 # bitwise and into error flags
180 "sv.and *64, *64, *80",
181
182 # or-reduce error flags into r96
183 "sv.mv *80, *64"
184 "sv.or *81, *80, *81",
185 # check for any actual error flags set
186 f"sv.andi. 96, 96, {UTF8FirstTwoBytesError.AllActualErrors}",
187 "bne fail",
188
189 # check for the correct number of continuation bytes for 3/4-byte cases
190 # set bit 0x80 (TwoContinuations) if input is >= 0xE0
191 f"sv.subi/satu *80, *46, {0xE0 - 0x80}",
192 # xor into error flags
193 "sv.xor *64, *64, *80",
194 # set bit 0x80 (TwoContinuations) if input is >= 0xF0
195 f"sv.subi/satu *80, *45, {0xF0 - 0x80}",
196 # xor into error flags
197 "sv.xor *80, *64, *80",
198 # now bit 0x80 is set in r80-95 if there's an error
199 # or-reduce into r96
200 "sv.or *81, *80, *81",
201 # adjust count/pointer
202 "add 3, 3, 5", # increment pointer
203 "sub 4, 4, 5", # decrement count
204 f"sv.andi. 96, 96, {0x80}", # check if any errors
205 "beq loop", # if no errors loop, else fail
206 "fail:",
207 "li 3, 0",
208 "blr",
209 "final_check:",
210 # check if prev input is incomplete
211 # check if byte 3 bytes from end needed 4 bytes
212 f"sv.cmpli 0, 1, 45, {0xF0}",
213 "bge fail",
214 # check if byte 2 bytes from end needed 3 bytes
215 f"sv.cmpli 0, 1, 46, {0xE0}",
216 "bge fail",
217 # check if byte 1 bytes from end needed 2 bytes
218 f"sv.cmpli 0, 1, 47, {0xC0}",
219 "bge fail",
220 "li 3, 1",
221 "blr",
222 ]
223
224
225 class SVP64UTF8ValidationTestCase(TestAccumulatorBase):
226 def run_case(self, data):
227 # type: (bytes) -> None
228 expected = 1
229 try:
230 data.decode("utf-8")
231 except UnicodeDecodeError:
232 expected = 0
233 isa = SVP64Asm(svp64_utf8_validation_asm())
234 lst = list(isa)
235 initial_regs = [0x15cee3293aa9bfbe] * 128 # fill with junk
236 initial_regs[3] = 0x10000 # pointer to bytes to check
237 initial_regs[4] = len(data) # length of bytes to check
238
239 initial_mem = {}
240 for i, v in enumerate(data):
241 initial_mem[i + initial_regs[3]] = v, 1
242 stop_at_pc = 0x10000000
243 initial_sprs = {8: SelectableInt(stop_at_pc, 64)}
244 e = ExpectedState(pc=stop_at_pc)
245 e.intregs[3] = expected
246 self.add_case(Program(lst, 0), initial_regs, initial_mem=initial_mem,
247 initial_sprs=initial_sprs, stop_at_pc=stop_at_pc,
248 expected=e)
249
250 def run_cases(self, data):
251 # type: (bytes | str) -> None
252 if isinstance(data, str):
253 data = data.encode("utf-8")
254 data = b' ' * 8 + data + b' ' * 8
255 for i in range(len(data)):
256 part = data[i:]
257 for j in range(len(part)):
258 self.run_case(part[:j])
259
260 def case_empty(self):
261 self.run_case(b"")
262
263 def case_nul(self):
264 self.run_cases("\u0000") # min 1-byte
265
266 def case_a(self):
267 self.run_cases("a")
268
269 def case_7f(self):
270 self.run_cases("\u007F") # max 1-byte
271
272 def case_c0_80(self):
273 self.run_cases(b"\xC0\x80") # min 2-byte overlong encoding
274
275 def case_c1_bf(self):
276 self.run_cases(b"\xC1\xBF") # max 2-byte overlong encoding
277
278 def case_u0080(self):
279 self.run_cases("\u0080") # min 2-byte
280
281 def case_u07ff(self):
282 self.run_cases("\u07FF") # max 2-byte
283
284 def case_e0_80_80(self):
285 self.run_cases(b"\xE0\x80\x80") # min 3-byte overlong encoding
286
287 def case_e0_9f_bf(self):
288 self.run_cases(b"\xE0\x9F\xBF") # max 3-byte overlong encoding
289
290 def case_u0800(self):
291 self.run_cases("\u0800") # min 3-byte
292
293 def case_u0fff(self):
294 self.run_cases("\u0FFF")
295
296 def case_u1000(self):
297 self.run_cases("\u1000")
298
299 def case_ucfff(self):
300 self.run_cases("\uCFFF")
301
302 def case_ud000(self):
303 self.run_cases("\uD000")
304
305 def case_ud7ff(self):
306 self.run_cases("\uD7FF")
307
308 def case_ud800(self):
309 self.run_cases("\uD800") # surrogate
310
311 def case_udbff(self):
312 self.run_cases("\uDBFF") # surrogate
313
314 def case_udc00(self):
315 self.run_cases("\uDC00") # surrogate
316
317 def case_udfff(self):
318 self.run_cases("\uDFFF") # surrogate
319
320 def case_ue000(self):
321 self.run_cases("\uE000")
322
323 def case_uffff(self):
324 self.run_cases("\uFFFF") # max 3-byte
325
326 def case_f0_80_80_80(self):
327 self.run_cases(b"\xF0\x80\x80\x80") # min 4-byte overlong encoding
328
329 def case_f0_bf_bf_bf(self):
330 self.run_cases(b"\xF0\x8F\xBF\xBF") # max 4-byte overlong encoding
331
332 def case_u00010000(self):
333 self.run_cases("\U00010000") # min 4-byte
334
335 def case_u0003ffff(self):
336 self.run_cases("\U0003FFFF")
337
338 def case_u00040000(self):
339 self.run_cases("\U00040000")
340
341 def case_u000fffff(self):
342 self.run_cases("\U000FFFFF")
343
344 def case_u00100000(self):
345 self.run_cases("\U00100000")
346
347 def case_u0010ffff(self):
348 self.run_cases("\U0010FFFF") # max 4-byte
349
350 def case_f4_90_80_80(self):
351 self.run_cases(b"\xF4\x90\x80\x80") # first too-big encoding
352
353 def case_f7_bf_bf_bf(self):
354 self.run_cases(b"\xF7\xBF\xBF\xBF") # max too-big 4-byte encoding
355
356 def case_f8_x4_80(self):
357 self.run_cases(b"\xF8" + b"\x80" * 4) # min too-big 5-byte encoding
358
359 def case_fb_x4_bf(self):
360 self.run_cases(b"\xFB" + b"\xBF" * 4) # max too-big 5-byte encoding
361
362 def case_fc_x5_80(self):
363 self.run_cases(b"\xFC" + b"\x80" * 5) # min too-big 6-byte encoding
364
365 def case_fd_x5_bf(self):
366 self.run_cases(b"\xFD" + b"\xBF" * 5) # max too-big 6-byte encoding
367
368 def case_fe_x6_80(self):
369 self.run_cases(b"\xFE" + b"\x80" * 6) # min too-big 7-byte encoding
370
371 def case_fe_x6_bf(self):
372 self.run_cases(b"\xFE" + b"\xBF" * 6) # max too-big 7-byte encoding
373
374 def case_ff_x7_80(self):
375 self.run_cases(b"\xFF" + b"\x80" * 7) # min too-big 8-byte encoding
376
377 def case_ff_x7_bf(self):
378 self.run_cases(b"\xFF" + b"\xBF" * 7) # max too-big 8-byte encoding