sl@0
|
1 |
# This file is derived from
|
sl@0
|
2 |
#
|
sl@0
|
3 |
# http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
|
sl@0
|
4 |
#
|
sl@0
|
5 |
# Which was created by Markus Kuhn <mkuhn@acm.org> - 2000-09-02
|
sl@0
|
6 |
#
|
sl@0
|
7 |
# lines begining with # and blank lines are ignored
|
sl@0
|
8 |
#
|
sl@0
|
9 |
# Beyond that, this file consists of a series of test cases. Each test case consists of
|
sl@0
|
10 |
# 2 or 3 lines:
|
sl@0
|
11 |
#
|
sl@0
|
12 |
# 1. A UTF-8 string
|
sl@0
|
13 |
# 2. A status
|
sl@0
|
14 |
# VALID : The string is a valid UTF-8 representation of valid Unicode
|
sl@0
|
15 |
# INCOMPLETE : The string has a partial character at the end
|
sl@0
|
16 |
# NOTUNICODE : The string is valid UTF-8, but the characters represented
|
sl@0
|
17 |
# are not valid unicode (
|
sl@0
|
18 |
# OVERLONG : The string includes overlong sequences
|
sl@0
|
19 |
# MALFORMED : The string is not valid UTF-8
|
sl@0
|
20 |
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
|
sl@0
|
21 |
# as a series of hex numbers.
|
sl@0
|
22 |
|
sl@0
|
23 |
# 1 Some correct UTF-8 text
|
sl@0
|
24 |
κόσμε
|
sl@0
|
25 |
VALID
|
sl@0
|
26 |
03ba 1f79 03c3 03bc 03b5
|
sl@0
|
27 |
|
sl@0
|
28 |
# 2.1 First possible sequence of a certain length
|
sl@0
|
29 |
#
|
sl@0
|
30 |
# FIXME - handle NULLS?
|
sl@0
|
31 |
#
|
sl@0
|
32 |
# [ NULL BYTE ]
|
sl@0
|
33 |
#VALID
|
sl@0
|
34 |
#0000
|
sl@0
|
35 |
|
sl@0
|
36 |
€
|
sl@0
|
37 |
VALID
|
sl@0
|
38 |
0080
|
sl@0
|
39 |
|
sl@0
|
40 |
øˆ€€€
|
sl@0
|
41 |
NOTUNICODE
|
sl@0
|
42 |
00200000
|
sl@0
|
43 |
|
sl@0
|
44 |
ü„€€€€
|
sl@0
|
45 |
NOTUNICODE
|
sl@0
|
46 |
04000000
|
sl@0
|
47 |
|
sl@0
|
48 |
|
sl@0
|
49 |
VALID
|
sl@0
|
50 |
0000007f
|
sl@0
|
51 |
|
sl@0
|
52 |
ß¿
|
sl@0
|
53 |
VALID
|
sl@0
|
54 |
000007ff
|
sl@0
|
55 |
|
sl@0
|
56 |
ï¿¿
|
sl@0
|
57 |
NOTUNICODE
|
sl@0
|
58 |
0000ffff
|
sl@0
|
59 |
|
sl@0
|
60 |
÷¿¿¿
|
sl@0
|
61 |
NOTUNICODE
|
sl@0
|
62 |
001fffff
|
sl@0
|
63 |
|
sl@0
|
64 |
û¿¿¿¿
|
sl@0
|
65 |
NOTUNICODE
|
sl@0
|
66 |
03ffffff
|
sl@0
|
67 |
|
sl@0
|
68 |
ý¿¿¿¿¿
|
sl@0
|
69 |
NOTUNICODE
|
sl@0
|
70 |
7fffffff
|
sl@0
|
71 |
|
sl@0
|
72 |
# 2.3 Other boundary conditions
|
sl@0
|
73 |
|
sl@0
|
74 |
퟿
|
sl@0
|
75 |
VALID
|
sl@0
|
76 |
d7ff
|
sl@0
|
77 |
|
sl@0
|
78 |
�
|
sl@0
|
79 |
VALID
|
sl@0
|
80 |
fffd
|
sl@0
|
81 |
|
sl@0
|
82 |
ô¿¿
|
sl@0
|
83 |
NOTUNICODE
|
sl@0
|
84 |
0010ffff
|
sl@0
|
85 |
|
sl@0
|
86 |
ô€€
|
sl@0
|
87 |
NOTUNICODE
|
sl@0
|
88 |
00110000
|
sl@0
|
89 |
|
sl@0
|
90 |
# 3.1 Unexpected continuation bytes
|
sl@0
|
91 |
|
sl@0
|
92 |
€
|
sl@0
|
93 |
MALFORMED
|
sl@0
|
94 |
¿
|
sl@0
|
95 |
MALFORMED
|
sl@0
|
96 |
€¿
|
sl@0
|
97 |
MALFORMED
|
sl@0
|
98 |
€¿€
|
sl@0
|
99 |
MALFORMED
|
sl@0
|
100 |
€¿€¿
|
sl@0
|
101 |
MALFORMED
|
sl@0
|
102 |
€¿€¿€
|
sl@0
|
103 |
MALFORMED
|
sl@0
|
104 |
€¿€¿€¿
|
sl@0
|
105 |
MALFORMED
|
sl@0
|
106 |
€¿€¿€¿€
|
sl@0
|
107 |
MALFORMED
|
sl@0
|
108 |
€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬®¯°±²³´µ¶·¸¹º»¼½¾¿
|
sl@0
|
109 |
MALFORMED
|
sl@0
|
110 |
|
sl@0
|
111 |
# 3.2 Lonely start characters
|
sl@0
|
112 |
|
sl@0
|
113 |
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß
|
sl@0
|
114 |
MALFORMED
|
sl@0
|
115 |
à á â ã ä å æ ç è é ê ë ì í î ï
|
sl@0
|
116 |
MALFORMED
|
sl@0
|
117 |
ð ñ ò ó ô õ ö ÷
|
sl@0
|
118 |
MALFORMED
|
sl@0
|
119 |
ø ù ú û
|
sl@0
|
120 |
MALFORMED
|
sl@0
|
121 |
ü ý
|
sl@0
|
122 |
MALFORMED
|
sl@0
|
123 |
|
sl@0
|
124 |
# 3.3 Sequences with last continuation byte missing
|
sl@0
|
125 |
|
sl@0
|
126 |
À
|
sl@0
|
127 |
INCOMPLETE
|
sl@0
|
128 |
à€
|
sl@0
|
129 |
INCOMPLETE
|
sl@0
|
130 |
ð€€
|
sl@0
|
131 |
INCOMPLETE
|
sl@0
|
132 |
ø€€€
|
sl@0
|
133 |
INCOMPLETE
|
sl@0
|
134 |
ü€€€€
|
sl@0
|
135 |
INCOMPLETE
|
sl@0
|
136 |
ß
|
sl@0
|
137 |
INCOMPLETE
|
sl@0
|
138 |
ï¿
|
sl@0
|
139 |
INCOMPLETE
|
sl@0
|
140 |
÷¿¿
|
sl@0
|
141 |
INCOMPLETE
|
sl@0
|
142 |
û¿¿¿
|
sl@0
|
143 |
INCOMPLETE
|
sl@0
|
144 |
ý¿¿¿¿
|
sl@0
|
145 |
INCOMPLETE
|
sl@0
|
146 |
|
sl@0
|
147 |
# 3.4 Concatenation of incomplete sequences
|
sl@0
|
148 |
|
sl@0
|
149 |
Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿
|
sl@0
|
150 |
MALFORMED
|
sl@0
|
151 |
|
sl@0
|
152 |
# 3.5 Impossible bytes
|
sl@0
|
153 |
|
sl@0
|
154 |
þ
|
sl@0
|
155 |
MALFORMED
|
sl@0
|
156 |
ÿ
|
sl@0
|
157 |
MALFORMED
|
sl@0
|
158 |
þþÿÿ
|
sl@0
|
159 |
MALFORMED
|
sl@0
|
160 |
|
sl@0
|
161 |
# Examples of an overlong ASCII character
|
sl@0
|
162 |
|
sl@0
|
163 |
À¯
|
sl@0
|
164 |
OVERLONG
|
sl@0
|
165 |
à€¯
|
sl@0
|
166 |
OVERLONG
|
sl@0
|
167 |
ð€€¯
|
sl@0
|
168 |
OVERLONG
|
sl@0
|
169 |
ø€€€¯
|
sl@0
|
170 |
OVERLONG
|
sl@0
|
171 |
ü€€€€¯
|
sl@0
|
172 |
OVERLONG
|
sl@0
|
173 |
|
sl@0
|
174 |
# Maximum overlong sequences
|
sl@0
|
175 |
|
sl@0
|
176 |
Á¿
|
sl@0
|
177 |
OVERLONG
|
sl@0
|
178 |
àŸ¿
|
sl@0
|
179 |
OVERLONG
|
sl@0
|
180 |
ð¿¿
|
sl@0
|
181 |
OVERLONG
|
sl@0
|
182 |
ø‡¿¿¿
|
sl@0
|
183 |
OVERLONG
|
sl@0
|
184 |
üƒ¿¿¿¿
|
sl@0
|
185 |
OVERLONG
|
sl@0
|
186 |
|
sl@0
|
187 |
# Overlong representation of the NUL character
|
sl@0
|
188 |
|
sl@0
|
189 |
À€
|
sl@0
|
190 |
OVERLONG
|
sl@0
|
191 |
à€€
|
sl@0
|
192 |
OVERLONG
|
sl@0
|
193 |
ð€€€
|
sl@0
|
194 |
OVERLONG
|
sl@0
|
195 |
ø€€€€
|
sl@0
|
196 |
OVERLONG
|
sl@0
|
197 |
ü€€€€€
|
sl@0
|
198 |
OVERLONG
|
sl@0
|
199 |
|
sl@0
|
200 |
# Illegal code positions
|
sl@0
|
201 |
|
sl@0
|
202 |
# Single UTF-16 surrogates
|
sl@0
|
203 |
|
sl@0
|
204 |
í €
|
sl@0
|
205 |
NOTUNICODE
|
sl@0
|
206 |
d800
|
sl@0
|
207 |
|
sl@0
|
208 |
í¿
|
sl@0
|
209 |
NOTUNICODE
|
sl@0
|
210 |
db7f
|
sl@0
|
211 |
|
sl@0
|
212 |
í®€
|
sl@0
|
213 |
NOTUNICODE
|
sl@0
|
214 |
db80
|
sl@0
|
215 |
|
sl@0
|
216 |
í¯¿
|
sl@0
|
217 |
NOTUNICODE
|
sl@0
|
218 |
dbff
|
sl@0
|
219 |
|
sl@0
|
220 |
í°€
|
sl@0
|
221 |
NOTUNICODE
|
sl@0
|
222 |
dc00
|
sl@0
|
223 |
|
sl@0
|
224 |
í¾€
|
sl@0
|
225 |
NOTUNICODE
|
sl@0
|
226 |
df80
|
sl@0
|
227 |
|
sl@0
|
228 |
í¿¿
|
sl@0
|
229 |
NOTUNICODE
|
sl@0
|
230 |
dfff
|
sl@0
|
231 |
|
sl@0
|
232 |
# Paired UTF-16 surrogates
|
sl@0
|
233 |
|
sl@0
|
234 |
í €í°€
|
sl@0
|
235 |
NOTUNICODE
|
sl@0
|
236 |
d800 dc00
|
sl@0
|
237 |
|
sl@0
|
238 |
í €í¿¿
|
sl@0
|
239 |
NOTUNICODE
|
sl@0
|
240 |
d800 dfff
|
sl@0
|
241 |
|
sl@0
|
242 |
í¿í°€
|
sl@0
|
243 |
NOTUNICODE
|
sl@0
|
244 |
db7f dc00
|
sl@0
|
245 |
|
sl@0
|
246 |
í¿í¿¿
|
sl@0
|
247 |
NOTUNICODE
|
sl@0
|
248 |
db7f dfff
|
sl@0
|
249 |
|
sl@0
|
250 |
󰀀
|
sl@0
|
251 |
NOTUNICODE
|
sl@0
|
252 |
db80 dc00
|
sl@0
|
253 |
|
sl@0
|
254 |
󰏿
|
sl@0
|
255 |
NOTUNICODE
|
sl@0
|
256 |
db80 dfff
|
sl@0
|
257 |
|
sl@0
|
258 |
􏰀
|
sl@0
|
259 |
NOTUNICODE
|
sl@0
|
260 |
dbff dc00
|
sl@0
|
261 |
|
sl@0
|
262 |
􏿿
|
sl@0
|
263 |
NOTUNICODE
|
sl@0
|
264 |
dbff dfff
|
sl@0
|
265 |
|
sl@0
|
266 |
# Other illegal code positions
|
sl@0
|
267 |
|
sl@0
|
268 |
￾
|
sl@0
|
269 |
NOTUNICODE
|
sl@0
|
270 |
fffe
|
sl@0
|
271 |
|
sl@0
|
272 |
ï¿¿
|
sl@0
|
273 |
NOTUNICODE
|
sl@0
|
274 |
ffff
|
sl@0
|
275 |
|
sl@0
|
276 |
################
|
sl@0
|
277 |
#
|
sl@0
|
278 |
# Some more tests, not from Markus Kuhn's file
|
sl@0
|
279 |
#
|
sl@0
|
280 |
|
sl@0
|
281 |
# Mixed plane 0 and higher planes
|
sl@0
|
282 |
|