os/ossrv/glib/tests/utf8.txt
author sl
Tue, 10 Jun 2014 14:32:02 +0200 (2014-06-10)
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
# This file is derived from 
sl@0
     2
#
sl@0
     3
#    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
sl@0
     4
#    
sl@0
     5
# Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
sl@0
     6
#
sl@0
     7
# lines begining with # and blank lines are ignored
sl@0
     8
#
sl@0
     9
# Beyond that, this file consists of a series of test cases. Each test case consists of
sl@0
    10
# 2 or 3 lines:
sl@0
    11
#
sl@0
    12
#  1. A UTF-8 string
sl@0
    13
#  2. A status
sl@0
    14
#      VALID      : The string is a valid UTF-8 representation of valid Unicode
sl@0
    15
#      INCOMPLETE : The string has a partial character at the end
sl@0
    16
#      NOTUNICODE : The string is valid UTF-8, but the characters represented
sl@0
    17
#                   are not valid unicode (
sl@0
    18
#      OVERLONG   : The string includes overlong sequences
sl@0
    19
#      MALFORMED  : The string is not valid UTF-8
sl@0
    20
# 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
sl@0
    21
#    as a series of hex numbers.
sl@0
    22
sl@0
    23
# 1  Some correct UTF-8 text
sl@0
    24
κόσμε
sl@0
    25
VALID
sl@0
    26
03ba 1f79 03c3 03bc 03b5
sl@0
    27
sl@0
    28
# 2.1  First possible sequence of a certain length
sl@0
    29
#
sl@0
    30
# FIXME - handle NULLS?
sl@0
    31
#
sl@0
    32
# [ NULL BYTE ]
sl@0
    33
#VALID
sl@0
    34
#0000
sl@0
    35
sl@0
    36
€
sl@0
    37
VALID
sl@0
    38
0080
sl@0
    39
sl@0
    40
øˆ€€€
sl@0
    41
NOTUNICODE
sl@0
    42
00200000
sl@0
    43
sl@0
    44
ü„€€€€
sl@0
    45
NOTUNICODE
sl@0
    46
04000000
sl@0
    47
sl@0
    48

sl@0
    49
VALID
sl@0
    50
0000007f
sl@0
    51
sl@0
    52
ß¿
sl@0
    53
VALID
sl@0
    54
000007ff
sl@0
    55
sl@0
    56
ï¿¿
sl@0
    57
NOTUNICODE
sl@0
    58
0000ffff
sl@0
    59
sl@0
    60
÷¿¿¿
sl@0
    61
NOTUNICODE
sl@0
    62
001fffff
sl@0
    63
sl@0
    64
û¿¿¿¿
sl@0
    65
NOTUNICODE
sl@0
    66
03ffffff
sl@0
    67
sl@0
    68
ý¿¿¿¿¿
sl@0
    69
NOTUNICODE
sl@0
    70
7fffffff
sl@0
    71
sl@0
    72
# 2.3  Other boundary conditions
sl@0
    73
sl@0
    74
퟿
sl@0
    75
VALID
sl@0
    76
d7ff
sl@0
    77
sl@0
    78
�
sl@0
    79
VALID
sl@0
    80
fffd
sl@0
    81
sl@0
    82
􏿿
sl@0
    83
NOTUNICODE
sl@0
    84
0010ffff
sl@0
    85
sl@0
    86
ô€€
sl@0
    87
NOTUNICODE
sl@0
    88
00110000
sl@0
    89
sl@0
    90
# 3.1  Unexpected continuation bytes
sl@0
    91
sl@0
    92
sl@0
    93
MALFORMED
sl@0
    94
¿
sl@0
    95
MALFORMED
sl@0
    96
€¿
sl@0
    97
MALFORMED
sl@0
    98
€¿€
sl@0
    99
MALFORMED
sl@0
   100
€¿€¿
sl@0
   101
MALFORMED
sl@0
   102
€¿€¿€
sl@0
   103
MALFORMED
sl@0
   104
€¿€¿€¿
sl@0
   105
MALFORMED
sl@0
   106
€¿€¿€¿€
sl@0
   107
MALFORMED
sl@0
   108
€‚ƒ„…†‡ˆ‰Š‹ŒŽ‘’“”•–—˜™š›œžŸ¡¢£¤¥¦§¨©ª«¬­®¯°±²³´µ¶·¸¹º»¼½¾¿
sl@0
   109
MALFORMED
sl@0
   110
sl@0
   111
# 3.2  Lonely start characters
sl@0
   112
sl@0
   113
À Á Â Ã Ä Å Æ Ç È É Ê Ë Ì Í Î Ï Ð Ñ Ò Ó Ô Õ Ö × Ø Ù Ú Û Ü Ý Þ ß 
sl@0
   114
MALFORMED
sl@0
   115
à á â ã ä å æ ç è é ê ë ì í î ï 
sl@0
   116
MALFORMED
sl@0
   117
ð ñ ò ó ô õ ö ÷ 
sl@0
   118
MALFORMED
sl@0
   119
ø ù ú û 
sl@0
   120
MALFORMED
sl@0
   121
ü ý 
sl@0
   122
MALFORMED
sl@0
   123
sl@0
   124
# 3.3  Sequences with last continuation byte missing
sl@0
   125
sl@0
   126
À
sl@0
   127
INCOMPLETE
sl@0
   128
à€
sl@0
   129
INCOMPLETE
sl@0
   130
ð€€
sl@0
   131
INCOMPLETE
sl@0
   132
ø€€€
sl@0
   133
INCOMPLETE
sl@0
   134
ü€€€€
sl@0
   135
INCOMPLETE
sl@0
   136
ß
sl@0
   137
INCOMPLETE
sl@0
   138
ï¿
sl@0
   139
INCOMPLETE
sl@0
   140
÷¿¿
sl@0
   141
INCOMPLETE
sl@0
   142
û¿¿¿
sl@0
   143
INCOMPLETE
sl@0
   144
ý¿¿¿¿
sl@0
   145
INCOMPLETE
sl@0
   146
sl@0
   147
# 3.4  Concatenation of incomplete sequences
sl@0
   148
sl@0
   149
Àà€ð€€ø€€€ü€€€€ßï¿÷¿¿û¿¿¿ý¿¿¿¿
sl@0
   150
MALFORMED
sl@0
   151
sl@0
   152
# 3.5  Impossible bytes
sl@0
   153
sl@0
   154
þ
sl@0
   155
MALFORMED
sl@0
   156
ÿ
sl@0
   157
MALFORMED
sl@0
   158
þþÿÿ
sl@0
   159
MALFORMED
sl@0
   160
sl@0
   161
#  Examples of an overlong ASCII character
sl@0
   162
sl@0
   163
À¯
sl@0
   164
OVERLONG
sl@0
   165
à€¯
sl@0
   166
OVERLONG
sl@0
   167
ð€€¯
sl@0
   168
OVERLONG
sl@0
   169
ø€€€¯
sl@0
   170
OVERLONG
sl@0
   171
ü€€€€¯
sl@0
   172
OVERLONG
sl@0
   173
sl@0
   174
#  Maximum overlong sequences
sl@0
   175
sl@0
   176
Á¿
sl@0
   177
OVERLONG
sl@0
   178
àŸ¿
sl@0
   179
OVERLONG
sl@0
   180
ð¿¿
sl@0
   181
OVERLONG
sl@0
   182
ø‡¿¿¿
sl@0
   183
OVERLONG
sl@0
   184
üƒ¿¿¿¿
sl@0
   185
OVERLONG
sl@0
   186
sl@0
   187
# Overlong representation of the NUL character
sl@0
   188
sl@0
   189
À€
sl@0
   190
OVERLONG
sl@0
   191
à€€
sl@0
   192
OVERLONG
sl@0
   193
ð€€€
sl@0
   194
OVERLONG
sl@0
   195
ø€€€€
sl@0
   196
OVERLONG
sl@0
   197
ü€€€€€
sl@0
   198
OVERLONG
sl@0
   199
sl@0
   200
# Illegal code positions
sl@0
   201
sl@0
   202
# Single UTF-16 surrogates
sl@0
   203
sl@0
   204
í €
sl@0
   205
NOTUNICODE
sl@0
   206
d800
sl@0
   207
sl@0
   208
í­¿
sl@0
   209
NOTUNICODE
sl@0
   210
db7f
sl@0
   211
sl@0
   212
í®€
sl@0
   213
NOTUNICODE
sl@0
   214
db80
sl@0
   215
sl@0
   216
í¯¿
sl@0
   217
NOTUNICODE
sl@0
   218
dbff
sl@0
   219
sl@0
   220
í°€
sl@0
   221
NOTUNICODE
sl@0
   222
dc00
sl@0
   223
sl@0
   224
í¾€
sl@0
   225
NOTUNICODE
sl@0
   226
df80
sl@0
   227
sl@0
   228
í¿¿
sl@0
   229
NOTUNICODE
sl@0
   230
dfff
sl@0
   231
sl@0
   232
# Paired UTF-16 surrogates
sl@0
   233
sl@0
   234
𐀀
sl@0
   235
NOTUNICODE
sl@0
   236
d800 dc00
sl@0
   237
sl@0
   238
𐏿
sl@0
   239
NOTUNICODE
sl@0
   240
d800 dfff
sl@0
   241
sl@0
   242
í­¿í°€
sl@0
   243
NOTUNICODE
sl@0
   244
db7f dc00
sl@0
   245
sl@0
   246
í­¿í¿¿
sl@0
   247
NOTUNICODE
sl@0
   248
db7f dfff
sl@0
   249
sl@0
   250
󰀀
sl@0
   251
NOTUNICODE
sl@0
   252
db80 dc00
sl@0
   253
sl@0
   254
󰏿
sl@0
   255
NOTUNICODE
sl@0
   256
db80 dfff
sl@0
   257
sl@0
   258
􏰀
sl@0
   259
NOTUNICODE
sl@0
   260
dbff dc00
sl@0
   261
sl@0
   262
􏿿
sl@0
   263
NOTUNICODE
sl@0
   264
dbff dfff
sl@0
   265
sl@0
   266
# Other illegal code positions
sl@0
   267
sl@0
   268
￾
sl@0
   269
NOTUNICODE
sl@0
   270
fffe
sl@0
   271
sl@0
   272
ï¿¿
sl@0
   273
NOTUNICODE
sl@0
   274
ffff
sl@0
   275
sl@0
   276
################
sl@0
   277
#
sl@0
   278
# Some more tests, not from Markus Kuhn's file
sl@0
   279
#
sl@0
   280
sl@0
   281
# Mixed plane 0 and higher planes
sl@0
   282