os/ossrv/glib/tests/utf8.txt
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 # This file is derived from 
     2 #
     3 #    http://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
     4 #    
     5 # Which was created by   Markus Kuhn <mkuhn@acm.org> - 2000-09-02 
     6 #
     7 # lines begining with # and blank lines are ignored
     8 #
     9 # Beyond that, this file consists of a series of test cases. Each test case consists of
    10 # 2 or 3 lines:
    11 #
    12 #  1. A UTF-8 string
    13 #  2. A status
    14 #      VALID      : The string is a valid UTF-8 representation of valid Unicode
    15 #      INCOMPLETE : The string has a partial character at the end
    16 #      NOTUNICODE : The string is valid UTF-8, but the characters represented
    17 #                   are not valid unicode (
    18 #      OVERLONG   : The string includes overlong sequences
    19 #      MALFORMED  : The string is not valid UTF-8
    20 # 3. If the status is VALID or NOTUNICODE, the UCS-4 representation of the string,
    21 #    as a series of hex numbers.
    22 
    23 # 1  Some correct UTF-8 text
    24 κόσμε
    25 VALID
    26 03ba 1f79 03c3 03bc 03b5
    27 
    28 # 2.1  First possible sequence of a certain length
    29 #
    30 # FIXME - handle NULLS?
    31 #
    32 # [ NULL BYTE ]
    33 #VALID
    34 #0000
    35 
    36 €
    37 VALID
    38 0080
    39 
    40 
    41 NOTUNICODE
    42 00200000
    43 
    44 
    45 NOTUNICODE
    46 04000000
    47 
    48 
    49 VALID
    50 0000007f
    51 
    52 ߿
    53 VALID
    54 000007ff
    55 
    56 ￿
    57 NOTUNICODE
    58 0000ffff
    59 
    60 
    61 NOTUNICODE
    62 001fffff
    63 
    64 
    65 NOTUNICODE
    66 03ffffff
    67 
    68 
    69 NOTUNICODE
    70 7fffffff
    71 
    72 # 2.3  Other boundary conditions
    73 
    74
    75 VALID
    76 d7ff
    77 
    78
    79 VALID
    80 fffd
    81 
    82 􏿿
    83 NOTUNICODE
    84 0010ffff
    85 
    86 
    87 NOTUNICODE
    88 00110000
    89 
    90 # 3.1  Unexpected continuation bytes
    91 
    92 
    93 MALFORMED
    94 
    95 MALFORMED
    96 
    97 MALFORMED
    98 
    99 MALFORMED
   100 
   101 MALFORMED
   102 
   103 MALFORMED
   104 
   105 MALFORMED
   106 
   107 MALFORMED
   108 
   109 MALFORMED
   110 
   111 # 3.2  Lonely start characters
   112 
   113                                 
   114 MALFORMED
   115                 
   116 MALFORMED
   117         
   118 MALFORMED
   119     
   120 MALFORMED
   121   
   122 MALFORMED
   123 
   124 # 3.3  Sequences with last continuation byte missing
   125 
   126 
   127 INCOMPLETE
   128 
   129 INCOMPLETE
   130 
   131 INCOMPLETE
   132 
   133 INCOMPLETE
   134 
   135 INCOMPLETE
   136 
   137 INCOMPLETE
   138 
   139 INCOMPLETE
   140 
   141 INCOMPLETE
   142 
   143 INCOMPLETE
   144 
   145 INCOMPLETE
   146 
   147 # 3.4  Concatenation of incomplete sequences
   148 
   149 
   150 MALFORMED
   151 
   152 # 3.5  Impossible bytes
   153 
   154 
   155 MALFORMED
   156 
   157 MALFORMED
   158 
   159 MALFORMED
   160 
   161 #  Examples of an overlong ASCII character
   162 
   163 
   164 OVERLONG
   165 
   166 OVERLONG
   167 
   168 OVERLONG
   169 
   170 OVERLONG
   171 
   172 OVERLONG
   173 
   174 #  Maximum overlong sequences
   175 
   176 
   177 OVERLONG
   178 
   179 OVERLONG
   180 
   181 OVERLONG
   182 
   183 OVERLONG
   184 
   185 OVERLONG
   186 
   187 # Overlong representation of the NUL character
   188 
   189 
   190 OVERLONG
   191 
   192 OVERLONG
   193 
   194 OVERLONG
   195 
   196 OVERLONG
   197 
   198 OVERLONG
   199 
   200 # Illegal code positions
   201 
   202 # Single UTF-16 surrogates
   203 
   204 
   205 NOTUNICODE
   206 d800
   207 
   208 
   209 NOTUNICODE
   210 db7f
   211 
   212 
   213 NOTUNICODE
   214 db80
   215 
   216 
   217 NOTUNICODE
   218 dbff
   219 
   220 
   221 NOTUNICODE
   222 dc00
   223 
   224 
   225 NOTUNICODE
   226 df80
   227 
   228 
   229 NOTUNICODE
   230 dfff
   231 
   232 # Paired UTF-16 surrogates
   233 
   234 
   235 NOTUNICODE
   236 d800 dc00
   237 
   238 
   239 NOTUNICODE
   240 d800 dfff
   241 
   242 
   243 NOTUNICODE
   244 db7f dc00
   245 
   246 
   247 NOTUNICODE
   248 db7f dfff
   249 
   250 
   251 NOTUNICODE
   252 db80 dc00
   253 
   254 
   255 NOTUNICODE
   256 db80 dfff
   257 
   258 
   259 NOTUNICODE
   260 dbff dc00
   261 
   262 
   263 NOTUNICODE
   264 dbff dfff
   265 
   266 # Other illegal code positions
   267 
   268
   269 NOTUNICODE
   270 fffe
   271 
   272 ￿
   273 NOTUNICODE
   274 ffff
   275 
   276 ################
   277 #
   278 # Some more tests, not from Markus Kuhn's file
   279 #
   280 
   281 # Mixed plane 0 and higher planes
   282