os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/utf16.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 *******************************************************************************
     3 *
     4 *   Copyright (C) 1999-2005, International Business Machines
     5 *   Corporation and others.  All Rights Reserved.
     6 *
     7 *******************************************************************************
     8 *   file name:  utf16.h
     9 *   encoding:   US-ASCII
    10 *   tab size:   8 (not used)
    11 *   indentation:4
    12 *
    13 *   created on: 1999sep09
    14 *   created by: Markus W. Scherer
    15 */
    16 
    17 /**
    18  * \file
    19  * \brief C API: 16-bit Unicode handling macros
    20  * 
    21  * This file defines macros to deal with 16-bit Unicode (UTF-16) code units and strings.
    22  * utf16.h is included by utf.h after unicode/umachine.h
    23  * and some common definitions.
    24  *
    25  * For more information see utf.h and the ICU User Guide Strings chapter
    26  * (http://icu.sourceforge.net/userguide/strings.html).
    27  *
    28  * <em>Usage:</em>
    29  * ICU coding guidelines for if() statements should be followed when using these macros.
    30  * Compound statements (curly braces {}) must be used  for if-else-while... 
    31  * bodies and all macro statements should be terminated with semicolon.
    32  */
    33 
    34 #ifndef __UTF16_H__
    35 #define __UTF16_H__
    36 
    37 /* utf.h must be included first. */
    38 #ifndef __UTF_H__
    39 #   include "unicode/utf.h"
    40 #endif
    41 
    42 /* single-code point definitions -------------------------------------------- */
    43 
    44 /**
    45  * Does this code unit alone encode a code point (BMP, not a surrogate)?
    46  * @param c 16-bit code unit
    47  * @return TRUE or FALSE
    48  * @stable ICU 2.4
    49  */
    50 #define U16_IS_SINGLE(c) !U_IS_SURROGATE(c)
    51 
    52 /**
    53  * Is this code unit a lead surrogate (U+d800..U+dbff)?
    54  * @param c 16-bit code unit
    55  * @return TRUE or FALSE
    56  * @stable ICU 2.4
    57  */
    58 #define U16_IS_LEAD(c) (((c)&0xfffffc00)==0xd800)
    59 
    60 /**
    61  * Is this code unit a trail surrogate (U+dc00..U+dfff)?
    62  * @param c 16-bit code unit
    63  * @return TRUE or FALSE
    64  * @stable ICU 2.4
    65  */
    66 #define U16_IS_TRAIL(c) (((c)&0xfffffc00)==0xdc00)
    67 
    68 /**
    69  * Is this code unit a surrogate (U+d800..U+dfff)?
    70  * @param c 16-bit code unit
    71  * @return TRUE or FALSE
    72  * @stable ICU 2.4
    73  */
    74 #define U16_IS_SURROGATE(c) U_IS_SURROGATE(c)
    75 
    76 /**
    77  * Assuming c is a surrogate code point (U16_IS_SURROGATE(c)),
    78  * is it a lead surrogate?
    79  * @param c 16-bit code unit
    80  * @return TRUE or FALSE
    81  * @stable ICU 2.4
    82  */
    83 #define U16_IS_SURROGATE_LEAD(c) (((c)&0x400)==0)
    84 
    85 /**
    86  * Helper constant for U16_GET_SUPPLEMENTARY.
    87  * @internal
    88  */
    89 #define U16_SURROGATE_OFFSET ((0xd800<<10UL)+0xdc00-0x10000)
    90 
    91 /**
    92  * Get a supplementary code point value (U+10000..U+10ffff)
    93  * from its lead and trail surrogates.
    94  * The result is undefined if the input values are not
    95  * lead and trail surrogates.
    96  *
    97  * @param lead lead surrogate (U+d800..U+dbff)
    98  * @param trail trail surrogate (U+dc00..U+dfff)
    99  * @return supplementary code point (U+10000..U+10ffff)
   100  * @stable ICU 2.4
   101  */
   102 #define U16_GET_SUPPLEMENTARY(lead, trail) \
   103     (((UChar32)(lead)<<10UL)+(UChar32)(trail)-U16_SURROGATE_OFFSET)
   104 
   105 
   106 /**
   107  * Get the lead surrogate (0xd800..0xdbff) for a
   108  * supplementary code point (0x10000..0x10ffff).
   109  * @param supplementary 32-bit code point (U+10000..U+10ffff)
   110  * @return lead surrogate (U+d800..U+dbff) for supplementary
   111  * @stable ICU 2.4
   112  */
   113 #define U16_LEAD(supplementary) (UChar)(((supplementary)>>10)+0xd7c0)
   114 
   115 /**
   116  * Get the trail surrogate (0xdc00..0xdfff) for a
   117  * supplementary code point (0x10000..0x10ffff).
   118  * @param supplementary 32-bit code point (U+10000..U+10ffff)
   119  * @return trail surrogate (U+dc00..U+dfff) for supplementary
   120  * @stable ICU 2.4
   121  */
   122 #define U16_TRAIL(supplementary) (UChar)(((supplementary)&0x3ff)|0xdc00)
   123 
   124 /**
   125  * How many 16-bit code units are used to encode this Unicode code point? (1 or 2)
   126  * The result is not defined if c is not a Unicode code point (U+0000..U+10ffff).
   127  * @param c 32-bit code point
   128  * @return 1 or 2
   129  * @stable ICU 2.4
   130  */
   131 #define U16_LENGTH(c) ((uint32_t)(c)<=0xffff ? 1 : 2)
   132 
   133 /**
   134  * The maximum number of 16-bit code units per Unicode code point (U+0000..U+10ffff).
   135  * @return 2
   136  * @stable ICU 2.4
   137  */
   138 #define U16_MAX_LENGTH 2
   139 
   140 /**
   141  * Get a code point from a string at a random-access offset,
   142  * without changing the offset.
   143  * "Unsafe" macro, assumes well-formed UTF-16.
   144  *
   145  * The offset may point to either the lead or trail surrogate unit
   146  * for a supplementary code point, in which case the macro will read
   147  * the adjacent matching surrogate as well.
   148  * The result is undefined if the offset points to a single, unpaired surrogate.
   149  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
   150  *
   151  * @param s const UChar * string
   152  * @param i string offset
   153  * @param c output UChar32 variable
   154  * @see U16_GET
   155  * @stable ICU 2.4
   156  */
   157 #define U16_GET_UNSAFE(s, i, c) { \
   158     (c)=(s)[i]; \
   159     if(U16_IS_SURROGATE(c)) { \
   160         if(U16_IS_SURROGATE_LEAD(c)) { \
   161             (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)+1]); \
   162         } else { \
   163             (c)=U16_GET_SUPPLEMENTARY((s)[(i)-1], (c)); \
   164         } \
   165     } \
   166 }
   167 
   168 /**
   169  * Get a code point from a string at a random-access offset,
   170  * without changing the offset.
   171  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   172  *
   173  * The offset may point to either the lead or trail surrogate unit
   174  * for a supplementary code point, in which case the macro will read
   175  * the adjacent matching surrogate as well.
   176  * If the offset points to a single, unpaired surrogate, then that itself
   177  * will be returned as the code point.
   178  * Iteration through a string is more efficient with U16_NEXT_UNSAFE or U16_NEXT.
   179  *
   180  * @param s const UChar * string
   181  * @param start starting string offset (usually 0)
   182  * @param i string offset, start<=i<length
   183  * @param length string length
   184  * @param c output UChar32 variable
   185  * @see U16_GET_UNSAFE
   186  * @stable ICU 2.4
   187  */
   188 #define U16_GET(s, start, i, length, c) { \
   189     (c)=(s)[i]; \
   190     if(U16_IS_SURROGATE(c)) { \
   191         uint16_t __c2; \
   192         if(U16_IS_SURROGATE_LEAD(c)) { \
   193             if((i)+1<(length) && U16_IS_TRAIL(__c2=(s)[(i)+1])) { \
   194                 (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
   195             } \
   196         } else { \
   197             if((i)-1>=(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
   198                 (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
   199             } \
   200         } \
   201     } \
   202 }
   203 
   204 /* definitions with forward iteration --------------------------------------- */
   205 
   206 /**
   207  * Get a code point from a string at a code point boundary offset,
   208  * and advance the offset to the next code point boundary.
   209  * (Post-incrementing forward iteration.)
   210  * "Unsafe" macro, assumes well-formed UTF-16.
   211  *
   212  * The offset may point to the lead surrogate unit
   213  * for a supplementary code point, in which case the macro will read
   214  * the following trail surrogate as well.
   215  * If the offset points to a trail surrogate, then that itself
   216  * will be returned as the code point.
   217  * The result is undefined if the offset points to a single, unpaired lead surrogate.
   218  *
   219  * @param s const UChar * string
   220  * @param i string offset
   221  * @param c output UChar32 variable
   222  * @see U16_NEXT
   223  * @stable ICU 2.4
   224  */
   225 #define U16_NEXT_UNSAFE(s, i, c) { \
   226     (c)=(s)[(i)++]; \
   227     if(U16_IS_LEAD(c)) { \
   228         (c)=U16_GET_SUPPLEMENTARY((c), (s)[(i)++]); \
   229     } \
   230 }
   231 
   232 /**
   233  * Get a code point from a string at a code point boundary offset,
   234  * and advance the offset to the next code point boundary.
   235  * (Post-incrementing forward iteration.)
   236  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   237  *
   238  * The offset may point to the lead surrogate unit
   239  * for a supplementary code point, in which case the macro will read
   240  * the following trail surrogate as well.
   241  * If the offset points to a trail surrogate or
   242  * to a single, unpaired lead surrogate, then that itself
   243  * will be returned as the code point.
   244  *
   245  * @param s const UChar * string
   246  * @param i string offset, i<length
   247  * @param length string length
   248  * @param c output UChar32 variable
   249  * @see U16_NEXT_UNSAFE
   250  * @stable ICU 2.4
   251  */
   252 #define U16_NEXT(s, i, length, c) { \
   253     (c)=(s)[(i)++]; \
   254     if(U16_IS_LEAD(c)) { \
   255         uint16_t __c2; \
   256         if((i)<(length) && U16_IS_TRAIL(__c2=(s)[(i)])) { \
   257             ++(i); \
   258             (c)=U16_GET_SUPPLEMENTARY((c), __c2); \
   259         } \
   260     } \
   261 }
   262 
   263 /**
   264  * Append a code point to a string, overwriting 1 or 2 code units.
   265  * The offset points to the current end of the string contents
   266  * and is advanced (post-increment).
   267  * "Unsafe" macro, assumes a valid code point and sufficient space in the string.
   268  * Otherwise, the result is undefined.
   269  *
   270  * @param s const UChar * string buffer
   271  * @param i string offset
   272  * @param c code point to append
   273  * @see U16_APPEND
   274  * @stable ICU 2.4
   275  */
   276 #define U16_APPEND_UNSAFE(s, i, c) { \
   277     if((uint32_t)(c)<=0xffff) { \
   278         (s)[(i)++]=(uint16_t)(c); \
   279     } else { \
   280         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
   281         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
   282     } \
   283 }
   284 
   285 /**
   286  * Append a code point to a string, overwriting 1 or 2 code units.
   287  * The offset points to the current end of the string contents
   288  * and is advanced (post-increment).
   289  * "Safe" macro, checks for a valid code point.
   290  * If a surrogate pair is written, checks for sufficient space in the string.
   291  * If the code point is not valid or a trail surrogate does not fit,
   292  * then isError is set to TRUE.
   293  *
   294  * @param s const UChar * string buffer
   295  * @param i string offset, i<length
   296  * @param capacity size of the string buffer
   297  * @param c code point to append
   298  * @param isError output UBool set to TRUE if an error occurs, otherwise not modified
   299  * @see U16_APPEND_UNSAFE
   300  * @stable ICU 2.4
   301  */
   302 #define U16_APPEND(s, i, capacity, c, isError) { \
   303     if((uint32_t)(c)<=0xffff) { \
   304         (s)[(i)++]=(uint16_t)(c); \
   305     } else if((uint32_t)(c)<=0x10ffff && (i)+1<(capacity)) { \
   306         (s)[(i)++]=(uint16_t)(((c)>>10)+0xd7c0); \
   307         (s)[(i)++]=(uint16_t)(((c)&0x3ff)|0xdc00); \
   308     } else /* c>0x10ffff or not enough space */ { \
   309         (isError)=TRUE; \
   310     } \
   311 }
   312 
   313 /**
   314  * Advance the string offset from one code point boundary to the next.
   315  * (Post-incrementing iteration.)
   316  * "Unsafe" macro, assumes well-formed UTF-16.
   317  *
   318  * @param s const UChar * string
   319  * @param i string offset
   320  * @see U16_FWD_1
   321  * @stable ICU 2.4
   322  */
   323 #define U16_FWD_1_UNSAFE(s, i) { \
   324     if(U16_IS_LEAD((s)[(i)++])) { \
   325         ++(i); \
   326     } \
   327 }
   328 
   329 /**
   330  * Advance the string offset from one code point boundary to the next.
   331  * (Post-incrementing iteration.)
   332  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   333  *
   334  * @param s const UChar * string
   335  * @param i string offset, i<length
   336  * @param length string length
   337  * @see U16_FWD_1_UNSAFE
   338  * @stable ICU 2.4
   339  */
   340 #define U16_FWD_1(s, i, length) { \
   341     if(U16_IS_LEAD((s)[(i)++]) && (i)<(length) && U16_IS_TRAIL((s)[i])) { \
   342         ++(i); \
   343     } \
   344 }
   345 
   346 /**
   347  * Advance the string offset from one code point boundary to the n-th next one,
   348  * i.e., move forward by n code points.
   349  * (Post-incrementing iteration.)
   350  * "Unsafe" macro, assumes well-formed UTF-16.
   351  *
   352  * @param s const UChar * string
   353  * @param i string offset
   354  * @param n number of code points to skip
   355  * @see U16_FWD_N
   356  * @stable ICU 2.4
   357  */
   358 #define U16_FWD_N_UNSAFE(s, i, n) { \
   359     int32_t __N=(n); \
   360     while(__N>0) { \
   361         U16_FWD_1_UNSAFE(s, i); \
   362         --__N; \
   363     } \
   364 }
   365 
   366 /**
   367  * Advance the string offset from one code point boundary to the n-th next one,
   368  * i.e., move forward by n code points.
   369  * (Post-incrementing iteration.)
   370  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   371  *
   372  * @param s const UChar * string
   373  * @param i string offset, i<length
   374  * @param length string length
   375  * @param n number of code points to skip
   376  * @see U16_FWD_N_UNSAFE
   377  * @stable ICU 2.4
   378  */
   379 #define U16_FWD_N(s, i, length, n) { \
   380     int32_t __N=(n); \
   381     while(__N>0 && (i)<(length)) { \
   382         U16_FWD_1(s, i, length); \
   383         --__N; \
   384     } \
   385 }
   386 
   387 /**
   388  * Adjust a random-access offset to a code point boundary
   389  * at the start of a code point.
   390  * If the offset points to the trail surrogate of a surrogate pair,
   391  * then the offset is decremented.
   392  * Otherwise, it is not modified.
   393  * "Unsafe" macro, assumes well-formed UTF-16.
   394  *
   395  * @param s const UChar * string
   396  * @param i string offset
   397  * @see U16_SET_CP_START
   398  * @stable ICU 2.4
   399  */
   400 #define U16_SET_CP_START_UNSAFE(s, i) { \
   401     if(U16_IS_TRAIL((s)[i])) { \
   402         --(i); \
   403     } \
   404 }
   405 
   406 /**
   407  * Adjust a random-access offset to a code point boundary
   408  * at the start of a code point.
   409  * If the offset points to the trail surrogate of a surrogate pair,
   410  * then the offset is decremented.
   411  * Otherwise, it is not modified.
   412  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   413  *
   414  * @param s const UChar * string
   415  * @param start starting string offset (usually 0)
   416  * @param i string offset, start<=i
   417  * @see U16_SET_CP_START_UNSAFE
   418  * @stable ICU 2.4
   419  */
   420 #define U16_SET_CP_START(s, start, i) { \
   421     if(U16_IS_TRAIL((s)[i]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
   422         --(i); \
   423     } \
   424 }
   425 
   426 /* definitions with backward iteration -------------------------------------- */
   427 
   428 /**
   429  * Move the string offset from one code point boundary to the previous one
   430  * and get the code point between them.
   431  * (Pre-decrementing backward iteration.)
   432  * "Unsafe" macro, assumes well-formed UTF-16.
   433  *
   434  * The input offset may be the same as the string length.
   435  * If the offset is behind a trail surrogate unit
   436  * for a supplementary code point, then the macro will read
   437  * the preceding lead surrogate as well.
   438  * If the offset is behind a lead surrogate, then that itself
   439  * will be returned as the code point.
   440  * The result is undefined if the offset is behind a single, unpaired trail surrogate.
   441  *
   442  * @param s const UChar * string
   443  * @param i string offset
   444  * @param c output UChar32 variable
   445  * @see U16_PREV
   446  * @stable ICU 2.4
   447  */
   448 #define U16_PREV_UNSAFE(s, i, c) { \
   449     (c)=(s)[--(i)]; \
   450     if(U16_IS_TRAIL(c)) { \
   451         (c)=U16_GET_SUPPLEMENTARY((s)[--(i)], (c)); \
   452     } \
   453 }
   454 
   455 /**
   456  * Move the string offset from one code point boundary to the previous one
   457  * and get the code point between them.
   458  * (Pre-decrementing backward iteration.)
   459  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   460  *
   461  * The input offset may be the same as the string length.
   462  * If the offset is behind a trail surrogate unit
   463  * for a supplementary code point, then the macro will read
   464  * the preceding lead surrogate as well.
   465  * If the offset is behind a lead surrogate or behind a single, unpaired
   466  * trail surrogate, then that itself
   467  * will be returned as the code point.
   468  *
   469  * @param s const UChar * string
   470  * @param start starting string offset (usually 0)
   471  * @param i string offset, start<=i
   472  * @param c output UChar32 variable
   473  * @see U16_PREV_UNSAFE
   474  * @stable ICU 2.4
   475  */
   476 #define U16_PREV(s, start, i, c) { \
   477     (c)=(s)[--(i)]; \
   478     if(U16_IS_TRAIL(c)) { \
   479         uint16_t __c2; \
   480         if((i)>(start) && U16_IS_LEAD(__c2=(s)[(i)-1])) { \
   481             --(i); \
   482             (c)=U16_GET_SUPPLEMENTARY(__c2, (c)); \
   483         } \
   484     } \
   485 }
   486 
   487 /**
   488  * Move the string offset from one code point boundary to the previous one.
   489  * (Pre-decrementing backward iteration.)
   490  * The input offset may be the same as the string length.
   491  * "Unsafe" macro, assumes well-formed UTF-16.
   492  *
   493  * @param s const UChar * string
   494  * @param i string offset
   495  * @see U16_BACK_1
   496  * @stable ICU 2.4
   497  */
   498 #define U16_BACK_1_UNSAFE(s, i) { \
   499     if(U16_IS_TRAIL((s)[--(i)])) { \
   500         --(i); \
   501     } \
   502 }
   503 
   504 /**
   505  * Move the string offset from one code point boundary to the previous one.
   506  * (Pre-decrementing backward iteration.)
   507  * The input offset may be the same as the string length.
   508  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   509  *
   510  * @param s const UChar * string
   511  * @param start starting string offset (usually 0)
   512  * @param i string offset, start<=i
   513  * @see U16_BACK_1_UNSAFE
   514  * @stable ICU 2.4
   515  */
   516 #define U16_BACK_1(s, start, i) { \
   517     if(U16_IS_TRAIL((s)[--(i)]) && (i)>(start) && U16_IS_LEAD((s)[(i)-1])) { \
   518         --(i); \
   519     } \
   520 }
   521 
   522 /**
   523  * Move the string offset from one code point boundary to the n-th one before it,
   524  * i.e., move backward by n code points.
   525  * (Pre-decrementing backward iteration.)
   526  * The input offset may be the same as the string length.
   527  * "Unsafe" macro, assumes well-formed UTF-16.
   528  *
   529  * @param s const UChar * string
   530  * @param i string offset
   531  * @param n number of code points to skip
   532  * @see U16_BACK_N
   533  * @stable ICU 2.4
   534  */
   535 #define U16_BACK_N_UNSAFE(s, i, n) { \
   536     int32_t __N=(n); \
   537     while(__N>0) { \
   538         U16_BACK_1_UNSAFE(s, i); \
   539         --__N; \
   540     } \
   541 }
   542 
   543 /**
   544  * Move the string offset from one code point boundary to the n-th one before it,
   545  * i.e., move backward by n code points.
   546  * (Pre-decrementing backward iteration.)
   547  * The input offset may be the same as the string length.
   548  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   549  *
   550  * @param s const UChar * string
   551  * @param start start of string
   552  * @param i string offset, i<length
   553  * @param n number of code points to skip
   554  * @see U16_BACK_N_UNSAFE
   555  * @stable ICU 2.4
   556  */
   557 #define U16_BACK_N(s, start, i, n) { \
   558     int32_t __N=(n); \
   559     while(__N>0 && (i)>(start)) { \
   560         U16_BACK_1(s, start, i); \
   561         --__N; \
   562     } \
   563 }
   564 
   565 /**
   566  * Adjust a random-access offset to a code point boundary after a code point.
   567  * If the offset is behind the lead surrogate of a surrogate pair,
   568  * then the offset is incremented.
   569  * Otherwise, it is not modified.
   570  * The input offset may be the same as the string length.
   571  * "Unsafe" macro, assumes well-formed UTF-16.
   572  *
   573  * @param s const UChar * string
   574  * @param i string offset
   575  * @see U16_SET_CP_LIMIT
   576  * @stable ICU 2.4
   577  */
   578 #define U16_SET_CP_LIMIT_UNSAFE(s, i) { \
   579     if(U16_IS_LEAD((s)[(i)-1])) { \
   580         ++(i); \
   581     } \
   582 }
   583 
   584 /**
   585  * Adjust a random-access offset to a code point boundary after a code point.
   586  * If the offset is behind the lead surrogate of a surrogate pair,
   587  * then the offset is incremented.
   588  * Otherwise, it is not modified.
   589  * The input offset may be the same as the string length.
   590  * "Safe" macro, handles unpaired surrogates and checks for string boundaries.
   591  *
   592  * @param s const UChar * string
   593  * @param start starting string offset (usually 0)
   594  * @param i string offset, start<=i<=length
   595  * @param length string length
   596  * @see U16_SET_CP_LIMIT_UNSAFE
   597  * @stable ICU 2.4
   598  */
   599 #define U16_SET_CP_LIMIT(s, start, i, length) { \
   600     if((start)<(i) && (i)<(length) && U16_IS_LEAD((s)[(i)-1]) && U16_IS_TRAIL((s)[i])) { \
   601         ++(i); \
   602     } \
   603 }
   604 
   605 #endif