os/textandloc/fontservices/textshaperplugin/IcuSource/common/unormimp.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200 (2014-06-10)
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
*******************************************************************************
sl@0
     3
*
sl@0
     4
*   Copyright (C) 2001-2004, International Business Machines
sl@0
     5
*   Corporation and others.  All Rights Reserved.
sl@0
     6
*
sl@0
     7
*******************************************************************************
sl@0
     8
*   file name:  unormimp.h
sl@0
     9
*   encoding:   US-ASCII
sl@0
    10
*   tab size:   8 (not used)
sl@0
    11
*   indentation:4
sl@0
    12
*
sl@0
    13
*   created on: 2001may25
sl@0
    14
*   created by: Markus W. Scherer
sl@0
    15
*/
sl@0
    16
sl@0
    17
#ifndef __UNORMIMP_H__
sl@0
    18
#define __UNORMIMP_H__
sl@0
    19
sl@0
    20
#include "unicode/utypes.h"
sl@0
    21
sl@0
    22
#if !UCONFIG_NO_NORMALIZATION
sl@0
    23
sl@0
    24
#ifdef XP_CPLUSPLUS
sl@0
    25
#include "unicode/uniset.h"
sl@0
    26
#endif
sl@0
    27
sl@0
    28
#include "unicode/uiter.h"
sl@0
    29
#include "unicode/unorm.h"
sl@0
    30
#include "unicode/uset.h"
sl@0
    31
#include "utrie.h"
sl@0
    32
#include "ustr_imp.h"
sl@0
    33
#include "udataswp.h"
sl@0
    34
sl@0
    35
/*
sl@0
    36
 * This new implementation of the normalization code loads its data from
sl@0
    37
 * unorm.icu, which is generated with the gennorm tool.
sl@0
    38
 * The format of that file is described at the end of this file.
sl@0
    39
 */
sl@0
    40
sl@0
    41
/* norm32 value constants */
sl@0
    42
enum {
sl@0
    43
    /* quick check flags 0..3 set mean "no" for their forms */
sl@0
    44
    _NORM_QC_NFC=0x11,          /* no|maybe */
sl@0
    45
    _NORM_QC_NFKC=0x22,         /* no|maybe */
sl@0
    46
    _NORM_QC_NFD=4,             /* no */
sl@0
    47
    _NORM_QC_NFKD=8,            /* no */
sl@0
    48
sl@0
    49
    _NORM_QC_ANY_NO=0xf,
sl@0
    50
sl@0
    51
    /* quick check flags 4..5 mean "maybe" for their forms; test flags>=_NORM_QC_MAYBE */
sl@0
    52
    _NORM_QC_MAYBE=0x10,
sl@0
    53
    _NORM_QC_ANY_MAYBE=0x30,
sl@0
    54
sl@0
    55
    _NORM_QC_MASK=0x3f,
sl@0
    56
sl@0
    57
    _NORM_COMBINES_FWD=0x40,
sl@0
    58
    _NORM_COMBINES_BACK=0x80,
sl@0
    59
    _NORM_COMBINES_ANY=0xc0,
sl@0
    60
sl@0
    61
    _NORM_CC_SHIFT=8,           /* UnicodeData.txt combining class in bits 15..8 */
sl@0
    62
    _NORM_CC_MASK=0xff00,
sl@0
    63
sl@0
    64
    _NORM_EXTRA_SHIFT=16,               /* 16 bits for the index to UChars and other extra data */
sl@0
    65
    _NORM_EXTRA_INDEX_TOP=0xfc00,       /* start of surrogate specials after shift */
sl@0
    66
sl@0
    67
    _NORM_EXTRA_SURROGATE_MASK=0x3ff,
sl@0
    68
    _NORM_EXTRA_SURROGATE_TOP=0x3f0,    /* hangul etc. */
sl@0
    69
sl@0
    70
    _NORM_EXTRA_HANGUL=_NORM_EXTRA_SURROGATE_TOP,
sl@0
    71
    _NORM_EXTRA_JAMO_L,
sl@0
    72
    _NORM_EXTRA_JAMO_V,
sl@0
    73
    _NORM_EXTRA_JAMO_T
sl@0
    74
};
sl@0
    75
sl@0
    76
/* norm32 value constants using >16 bits */
sl@0
    77
#define _NORM_MIN_SPECIAL       0xfc000000
sl@0
    78
#define _NORM_SURROGATES_TOP    0xfff00000
sl@0
    79
#define _NORM_MIN_HANGUL        0xfff00000
sl@0
    80
#define _NORM_MIN_JAMO_V        0xfff20000
sl@0
    81
#define _NORM_JAMO_V_TOP        0xfff30000
sl@0
    82
sl@0
    83
/* value constants for auxTrie */
sl@0
    84
enum {
sl@0
    85
    _NORM_AUX_COMP_EX_SHIFT=10,
sl@0
    86
    _NORM_AUX_UNSAFE_SHIFT=11,
sl@0
    87
    _NORM_AUX_NFC_SKIPPABLE_F_SHIFT=12
sl@0
    88
};
sl@0
    89
sl@0
    90
#define _NORM_AUX_MAX_FNC           ((int32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
sl@0
    91
sl@0
    92
#define _NORM_AUX_FNC_MASK          (uint32_t)(_NORM_AUX_MAX_FNC-1)
sl@0
    93
#define _NORM_AUX_COMP_EX_MASK      ((uint32_t)1<<_NORM_AUX_COMP_EX_SHIFT)
sl@0
    94
#define _NORM_AUX_UNSAFE_MASK       ((uint32_t)1<<_NORM_AUX_UNSAFE_SHIFT)
sl@0
    95
#define _NORM_AUX_NFC_SKIP_F_MASK   ((uint32_t)1<<_NORM_AUX_NFC_SKIPPABLE_F_SHIFT)
sl@0
    96
sl@0
    97
/* canonStartSets[0..31] contains indexes for what is in the array */
sl@0
    98
enum {
sl@0
    99
    _NORM_SET_INDEX_CANON_SETS_LENGTH,      /* number of uint16_t in canonical starter sets */
sl@0
   100
    _NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH, /* number of uint16_t in the BMP search table (contains pairs) */
sl@0
   101
    _NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH,/* number of uint16_t in the supplementary search table (contains triplets) */
sl@0
   102
sl@0
   103
    /* from formatVersion 2.3: */
sl@0
   104
    _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET,   /* uint16_t offset from canonStartSets[0] to the
sl@0
   105
                                               exclusion set for CJK compatibility characters */
sl@0
   106
    _NORM_SET_INDEX_NX_UNICODE32_OFFSET,    /* uint16_t offset from canonStartSets[0] to the
sl@0
   107
                                               exclusion set for Unicode 3.2 characters */
sl@0
   108
    _NORM_SET_INDEX_NX_RESERVED_OFFSET,     /* uint16_t offset from canonStartSets[0] to the
sl@0
   109
                                               end of the previous exclusion set */
sl@0
   110
sl@0
   111
    _NORM_SET_INDEX_TOP=32                  /* changing this requires a new formatVersion */
sl@0
   112
};
sl@0
   113
sl@0
   114
/* more constants for canonical starter sets */
sl@0
   115
sl@0
   116
/* 14 bit indexes to canonical USerializedSets */
sl@0
   117
#define _NORM_MAX_CANON_SETS            0x4000
sl@0
   118
sl@0
   119
/* single-code point BMP sets are encoded directly in the search table except if result=0x4000..0x7fff */
sl@0
   120
#define _NORM_CANON_SET_BMP_MASK        0xc000
sl@0
   121
#define _NORM_CANON_SET_BMP_IS_INDEX    0x4000
sl@0
   122
sl@0
   123
/* indexes[] value names */
sl@0
   124
enum {
sl@0
   125
    _NORM_INDEX_TRIE_SIZE,              /* number of bytes in normalization trie */
sl@0
   126
    _NORM_INDEX_UCHAR_COUNT,            /* number of UChars in extra data */
sl@0
   127
sl@0
   128
    _NORM_INDEX_COMBINE_DATA_COUNT,     /* number of uint16_t words for combining data */
sl@0
   129
    _NORM_INDEX_COMBINE_FWD_COUNT,      /* number of code points that combine forward */
sl@0
   130
    _NORM_INDEX_COMBINE_BOTH_COUNT,     /* number of code points that combine forward and backward */
sl@0
   131
    _NORM_INDEX_COMBINE_BACK_COUNT,     /* number of code points that combine backward */
sl@0
   132
sl@0
   133
    _NORM_INDEX_MIN_NFC_NO_MAYBE,       /* first code point with quick check NFC NO/MAYBE */
sl@0
   134
    _NORM_INDEX_MIN_NFKC_NO_MAYBE,      /* first code point with quick check NFKC NO/MAYBE */
sl@0
   135
    _NORM_INDEX_MIN_NFD_NO_MAYBE,       /* first code point with quick check NFD NO/MAYBE */
sl@0
   136
    _NORM_INDEX_MIN_NFKD_NO_MAYBE,      /* first code point with quick check NFKD NO/MAYBE */
sl@0
   137
sl@0
   138
    _NORM_INDEX_FCD_TRIE_SIZE,          /* number of bytes in FCD trie */
sl@0
   139
sl@0
   140
    _NORM_INDEX_AUX_TRIE_SIZE,          /* number of bytes in the auxiliary trie */
sl@0
   141
    _NORM_INDEX_CANON_SET_COUNT,        /* number of uint16_t in the array of serialized USet */
sl@0
   142
sl@0
   143
    _NORM_INDEX_TOP=32                  /* changing this requires a new formatVersion */
sl@0
   144
};
sl@0
   145
sl@0
   146
enum {
sl@0
   147
    /* FCD check: everything below this code point is known to have a 0 lead combining class */
sl@0
   148
    _NORM_MIN_WITH_LEAD_CC=0x300
sl@0
   149
};
sl@0
   150
sl@0
   151
enum {
sl@0
   152
    /**
sl@0
   153
     * Bit 7 of the length byte for a decomposition string in extra data is
sl@0
   154
     * a flag indicating whether the decomposition string is
sl@0
   155
     * preceded by a 16-bit word with the leading and trailing cc
sl@0
   156
     * of the decomposition (like for A-umlaut);
sl@0
   157
     * if not, then both cc's are zero (like for compatibility ideographs).
sl@0
   158
     */
sl@0
   159
    _NORM_DECOMP_FLAG_LENGTH_HAS_CC=0x80,
sl@0
   160
    /**
sl@0
   161
     * Bits 6..0 of the length byte contain the actual length.
sl@0
   162
     */
sl@0
   163
    _NORM_DECOMP_LENGTH_MASK=0x7f
sl@0
   164
};
sl@0
   165
sl@0
   166
#endif /* #if !UCONFIG_NO_NORMALIZATION */
sl@0
   167
sl@0
   168
/* Korean Hangul and Jamo constants */
sl@0
   169
enum {
sl@0
   170
    JAMO_L_BASE=0x1100,     /* "lead" jamo */
sl@0
   171
    JAMO_V_BASE=0x1161,     /* "vowel" jamo */
sl@0
   172
    JAMO_T_BASE=0x11a7,     /* "trail" jamo */
sl@0
   173
sl@0
   174
    HANGUL_BASE=0xac00,
sl@0
   175
sl@0
   176
    JAMO_L_COUNT=19,
sl@0
   177
    JAMO_V_COUNT=21,
sl@0
   178
    JAMO_T_COUNT=28,
sl@0
   179
sl@0
   180
    HANGUL_COUNT=JAMO_L_COUNT*JAMO_V_COUNT*JAMO_T_COUNT
sl@0
   181
};
sl@0
   182
sl@0
   183
#if !UCONFIG_NO_NORMALIZATION
sl@0
   184
sl@0
   185
/* Constants for options flags for normalization. @draft ICU 2.6 */
sl@0
   186
enum {
sl@0
   187
    /** Options bit 0, do not decompose Hangul syllables. @draft ICU 2.6 */
sl@0
   188
    UNORM_NX_HANGUL=1,
sl@0
   189
    /** Options bit 1, do not decompose CJK compatibility characters. @draft ICU 2.6 */
sl@0
   190
    UNORM_NX_CJK_COMPAT=2,
sl@0
   191
    /**
sl@0
   192
     * Options bit 8, use buggy recomposition described in
sl@0
   193
     * Unicode Public Review Issue #29
sl@0
   194
     * at http://www.unicode.org/review/resolved-pri.html#pri29
sl@0
   195
     *
sl@0
   196
     * Used in IDNA implementation according to strict interpretation
sl@0
   197
     * of IDNA definition based on Unicode 3.2 which predates PRI #29.
sl@0
   198
     */
sl@0
   199
    UNORM_BEFORE_PRI_29=0x100
sl@0
   200
};
sl@0
   201
sl@0
   202
/**
sl@0
   203
 * Is the normalizer data loaded?
sl@0
   204
 * This is used internally before other internal normalizer functions
sl@0
   205
 * are called.
sl@0
   206
 * It saves this check in each of many normalization calls that
sl@0
   207
 * are made for, e.g., collation.
sl@0
   208
 *
sl@0
   209
 * @param pErrorCode as usual
sl@0
   210
 * @return boolean value for whether the normalization data is loaded
sl@0
   211
 *
sl@0
   212
 * @internal
sl@0
   213
 */
sl@0
   214
U_CAPI UBool U_EXPORT2
sl@0
   215
unorm_haveData(UErrorCode *pErrorCode);
sl@0
   216
sl@0
   217
/**
sl@0
   218
 * Internal API for normalizing.
sl@0
   219
 * Does not check for bad input.
sl@0
   220
 * @internal
sl@0
   221
 */
sl@0
   222
U_CAPI int32_t U_EXPORT2
sl@0
   223
unorm_internalNormalize(UChar *dest, int32_t destCapacity,
sl@0
   224
                        const UChar *src, int32_t srcLength,
sl@0
   225
                        UNormalizationMode mode, int32_t options,
sl@0
   226
                        UErrorCode *pErrorCode);
sl@0
   227
sl@0
   228
#ifdef XP_CPLUSPLUS
sl@0
   229
sl@0
   230
/**
sl@0
   231
 * Internal API for normalizing.
sl@0
   232
 * Does not check for bad input.
sl@0
   233
 * Requires _haveData() to be true.
sl@0
   234
 * @internal
sl@0
   235
 */
sl@0
   236
U_CFUNC int32_t
sl@0
   237
unorm_internalNormalizeWithNX(UChar *dest, int32_t destCapacity,
sl@0
   238
                              const UChar *src, int32_t srcLength,
sl@0
   239
                              UNormalizationMode mode, int32_t options, const UnicodeSet *nx,
sl@0
   240
                              UErrorCode *pErrorCode);
sl@0
   241
sl@0
   242
#endif
sl@0
   243
sl@0
   244
/**
sl@0
   245
 * internal API, used by normlzr.cpp
sl@0
   246
 * @internal
sl@0
   247
 */
sl@0
   248
U_CAPI int32_t U_EXPORT2
sl@0
   249
unorm_decompose(UChar *dest, int32_t destCapacity,
sl@0
   250
                const UChar *src, int32_t srcLength,
sl@0
   251
                UBool compat, int32_t options,
sl@0
   252
                UErrorCode *pErrorCode);
sl@0
   253
sl@0
   254
/**
sl@0
   255
 * internal API, used by normlzr.cpp
sl@0
   256
 * @internal
sl@0
   257
 */
sl@0
   258
U_CAPI int32_t U_EXPORT2
sl@0
   259
unorm_compose(UChar *dest, int32_t destCapacity,
sl@0
   260
              const UChar *src, int32_t srcLength,
sl@0
   261
              UBool compat, int32_t options,
sl@0
   262
              UErrorCode *pErrorCode);
sl@0
   263
sl@0
   264
#ifdef XP_CPLUSPLUS
sl@0
   265
sl@0
   266
/**
sl@0
   267
 * internal API, used by unormcmp.cpp
sl@0
   268
 * @internal
sl@0
   269
 */
sl@0
   270
U_CFUNC UNormalizationCheckResult
sl@0
   271
unorm_internalQuickCheck(const UChar *src,
sl@0
   272
                         int32_t srcLength,
sl@0
   273
                         UNormalizationMode mode,
sl@0
   274
                         UBool allowMaybe,
sl@0
   275
                         const UnicodeSet *nx,
sl@0
   276
                         UErrorCode *pErrorCode);
sl@0
   277
sl@0
   278
#endif
sl@0
   279
sl@0
   280
#endif /* #if !UCONFIG_NO_NORMALIZATION */
sl@0
   281
sl@0
   282
/**
sl@0
   283
 * Internal option for unorm_cmpEquivFold() for decomposing.
sl@0
   284
 * If not set, just do strcasecmp().
sl@0
   285
 * @internal
sl@0
   286
 */
sl@0
   287
#define _COMPARE_EQUIV 0x80000
sl@0
   288
sl@0
   289
#ifndef U_COMPARE_IGNORE_CASE
sl@0
   290
/* see also unorm.h */
sl@0
   291
/**
sl@0
   292
 * Option bit for unorm_compare:
sl@0
   293
 * Perform case-insensitive comparison.
sl@0
   294
 * @draft ICU 2.2
sl@0
   295
 */
sl@0
   296
#define U_COMPARE_IGNORE_CASE       0x10000
sl@0
   297
#endif
sl@0
   298
sl@0
   299
/**
sl@0
   300
 * Internal option for unorm_cmpEquivFold() for strncmp style.
sl@0
   301
 * If set, checks for both string length and terminating NUL.
sl@0
   302
 * @internal
sl@0
   303
 */
sl@0
   304
#define _STRNCMP_STYLE 0x1000
sl@0
   305
sl@0
   306
#if !UCONFIG_NO_NORMALIZATION
sl@0
   307
sl@0
   308
/**
sl@0
   309
 * Internal API to get the 16-bit FCD value (lccc + tccc) for c,
sl@0
   310
 * for u_getIntPropertyValue().
sl@0
   311
 * @internal
sl@0
   312
 */
sl@0
   313
U_CAPI uint16_t U_EXPORT2
sl@0
   314
unorm_getFCD16FromCodePoint(UChar32 c);
sl@0
   315
sl@0
   316
/**
sl@0
   317
 * Internal API, used by collation code.
sl@0
   318
 * Get access to the internal FCD trie table to be able to perform
sl@0
   319
 * incremental, per-code unit, FCD checks in collation.
sl@0
   320
 * One pointer is sufficient because the trie index values are offset
sl@0
   321
 * by the index size, so that the same pointer is used to access the trie data.
sl@0
   322
 * @internal
sl@0
   323
 */
sl@0
   324
U_CAPI const uint16_t * U_EXPORT2
sl@0
   325
unorm_getFCDTrie(UErrorCode *pErrorCode);
sl@0
   326
sl@0
   327
#ifdef XP_CPLUSPLUS
sl@0
   328
sl@0
   329
U_NAMESPACE_BEGIN
sl@0
   330
/**
sl@0
   331
 * Internal API, used by collation code.
sl@0
   332
 * Get the FCD value for a code unit, with
sl@0
   333
 * bits 15..8   lead combining class
sl@0
   334
 * bits  7..0   trail combining class
sl@0
   335
 *
sl@0
   336
 * If c is a lead surrogate and the value is not 0,
sl@0
   337
 * then instead of combining classes the value
sl@0
   338
 * is used in unorm_getFCD16FromSurrogatePair() to get the real value
sl@0
   339
 * of the supplementary code point.
sl@0
   340
 *
sl@0
   341
 * @internal
sl@0
   342
 */
sl@0
   343
inline uint16_t
sl@0
   344
unorm_getFCD16(const uint16_t *fcdTrieIndex, UChar c) {
sl@0
   345
    return
sl@0
   346
        fcdTrieIndex[
sl@0
   347
            (fcdTrieIndex[
sl@0
   348
                c>>UTRIE_SHIFT
sl@0
   349
            ]<<UTRIE_INDEX_SHIFT)+
sl@0
   350
            (c&UTRIE_MASK)
sl@0
   351
        ];
sl@0
   352
}
sl@0
   353
sl@0
   354
/**
sl@0
   355
 * Internal API, used by collation code.
sl@0
   356
 * Get the FCD value for a supplementary code point, with
sl@0
   357
 * bits 15..8   lead combining class
sl@0
   358
 * bits  7..0   trail combining class
sl@0
   359
 *
sl@0
   360
 * @param fcd16  The FCD value for the lead surrogate, not 0.
sl@0
   361
 * @param c2     The trail surrogate code unit.
sl@0
   362
 *
sl@0
   363
 * @internal
sl@0
   364
 */
sl@0
   365
inline uint16_t
sl@0
   366
unorm_getFCD16FromSurrogatePair(const uint16_t *fcdTrieIndex, uint16_t fcd16, UChar c2) {
sl@0
   367
    return
sl@0
   368
        fcdTrieIndex[
sl@0
   369
            (fcdTrieIndex[
sl@0
   370
                (int32_t)fcd16+((c2&0x3ff)>>UTRIE_SHIFT)
sl@0
   371
            ]<<UTRIE_INDEX_SHIFT)+
sl@0
   372
            (c2&UTRIE_MASK)
sl@0
   373
        ];
sl@0
   374
}
sl@0
   375
sl@0
   376
U_NAMESPACE_END
sl@0
   377
sl@0
   378
#endif
sl@0
   379
sl@0
   380
/**
sl@0
   381
 * internal API, used by StringPrep
sl@0
   382
 * @internal
sl@0
   383
 */
sl@0
   384
U_CAPI void U_EXPORT2
sl@0
   385
unorm_getUnicodeVersion(UVersionInfo *versionInfo, UErrorCode *pErrorCode);
sl@0
   386
sl@0
   387
/**
sl@0
   388
 * Get the canonical decomposition for one code point.
sl@0
   389
 * Requires unorm_haveData() and buffer!=NULL and pLength!=NULL.
sl@0
   390
 * @param c code point
sl@0
   391
 * @param buffer out-only buffer for algorithmic decompositions of Hangul
sl@0
   392
 * @param length out-only, takes the length of the decomposition, if any
sl@0
   393
 * @return pointer to decomposition, or 0 if none
sl@0
   394
 * @internal
sl@0
   395
 */
sl@0
   396
U_CFUNC const UChar *
sl@0
   397
unorm_getCanonicalDecomposition(UChar32 c, UChar buffer[4], int32_t *pLength);
sl@0
   398
sl@0
   399
/**
sl@0
   400
 * internal API, used by the canonical iterator
sl@0
   401
 * TODO Consider using signature similar to unorm_getCanonicalDecomposition()
sl@0
   402
 * for more efficiency
sl@0
   403
 * @internal
sl@0
   404
 */
sl@0
   405
U_CAPI int32_t U_EXPORT2
sl@0
   406
unorm_getDecomposition(UChar32 c, UBool compat,
sl@0
   407
                       UChar *dest, int32_t destCapacity);
sl@0
   408
sl@0
   409
/**
sl@0
   410
 * internal API, used by uprops.cpp
sl@0
   411
 * @internal
sl@0
   412
 */
sl@0
   413
U_CAPI UBool U_EXPORT2
sl@0
   414
unorm_internalIsFullCompositionExclusion(UChar32 c);
sl@0
   415
sl@0
   416
/**
sl@0
   417
 * Internal API, used by enumeration of canonically equivalent strings
sl@0
   418
 * @internal
sl@0
   419
 */
sl@0
   420
U_CAPI UBool U_EXPORT2
sl@0
   421
unorm_isCanonSafeStart(UChar32 c);
sl@0
   422
sl@0
   423
/**
sl@0
   424
 * Internal API, used by enumeration of canonically equivalent strings
sl@0
   425
 * @internal
sl@0
   426
 */
sl@0
   427
U_CAPI UBool U_EXPORT2
sl@0
   428
unorm_getCanonStartSet(UChar32 c, USerializedSet *fillSet);
sl@0
   429
sl@0
   430
/**
sl@0
   431
 * Is c an NF<mode>-skippable code point? See unormimp.h.
sl@0
   432
 * @internal
sl@0
   433
 */
sl@0
   434
U_CAPI UBool U_EXPORT2
sl@0
   435
unorm_isNFSkippable(UChar32 c, UNormalizationMode mode);
sl@0
   436
sl@0
   437
#ifdef XP_CPLUSPLUS
sl@0
   438
sl@0
   439
/**
sl@0
   440
 * Get normalization exclusion set for the options.
sl@0
   441
 * Requires unorm_haveData().
sl@0
   442
 * @internal
sl@0
   443
 */
sl@0
   444
U_CFUNC const UnicodeSet *
sl@0
   445
unorm_getNX(int32_t options, UErrorCode *pErrorCode);
sl@0
   446
sl@0
   447
#endif
sl@0
   448
sl@0
   449
/**
sl@0
   450
 * Enumerate each normalization data trie and add the
sl@0
   451
 * start of each range of same properties to the set.
sl@0
   452
 * @internal
sl@0
   453
 */
sl@0
   454
U_CAPI void U_EXPORT2
sl@0
   455
unorm_addPropertyStarts(const USetAdder *sa, UErrorCode *pErrorCode);
sl@0
   456
sl@0
   457
/**
sl@0
   458
 * Swap unorm.icu. See udataswp.h.
sl@0
   459
 * @internal
sl@0
   460
 */
sl@0
   461
U_CAPI int32_t U_EXPORT2
sl@0
   462
unorm_swap(const UDataSwapper *ds,
sl@0
   463
           const void *inData, int32_t length, void *outData,
sl@0
   464
           UErrorCode *pErrorCode);
sl@0
   465
sl@0
   466
/**
sl@0
   467
 * Get the NF*_QC property for a code point, for u_getIntPropertyValue().
sl@0
   468
 * @internal
sl@0
   469
 */
sl@0
   470
U_CAPI UNormalizationCheckResult U_EXPORT2
sl@0
   471
unorm_getQuickCheck(UChar32 c, UNormalizationMode mode);
sl@0
   472
sl@0
   473
/**
sl@0
   474
 * Description of the format of unorm.icu version 2.3.
sl@0
   475
 *
sl@0
   476
 * Main change from version 1 to version 2:
sl@0
   477
 * Use of new, common UTrie instead of normalization-specific tries.
sl@0
   478
 * Change to version 2.1: add third/auxiliary trie with associated data.
sl@0
   479
 * Change to version 2.2: add skippable (f) flag data (_NORM_AUX_NFC_SKIP_F_MASK).
sl@0
   480
 * Change to version 2.3: add serialized sets for normalization exclusions
sl@0
   481
 *                        stored inside canonStartSets[]
sl@0
   482
 *
sl@0
   483
 * For more details of how to use the data structures see the code
sl@0
   484
 * in unorm.cpp (runtime normalization code) and
sl@0
   485
 * in gennorm.c and gennorm/store.c (build-time data generation).
sl@0
   486
 *
sl@0
   487
 * For the serialized format of UTrie see utrie.c/UTrieHeader.
sl@0
   488
 *
sl@0
   489
 * - Overall partition
sl@0
   490
 *
sl@0
   491
 * unorm.dat customarily begins with a UDataInfo structure, see udata.h and .c.
sl@0
   492
 * After that there are the following structures:
sl@0
   493
 *
sl@0
   494
 * int32_t indexes[_NORM_INDEX_TOP];            -- _NORM_INDEX_TOP=32, see enum in this file
sl@0
   495
 *
sl@0
   496
 * UTrie normTrie;                              -- size in bytes=indexes[_NORM_INDEX_TRIE_SIZE]
sl@0
   497
 * 
sl@0
   498
 * uint16_t extraData[extraDataTop];            -- extraDataTop=indexes[_NORM_INDEX_UCHAR_COUNT]
sl@0
   499
 *                                                 extraData[0] contains the number of units for
sl@0
   500
 *                                                 FC_NFKC_Closure (formatVersion>=2.1)
sl@0
   501
 *
sl@0
   502
 * uint16_t combiningTable[combiningTableTop];  -- combiningTableTop=indexes[_NORM_INDEX_COMBINE_DATA_COUNT]
sl@0
   503
 *                                                 combiningTableTop may include one 16-bit padding unit
sl@0
   504
 *                                                 to make sure that fcdTrie is 32-bit-aligned
sl@0
   505
 *
sl@0
   506
 * UTrie fcdTrie;                               -- size in bytes=indexes[_NORM_INDEX_FCD_TRIE_SIZE]
sl@0
   507
 *
sl@0
   508
 * UTrie auxTrie;                               -- size in bytes=indexes[_NORM_INDEX_AUX_TRIE_SIZE]
sl@0
   509
 *
sl@0
   510
 * uint16_t canonStartSets[canonStartSetsTop]   -- canonStartSetsTop=indexes[_NORM_INDEX_CANON_SET_COUNT]
sl@0
   511
 *                                                 serialized USets and binary search tables, see below
sl@0
   512
 *
sl@0
   513
 *
sl@0
   514
 * The indexes array contains lengths and sizes of the following arrays and structures
sl@0
   515
 * as well as the following values:
sl@0
   516
 *  indexes[_NORM_INDEX_COMBINE_FWD_COUNT]=combineFwdTop
sl@0
   517
 *      -- one more than the highest combining index computed for forward-only-combining characters
sl@0
   518
 *  indexes[_NORM_INDEX_COMBINE_BOTH_COUNT]=combineBothTop-combineFwdTop
sl@0
   519
 *      -- number of combining indexes computed for both-ways-combining characters
sl@0
   520
 *  indexes[_NORM_INDEX_COMBINE_BACK_COUNT]=combineBackTop-combineBothTop
sl@0
   521
 *      -- number of combining indexes computed for backward-only-combining characters
sl@0
   522
 *
sl@0
   523
 *  indexes[_NORM_INDEX_MIN_NF*_NO_MAYBE] (where *={ C, D, KC, KD })
sl@0
   524
 *      -- first code point with a quick check NF* value of NO/MAYBE
sl@0
   525
 *
sl@0
   526
 *
sl@0
   527
 * - Tries
sl@0
   528
 *
sl@0
   529
 * The main structures are two UTrie tables ("compact arrays"),
sl@0
   530
 * each with one index array and one data array.
sl@0
   531
 * See utrie.h and utrie.c.
sl@0
   532
 *
sl@0
   533
 *
sl@0
   534
 * - Tries in unorm.dat
sl@0
   535
 *
sl@0
   536
 * The first trie (normTrie above)
sl@0
   537
 * provides data for the NF* quick checks and normalization.
sl@0
   538
 * The second trie (fcdTrie above) provides data just for FCD checks.
sl@0
   539
 *
sl@0
   540
 *
sl@0
   541
 * - norm32 data words from the first trie
sl@0
   542
 *
sl@0
   543
 * The norm32Table contains one 32-bit word "norm32" per code point.
sl@0
   544
 * It contains the following bit fields:
sl@0
   545
 * 31..16   extra data index, _NORM_EXTRA_SHIFT is used to shift this field down
sl@0
   546
 *          if this index is <_NORM_EXTRA_INDEX_TOP then it is an index into
sl@0
   547
 *              extraData[] where variable-length normalization data for this
sl@0
   548
 *              code point is found
sl@0
   549
 *          if this index is <_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP
sl@0
   550
 *              then this is a norm32 for a leading surrogate, and the index
sl@0
   551
 *              value is used together with the following trailing surrogate
sl@0
   552
 *              code unit in the second trie access
sl@0
   553
 *          if this index is >=_NORM_EXTRA_INDEX_TOP+_NORM_EXTRA_SURROGATE_TOP
sl@0
   554
 *              then this is a norm32 for a "special" character,
sl@0
   555
 *              i.e., the character is a Hangul syllable or a Jamo
sl@0
   556
 *              see _NORM_EXTRA_HANGUL etc.
sl@0
   557
 *          generally, instead of extracting this index from the norm32 and
sl@0
   558
 *              comparing it with the above constants,
sl@0
   559
 *              the normalization code compares the entire norm32 value
sl@0
   560
 *              with _NORM_MIN_SPECIAL, _NORM_SURROGATES_TOP, _NORM_MIN_HANGUL etc.
sl@0
   561
 *
sl@0
   562
 * 15..8    combining class (cc) according to UnicodeData.txt
sl@0
   563
 *
sl@0
   564
 *  7..6    _NORM_COMBINES_ANY flags, used in composition to see if a character
sl@0
   565
 *              combines with any following or preceding character(s)
sl@0
   566
 *              at all
sl@0
   567
 *     7    _NORM_COMBINES_BACK
sl@0
   568
 *     6    _NORM_COMBINES_FWD
sl@0
   569
 *
sl@0
   570
 *  5..0    quick check flags, set for "no" or "maybe", with separate flags for
sl@0
   571
 *              each normalization form
sl@0
   572
 *              the higher bits are "maybe" flags; for NF*D there are no such flags
sl@0
   573
 *              the lower bits are "no" flags for all forms, in the same order
sl@0
   574
 *              as the "maybe" flags,
sl@0
   575
 *              which is (MSB to LSB): NFKD NFD NFKC NFC
sl@0
   576
 *  5..4    _NORM_QC_ANY_MAYBE
sl@0
   577
 *  3..0    _NORM_QC_ANY_NO
sl@0
   578
 *              see further related constants
sl@0
   579
 *
sl@0
   580
 *
sl@0
   581
 * - Extra data per code point
sl@0
   582
 *
sl@0
   583
 * "Extra data" is referenced by the index in norm32.
sl@0
   584
 * It is variable-length data. It is only present, and only those parts
sl@0
   585
 * of it are, as needed for a given character.
sl@0
   586
 * The norm32 extra data index is added to the beginning of extraData[]
sl@0
   587
 * to get to a vector of 16-bit words with data at the following offsets:
sl@0
   588
 *
sl@0
   589
 * [-1]     Combining index for composition.
sl@0
   590
 *              Stored only if norm32&_NORM_COMBINES_ANY .
sl@0
   591
 * [0]      Lengths of the canonical and compatibility decomposition strings.
sl@0
   592
 *              Stored only if there are decompositions, i.e.,
sl@0
   593
 *              if norm32&(_NORM_QC_NFD|_NORM_QC_NFKD)
sl@0
   594
 *          High byte: length of NFKD, or 0 if none
sl@0
   595
 *          Low byte: length of NFD, or 0 if none
sl@0
   596
 *          Each length byte also has another flag:
sl@0
   597
 *              Bit 7 of a length byte is set if there are non-zero
sl@0
   598
 *              combining classes (cc's) associated with the respective
sl@0
   599
 *              decomposition. If this flag is set, then the decomposition
sl@0
   600
 *              is preceded by a 16-bit word that contains the
sl@0
   601
 *              leading and trailing cc's.
sl@0
   602
 *              Bits 6..0 of a length byte are the length of the
sl@0
   603
 *              decomposition string, not counting the cc word.
sl@0
   604
 * [1..n]   NFD
sl@0
   605
 * [n+1..]  NFKD
sl@0
   606
 *
sl@0
   607
 * Each of the two decompositions consists of up to two parts:
sl@0
   608
 * - The 16-bit words with the leading and trailing cc's.
sl@0
   609
 *   This is only stored if bit 7 of the corresponding length byte
sl@0
   610
 *   is set. In this case, at least one of the cc's is not zero.
sl@0
   611
 *   High byte: leading cc==cc of the first code point in the decomposition string
sl@0
   612
 *   Low byte: trailing cc==cc of the last code point in the decomposition string
sl@0
   613
 * - The decomposition string in UTF-16, with length code units.
sl@0
   614
 *
sl@0
   615
 *
sl@0
   616
 * - Combining indexes and combiningTable[]
sl@0
   617
 *
sl@0
   618
 * Combining indexes are stored at the [-1] offset of the extra data
sl@0
   619
 * if the character combines forward or backward with any other characters.
sl@0
   620
 * They are used for (re)composition in NF*C.
sl@0
   621
 * Values of combining indexes are arranged according to whether a character
sl@0
   622
 * combines forward, backward, or both ways:
sl@0
   623
 *    forward-only < both ways < backward-only
sl@0
   624
 *
sl@0
   625
 * The index values for forward-only and both-ways combining characters
sl@0
   626
 * are indexes into the combiningTable[].
sl@0
   627
 * The index values for backward-only combining characters are simply
sl@0
   628
 * incremented from the preceding index values to be unique.
sl@0
   629
 *
sl@0
   630
 * In the combiningTable[], a variable-length list
sl@0
   631
 * of variable-length (back-index, code point) pair entries is stored
sl@0
   632
 * for each forward-combining character.
sl@0
   633
 *
sl@0
   634
 * These back-indexes are the combining indexes of both-ways or backward-only
sl@0
   635
 * combining characters that the forward-combining character combines with.
sl@0
   636
 *
sl@0
   637
 * Each list is sorted in ascending order of back-indexes.
sl@0
   638
 * Each list is terminated with the last back-index having bit 15 set.
sl@0
   639
 *
sl@0
   640
 * Each pair (back-index, code point) takes up either 2 or 3
sl@0
   641
 * 16-bit words.
sl@0
   642
 * The first word of a list entry is the back-index, with its bit 15 set if
sl@0
   643
 * this is the last pair in the list.
sl@0
   644
 *
sl@0
   645
 * The second word contains flags in bits 15..13 that determine
sl@0
   646
 * if there is a third word and how the combined character is encoded:
sl@0
   647
 * 15   set if there is a third word in this list entry
sl@0
   648
 * 14   set if the result is a supplementary character
sl@0
   649
 * 13   set if the result itself combines forward
sl@0
   650
 *
sl@0
   651
 * According to these bits 15..14 of the second word,
sl@0
   652
 * the result character is encoded as follows:
sl@0
   653
 * 00 or 01 The result is <=0x1fff and stored in bits 12..0 of
sl@0
   654
 *          the second word.
sl@0
   655
 * 10       The result is 0x2000..0xffff and stored in the third word.
sl@0
   656
 *          Bits 12..0 of the second word are not used.
sl@0
   657
 * 11       The result is a supplementary character.
sl@0
   658
 *          Bits 9..0 of the leading surrogate are in bits 9..0 of
sl@0
   659
 *          the second word.
sl@0
   660
 *          Add 0xd800 to these bits to get the complete surrogate.
sl@0
   661
 *          Bits 12..10 of the second word are not used.
sl@0
   662
 *          The trailing surrogate is stored in the third word.
sl@0
   663
 *
sl@0
   664
 *
sl@0
   665
 * - FCD trie
sl@0
   666
 *
sl@0
   667
 * The FCD trie is very simple.
sl@0
   668
 * It is a folded trie with 16-bit data words.
sl@0
   669
 * In each word, the high byte contains the leading cc of the character,
sl@0
   670
 * and the low byte contains the trailing cc of the character.
sl@0
   671
 * These cc's are the cc's of the first and last code points in the
sl@0
   672
 * canonical decomposition of the character.
sl@0
   673
 *
sl@0
   674
 * Since all 16 bits are used for cc's, lead surrogates must be tested
sl@0
   675
 * by checking the code unit instead of the trie data.
sl@0
   676
 * This is done only if the 16-bit data word is not zero.
sl@0
   677
 * If the code unit is a leading surrogate and the data word is not zero,
sl@0
   678
 * then instead of cc's it contains the offset for the second trie lookup.
sl@0
   679
 *
sl@0
   680
 *
sl@0
   681
 * - Auxiliary trie and data
sl@0
   682
 *
sl@0
   683
 * The auxiliary 16-bit trie contains data for additional properties.
sl@0
   684
 * Bits
sl@0
   685
 * 15..13   reserved
sl@0
   686
 *     12   not NFC_Skippable (f) (formatVersion>=2.2)
sl@0
   687
 *     11   flag: not a safe starter for canonical closure
sl@0
   688
 *     10   composition exclusion
sl@0
   689
 *  9.. 0   index into extraData[] to FC_NFKC_Closure string
sl@0
   690
 *          (not for lead surrogate),
sl@0
   691
 *          or lead surrogate offset (for lead surrogate, if 9..0 not zero)
sl@0
   692
 *
sl@0
   693
 * - FC_NFKC_Closure strings in extraData[]
sl@0
   694
 *
sl@0
   695
 * Strings are either stored as a single code unit or as the length
sl@0
   696
 * followed by that many units.
sl@0
   697
 *   const UChar *s=extraData+(index from auxTrie data bits 9..0);
sl@0
   698
 *   int32_t length;
sl@0
   699
 *   if(*s<0xff00) {
sl@0
   700
 *     // s points to the single-unit string
sl@0
   701
 *     length=1;
sl@0
   702
 *   } else {
sl@0
   703
 *     length=*s&0xff;
sl@0
   704
 *     ++s;
sl@0
   705
 *   }
sl@0
   706
 *
sl@0
   707
 * Conditions for "NF* Skippable" from Mark Davis' com.ibm.text.UCD.NFSkippable:
sl@0
   708
 * (used in NormalizerTransliterator)
sl@0
   709
 *
sl@0
   710
 * A skippable character is
sl@0
   711
 * a) unassigned, or ALL of the following:
sl@0
   712
 * b) of combining class 0.
sl@0
   713
 * c) not decomposed by this normalization form.
sl@0
   714
 * AND if NFC or NFKC,
sl@0
   715
 * d) can never compose with a previous character.
sl@0
   716
 * e) can never compose with a following character.
sl@0
   717
 * f) can never change if another character is added.
sl@0
   718
 *    Example: a-breve might satisfy all but f, but if you
sl@0
   719
 *    add an ogonek it changes to a-ogonek + breve
sl@0
   720
 *
sl@0
   721
 * a)..e) must be tested from norm32.
sl@0
   722
 * Since f) is more complicated, the (not-)NFC_Skippable flag (f) is built
sl@0
   723
 * into the auxiliary trie.
sl@0
   724
 * The same bit is used for NFC and NFKC; (c) differs for them.
sl@0
   725
 * As usual, we build the "not skippable" flags so that unassigned
sl@0
   726
 * code points get a 0 bit.
sl@0
   727
 * This bit is only valid after (a)..(e) test FALSE; test NFD_NO before (f) as well.
sl@0
   728
 * Test Hangul LV syllables entirely in code.
sl@0
   729
 *
sl@0
   730
 *
sl@0
   731
 * - structure inside canonStartSets[]
sl@0
   732
 *
sl@0
   733
 * This array maps from code points c to sets of code points (USerializedSet).
sl@0
   734
 * The result sets are the code points whose canonical decompositions start
sl@0
   735
 * with c.
sl@0
   736
 *
sl@0
   737
 * canonStartSets[] contains the following sub-arrays:
sl@0
   738
 *
sl@0
   739
 * indexes[_NORM_SET_INDEX_TOP]
sl@0
   740
 *   - contains lengths of sub-arrays etc.
sl@0
   741
 *
sl@0
   742
 * startSets[indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH]-_NORM_SET_INDEX_TOP]
sl@0
   743
 *   - contains serialized sets (USerializedSet) of canonical starters for
sl@0
   744
 *     enumerating canonically equivalent strings
sl@0
   745
 *     indexes[_NORM_SET_INDEX_CANON_SETS_LENGTH] includes _NORM_SET_INDEX_TOP
sl@0
   746
 *     for details about the structure see uset.c
sl@0
   747
 *
sl@0
   748
 * bmpTable[indexes[_NORM_SET_INDEX_CANON_BMP_TABLE_LENGTH]]
sl@0
   749
 *   - a sorted search table for BMP code points whose results are
sl@0
   750
 *     either indexes to USerializedSets or single code points for
sl@0
   751
 *     single-code point sets;
sl@0
   752
 *     each entry is a pair of { code point, result } with result=(binary) yy xxxxxx xxxxxxxx
sl@0
   753
 *     if yy==01 then there is a USerializedSet at canonStartSets+x
sl@0
   754
 *     else build a USerializedSet with result as the single code point
sl@0
   755
 *
sl@0
   756
 * suppTable[indexes[_NORM_SET_INDEX_CANON_SUPP_TABLE_LENGTH]]
sl@0
   757
 *   - a sorted search table for supplementary code points whose results are
sl@0
   758
 *     either indexes to USerializedSets or single code points for
sl@0
   759
 *     single-code point sets;
sl@0
   760
 *     each entry is a triplet of { high16(cp), low16(cp), result }
sl@0
   761
 *     each code point's high-word may contain extra data in bits 15..5:
sl@0
   762
 *     if the high word has bit 15 set, then build a set with a single code point
sl@0
   763
 *     which is (((high16(cp)&0x1f00)<<8)|result;
sl@0
   764
 *     else there is a USerializedSet at canonStartSets+result
sl@0
   765
 *
sl@0
   766
 * FormatVersion 2.3 adds 2 serialized sets for normalization exclusions.
sl@0
   767
 * They are stored in the data file so that the runtime normalization code need
sl@0
   768
 * not depend on other properties and their data and implementation files.
sl@0
   769
 * The _NORM_SET_INDEX_NX_..._OFFSET offsets in the canonStartSets index table
sl@0
   770
 * give the location for each set.
sl@0
   771
 * There is no set stored for UNORM_NX_HANGUL because it's trivial to create
sl@0
   772
 * without using properties.
sl@0
   773
 *
sl@0
   774
 * Set contents:
sl@0
   775
 *
sl@0
   776
 * _NORM_SET_INDEX_NX_CJK_COMPAT_OFFSET (for UNORM_NX_CJK_COMPAT)
sl@0
   777
 *     [[:Ideographic:]&[:NFD_QC=No:]]
sl@0
   778
 *     =[CJK Ideographs]&[has canonical decomposition]
sl@0
   779
 *
sl@0
   780
 * _NORM_SET_INDEX_NX_UNICODE32_OFFSET (for UNORM_UNICODE_3_2)
sl@0
   781
 *     [:^Age=3.2:]
sl@0
   782
 *     =set with all code points that were not designated by the specified Unicode version
sl@0
   783
 *
sl@0
   784
 * _NORM_SET_INDEX_NX_RESERVED_OFFSET
sl@0
   785
 *     This is an offset that points to where the next, future set would start.
sl@0
   786
 *     Currently it indicates where the previous set ends, and thus its length.
sl@0
   787
 *     The name for this enum constant may in the future be applied to different
sl@0
   788
 *     index slots. In order to get the limit of a set, use its index slot and
sl@0
   789
 *     the immediately following one regardless of that one's enum name.
sl@0
   790
 */
sl@0
   791
sl@0
   792
#endif /* #if !UCONFIG_NO_NORMALIZATION */
sl@0
   793
sl@0
   794
#endif