os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/uidna.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
 *******************************************************************************
sl@0
     3
 *
sl@0
     4
 *   Copyright (C) 2003-2005, International Business Machines
sl@0
     5
 *   Corporation and others.  All Rights Reserved.
sl@0
     6
 *
sl@0
     7
 *******************************************************************************
sl@0
     8
 *   file name:  uidna.h
sl@0
     9
 *   encoding:   US-ASCII
sl@0
    10
 *   tab size:   8 (not used)
sl@0
    11
 *   indentation:4
sl@0
    12
 *
sl@0
    13
 *   created on: 2003feb1
sl@0
    14
 *   created by: Ram Viswanadha
sl@0
    15
 */
sl@0
    16
sl@0
    17
#ifndef __UIDNA_H__
sl@0
    18
#define __UIDNA_H__
sl@0
    19
sl@0
    20
#include "unicode/utypes.h"
sl@0
    21
sl@0
    22
#if !UCONFIG_NO_IDNA
sl@0
    23
sl@0
    24
#include "unicode/parseerr.h"
sl@0
    25
  
sl@0
    26
/**
sl@0
    27
 * \file
sl@0
    28
 * \brief C API: Internationalized Domain Names in Applications Tranformation
sl@0
    29
 *
sl@0
    30
 * UIDNA API implements the IDNA protocol as defined in the IDNA RFC 
sl@0
    31
 * (http://www.ietf.org/rfc/rfc3490.txt).
sl@0
    32
 * The RFC defines 2 operations: ToASCII and ToUnicode. Domain labels 
sl@0
    33
 * containing non-ASCII code points are required to be processed by
sl@0
    34
 * ToASCII operation before passing it to resolver libraries. Domain names
sl@0
    35
 * that are obtained from resolver libraries are required to be processed by
sl@0
    36
 * ToUnicode operation before displaying the domain name to the user.
sl@0
    37
 * IDNA requires that implementations process input strings with Nameprep
sl@0
    38
 * (http://www.ietf.org/rfc/rfc3491.txt), 
sl@0
    39
 * which is a profile of Stringprep (http://www.ietf.org/rfc/rfc3454.txt), 
sl@0
    40
 * and then with Punycode (http://www.ietf.org/rfc/rfc3492.txt). 
sl@0
    41
 * Implementations of IDNA MUST fully implement Nameprep and Punycode; 
sl@0
    42
 * neither Nameprep nor Punycode are optional.
sl@0
    43
 * The input and output of ToASCII and ToUnicode operations are Unicode 
sl@0
    44
 * and are designed to be chainable, i.e., applying ToASCII or ToUnicode operations
sl@0
    45
 * multiple times to an input string will yield the same result as applying the operation
sl@0
    46
 * once.
sl@0
    47
 * ToUnicode(ToUnicode(ToUnicode...(ToUnicode(string)))) == ToUnicode(string) 
sl@0
    48
 * ToASCII(ToASCII(ToASCII...(ToASCII(string))) == ToASCII(string).
sl@0
    49
 *
sl@0
    50
 */
sl@0
    51
sl@0
    52
#ifndef U_HIDE_DRAFT_API
sl@0
    53
sl@0
    54
/** 
sl@0
    55
 * Option to prohibit processing of unassigned codepoints in the input and
sl@0
    56
 * do not check if the input conforms to STD-3 ASCII rules.
sl@0
    57
 * 
sl@0
    58
 * @see  uidna_toASCII uidna_toUnicode
sl@0
    59
 * @stable ICU 2.6
sl@0
    60
 */
sl@0
    61
#define UIDNA_DEFAULT          0x0000
sl@0
    62
/** 
sl@0
    63
 * Option to allow processing of unassigned codepoints in the input
sl@0
    64
 * 
sl@0
    65
 * @see  uidna_toASCII uidna_toUnicode
sl@0
    66
 * @stable ICU 2.6
sl@0
    67
 */
sl@0
    68
#define UIDNA_ALLOW_UNASSIGNED 0x0001
sl@0
    69
/** 
sl@0
    70
 * Option to check if input conforms to STD-3 ASCII rules
sl@0
    71
 * 
sl@0
    72
 * @see  uidna_toASCII uidna_toUnicode
sl@0
    73
 * @stable ICU 2.6
sl@0
    74
 */
sl@0
    75
#define UIDNA_USE_STD3_RULES   0x0002
sl@0
    76
sl@0
    77
#endif /*U_HIDE_DRAFT_API*/
sl@0
    78
    
sl@0
    79
/**
sl@0
    80
 * This function implements the ToASCII operation as defined in the IDNA RFC.
sl@0
    81
 * This operation is done on <b>single labels</b> before sending it to something that expects
sl@0
    82
 * ASCII names. A label is an individual part of a domain name. Labels are usually
sl@0
    83
 * separated by dots; e.g." "www.example.com" is composed of 3 labels 
sl@0
    84
 * "www","example", and "com".
sl@0
    85
 *
sl@0
    86
 *
sl@0
    87
 * @param src               Input UChar array containing label in Unicode.
sl@0
    88
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
sl@0
    89
 * @param dest              Output UChar array with ASCII (ACE encoded) label.
sl@0
    90
 * @param destCapacity      Size of dest.
sl@0
    91
 * @param options           A bit set of options:
sl@0
    92
 *
sl@0
    93
 *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
sl@0
    94
 *                              and do not use STD3 ASCII rules
sl@0
    95
 *                              If unassigned code points are found the operation fails with 
sl@0
    96
 *                              U_UNASSIGNED_ERROR error code.
sl@0
    97
 *
sl@0
    98
 *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
sl@0
    99
 *                              If this option is set, the unassigned code points are in the input 
sl@0
   100
 *                              are treated as normal Unicode code points.
sl@0
   101
 *                          
sl@0
   102
 *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
sl@0
   103
 *                              If this option is set and the input does not satisfy STD3 rules,  
sl@0
   104
 *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
sl@0
   105
 *
sl@0
   106
 * @param parseError        Pointer to UParseError struct to receive information on position 
sl@0
   107
 *                          of error if an error is encountered. Can be NULL.
sl@0
   108
 * @param status            ICU in/out error code parameter.
sl@0
   109
 *                          U_INVALID_CHAR_FOUND if src contains
sl@0
   110
 *                          unmatched single surrogates.
sl@0
   111
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
sl@0
   112
 *                          too many code points.
sl@0
   113
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
sl@0
   114
 * @return                  Number of ASCII characters converted.
sl@0
   115
 * @stable ICU 2.6
sl@0
   116
 */
sl@0
   117
U_STABLE int32_t U_EXPORT2
sl@0
   118
uidna_toASCII(const UChar* src, int32_t srcLength, 
sl@0
   119
              UChar* dest, int32_t destCapacity,
sl@0
   120
              int32_t options,
sl@0
   121
              UParseError* parseError,
sl@0
   122
              UErrorCode* status);
sl@0
   123
sl@0
   124
sl@0
   125
/**
sl@0
   126
 * This function implements the ToUnicode operation as defined in the IDNA RFC.
sl@0
   127
 * This operation is done on <b>single labels</b> before sending it to something that expects
sl@0
   128
 * Unicode names. A label is an individual part of a domain name. Labels are usually
sl@0
   129
 * separated by dots; for e.g." "www.example.com" is composed of 3 labels 
sl@0
   130
 * "www","example", and "com".
sl@0
   131
 *
sl@0
   132
 * @param src               Input UChar array containing ASCII (ACE encoded) label.
sl@0
   133
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
sl@0
   134
 * @param dest Output       Converted UChar array containing Unicode equivalent of label.
sl@0
   135
 * @param destCapacity      Size of dest.
sl@0
   136
 * @param options           A bit set of options:
sl@0
   137
 *  
sl@0
   138
 *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
sl@0
   139
 *                              and do not use STD3 ASCII rules
sl@0
   140
 *                              If unassigned code points are found the operation fails with 
sl@0
   141
 *                              U_UNASSIGNED_ERROR error code.
sl@0
   142
 *
sl@0
   143
 *  - UIDNA_ALLOW_UNASSIGNED      Unassigned values can be converted to ASCII for query operations
sl@0
   144
 *                              If this option is set, the unassigned code points are in the input 
sl@0
   145
 *                              are treated as normal Unicode code points. <b> Note: </b> This option is 
sl@0
   146
 *                              required on toUnicode operation because the RFC mandates 
sl@0
   147
 *                              verification of decoded ACE input by applying toASCII and comparing
sl@0
   148
 *                              its output with source
sl@0
   149
 *
sl@0
   150
 *                          
sl@0
   151
 *                          
sl@0
   152
 *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
sl@0
   153
 *                              If this option is set and the input does not satisfy STD3 rules,  
sl@0
   154
 *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
sl@0
   155
 *
sl@0
   156
 * @param parseError        Pointer to UParseError struct to receive information on position 
sl@0
   157
 *                          of error if an error is encountered. Can be NULL.
sl@0
   158
 * @param status            ICU in/out error code parameter.
sl@0
   159
 *                          U_INVALID_CHAR_FOUND if src contains
sl@0
   160
 *                          unmatched single surrogates.
sl@0
   161
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
sl@0
   162
 *                          too many code points.
sl@0
   163
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
sl@0
   164
 * @return                  Number of Unicode characters converted.
sl@0
   165
 * @stable ICU 2.6
sl@0
   166
 */
sl@0
   167
U_STABLE int32_t U_EXPORT2
sl@0
   168
uidna_toUnicode(const UChar* src, int32_t srcLength,
sl@0
   169
                UChar* dest, int32_t destCapacity,
sl@0
   170
                int32_t options,
sl@0
   171
                UParseError* parseError,
sl@0
   172
                UErrorCode* status);
sl@0
   173
sl@0
   174
sl@0
   175
/**
sl@0
   176
 * Convenience function that implements the IDNToASCII operation as defined in the IDNA RFC.
sl@0
   177
 * This operation is done on complete domain names, e.g: "www.example.com". 
sl@0
   178
 * It is important to note that this operation can fail. If it fails, then the input 
sl@0
   179
 * domain name cannot be used as an Internationalized Domain Name and the application
sl@0
   180
 * should have methods defined to deal with the failure.
sl@0
   181
 * 
sl@0
   182
 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
sl@0
   183
 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
sl@0
   184
 * and then convert. This function does not offer that level of granularity. The options once  
sl@0
   185
 * set will apply to all labels in the domain name
sl@0
   186
 *
sl@0
   187
 * @param src               Input UChar array containing IDN in Unicode.
sl@0
   188
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
sl@0
   189
 * @param dest              Output UChar array with ASCII (ACE encoded) IDN.
sl@0
   190
 * @param destCapacity      Size of dest.
sl@0
   191
 * @param options           A bit set of options:
sl@0
   192
 *  
sl@0
   193
 *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
sl@0
   194
 *                              and do not use STD3 ASCII rules
sl@0
   195
 *                              If unassigned code points are found the operation fails with 
sl@0
   196
 *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
sl@0
   197
 *
sl@0
   198
 *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
sl@0
   199
 *                              If this option is set, the unassigned code points are in the input 
sl@0
   200
 *                              are treated as normal Unicode code points.
sl@0
   201
 *                          
sl@0
   202
 *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
sl@0
   203
 *                              If this option is set and the input does not satisfy STD3 rules,  
sl@0
   204
 *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
sl@0
   205
 * 
sl@0
   206
 * @param parseError        Pointer to UParseError struct to receive information on position 
sl@0
   207
 *                          of error if an error is encountered. Can be NULL.
sl@0
   208
 * @param status            ICU in/out error code parameter.
sl@0
   209
 *                          U_INVALID_CHAR_FOUND if src contains
sl@0
   210
 *                          unmatched single surrogates.
sl@0
   211
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
sl@0
   212
 *                          too many code points.
sl@0
   213
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
sl@0
   214
 * @return                  Number of ASCII characters converted.
sl@0
   215
 * @stable ICU 2.6
sl@0
   216
 */
sl@0
   217
U_STABLE int32_t U_EXPORT2
sl@0
   218
uidna_IDNToASCII(  const UChar* src, int32_t srcLength,
sl@0
   219
                   UChar* dest, int32_t destCapacity,
sl@0
   220
                   int32_t options,
sl@0
   221
                   UParseError* parseError,
sl@0
   222
                   UErrorCode* status);
sl@0
   223
sl@0
   224
/**
sl@0
   225
 * Convenience function that implements the IDNToUnicode operation as defined in the IDNA RFC.
sl@0
   226
 * This operation is done on complete domain names, e.g: "www.example.com". 
sl@0
   227
 *
sl@0
   228
 * <b>Note:</b> IDNA RFC specifies that a conformant application should divide a domain name
sl@0
   229
 * into separate labels, decide whether to apply allowUnassigned and useSTD3ASCIIRules on each, 
sl@0
   230
 * and then convert. This function does not offer that level of granularity. The options once  
sl@0
   231
 * set will apply to all labels in the domain name
sl@0
   232
 *
sl@0
   233
 * @param src               Input UChar array containing IDN in ASCII (ACE encoded) form.
sl@0
   234
 * @param srcLength         Number of UChars in src, or -1 if NUL-terminated.
sl@0
   235
 * @param dest Output       UChar array containing Unicode equivalent of source IDN.
sl@0
   236
 * @param destCapacity      Size of dest.
sl@0
   237
 * @param options           A bit set of options:
sl@0
   238
 *  
sl@0
   239
 *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
sl@0
   240
 *                              and do not use STD3 ASCII rules
sl@0
   241
 *                              If unassigned code points are found the operation fails with 
sl@0
   242
 *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
sl@0
   243
 *
sl@0
   244
 *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
sl@0
   245
 *                              If this option is set, the unassigned code points are in the input 
sl@0
   246
 *                              are treated as normal Unicode code points.
sl@0
   247
 *                          
sl@0
   248
 *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
sl@0
   249
 *                              If this option is set and the input does not satisfy STD3 rules,  
sl@0
   250
 *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
sl@0
   251
 *
sl@0
   252
 * @param parseError        Pointer to UParseError struct to receive information on position 
sl@0
   253
 *                          of error if an error is encountered. Can be NULL.
sl@0
   254
 * @param status            ICU in/out error code parameter.
sl@0
   255
 *                          U_INVALID_CHAR_FOUND if src contains
sl@0
   256
 *                          unmatched single surrogates.
sl@0
   257
 *                          U_INDEX_OUTOFBOUNDS_ERROR if src contains
sl@0
   258
 *                          too many code points.
sl@0
   259
 *                          U_BUFFER_OVERFLOW_ERROR if destCapacity is not enough
sl@0
   260
 * @return                  Number of ASCII characters converted.
sl@0
   261
 * @stable ICU 2.6
sl@0
   262
 */
sl@0
   263
U_STABLE int32_t U_EXPORT2
sl@0
   264
uidna_IDNToUnicode(  const UChar* src, int32_t srcLength,
sl@0
   265
                     UChar* dest, int32_t destCapacity,
sl@0
   266
                     int32_t options,
sl@0
   267
                     UParseError* parseError,
sl@0
   268
                     UErrorCode* status);
sl@0
   269
sl@0
   270
/**
sl@0
   271
 * Compare two IDN strings for equivalence.
sl@0
   272
 * This function splits the domain names into labels and compares them.
sl@0
   273
 * According to IDN RFC, whenever two labels are compared, they are 
sl@0
   274
 * considered equal if and only if their ASCII forms (obtained by 
sl@0
   275
 * applying toASCII) match using an case-insensitive ASCII comparison.
sl@0
   276
 * Two domain names are considered a match if and only if all labels 
sl@0
   277
 * match regardless of whether label separators match.
sl@0
   278
 *
sl@0
   279
 * @param s1                First source string.
sl@0
   280
 * @param length1           Length of first source string, or -1 if NUL-terminated.
sl@0
   281
 *
sl@0
   282
 * @param s2                Second source string.
sl@0
   283
 * @param length2           Length of second source string, or -1 if NUL-terminated.
sl@0
   284
 * @param options           A bit set of options:
sl@0
   285
 *  
sl@0
   286
 *  - UIDNA_DEFAULT             Use default options, i.e., do not process unassigned code points
sl@0
   287
 *                              and do not use STD3 ASCII rules
sl@0
   288
 *                              If unassigned code points are found the operation fails with 
sl@0
   289
 *                              U_UNASSIGNED_CODE_POINT_FOUND error code.
sl@0
   290
 *
sl@0
   291
 *  - UIDNA_ALLOW_UNASSIGNED    Unassigned values can be converted to ASCII for query operations
sl@0
   292
 *                              If this option is set, the unassigned code points are in the input 
sl@0
   293
 *                              are treated as normal Unicode code points.
sl@0
   294
 *                          
sl@0
   295
 *  - UIDNA_USE_STD3_RULES      Use STD3 ASCII rules for host name syntax restrictions
sl@0
   296
 *                              If this option is set and the input does not satisfy STD3 rules,  
sl@0
   297
 *                              the operation will fail with U_IDNA_STD3_ASCII_RULES_ERROR
sl@0
   298
 *
sl@0
   299
 * @param status            ICU error code in/out parameter.
sl@0
   300
 *                          Must fulfill U_SUCCESS before the function call.
sl@0
   301
 * @return <0 or 0 or >0 as usual for string comparisons
sl@0
   302
 * @stable ICU 2.6
sl@0
   303
 */
sl@0
   304
U_STABLE int32_t U_EXPORT2
sl@0
   305
uidna_compare(  const UChar *s1, int32_t length1,
sl@0
   306
                const UChar *s2, int32_t length2,
sl@0
   307
                int32_t options,
sl@0
   308
                UErrorCode* status);
sl@0
   309
sl@0
   310
#endif /* #if !UCONFIG_NO_IDNA */
sl@0
   311
sl@0
   312
#endif