os/textandloc/fontservices/textshaperplugin/IcuSource/common/unicode/caniter.h
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
sl@0
     1
/*
sl@0
     2
 *******************************************************************************
sl@0
     3
 * Copyright (C) 1996-2005, International Business Machines Corporation and    *
sl@0
     4
 * others. All Rights Reserved.                                                *
sl@0
     5
 *******************************************************************************
sl@0
     6
 */
sl@0
     7
sl@0
     8
#ifndef CANITER_H
sl@0
     9
#define CANITER_H
sl@0
    10
sl@0
    11
#include "unicode/utypes.h"
sl@0
    12
sl@0
    13
#if !UCONFIG_NO_NORMALIZATION
sl@0
    14
sl@0
    15
#include "unicode/uobject.h"
sl@0
    16
#include "unicode/unistr.h"
sl@0
    17
sl@0
    18
/**
sl@0
    19
 * \file
sl@0
    20
 * \brief C++ API: Canonical Iterator
sl@0
    21
 */
sl@0
    22
 
sl@0
    23
/** Should permutation skip characters with combining class zero
sl@0
    24
 *  Should be either TRUE or FALSE. This is a compile time option
sl@0
    25
 *  @stable ICU 2.4
sl@0
    26
 */
sl@0
    27
#ifndef CANITER_SKIP_ZEROES
sl@0
    28
#define CANITER_SKIP_ZEROES TRUE
sl@0
    29
#endif
sl@0
    30
sl@0
    31
U_NAMESPACE_BEGIN
sl@0
    32
sl@0
    33
class Hashtable;
sl@0
    34
sl@0
    35
/**
sl@0
    36
 * This class allows one to iterate through all the strings that are canonically equivalent to a given
sl@0
    37
 * string. For example, here are some sample results:
sl@0
    38
Results for: {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
sl@0
    39
1: \\u0041\\u030A\\u0064\\u0307\\u0327
sl@0
    40
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
sl@0
    41
2: \\u0041\\u030A\\u0064\\u0327\\u0307
sl@0
    42
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
sl@0
    43
3: \\u0041\\u030A\\u1E0B\\u0327
sl@0
    44
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
sl@0
    45
4: \\u0041\\u030A\\u1E11\\u0307
sl@0
    46
 = {LATIN CAPITAL LETTER A}{COMBINING RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
sl@0
    47
5: \\u00C5\\u0064\\u0307\\u0327
sl@0
    48
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
sl@0
    49
6: \\u00C5\\u0064\\u0327\\u0307
sl@0
    50
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
sl@0
    51
7: \\u00C5\\u1E0B\\u0327
sl@0
    52
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
sl@0
    53
8: \\u00C5\\u1E11\\u0307
sl@0
    54
 = {LATIN CAPITAL LETTER A WITH RING ABOVE}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
sl@0
    55
9: \\u212B\\u0064\\u0307\\u0327
sl@0
    56
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING DOT ABOVE}{COMBINING CEDILLA}
sl@0
    57
10: \\u212B\\u0064\\u0327\\u0307
sl@0
    58
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D}{COMBINING CEDILLA}{COMBINING DOT ABOVE}
sl@0
    59
11: \\u212B\\u1E0B\\u0327
sl@0
    60
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH DOT ABOVE}{COMBINING CEDILLA}
sl@0
    61
12: \\u212B\\u1E11\\u0307
sl@0
    62
 = {ANGSTROM SIGN}{LATIN SMALL LETTER D WITH CEDILLA}{COMBINING DOT ABOVE}
sl@0
    63
 *<br>Note: the code is intended for use with small strings, and is not suitable for larger ones,
sl@0
    64
 * since it has not been optimized for that situation.
sl@0
    65
 * Note, CanonicalIterator is not intended to be subclassed.
sl@0
    66
 * @author M. Davis
sl@0
    67
 * @author C++ port by V. Weinstein
sl@0
    68
 * @stable ICU 2.4
sl@0
    69
 */
sl@0
    70
class U_COMMON_API CanonicalIterator : public UObject {
sl@0
    71
public:
sl@0
    72
    /**
sl@0
    73
     * Construct a CanonicalIterator object
sl@0
    74
     * @param source    string to get results for
sl@0
    75
     * @param status    Fill-in parameter which receives the status of this operation.
sl@0
    76
     * @stable ICU 2.4
sl@0
    77
     */
sl@0
    78
    CanonicalIterator(const UnicodeString &source, UErrorCode &status);
sl@0
    79
sl@0
    80
    /** Destructor
sl@0
    81
     *  Cleans pieces
sl@0
    82
     * @stable ICU 2.4
sl@0
    83
     */
sl@0
    84
    virtual ~CanonicalIterator();
sl@0
    85
sl@0
    86
    /**
sl@0
    87
     * Gets the NFD form of the current source we are iterating over.
sl@0
    88
     * @return gets the source: NOTE: it is the NFD form of source
sl@0
    89
     * @stable ICU 2.4
sl@0
    90
     */
sl@0
    91
    UnicodeString getSource();
sl@0
    92
sl@0
    93
    /**
sl@0
    94
     * Resets the iterator so that one can start again from the beginning.
sl@0
    95
     * @stable ICU 2.4
sl@0
    96
     */
sl@0
    97
    void reset();
sl@0
    98
sl@0
    99
    /**
sl@0
   100
     * Get the next canonically equivalent string.
sl@0
   101
     * <br><b>Warning: The strings are not guaranteed to be in any particular order.</b>
sl@0
   102
     * @return the next string that is canonically equivalent. A bogus string is returned when
sl@0
   103
     * the iteration is done.
sl@0
   104
     * @stable ICU 2.4
sl@0
   105
     */
sl@0
   106
    UnicodeString next();
sl@0
   107
sl@0
   108
    /**
sl@0
   109
     * Set a new source for this iterator. Allows object reuse.
sl@0
   110
     * @param newSource     the source string to iterate against. This allows the same iterator to be used
sl@0
   111
     *                     while changing the source string, saving object creation.
sl@0
   112
     * @param status        Fill-in parameter which receives the status of this operation.
sl@0
   113
     * @stable ICU 2.4
sl@0
   114
     */
sl@0
   115
    void setSource(const UnicodeString &newSource, UErrorCode &status);
sl@0
   116
sl@0
   117
    /**
sl@0
   118
     * Dumb recursive implementation of permutation.
sl@0
   119
     * TODO: optimize
sl@0
   120
     * @param source     the string to find permutations for
sl@0
   121
     * @param skipZeros  determine if skip zeros
sl@0
   122
     * @param result     the results in a set.
sl@0
   123
     * @param status       Fill-in parameter which receives the status of this operation.
sl@0
   124
     * @internal
sl@0
   125
     */
sl@0
   126
    static void U_EXPORT2 permute(UnicodeString &source, UBool skipZeros, Hashtable *result, UErrorCode &status);
sl@0
   127
sl@0
   128
    /**
sl@0
   129
     * ICU "poor man's RTTI", returns a UClassID for this class.
sl@0
   130
     *
sl@0
   131
     * @stable ICU 2.2
sl@0
   132
     */
sl@0
   133
    static UClassID U_EXPORT2 getStaticClassID();
sl@0
   134
sl@0
   135
    /**
sl@0
   136
     * ICU "poor man's RTTI", returns a UClassID for the actual class.
sl@0
   137
     *
sl@0
   138
     * @stable ICU 2.2
sl@0
   139
     */
sl@0
   140
    virtual UClassID getDynamicClassID() const;
sl@0
   141
sl@0
   142
private:
sl@0
   143
    // ===================== PRIVATES ==============================
sl@0
   144
    // private default constructor
sl@0
   145
    CanonicalIterator();
sl@0
   146
sl@0
   147
sl@0
   148
    /**
sl@0
   149
     * Copy constructor. Private for now.
sl@0
   150
     * @internal
sl@0
   151
     */
sl@0
   152
    CanonicalIterator(const CanonicalIterator& other);
sl@0
   153
sl@0
   154
    /**
sl@0
   155
     * Assignment operator. Private for now.
sl@0
   156
     * @internal
sl@0
   157
     */
sl@0
   158
    CanonicalIterator& operator=(const CanonicalIterator& other);
sl@0
   159
sl@0
   160
    // fields
sl@0
   161
    UnicodeString source;
sl@0
   162
    UBool done;
sl@0
   163
sl@0
   164
    // 2 dimensional array holds the pieces of the string with
sl@0
   165
    // their different canonically equivalent representations
sl@0
   166
    UnicodeString **pieces;
sl@0
   167
    int32_t pieces_length;
sl@0
   168
    int32_t *pieces_lengths;
sl@0
   169
sl@0
   170
    // current is used in iterating to combine pieces
sl@0
   171
    int32_t *current;
sl@0
   172
    int32_t current_length;
sl@0
   173
sl@0
   174
    // transient fields
sl@0
   175
    UnicodeString buffer;
sl@0
   176
sl@0
   177
    // we have a segment, in NFD. Find all the strings that are canonically equivalent to it.
sl@0
   178
    UnicodeString *getEquivalents(const UnicodeString &segment, int32_t &result_len, UErrorCode &status); //private String[] getEquivalents(String segment)
sl@0
   179
sl@0
   180
    //Set getEquivalents2(String segment);
sl@0
   181
    Hashtable *getEquivalents2(const UChar *segment, int32_t segLen, UErrorCode &status);
sl@0
   182
    //Hashtable *getEquivalents2(const UnicodeString &segment, int32_t segLen, UErrorCode &status);
sl@0
   183
sl@0
   184
    /**
sl@0
   185
     * See if the decomposition of cp2 is at segment starting at segmentPos
sl@0
   186
     * (with canonical rearrangment!)
sl@0
   187
     * If so, take the remainder, and return the equivalents
sl@0
   188
     */
sl@0
   189
    //Set extract(int comp, String segment, int segmentPos, StringBuffer buffer);
sl@0
   190
    Hashtable *extract(UChar32 comp, const UChar *segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
sl@0
   191
    //Hashtable *extract(UChar32 comp, const UnicodeString &segment, int32_t segLen, int32_t segmentPos, UErrorCode &status);
sl@0
   192
sl@0
   193
    void cleanPieces();
sl@0
   194
sl@0
   195
};
sl@0
   196
sl@0
   197
U_NAMESPACE_END
sl@0
   198
sl@0
   199
#endif /* #if !UCONFIG_NO_NORMALIZATION */
sl@0
   200
sl@0
   201
#endif