sl@0: /*
sl@0: *******************************************************************************
sl@0: *
sl@0: *   Copyright (C) 2004-2005, International Business Machines
sl@0: *   Corporation and others.  All Rights Reserved.
sl@0: *
sl@0: *******************************************************************************
sl@0: *   file name:  ucase.h
sl@0: *   encoding:   US-ASCII
sl@0: *   tab size:   8 (not used)
sl@0: *   indentation:4
sl@0: *
sl@0: *   created on: 2004aug30
sl@0: *   created by: Markus W. Scherer
sl@0: *
sl@0: *   Low-level Unicode character/string case mapping code.
sl@0: */
sl@0: 
sl@0: #ifndef __UCASE_H__
sl@0: #define __UCASE_H__
sl@0: 
sl@0: #include "unicode/utypes.h"
sl@0: #include "unicode/uset.h"
sl@0: #include "uset_imp.h"
sl@0: #include "udataswp.h"
sl@0: 
sl@0: U_CDECL_BEGIN
sl@0: 
sl@0: /* library API -------------------------------------------------------------- */
sl@0: 
sl@0: struct UCaseProps;
sl@0: typedef struct UCaseProps UCaseProps;
sl@0: 
sl@0: U_CAPI UCaseProps * U_EXPORT2
sl@0: ucase_open(UErrorCode *pErrorCode);
sl@0: 
sl@0: U_CAPI UCaseProps * U_EXPORT2
sl@0: ucase_openBinary(const uint8_t *bin, int32_t length, UErrorCode *pErrorCode);
sl@0: 
sl@0: U_CAPI void U_EXPORT2
sl@0: ucase_close(UCaseProps *csp);
sl@0: 
sl@0: 
sl@0: U_CAPI const UCaseProps * U_EXPORT2
sl@0: ucase_getSingleton(UErrorCode *pErrorCode);
sl@0: 
sl@0: /**
sl@0:  * Get a singleton dummy object, one that works with no real data.
sl@0:  * This can be used when the real data is not available.
sl@0:  * Using the dummy can reduce checks for available data after an initial failure.
sl@0:  */
sl@0: U_CAPI const UCaseProps * U_EXPORT2
sl@0: ucase_getDummy(UErrorCode *pErrorCode);
sl@0: 
sl@0: 
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_swap(const UDataSwapper *ds,
sl@0:            const void *inData, int32_t length, void *outData,
sl@0:            UErrorCode *pErrorCode);
sl@0: 
sl@0: U_CAPI void U_EXPORT2
sl@0: ucase_addPropertyStarts(const UCaseProps *csp, const USetAdder *sa, UErrorCode *pErrorCode);
sl@0: 
sl@0: /**
sl@0:  * Requires non-NULL locale ID but otherwise does the equivalent of
sl@0:  * checking for language codes as if uloc_getLanguage() were called:
sl@0:  * Accepts both 2- and 3-letter codes and accepts case variants.
sl@0:  */
sl@0: U_CFUNC int32_t
sl@0: ucase_getCaseLocale(const char *locale, int32_t *locCache);
sl@0: 
sl@0: /**
sl@0:  * Bit mask for getting just the options from a string compare options word
sl@0:  * that are relevant for case-insensitive string comparison.
sl@0:  * See uchar.h. Also include _STRNCMP_STYLE and U_COMPARE_CODE_POINT_ORDER.
sl@0:  * @internal
sl@0:  */
sl@0: #define _STRCASECMP_OPTIONS_MASK 0xffff
sl@0: 
sl@0: /**
sl@0:  * Bit mask for getting just the options from a string compare options word
sl@0:  * that are relevant for case folding (of a single string or code point).
sl@0:  * See uchar.h.
sl@0:  * @internal
sl@0:  */
sl@0: #define _FOLD_CASE_OPTIONS_MASK 0xff
sl@0: 
sl@0: /* single-code point functions */
sl@0: 
sl@0: U_CAPI UChar32 U_EXPORT2
sl@0: ucase_tolower(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: U_CAPI UChar32 U_EXPORT2
sl@0: ucase_toupper(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: U_CAPI UChar32 U_EXPORT2
sl@0: ucase_totitle(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: U_CAPI UChar32 U_EXPORT2
sl@0: ucase_fold(const UCaseProps *csp, UChar32 c, uint32_t options);
sl@0: 
sl@0: /**
sl@0:  * Adds all simple case mappings and the full case folding for c to sa,
sl@0:  * and also adds special case closure mappings.
sl@0:  * c itself is not added.
sl@0:  * For example, the mappings
sl@0:  * - for s include long s
sl@0:  * - for sharp s include ss
sl@0:  * - for k include the Kelvin sign
sl@0:  */
sl@0: U_CAPI void U_EXPORT2
sl@0: ucase_addCaseClosure(const UCaseProps *csp, UChar32 c, const USetAdder *sa);
sl@0: 
sl@0: /**
sl@0:  * Maps the string to single code points and adds the associated case closure
sl@0:  * mappings.
sl@0:  * The string is mapped to code points if it is their full case folding string.
sl@0:  * In other words, this performs a reverse full case folding and then
sl@0:  * adds the case closure items of the resulting code points.
sl@0:  * If the string is found and its closure applied, then
sl@0:  * the string itself is added as well as part of its code points' closure.
sl@0:  * It must be length>=0.
sl@0:  *
sl@0:  * @return TRUE if the string was found
sl@0:  */
sl@0: U_CAPI UBool U_EXPORT2
sl@0: ucase_addStringCaseClosure(const UCaseProps *csp, const UChar *s, int32_t length, const USetAdder *sa);
sl@0: 
sl@0: /** @return UCASE_NONE, UCASE_LOWER, UCASE_UPPER, UCASE_TITLE */
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_getType(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: /** @return same as ucase_getType(), or <0 if c is case-ignorable */
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_getTypeOrIgnorable(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: U_CAPI UBool U_EXPORT2
sl@0: ucase_isSoftDotted(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: U_CAPI UBool U_EXPORT2
sl@0: ucase_isCaseSensitive(const UCaseProps *csp, UChar32 c);
sl@0: 
sl@0: /* string case mapping functions */
sl@0: 
sl@0: /**
sl@0:  * Iterator function for string case mappings, which need to look at the
sl@0:  * context (surrounding text) of a given character for conditional mappings.
sl@0:  *
sl@0:  * The iterator only needs to go backward or forward away from the
sl@0:  * character in question. It does not use any indexes on this interface.
sl@0:  * It does not support random access or an arbitrary change of
sl@0:  * iteration direction.
sl@0:  *
sl@0:  * The code point being case-mapped itself is never returned by
sl@0:  * this iterator.
sl@0:  *
sl@0:  * @param context A pointer to the iterator's working data.
sl@0:  * @param dir If <0 then start iterating backward from the character;
sl@0:  *            if >0 then start iterating forward from the character;
sl@0:  *            if 0 then continue iterating in the current direction.
sl@0:  * @return Next code point, or <0 when the iteration is done.
sl@0:  */
sl@0: typedef UChar32 U_CALLCONV
sl@0: UCaseContextIterator(void *context, int8_t dir);
sl@0: 
sl@0: /**
sl@0:  * Sample struct which may be used by some implementations of
sl@0:  * UCaseContextIterator.
sl@0:  */
sl@0: struct UCaseContext {
sl@0:     void *p;
sl@0:     int32_t start, index, limit;
sl@0:     int32_t cpStart, cpLimit;
sl@0:     int8_t dir;
sl@0:     int8_t b1, b2, b3;
sl@0: };
sl@0: typedef struct UCaseContext UCaseContext;
sl@0: 
sl@0: enum {
sl@0:     /**
sl@0:      * For string case mappings, a single character (a code point) is mapped
sl@0:      * either to itself (in which case in-place mapping functions do nothing),
sl@0:      * or to another single code point, or to a string.
sl@0:      * Aside from the string contents, these are indicated with a single int32_t
sl@0:      * value as follows:
sl@0:      *
sl@0:      * Mapping to self: Negative values (~self instead of -self to support U+0000)
sl@0:      *
sl@0:      * Mapping to another code point: Positive values >UCASE_MAX_STRING_LENGTH
sl@0:      *
sl@0:      * Mapping to a string: The string length (0..UCASE_MAX_STRING_LENGTH) is
sl@0:      * returned. Note that the string result may indeed have zero length.
sl@0:      */
sl@0:     UCASE_MAX_STRING_LENGTH=0x1f
sl@0: };
sl@0: 
sl@0: /**
sl@0:  * Get the full lowercase mapping for c.
sl@0:  *
sl@0:  * @param csp Case mapping properties.
sl@0:  * @param c Character to be mapped.
sl@0:  * @param iter Character iterator, used for context-sensitive mappings.
sl@0:  *             See UCaseContextIterator for details.
sl@0:  *             If iter==NULL then a context-independent result is returned.
sl@0:  * @param context Pointer to be passed into iter.
sl@0:  * @param pString If the mapping result is a string, then the pointer is
sl@0:  *                written to *pString.
sl@0:  * @param locale Locale ID for locale-dependent mappings.
sl@0:  * @param locCache Initialize to 0; may be used to cache the result of parsing
sl@0:  *                 the locale ID for subsequent calls.
sl@0:  *                 Can be NULL.
sl@0:  * @return Output code point or string length, see UCASE_MAX_STRING_LENGTH.
sl@0:  *
sl@0:  * @see UCaseContextIterator
sl@0:  * @see UCASE_MAX_STRING_LENGTH
sl@0:  * @internal
sl@0:  */
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_toFullLower(const UCaseProps *csp, UChar32 c,
sl@0:                   UCaseContextIterator *iter, void *context,
sl@0:                   const UChar **pString,
sl@0:                   const char *locale, int32_t *locCache);
sl@0: 
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_toFullUpper(const UCaseProps *csp, UChar32 c,
sl@0:                   UCaseContextIterator *iter, void *context,
sl@0:                   const UChar **pString,
sl@0:                   const char *locale, int32_t *locCache);
sl@0: 
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_toFullTitle(const UCaseProps *csp, UChar32 c,
sl@0:                   UCaseContextIterator *iter, void *context,
sl@0:                   const UChar **pString,
sl@0:                   const char *locale, int32_t *locCache);
sl@0: 
sl@0: U_CAPI int32_t U_EXPORT2
sl@0: ucase_toFullFolding(const UCaseProps *csp, UChar32 c,
sl@0:                     const UChar **pString,
sl@0:                     uint32_t options);
sl@0: 
sl@0: /* file definitions --------------------------------------------------------- */
sl@0: 
sl@0: #define UCASE_DATA_NAME "ucase"
sl@0: #define UCASE_DATA_TYPE "icu"
sl@0: 
sl@0: /* format "cAsE" */
sl@0: #define UCASE_FMT_0 0x63
sl@0: #define UCASE_FMT_1 0x41
sl@0: #define UCASE_FMT_2 0x53
sl@0: #define UCASE_FMT_3 0x45
sl@0: 
sl@0: /* indexes into indexes[] */
sl@0: enum {
sl@0:     UCASE_IX_INDEX_TOP,
sl@0:     UCASE_IX_LENGTH,
sl@0:     UCASE_IX_TRIE_SIZE,
sl@0:     UCASE_IX_EXC_LENGTH,
sl@0:     UCASE_IX_UNFOLD_LENGTH,
sl@0: 
sl@0:     UCASE_IX_MAX_FULL_LENGTH=15,
sl@0:     UCASE_IX_TOP=16
sl@0: };
sl@0: 
sl@0: /* definitions for 16-bit case properties word ------------------------------ */
sl@0: 
sl@0: /* 2-bit constants for types of cased characters */
sl@0: #define UCASE_TYPE_MASK     3
sl@0: enum {
sl@0:     UCASE_NONE,
sl@0:     UCASE_LOWER,
sl@0:     UCASE_UPPER,
sl@0:     UCASE_TITLE
sl@0: };
sl@0: 
sl@0: #define UCASE_GET_TYPE(props) ((props)&UCASE_TYPE_MASK)
sl@0: 
sl@0: #define UCASE_SENSITIVE     4
sl@0: #define UCASE_EXCEPTION     8
sl@0: 
sl@0: #define UCASE_DOT_MASK      0x30
sl@0: enum {
sl@0:     UCASE_NO_DOT=0,         /* normal characters with cc=0 */
sl@0:     UCASE_SOFT_DOTTED=0x10, /* soft-dotted characters with cc=0 */
sl@0:     UCASE_ABOVE=0x20,       /* "above" accents with cc=230 */
sl@0:     UCASE_OTHER_ACCENT=0x30 /* other accent character (0<cc!=230) */
sl@0: };
sl@0: 
sl@0: /* no exception: bits 15..6 are a 10-bit signed case mapping delta */
sl@0: #define UCASE_DELTA_SHIFT   6
sl@0: #define UCASE_DELTA_MASK    0xffc0
sl@0: #define UCASE_MAX_DELTA     0x1ff
sl@0: #define UCASE_MIN_DELTA     (-UCASE_MAX_DELTA-1)
sl@0: 
sl@0: #define UCASE_GET_DELTA(props) ((int16_t)(props)>>UCASE_DELTA_SHIFT)
sl@0: 
sl@0: /* case-ignorable uses one of the delta bits, see gencase/store.c */
sl@0: #define UCASE_CASE_IGNORABLE 0x40
sl@0: 
sl@0: /* exception: bits 15..4 are an unsigned 12-bit index into the exceptions array */
sl@0: #define UCASE_EXC_SHIFT     4
sl@0: #define UCASE_EXC_MASK      0xfff0
sl@0: #define UCASE_MAX_EXCEPTIONS 0x1000
sl@0: 
sl@0: /* definitions for 16-bit main exceptions word ------------------------------ */
sl@0: 
sl@0: /* first 8 bits indicate values in optional slots */
sl@0: enum {
sl@0:     UCASE_EXC_LOWER,
sl@0:     UCASE_EXC_FOLD,
sl@0:     UCASE_EXC_UPPER,
sl@0:     UCASE_EXC_TITLE,
sl@0:     UCASE_EXC_4,            /* reserved */
sl@0:     UCASE_EXC_5,            /* reserved */
sl@0:     UCASE_EXC_CLOSURE,
sl@0:     UCASE_EXC_FULL_MAPPINGS,
sl@0:     UCASE_EXC_ALL_SLOTS     /* one past the last slot */
sl@0: };
sl@0: 
sl@0: /* each slot is 2 uint16_t instead of 1 */
sl@0: #define UCASE_EXC_DOUBLE_SLOTS      0x100
sl@0: 
sl@0: /* reserved: exception bits 11..9 */
sl@0: 
sl@0: /* UCASE_EXC_DOT_MASK=UCASE_DOT_MASK<<UCASE_EXC_DOT_SHIFT */
sl@0: #define UCASE_EXC_DOT_SHIFT     8
sl@0: 
sl@0: /* normally stored in the main word, but pushed out for larger exception indexes */
sl@0: #define UCASE_EXC_DOT_MASK      0x3000
sl@0: enum {
sl@0:     UCASE_EXC_NO_DOT=0,
sl@0:     UCASE_EXC_SOFT_DOTTED=0x1000,
sl@0:     UCASE_EXC_ABOVE=0x2000,         /* "above" accents with cc=230 */
sl@0:     UCASE_EXC_OTHER_ACCENT=0x3000   /* other character (0<cc!=230) */
sl@0: };
sl@0: 
sl@0: /* complex/conditional mappings */
sl@0: #define UCASE_EXC_CONDITIONAL_SPECIAL   0x4000
sl@0: #define UCASE_EXC_CONDITIONAL_FOLD      0x8000
sl@0: 
sl@0: /* definitions for lengths word for full case mappings */
sl@0: #define UCASE_FULL_LOWER    0xf
sl@0: #define UCASE_FULL_FOLDING  0xf0
sl@0: #define UCASE_FULL_UPPER    0xf00
sl@0: #define UCASE_FULL_TITLE    0xf000
sl@0: 
sl@0: /* maximum lengths */
sl@0: #define UCASE_FULL_MAPPINGS_MAX_LENGTH (4*0xf)
sl@0: #define UCASE_CLOSURE_MAX_LENGTH 0xf
sl@0: 
sl@0: /* constants for reverse case folding ("unfold") data */
sl@0: enum {
sl@0:     UCASE_UNFOLD_ROWS,
sl@0:     UCASE_UNFOLD_ROW_WIDTH,
sl@0:     UCASE_UNFOLD_STRING_WIDTH
sl@0: };
sl@0: 
sl@0: U_CDECL_END
sl@0: 
sl@0: #endif