sl@0: /* sl@0: ** 2007 June 22 sl@0: ** sl@0: ** The author disclaims copyright to this source code. In place of sl@0: ** a legal notice, here is a blessing: sl@0: ** sl@0: ** May you do good and not evil. sl@0: ** May you find forgiveness for yourself and forgive others. sl@0: ** May you share freely, never taking more than you give. sl@0: ** sl@0: ************************************************************************* sl@0: ** This file implements a tokenizer for fts2 based on the ICU library. sl@0: ** sl@0: ** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $ sl@0: */ sl@0: sl@0: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) sl@0: #ifdef SQLITE_ENABLE_ICU sl@0: sl@0: #include sl@0: #include sl@0: #include "fts2_tokenizer.h" sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: sl@0: typedef struct IcuTokenizer IcuTokenizer; sl@0: typedef struct IcuCursor IcuCursor; sl@0: sl@0: struct IcuTokenizer { sl@0: sqlite3_tokenizer base; sl@0: char *zLocale; sl@0: }; sl@0: sl@0: struct IcuCursor { sl@0: sqlite3_tokenizer_cursor base; sl@0: sl@0: UBreakIterator *pIter; /* ICU break-iterator object */ sl@0: int nChar; /* Number of UChar elements in pInput */ sl@0: UChar *aChar; /* Copy of input using utf-16 encoding */ sl@0: int *aOffset; /* Offsets of each character in utf-8 input */ sl@0: sl@0: int nBuffer; sl@0: char *zBuffer; sl@0: sl@0: int iToken; sl@0: }; sl@0: sl@0: /* sl@0: ** Create a new tokenizer instance. sl@0: */ sl@0: static int icuCreate( sl@0: int argc, /* Number of entries in argv[] */ sl@0: const char * const *argv, /* Tokenizer creation arguments */ sl@0: sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */ sl@0: ){ sl@0: IcuTokenizer *p; sl@0: int n = 0; sl@0: sl@0: if( argc>0 ){ sl@0: n = strlen(argv[0])+1; sl@0: } sl@0: p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n); sl@0: if( !p ){ sl@0: return SQLITE_NOMEM; sl@0: } sl@0: memset(p, 0, sizeof(IcuTokenizer)); sl@0: sl@0: if( n ){ sl@0: p->zLocale = (char *)&p[1]; sl@0: memcpy(p->zLocale, argv[0], n); sl@0: } sl@0: sl@0: *ppTokenizer = (sqlite3_tokenizer *)p; sl@0: sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Destroy a tokenizer sl@0: */ sl@0: static int icuDestroy(sqlite3_tokenizer *pTokenizer){ sl@0: IcuTokenizer *p = (IcuTokenizer *)pTokenizer; sl@0: sqlite3_free(p); sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Prepare to begin tokenizing a particular string. The input sl@0: ** string to be tokenized is pInput[0..nBytes-1]. A cursor sl@0: ** used to incrementally tokenize this string is returned in sl@0: ** *ppCursor. sl@0: */ sl@0: static int icuOpen( sl@0: sqlite3_tokenizer *pTokenizer, /* The tokenizer */ sl@0: const char *zInput, /* Input string */ sl@0: int nInput, /* Length of zInput in bytes */ sl@0: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ sl@0: ){ sl@0: IcuTokenizer *p = (IcuTokenizer *)pTokenizer; sl@0: IcuCursor *pCsr; sl@0: sl@0: const int32_t opt = U_FOLD_CASE_DEFAULT; sl@0: UErrorCode status = U_ZERO_ERROR; sl@0: int nChar; sl@0: sl@0: UChar32 c; sl@0: int iInput = 0; sl@0: int iOut = 0; sl@0: sl@0: *ppCursor = 0; sl@0: sl@0: if( -1 == nInput ) nInput = strlen(nInput); sl@0: nChar = nInput+1; sl@0: pCsr = (IcuCursor *)sqlite3_malloc( sl@0: sizeof(IcuCursor) + /* IcuCursor */ sl@0: nChar * sizeof(UChar) + /* IcuCursor.aChar[] */ sl@0: (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */ sl@0: ); sl@0: if( !pCsr ){ sl@0: return SQLITE_NOMEM; sl@0: } sl@0: memset(pCsr, 0, sizeof(IcuCursor)); sl@0: pCsr->aChar = (UChar *)&pCsr[1]; sl@0: pCsr->aOffset = (int *)&pCsr->aChar[nChar]; sl@0: sl@0: pCsr->aOffset[iOut] = iInput; sl@0: U8_NEXT(zInput, iInput, nInput, c); sl@0: while( c>0 ){ sl@0: int isError = 0; sl@0: c = u_foldCase(c, opt); sl@0: U16_APPEND(pCsr->aChar, iOut, nChar, c, isError); sl@0: if( isError ){ sl@0: sqlite3_free(pCsr); sl@0: return SQLITE_ERROR; sl@0: } sl@0: pCsr->aOffset[iOut] = iInput; sl@0: sl@0: if( iInputpIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status); sl@0: if( !U_SUCCESS(status) ){ sl@0: sqlite3_free(pCsr); sl@0: return SQLITE_ERROR; sl@0: } sl@0: pCsr->nChar = iOut; sl@0: sl@0: ubrk_first(pCsr->pIter); sl@0: *ppCursor = (sqlite3_tokenizer_cursor *)pCsr; sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Close a tokenization cursor previously opened by a call to icuOpen(). sl@0: */ sl@0: static int icuClose(sqlite3_tokenizer_cursor *pCursor){ sl@0: IcuCursor *pCsr = (IcuCursor *)pCursor; sl@0: ubrk_close(pCsr->pIter); sl@0: sqlite3_free(pCsr->zBuffer); sl@0: sqlite3_free(pCsr); sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Extract the next token from a tokenization cursor. sl@0: */ sl@0: static int icuNext( sl@0: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ sl@0: const char **ppToken, /* OUT: *ppToken is the token text */ sl@0: int *pnBytes, /* OUT: Number of bytes in token */ sl@0: int *piStartOffset, /* OUT: Starting offset of token */ sl@0: int *piEndOffset, /* OUT: Ending offset of token */ sl@0: int *piPosition /* OUT: Position integer of token */ sl@0: ){ sl@0: IcuCursor *pCsr = (IcuCursor *)pCursor; sl@0: sl@0: int iStart = 0; sl@0: int iEnd = 0; sl@0: int nByte = 0; sl@0: sl@0: while( iStart==iEnd ){ sl@0: UChar32 c; sl@0: sl@0: iStart = ubrk_current(pCsr->pIter); sl@0: iEnd = ubrk_next(pCsr->pIter); sl@0: if( iEnd==UBRK_DONE ){ sl@0: return SQLITE_DONE; sl@0: } sl@0: sl@0: while( iStartaChar, iWhite, pCsr->nChar, c); sl@0: if( u_isspace(c) ){ sl@0: iStart = iWhite; sl@0: }else{ sl@0: break; sl@0: } sl@0: } sl@0: assert(iStart<=iEnd); sl@0: } sl@0: sl@0: do { sl@0: UErrorCode status = U_ZERO_ERROR; sl@0: if( nByte ){ sl@0: char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte); sl@0: if( !zNew ){ sl@0: return SQLITE_NOMEM; sl@0: } sl@0: pCsr->zBuffer = zNew; sl@0: pCsr->nBuffer = nByte; sl@0: } sl@0: sl@0: u_strToUTF8( sl@0: pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */ sl@0: &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */ sl@0: &status /* Output success/failure */ sl@0: ); sl@0: } while( nByte>pCsr->nBuffer ); sl@0: sl@0: *ppToken = pCsr->zBuffer; sl@0: *pnBytes = nByte; sl@0: *piStartOffset = pCsr->aOffset[iStart]; sl@0: *piEndOffset = pCsr->aOffset[iEnd]; sl@0: *piPosition = pCsr->iToken++; sl@0: sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** The set of routines that implement the simple tokenizer sl@0: */ sl@0: static const sqlite3_tokenizer_module icuTokenizerModule = { sl@0: 0, /* iVersion */ sl@0: icuCreate, /* xCreate */ sl@0: icuDestroy, /* xCreate */ sl@0: icuOpen, /* xOpen */ sl@0: icuClose, /* xClose */ sl@0: icuNext, /* xNext */ sl@0: }; sl@0: sl@0: /* sl@0: ** Set *ppModule to point at the implementation of the ICU tokenizer. sl@0: */ sl@0: void sqlite3Fts2IcuTokenizerModule( sl@0: sqlite3_tokenizer_module const**ppModule sl@0: ){ sl@0: *ppModule = &icuTokenizerModule; sl@0: } sl@0: sl@0: #endif /* defined(SQLITE_ENABLE_ICU) */ sl@0: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */