Update contrib.
4 ** The author disclaims copyright to this source code. In place of
5 ** a legal notice, here is a blessing:
7 ** May you do good and not evil.
8 ** May you find forgiveness for yourself and forgive others.
9 ** May you share freely, never taking more than you give.
11 *************************************************************************
12 ** This file implements a tokenizer for fts2 based on the ICU library.
14 ** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $
17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
18 #ifdef SQLITE_ENABLE_ICU
22 #include "fts2_tokenizer.h"
24 #include <unicode/ubrk.h>
25 #include <unicode/ucol.h>
26 #include <unicode/ustring.h>
27 #include <unicode/utf16.h>
29 typedef struct IcuTokenizer IcuTokenizer;
30 typedef struct IcuCursor IcuCursor;
33 sqlite3_tokenizer base;
38 sqlite3_tokenizer_cursor base;
40 UBreakIterator *pIter; /* ICU break-iterator object */
41 int nChar; /* Number of UChar elements in pInput */
42 UChar *aChar; /* Copy of input using utf-16 encoding */
43 int *aOffset; /* Offsets of each character in utf-8 input */
52 ** Create a new tokenizer instance.
55 int argc, /* Number of entries in argv[] */
56 const char * const *argv, /* Tokenizer creation arguments */
57 sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
63 n = strlen(argv[0])+1;
65 p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
69 memset(p, 0, sizeof(IcuTokenizer));
72 p->zLocale = (char *)&p[1];
73 memcpy(p->zLocale, argv[0], n);
76 *ppTokenizer = (sqlite3_tokenizer *)p;
82 ** Destroy a tokenizer
84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
85 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
91 ** Prepare to begin tokenizing a particular string. The input
92 ** string to be tokenized is pInput[0..nBytes-1]. A cursor
93 ** used to incrementally tokenize this string is returned in
97 sqlite3_tokenizer *pTokenizer, /* The tokenizer */
98 const char *zInput, /* Input string */
99 int nInput, /* Length of zInput in bytes */
100 sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
102 IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
105 const int32_t opt = U_FOLD_CASE_DEFAULT;
106 UErrorCode status = U_ZERO_ERROR;
115 if( -1 == nInput ) nInput = strlen(nInput);
117 pCsr = (IcuCursor *)sqlite3_malloc(
118 sizeof(IcuCursor) + /* IcuCursor */
119 nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
120 (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
125 memset(pCsr, 0, sizeof(IcuCursor));
126 pCsr->aChar = (UChar *)&pCsr[1];
127 pCsr->aOffset = (int *)&pCsr->aChar[nChar];
129 pCsr->aOffset[iOut] = iInput;
130 U8_NEXT(zInput, iInput, nInput, c);
133 c = u_foldCase(c, opt);
134 U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
139 pCsr->aOffset[iOut] = iInput;
142 U8_NEXT(zInput, iInput, nInput, c);
148 pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
149 if( !U_SUCCESS(status) ){
155 ubrk_first(pCsr->pIter);
156 *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
161 ** Close a tokenization cursor previously opened by a call to icuOpen().
163 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
164 IcuCursor *pCsr = (IcuCursor *)pCursor;
165 ubrk_close(pCsr->pIter);
166 sqlite3_free(pCsr->zBuffer);
172 ** Extract the next token from a tokenization cursor.
175 sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
176 const char **ppToken, /* OUT: *ppToken is the token text */
177 int *pnBytes, /* OUT: Number of bytes in token */
178 int *piStartOffset, /* OUT: Starting offset of token */
179 int *piEndOffset, /* OUT: Ending offset of token */
180 int *piPosition /* OUT: Position integer of token */
182 IcuCursor *pCsr = (IcuCursor *)pCursor;
188 while( iStart==iEnd ){
191 iStart = ubrk_current(pCsr->pIter);
192 iEnd = ubrk_next(pCsr->pIter);
193 if( iEnd==UBRK_DONE ){
197 while( iStart<iEnd ){
199 U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
206 assert(iStart<=iEnd);
210 UErrorCode status = U_ZERO_ERROR;
212 char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
216 pCsr->zBuffer = zNew;
217 pCsr->nBuffer = nByte;
221 pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
222 &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
223 &status /* Output success/failure */
225 } while( nByte>pCsr->nBuffer );
227 *ppToken = pCsr->zBuffer;
229 *piStartOffset = pCsr->aOffset[iStart];
230 *piEndOffset = pCsr->aOffset[iEnd];
231 *piPosition = pCsr->iToken++;
237 ** The set of routines that implement the simple tokenizer
239 static const sqlite3_tokenizer_module icuTokenizerModule = {
241 icuCreate, /* xCreate */
242 icuDestroy, /* xCreate */
244 icuClose, /* xClose */
249 ** Set *ppModule to point at the implementation of the ICU tokenizer.
251 void sqlite3Fts2IcuTokenizerModule(
252 sqlite3_tokenizer_module const**ppModule
254 *ppModule = &icuTokenizerModule;
257 #endif /* defined(SQLITE_ENABLE_ICU) */
258 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */