1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3_icu.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,260 @@
1.4 +/*
1.5 +** 2007 June 22
1.6 +**
1.7 +** The author disclaims copyright to this source code. In place of
1.8 +** a legal notice, here is a blessing:
1.9 +**
1.10 +** May you do good and not evil.
1.11 +** May you find forgiveness for yourself and forgive others.
1.12 +** May you share freely, never taking more than you give.
1.13 +**
1.14 +*************************************************************************
1.15 +** This file implements a tokenizer for fts3 based on the ICU library.
1.16 +**
1.17 +** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $
1.18 +*/
1.19 +
1.20 +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
1.21 +#ifdef SQLITE_ENABLE_ICU
1.22 +
1.23 +#include <assert.h>
1.24 +#include <string.h>
1.25 +#include "fts3_tokenizer.h"
1.26 +
1.27 +#include <unicode/ubrk.h>
1.28 +#include <unicode/ucol.h>
1.29 +#include <unicode/ustring.h>
1.30 +#include <unicode/utf16.h>
1.31 +
1.32 +typedef struct IcuTokenizer IcuTokenizer;
1.33 +typedef struct IcuCursor IcuCursor;
1.34 +
1.35 +struct IcuTokenizer {
1.36 + sqlite3_tokenizer base;
1.37 + char *zLocale;
1.38 +};
1.39 +
1.40 +struct IcuCursor {
1.41 + sqlite3_tokenizer_cursor base;
1.42 +
1.43 + UBreakIterator *pIter; /* ICU break-iterator object */
1.44 + int nChar; /* Number of UChar elements in pInput */
1.45 + UChar *aChar; /* Copy of input using utf-16 encoding */
1.46 + int *aOffset; /* Offsets of each character in utf-8 input */
1.47 +
1.48 + int nBuffer;
1.49 + char *zBuffer;
1.50 +
1.51 + int iToken;
1.52 +};
1.53 +
1.54 +/*
1.55 +** Create a new tokenizer instance.
1.56 +*/
1.57 +static int icuCreate(
1.58 + int argc, /* Number of entries in argv[] */
1.59 + const char * const *argv, /* Tokenizer creation arguments */
1.60 + sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
1.61 +){
1.62 + IcuTokenizer *p;
1.63 + int n = 0;
1.64 +
1.65 + if( argc>0 ){
1.66 + n = strlen(argv[0])+1;
1.67 + }
1.68 + p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
1.69 + if( !p ){
1.70 + return SQLITE_NOMEM;
1.71 + }
1.72 + memset(p, 0, sizeof(IcuTokenizer));
1.73 +
1.74 + if( n ){
1.75 + p->zLocale = (char *)&p[1];
1.76 + memcpy(p->zLocale, argv[0], n);
1.77 + }
1.78 +
1.79 + *ppTokenizer = (sqlite3_tokenizer *)p;
1.80 +
1.81 + return SQLITE_OK;
1.82 +}
1.83 +
1.84 +/*
1.85 +** Destroy a tokenizer
1.86 +*/
1.87 +static int icuDestroy(sqlite3_tokenizer *pTokenizer){
1.88 + IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
1.89 + sqlite3_free(p);
1.90 + return SQLITE_OK;
1.91 +}
1.92 +
1.93 +/*
1.94 +** Prepare to begin tokenizing a particular string. The input
1.95 +** string to be tokenized is pInput[0..nBytes-1]. A cursor
1.96 +** used to incrementally tokenize this string is returned in
1.97 +** *ppCursor.
1.98 +*/
1.99 +static int icuOpen(
1.100 + sqlite3_tokenizer *pTokenizer, /* The tokenizer */
1.101 + const char *zInput, /* Input string */
1.102 + int nInput, /* Length of zInput in bytes */
1.103 + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
1.104 +){
1.105 + IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
1.106 + IcuCursor *pCsr;
1.107 +
1.108 + const int32_t opt = U_FOLD_CASE_DEFAULT;
1.109 + UErrorCode status = U_ZERO_ERROR;
1.110 + int nChar;
1.111 +
1.112 + UChar32 c;
1.113 + int iInput = 0;
1.114 + int iOut = 0;
1.115 +
1.116 + *ppCursor = 0;
1.117 +
1.118 + if( nInput<0 ){
1.119 + nInput = strlen(zInput);
1.120 + }
1.121 + nChar = nInput+1;
1.122 + pCsr = (IcuCursor *)sqlite3_malloc(
1.123 + sizeof(IcuCursor) + /* IcuCursor */
1.124 + nChar * sizeof(UChar) + /* IcuCursor.aChar[] */
1.125 + (nChar+1) * sizeof(int) /* IcuCursor.aOffset[] */
1.126 + );
1.127 + if( !pCsr ){
1.128 + return SQLITE_NOMEM;
1.129 + }
1.130 + memset(pCsr, 0, sizeof(IcuCursor));
1.131 + pCsr->aChar = (UChar *)&pCsr[1];
1.132 + pCsr->aOffset = (int *)&pCsr->aChar[nChar];
1.133 +
1.134 + pCsr->aOffset[iOut] = iInput;
1.135 + U8_NEXT(zInput, iInput, nInput, c);
1.136 + while( c>0 ){
1.137 + int isError = 0;
1.138 + c = u_foldCase(c, opt);
1.139 + U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
1.140 + if( isError ){
1.141 + sqlite3_free(pCsr);
1.142 + return SQLITE_ERROR;
1.143 + }
1.144 + pCsr->aOffset[iOut] = iInput;
1.145 +
1.146 + if( iInput<nInput ){
1.147 + U8_NEXT(zInput, iInput, nInput, c);
1.148 + }else{
1.149 + c = 0;
1.150 + }
1.151 + }
1.152 +
1.153 + pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
1.154 + if( !U_SUCCESS(status) ){
1.155 + sqlite3_free(pCsr);
1.156 + return SQLITE_ERROR;
1.157 + }
1.158 + pCsr->nChar = iOut;
1.159 +
1.160 + ubrk_first(pCsr->pIter);
1.161 + *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
1.162 + return SQLITE_OK;
1.163 +}
1.164 +
1.165 +/*
1.166 +** Close a tokenization cursor previously opened by a call to icuOpen().
1.167 +*/
1.168 +static int icuClose(sqlite3_tokenizer_cursor *pCursor){
1.169 + IcuCursor *pCsr = (IcuCursor *)pCursor;
1.170 + ubrk_close(pCsr->pIter);
1.171 + sqlite3_free(pCsr->zBuffer);
1.172 + sqlite3_free(pCsr);
1.173 + return SQLITE_OK;
1.174 +}
1.175 +
1.176 +/*
1.177 +** Extract the next token from a tokenization cursor.
1.178 +*/
1.179 +static int icuNext(
1.180 + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
1.181 + const char **ppToken, /* OUT: *ppToken is the token text */
1.182 + int *pnBytes, /* OUT: Number of bytes in token */
1.183 + int *piStartOffset, /* OUT: Starting offset of token */
1.184 + int *piEndOffset, /* OUT: Ending offset of token */
1.185 + int *piPosition /* OUT: Position integer of token */
1.186 +){
1.187 + IcuCursor *pCsr = (IcuCursor *)pCursor;
1.188 +
1.189 + int iStart = 0;
1.190 + int iEnd = 0;
1.191 + int nByte = 0;
1.192 +
1.193 + while( iStart==iEnd ){
1.194 + UChar32 c;
1.195 +
1.196 + iStart = ubrk_current(pCsr->pIter);
1.197 + iEnd = ubrk_next(pCsr->pIter);
1.198 + if( iEnd==UBRK_DONE ){
1.199 + return SQLITE_DONE;
1.200 + }
1.201 +
1.202 + while( iStart<iEnd ){
1.203 + int iWhite = iStart;
1.204 + U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
1.205 + if( u_isspace(c) ){
1.206 + iStart = iWhite;
1.207 + }else{
1.208 + break;
1.209 + }
1.210 + }
1.211 + assert(iStart<=iEnd);
1.212 + }
1.213 +
1.214 + do {
1.215 + UErrorCode status = U_ZERO_ERROR;
1.216 + if( nByte ){
1.217 + char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
1.218 + if( !zNew ){
1.219 + return SQLITE_NOMEM;
1.220 + }
1.221 + pCsr->zBuffer = zNew;
1.222 + pCsr->nBuffer = nByte;
1.223 + }
1.224 +
1.225 + u_strToUTF8(
1.226 + pCsr->zBuffer, pCsr->nBuffer, &nByte, /* Output vars */
1.227 + &pCsr->aChar[iStart], iEnd-iStart, /* Input vars */
1.228 + &status /* Output success/failure */
1.229 + );
1.230 + } while( nByte>pCsr->nBuffer );
1.231 +
1.232 + *ppToken = pCsr->zBuffer;
1.233 + *pnBytes = nByte;
1.234 + *piStartOffset = pCsr->aOffset[iStart];
1.235 + *piEndOffset = pCsr->aOffset[iEnd];
1.236 + *piPosition = pCsr->iToken++;
1.237 +
1.238 + return SQLITE_OK;
1.239 +}
1.240 +
1.241 +/*
1.242 +** The set of routines that implement the simple tokenizer
1.243 +*/
1.244 +static const sqlite3_tokenizer_module icuTokenizerModule = {
1.245 + 0, /* iVersion */
1.246 + icuCreate, /* xCreate */
1.247 + icuDestroy, /* xCreate */
1.248 + icuOpen, /* xOpen */
1.249 + icuClose, /* xClose */
1.250 + icuNext, /* xNext */
1.251 +};
1.252 +
1.253 +/*
1.254 +** Set *ppModule to point at the implementation of the ICU tokenizer.
1.255 +*/
1.256 +void sqlite3Fts3IcuTokenizerModule(
1.257 + sqlite3_tokenizer_module const**ppModule
1.258 +){
1.259 + *ppModule = &icuTokenizerModule;
1.260 +}
1.261 +
1.262 +#endif /* defined(SQLITE_ENABLE_ICU) */
1.263 +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */