os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3_icu.c
changeset 0 bde4ae8d615e
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3_icu.c	Fri Jun 15 03:10:57 2012 +0200
     1.3 @@ -0,0 +1,260 @@
     1.4 +/*
     1.5 +** 2007 June 22
     1.6 +**
     1.7 +** The author disclaims copyright to this source code.  In place of
     1.8 +** a legal notice, here is a blessing:
     1.9 +**
    1.10 +**    May you do good and not evil.
    1.11 +**    May you find forgiveness for yourself and forgive others.
    1.12 +**    May you share freely, never taking more than you give.
    1.13 +**
    1.14 +*************************************************************************
    1.15 +** This file implements a tokenizer for fts3 based on the ICU library.
    1.16 +** 
    1.17 +** $Id: fts3_icu.c,v 1.3 2008/09/01 18:34:20 danielk1977 Exp $
    1.18 +*/
    1.19 +
    1.20 +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
    1.21 +#ifdef SQLITE_ENABLE_ICU
    1.22 +
    1.23 +#include <assert.h>
    1.24 +#include <string.h>
    1.25 +#include "fts3_tokenizer.h"
    1.26 +
    1.27 +#include <unicode/ubrk.h>
    1.28 +#include <unicode/ucol.h>
    1.29 +#include <unicode/ustring.h>
    1.30 +#include <unicode/utf16.h>
    1.31 +
    1.32 +typedef struct IcuTokenizer IcuTokenizer;
    1.33 +typedef struct IcuCursor IcuCursor;
    1.34 +
    1.35 +struct IcuTokenizer {
    1.36 +  sqlite3_tokenizer base;
    1.37 +  char *zLocale;
    1.38 +};
    1.39 +
    1.40 +struct IcuCursor {
    1.41 +  sqlite3_tokenizer_cursor base;
    1.42 +
    1.43 +  UBreakIterator *pIter;      /* ICU break-iterator object */
    1.44 +  int nChar;                  /* Number of UChar elements in pInput */
    1.45 +  UChar *aChar;               /* Copy of input using utf-16 encoding */
    1.46 +  int *aOffset;               /* Offsets of each character in utf-8 input */
    1.47 +
    1.48 +  int nBuffer;
    1.49 +  char *zBuffer;
    1.50 +
    1.51 +  int iToken;
    1.52 +};
    1.53 +
    1.54 +/*
    1.55 +** Create a new tokenizer instance.
    1.56 +*/
    1.57 +static int icuCreate(
    1.58 +  int argc,                            /* Number of entries in argv[] */
    1.59 +  const char * const *argv,            /* Tokenizer creation arguments */
    1.60 +  sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
    1.61 +){
    1.62 +  IcuTokenizer *p;
    1.63 +  int n = 0;
    1.64 +
    1.65 +  if( argc>0 ){
    1.66 +    n = strlen(argv[0])+1;
    1.67 +  }
    1.68 +  p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
    1.69 +  if( !p ){
    1.70 +    return SQLITE_NOMEM;
    1.71 +  }
    1.72 +  memset(p, 0, sizeof(IcuTokenizer));
    1.73 +
    1.74 +  if( n ){
    1.75 +    p->zLocale = (char *)&p[1];
    1.76 +    memcpy(p->zLocale, argv[0], n);
    1.77 +  }
    1.78 +
    1.79 +  *ppTokenizer = (sqlite3_tokenizer *)p;
    1.80 +
    1.81 +  return SQLITE_OK;
    1.82 +}
    1.83 +
    1.84 +/*
    1.85 +** Destroy a tokenizer
    1.86 +*/
    1.87 +static int icuDestroy(sqlite3_tokenizer *pTokenizer){
    1.88 +  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
    1.89 +  sqlite3_free(p);
    1.90 +  return SQLITE_OK;
    1.91 +}
    1.92 +
    1.93 +/*
    1.94 +** Prepare to begin tokenizing a particular string.  The input
    1.95 +** string to be tokenized is pInput[0..nBytes-1].  A cursor
    1.96 +** used to incrementally tokenize this string is returned in 
    1.97 +** *ppCursor.
    1.98 +*/
    1.99 +static int icuOpen(
   1.100 +  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
   1.101 +  const char *zInput,                    /* Input string */
   1.102 +  int nInput,                            /* Length of zInput in bytes */
   1.103 +  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
   1.104 +){
   1.105 +  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
   1.106 +  IcuCursor *pCsr;
   1.107 +
   1.108 +  const int32_t opt = U_FOLD_CASE_DEFAULT;
   1.109 +  UErrorCode status = U_ZERO_ERROR;
   1.110 +  int nChar;
   1.111 +
   1.112 +  UChar32 c;
   1.113 +  int iInput = 0;
   1.114 +  int iOut = 0;
   1.115 +
   1.116 +  *ppCursor = 0;
   1.117 +
   1.118 +  if( nInput<0 ){
   1.119 +    nInput = strlen(zInput);
   1.120 +  }
   1.121 +  nChar = nInput+1;
   1.122 +  pCsr = (IcuCursor *)sqlite3_malloc(
   1.123 +      sizeof(IcuCursor) +                /* IcuCursor */
   1.124 +      nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
   1.125 +      (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
   1.126 +  );
   1.127 +  if( !pCsr ){
   1.128 +    return SQLITE_NOMEM;
   1.129 +  }
   1.130 +  memset(pCsr, 0, sizeof(IcuCursor));
   1.131 +  pCsr->aChar = (UChar *)&pCsr[1];
   1.132 +  pCsr->aOffset = (int *)&pCsr->aChar[nChar];
   1.133 +
   1.134 +  pCsr->aOffset[iOut] = iInput;
   1.135 +  U8_NEXT(zInput, iInput, nInput, c); 
   1.136 +  while( c>0 ){
   1.137 +    int isError = 0;
   1.138 +    c = u_foldCase(c, opt);
   1.139 +    U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
   1.140 +    if( isError ){
   1.141 +      sqlite3_free(pCsr);
   1.142 +      return SQLITE_ERROR;
   1.143 +    }
   1.144 +    pCsr->aOffset[iOut] = iInput;
   1.145 +
   1.146 +    if( iInput<nInput ){
   1.147 +      U8_NEXT(zInput, iInput, nInput, c);
   1.148 +    }else{
   1.149 +      c = 0;
   1.150 +    }
   1.151 +  }
   1.152 +
   1.153 +  pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
   1.154 +  if( !U_SUCCESS(status) ){
   1.155 +    sqlite3_free(pCsr);
   1.156 +    return SQLITE_ERROR;
   1.157 +  }
   1.158 +  pCsr->nChar = iOut;
   1.159 +
   1.160 +  ubrk_first(pCsr->pIter);
   1.161 +  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
   1.162 +  return SQLITE_OK;
   1.163 +}
   1.164 +
   1.165 +/*
   1.166 +** Close a tokenization cursor previously opened by a call to icuOpen().
   1.167 +*/
   1.168 +static int icuClose(sqlite3_tokenizer_cursor *pCursor){
   1.169 +  IcuCursor *pCsr = (IcuCursor *)pCursor;
   1.170 +  ubrk_close(pCsr->pIter);
   1.171 +  sqlite3_free(pCsr->zBuffer);
   1.172 +  sqlite3_free(pCsr);
   1.173 +  return SQLITE_OK;
   1.174 +}
   1.175 +
   1.176 +/*
   1.177 +** Extract the next token from a tokenization cursor.
   1.178 +*/
   1.179 +static int icuNext(
   1.180 +  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
   1.181 +  const char **ppToken,               /* OUT: *ppToken is the token text */
   1.182 +  int *pnBytes,                       /* OUT: Number of bytes in token */
   1.183 +  int *piStartOffset,                 /* OUT: Starting offset of token */
   1.184 +  int *piEndOffset,                   /* OUT: Ending offset of token */
   1.185 +  int *piPosition                     /* OUT: Position integer of token */
   1.186 +){
   1.187 +  IcuCursor *pCsr = (IcuCursor *)pCursor;
   1.188 +
   1.189 +  int iStart = 0;
   1.190 +  int iEnd = 0;
   1.191 +  int nByte = 0;
   1.192 +
   1.193 +  while( iStart==iEnd ){
   1.194 +    UChar32 c;
   1.195 +
   1.196 +    iStart = ubrk_current(pCsr->pIter);
   1.197 +    iEnd = ubrk_next(pCsr->pIter);
   1.198 +    if( iEnd==UBRK_DONE ){
   1.199 +      return SQLITE_DONE;
   1.200 +    }
   1.201 +
   1.202 +    while( iStart<iEnd ){
   1.203 +      int iWhite = iStart;
   1.204 +      U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
   1.205 +      if( u_isspace(c) ){
   1.206 +        iStart = iWhite;
   1.207 +      }else{
   1.208 +        break;
   1.209 +      }
   1.210 +    }
   1.211 +    assert(iStart<=iEnd);
   1.212 +  }
   1.213 +
   1.214 +  do {
   1.215 +    UErrorCode status = U_ZERO_ERROR;
   1.216 +    if( nByte ){
   1.217 +      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
   1.218 +      if( !zNew ){
   1.219 +        return SQLITE_NOMEM;
   1.220 +      }
   1.221 +      pCsr->zBuffer = zNew;
   1.222 +      pCsr->nBuffer = nByte;
   1.223 +    }
   1.224 +
   1.225 +    u_strToUTF8(
   1.226 +        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
   1.227 +        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
   1.228 +        &status                                  /* Output success/failure */
   1.229 +    );
   1.230 +  } while( nByte>pCsr->nBuffer );
   1.231 +
   1.232 +  *ppToken = pCsr->zBuffer;
   1.233 +  *pnBytes = nByte;
   1.234 +  *piStartOffset = pCsr->aOffset[iStart];
   1.235 +  *piEndOffset = pCsr->aOffset[iEnd];
   1.236 +  *piPosition = pCsr->iToken++;
   1.237 +
   1.238 +  return SQLITE_OK;
   1.239 +}
   1.240 +
   1.241 +/*
   1.242 +** The set of routines that implement the simple tokenizer
   1.243 +*/
   1.244 +static const sqlite3_tokenizer_module icuTokenizerModule = {
   1.245 +  0,                           /* iVersion */
   1.246 +  icuCreate,                   /* xCreate  */
   1.247 +  icuDestroy,                  /* xCreate  */
   1.248 +  icuOpen,                     /* xOpen    */
   1.249 +  icuClose,                    /* xClose   */
   1.250 +  icuNext,                     /* xNext    */
   1.251 +};
   1.252 +
   1.253 +/*
   1.254 +** Set *ppModule to point at the implementation of the ICU tokenizer.
   1.255 +*/
   1.256 +void sqlite3Fts3IcuTokenizerModule(
   1.257 +  sqlite3_tokenizer_module const**ppModule
   1.258 +){
   1.259 +  *ppModule = &icuTokenizerModule;
   1.260 +}
   1.261 +
   1.262 +#endif /* defined(SQLITE_ENABLE_ICU) */
   1.263 +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */