os/persistentdata/persistentstorage/sqlite3api/SQLite/fts2_icu.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ** 2007 June 22
     3 **
     4 ** The author disclaims copyright to this source code.  In place of
     5 ** a legal notice, here is a blessing:
     6 **
     7 **    May you do good and not evil.
     8 **    May you find forgiveness for yourself and forgive others.
     9 **    May you share freely, never taking more than you give.
    10 **
    11 *************************************************************************
    12 ** This file implements a tokenizer for fts2 based on the ICU library.
    13 ** 
    14 ** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $
    15 */
    16 
    17 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
    18 #ifdef SQLITE_ENABLE_ICU
    19 
    20 #include <assert.h>
    21 #include <string.h>
    22 #include "fts2_tokenizer.h"
    23 
    24 #include <unicode/ubrk.h>
    25 #include <unicode/ucol.h>
    26 #include <unicode/ustring.h>
    27 #include <unicode/utf16.h>
    28 
    29 typedef struct IcuTokenizer IcuTokenizer;
    30 typedef struct IcuCursor IcuCursor;
    31 
    32 struct IcuTokenizer {
    33   sqlite3_tokenizer base;
    34   char *zLocale;
    35 };
    36 
    37 struct IcuCursor {
    38   sqlite3_tokenizer_cursor base;
    39 
    40   UBreakIterator *pIter;      /* ICU break-iterator object */
    41   int nChar;                  /* Number of UChar elements in pInput */
    42   UChar *aChar;               /* Copy of input using utf-16 encoding */
    43   int *aOffset;               /* Offsets of each character in utf-8 input */
    44 
    45   int nBuffer;
    46   char *zBuffer;
    47 
    48   int iToken;
    49 };
    50 
    51 /*
    52 ** Create a new tokenizer instance.
    53 */
    54 static int icuCreate(
    55   int argc,                            /* Number of entries in argv[] */
    56   const char * const *argv,            /* Tokenizer creation arguments */
    57   sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
    58 ){
    59   IcuTokenizer *p;
    60   int n = 0;
    61 
    62   if( argc>0 ){
    63     n = strlen(argv[0])+1;
    64   }
    65   p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
    66   if( !p ){
    67     return SQLITE_NOMEM;
    68   }
    69   memset(p, 0, sizeof(IcuTokenizer));
    70 
    71   if( n ){
    72     p->zLocale = (char *)&p[1];
    73     memcpy(p->zLocale, argv[0], n);
    74   }
    75 
    76   *ppTokenizer = (sqlite3_tokenizer *)p;
    77 
    78   return SQLITE_OK;
    79 }
    80 
    81 /*
    82 ** Destroy a tokenizer
    83 */
    84 static int icuDestroy(sqlite3_tokenizer *pTokenizer){
    85   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
    86   sqlite3_free(p);
    87   return SQLITE_OK;
    88 }
    89 
    90 /*
    91 ** Prepare to begin tokenizing a particular string.  The input
    92 ** string to be tokenized is pInput[0..nBytes-1].  A cursor
    93 ** used to incrementally tokenize this string is returned in 
    94 ** *ppCursor.
    95 */
    96 static int icuOpen(
    97   sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
    98   const char *zInput,                    /* Input string */
    99   int nInput,                            /* Length of zInput in bytes */
   100   sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
   101 ){
   102   IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
   103   IcuCursor *pCsr;
   104 
   105   const int32_t opt = U_FOLD_CASE_DEFAULT;
   106   UErrorCode status = U_ZERO_ERROR;
   107   int nChar;
   108 
   109   UChar32 c;
   110   int iInput = 0;
   111   int iOut = 0;
   112 
   113   *ppCursor = 0;
   114 
   115   if( -1 == nInput ) nInput = strlen(nInput);
   116   nChar = nInput+1;
   117   pCsr = (IcuCursor *)sqlite3_malloc(
   118       sizeof(IcuCursor) +                /* IcuCursor */
   119       nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
   120       (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
   121   );
   122   if( !pCsr ){
   123     return SQLITE_NOMEM;
   124   }
   125   memset(pCsr, 0, sizeof(IcuCursor));
   126   pCsr->aChar = (UChar *)&pCsr[1];
   127   pCsr->aOffset = (int *)&pCsr->aChar[nChar];
   128 
   129   pCsr->aOffset[iOut] = iInput;
   130   U8_NEXT(zInput, iInput, nInput, c); 
   131   while( c>0 ){
   132     int isError = 0;
   133     c = u_foldCase(c, opt);
   134     U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
   135     if( isError ){
   136       sqlite3_free(pCsr);
   137       return SQLITE_ERROR;
   138     }
   139     pCsr->aOffset[iOut] = iInput;
   140 
   141     if( iInput<nInput ){
   142       U8_NEXT(zInput, iInput, nInput, c);
   143     }else{
   144       c = 0;
   145     }
   146   }
   147 
   148   pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
   149   if( !U_SUCCESS(status) ){
   150     sqlite3_free(pCsr);
   151     return SQLITE_ERROR;
   152   }
   153   pCsr->nChar = iOut;
   154 
   155   ubrk_first(pCsr->pIter);
   156   *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
   157   return SQLITE_OK;
   158 }
   159 
   160 /*
   161 ** Close a tokenization cursor previously opened by a call to icuOpen().
   162 */
   163 static int icuClose(sqlite3_tokenizer_cursor *pCursor){
   164   IcuCursor *pCsr = (IcuCursor *)pCursor;
   165   ubrk_close(pCsr->pIter);
   166   sqlite3_free(pCsr->zBuffer);
   167   sqlite3_free(pCsr);
   168   return SQLITE_OK;
   169 }
   170 
   171 /*
   172 ** Extract the next token from a tokenization cursor.
   173 */
   174 static int icuNext(
   175   sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
   176   const char **ppToken,               /* OUT: *ppToken is the token text */
   177   int *pnBytes,                       /* OUT: Number of bytes in token */
   178   int *piStartOffset,                 /* OUT: Starting offset of token */
   179   int *piEndOffset,                   /* OUT: Ending offset of token */
   180   int *piPosition                     /* OUT: Position integer of token */
   181 ){
   182   IcuCursor *pCsr = (IcuCursor *)pCursor;
   183 
   184   int iStart = 0;
   185   int iEnd = 0;
   186   int nByte = 0;
   187 
   188   while( iStart==iEnd ){
   189     UChar32 c;
   190 
   191     iStart = ubrk_current(pCsr->pIter);
   192     iEnd = ubrk_next(pCsr->pIter);
   193     if( iEnd==UBRK_DONE ){
   194       return SQLITE_DONE;
   195     }
   196 
   197     while( iStart<iEnd ){
   198       int iWhite = iStart;
   199       U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
   200       if( u_isspace(c) ){
   201         iStart = iWhite;
   202       }else{
   203         break;
   204       }
   205     }
   206     assert(iStart<=iEnd);
   207   }
   208 
   209   do {
   210     UErrorCode status = U_ZERO_ERROR;
   211     if( nByte ){
   212       char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
   213       if( !zNew ){
   214         return SQLITE_NOMEM;
   215       }
   216       pCsr->zBuffer = zNew;
   217       pCsr->nBuffer = nByte;
   218     }
   219 
   220     u_strToUTF8(
   221         pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
   222         &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
   223         &status                                  /* Output success/failure */
   224     );
   225   } while( nByte>pCsr->nBuffer );
   226 
   227   *ppToken = pCsr->zBuffer;
   228   *pnBytes = nByte;
   229   *piStartOffset = pCsr->aOffset[iStart];
   230   *piEndOffset = pCsr->aOffset[iEnd];
   231   *piPosition = pCsr->iToken++;
   232 
   233   return SQLITE_OK;
   234 }
   235 
   236 /*
   237 ** The set of routines that implement the simple tokenizer
   238 */
   239 static const sqlite3_tokenizer_module icuTokenizerModule = {
   240   0,                           /* iVersion */
   241   icuCreate,                   /* xCreate  */
   242   icuDestroy,                  /* xCreate  */
   243   icuOpen,                     /* xOpen    */
   244   icuClose,                    /* xClose   */
   245   icuNext,                     /* xNext    */
   246 };
   247 
   248 /*
   249 ** Set *ppModule to point at the implementation of the ICU tokenizer.
   250 */
   251 void sqlite3Fts2IcuTokenizerModule(
   252   sqlite3_tokenizer_module const**ppModule
   253 ){
   254   *ppModule = &icuTokenizerModule;
   255 }
   256 
   257 #endif /* defined(SQLITE_ENABLE_ICU) */
   258 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */