os/persistentdata/persistentstorage/sqlite3api/SQLite/fts2_icu.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
** 2007 June 22
sl@0
     3
**
sl@0
     4
** The author disclaims copyright to this source code.  In place of
sl@0
     5
** a legal notice, here is a blessing:
sl@0
     6
**
sl@0
     7
**    May you do good and not evil.
sl@0
     8
**    May you find forgiveness for yourself and forgive others.
sl@0
     9
**    May you share freely, never taking more than you give.
sl@0
    10
**
sl@0
    11
*************************************************************************
sl@0
    12
** This file implements a tokenizer for fts2 based on the ICU library.
sl@0
    13
** 
sl@0
    14
** $Id: fts2_icu.c,v 1.2 2008/07/22 22:20:50 shess Exp $
sl@0
    15
*/
sl@0
    16
sl@0
    17
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
sl@0
    18
#ifdef SQLITE_ENABLE_ICU
sl@0
    19
sl@0
    20
#include <assert.h>
sl@0
    21
#include <string.h>
sl@0
    22
#include "fts2_tokenizer.h"
sl@0
    23
sl@0
    24
#include <unicode/ubrk.h>
sl@0
    25
#include <unicode/ucol.h>
sl@0
    26
#include <unicode/ustring.h>
sl@0
    27
#include <unicode/utf16.h>
sl@0
    28
sl@0
    29
typedef struct IcuTokenizer IcuTokenizer;
sl@0
    30
typedef struct IcuCursor IcuCursor;
sl@0
    31
sl@0
    32
struct IcuTokenizer {
sl@0
    33
  sqlite3_tokenizer base;
sl@0
    34
  char *zLocale;
sl@0
    35
};
sl@0
    36
sl@0
    37
struct IcuCursor {
sl@0
    38
  sqlite3_tokenizer_cursor base;
sl@0
    39
sl@0
    40
  UBreakIterator *pIter;      /* ICU break-iterator object */
sl@0
    41
  int nChar;                  /* Number of UChar elements in pInput */
sl@0
    42
  UChar *aChar;               /* Copy of input using utf-16 encoding */
sl@0
    43
  int *aOffset;               /* Offsets of each character in utf-8 input */
sl@0
    44
sl@0
    45
  int nBuffer;
sl@0
    46
  char *zBuffer;
sl@0
    47
sl@0
    48
  int iToken;
sl@0
    49
};
sl@0
    50
sl@0
    51
/*
sl@0
    52
** Create a new tokenizer instance.
sl@0
    53
*/
sl@0
    54
static int icuCreate(
sl@0
    55
  int argc,                            /* Number of entries in argv[] */
sl@0
    56
  const char * const *argv,            /* Tokenizer creation arguments */
sl@0
    57
  sqlite3_tokenizer **ppTokenizer      /* OUT: Created tokenizer */
sl@0
    58
){
sl@0
    59
  IcuTokenizer *p;
sl@0
    60
  int n = 0;
sl@0
    61
sl@0
    62
  if( argc>0 ){
sl@0
    63
    n = strlen(argv[0])+1;
sl@0
    64
  }
sl@0
    65
  p = (IcuTokenizer *)sqlite3_malloc(sizeof(IcuTokenizer)+n);
sl@0
    66
  if( !p ){
sl@0
    67
    return SQLITE_NOMEM;
sl@0
    68
  }
sl@0
    69
  memset(p, 0, sizeof(IcuTokenizer));
sl@0
    70
sl@0
    71
  if( n ){
sl@0
    72
    p->zLocale = (char *)&p[1];
sl@0
    73
    memcpy(p->zLocale, argv[0], n);
sl@0
    74
  }
sl@0
    75
sl@0
    76
  *ppTokenizer = (sqlite3_tokenizer *)p;
sl@0
    77
sl@0
    78
  return SQLITE_OK;
sl@0
    79
}
sl@0
    80
sl@0
    81
/*
sl@0
    82
** Destroy a tokenizer
sl@0
    83
*/
sl@0
    84
static int icuDestroy(sqlite3_tokenizer *pTokenizer){
sl@0
    85
  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
sl@0
    86
  sqlite3_free(p);
sl@0
    87
  return SQLITE_OK;
sl@0
    88
}
sl@0
    89
sl@0
    90
/*
sl@0
    91
** Prepare to begin tokenizing a particular string.  The input
sl@0
    92
** string to be tokenized is pInput[0..nBytes-1].  A cursor
sl@0
    93
** used to incrementally tokenize this string is returned in 
sl@0
    94
** *ppCursor.
sl@0
    95
*/
sl@0
    96
static int icuOpen(
sl@0
    97
  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
sl@0
    98
  const char *zInput,                    /* Input string */
sl@0
    99
  int nInput,                            /* Length of zInput in bytes */
sl@0
   100
  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
sl@0
   101
){
sl@0
   102
  IcuTokenizer *p = (IcuTokenizer *)pTokenizer;
sl@0
   103
  IcuCursor *pCsr;
sl@0
   104
sl@0
   105
  const int32_t opt = U_FOLD_CASE_DEFAULT;
sl@0
   106
  UErrorCode status = U_ZERO_ERROR;
sl@0
   107
  int nChar;
sl@0
   108
sl@0
   109
  UChar32 c;
sl@0
   110
  int iInput = 0;
sl@0
   111
  int iOut = 0;
sl@0
   112
sl@0
   113
  *ppCursor = 0;
sl@0
   114
sl@0
   115
  if( -1 == nInput ) nInput = strlen(nInput);
sl@0
   116
  nChar = nInput+1;
sl@0
   117
  pCsr = (IcuCursor *)sqlite3_malloc(
sl@0
   118
      sizeof(IcuCursor) +                /* IcuCursor */
sl@0
   119
      nChar * sizeof(UChar) +            /* IcuCursor.aChar[] */
sl@0
   120
      (nChar+1) * sizeof(int)            /* IcuCursor.aOffset[] */
sl@0
   121
  );
sl@0
   122
  if( !pCsr ){
sl@0
   123
    return SQLITE_NOMEM;
sl@0
   124
  }
sl@0
   125
  memset(pCsr, 0, sizeof(IcuCursor));
sl@0
   126
  pCsr->aChar = (UChar *)&pCsr[1];
sl@0
   127
  pCsr->aOffset = (int *)&pCsr->aChar[nChar];
sl@0
   128
sl@0
   129
  pCsr->aOffset[iOut] = iInput;
sl@0
   130
  U8_NEXT(zInput, iInput, nInput, c); 
sl@0
   131
  while( c>0 ){
sl@0
   132
    int isError = 0;
sl@0
   133
    c = u_foldCase(c, opt);
sl@0
   134
    U16_APPEND(pCsr->aChar, iOut, nChar, c, isError);
sl@0
   135
    if( isError ){
sl@0
   136
      sqlite3_free(pCsr);
sl@0
   137
      return SQLITE_ERROR;
sl@0
   138
    }
sl@0
   139
    pCsr->aOffset[iOut] = iInput;
sl@0
   140
sl@0
   141
    if( iInput<nInput ){
sl@0
   142
      U8_NEXT(zInput, iInput, nInput, c);
sl@0
   143
    }else{
sl@0
   144
      c = 0;
sl@0
   145
    }
sl@0
   146
  }
sl@0
   147
sl@0
   148
  pCsr->pIter = ubrk_open(UBRK_WORD, p->zLocale, pCsr->aChar, iOut, &status);
sl@0
   149
  if( !U_SUCCESS(status) ){
sl@0
   150
    sqlite3_free(pCsr);
sl@0
   151
    return SQLITE_ERROR;
sl@0
   152
  }
sl@0
   153
  pCsr->nChar = iOut;
sl@0
   154
sl@0
   155
  ubrk_first(pCsr->pIter);
sl@0
   156
  *ppCursor = (sqlite3_tokenizer_cursor *)pCsr;
sl@0
   157
  return SQLITE_OK;
sl@0
   158
}
sl@0
   159
sl@0
   160
/*
sl@0
   161
** Close a tokenization cursor previously opened by a call to icuOpen().
sl@0
   162
*/
sl@0
   163
static int icuClose(sqlite3_tokenizer_cursor *pCursor){
sl@0
   164
  IcuCursor *pCsr = (IcuCursor *)pCursor;
sl@0
   165
  ubrk_close(pCsr->pIter);
sl@0
   166
  sqlite3_free(pCsr->zBuffer);
sl@0
   167
  sqlite3_free(pCsr);
sl@0
   168
  return SQLITE_OK;
sl@0
   169
}
sl@0
   170
sl@0
   171
/*
sl@0
   172
** Extract the next token from a tokenization cursor.
sl@0
   173
*/
sl@0
   174
static int icuNext(
sl@0
   175
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by simpleOpen */
sl@0
   176
  const char **ppToken,               /* OUT: *ppToken is the token text */
sl@0
   177
  int *pnBytes,                       /* OUT: Number of bytes in token */
sl@0
   178
  int *piStartOffset,                 /* OUT: Starting offset of token */
sl@0
   179
  int *piEndOffset,                   /* OUT: Ending offset of token */
sl@0
   180
  int *piPosition                     /* OUT: Position integer of token */
sl@0
   181
){
sl@0
   182
  IcuCursor *pCsr = (IcuCursor *)pCursor;
sl@0
   183
sl@0
   184
  int iStart = 0;
sl@0
   185
  int iEnd = 0;
sl@0
   186
  int nByte = 0;
sl@0
   187
sl@0
   188
  while( iStart==iEnd ){
sl@0
   189
    UChar32 c;
sl@0
   190
sl@0
   191
    iStart = ubrk_current(pCsr->pIter);
sl@0
   192
    iEnd = ubrk_next(pCsr->pIter);
sl@0
   193
    if( iEnd==UBRK_DONE ){
sl@0
   194
      return SQLITE_DONE;
sl@0
   195
    }
sl@0
   196
sl@0
   197
    while( iStart<iEnd ){
sl@0
   198
      int iWhite = iStart;
sl@0
   199
      U8_NEXT(pCsr->aChar, iWhite, pCsr->nChar, c);
sl@0
   200
      if( u_isspace(c) ){
sl@0
   201
        iStart = iWhite;
sl@0
   202
      }else{
sl@0
   203
        break;
sl@0
   204
      }
sl@0
   205
    }
sl@0
   206
    assert(iStart<=iEnd);
sl@0
   207
  }
sl@0
   208
sl@0
   209
  do {
sl@0
   210
    UErrorCode status = U_ZERO_ERROR;
sl@0
   211
    if( nByte ){
sl@0
   212
      char *zNew = sqlite3_realloc(pCsr->zBuffer, nByte);
sl@0
   213
      if( !zNew ){
sl@0
   214
        return SQLITE_NOMEM;
sl@0
   215
      }
sl@0
   216
      pCsr->zBuffer = zNew;
sl@0
   217
      pCsr->nBuffer = nByte;
sl@0
   218
    }
sl@0
   219
sl@0
   220
    u_strToUTF8(
sl@0
   221
        pCsr->zBuffer, pCsr->nBuffer, &nByte,    /* Output vars */
sl@0
   222
        &pCsr->aChar[iStart], iEnd-iStart,       /* Input vars */
sl@0
   223
        &status                                  /* Output success/failure */
sl@0
   224
    );
sl@0
   225
  } while( nByte>pCsr->nBuffer );
sl@0
   226
sl@0
   227
  *ppToken = pCsr->zBuffer;
sl@0
   228
  *pnBytes = nByte;
sl@0
   229
  *piStartOffset = pCsr->aOffset[iStart];
sl@0
   230
  *piEndOffset = pCsr->aOffset[iEnd];
sl@0
   231
  *piPosition = pCsr->iToken++;
sl@0
   232
sl@0
   233
  return SQLITE_OK;
sl@0
   234
}
sl@0
   235
sl@0
   236
/*
sl@0
   237
** The set of routines that implement the simple tokenizer
sl@0
   238
*/
sl@0
   239
static const sqlite3_tokenizer_module icuTokenizerModule = {
sl@0
   240
  0,                           /* iVersion */
sl@0
   241
  icuCreate,                   /* xCreate  */
sl@0
   242
  icuDestroy,                  /* xCreate  */
sl@0
   243
  icuOpen,                     /* xOpen    */
sl@0
   244
  icuClose,                    /* xClose   */
sl@0
   245
  icuNext,                     /* xNext    */
sl@0
   246
};
sl@0
   247
sl@0
   248
/*
sl@0
   249
** Set *ppModule to point at the implementation of the ICU tokenizer.
sl@0
   250
*/
sl@0
   251
void sqlite3Fts2IcuTokenizerModule(
sl@0
   252
  sqlite3_tokenizer_module const**ppModule
sl@0
   253
){
sl@0
   254
  *ppModule = &icuTokenizerModule;
sl@0
   255
}
sl@0
   256
sl@0
   257
#endif /* defined(SQLITE_ENABLE_ICU) */
sl@0
   258
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */