os/persistentdata/persistentstorage/sqlite3api/SQLite/fts2_porter.c
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
** 2006 September 30
sl@0
     3
**
sl@0
     4
** The author disclaims copyright to this source code.  In place of
sl@0
     5
** a legal notice, here is a blessing:
sl@0
     6
**
sl@0
     7
**    May you do good and not evil.
sl@0
     8
**    May you find forgiveness for yourself and forgive others.
sl@0
     9
**    May you share freely, never taking more than you give.
sl@0
    10
**
sl@0
    11
*************************************************************************
sl@0
    12
** Implementation of the full-text-search tokenizer that implements
sl@0
    13
** a Porter stemmer.
sl@0
    14
*/
sl@0
    15
sl@0
    16
/*
sl@0
    17
** The code in this file is only compiled if:
sl@0
    18
**
sl@0
    19
**     * The FTS2 module is being built as an extension
sl@0
    20
**       (in which case SQLITE_CORE is not defined), or
sl@0
    21
**
sl@0
    22
**     * The FTS2 module is being built into the core of
sl@0
    23
**       SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
sl@0
    24
*/
sl@0
    25
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
sl@0
    26
sl@0
    27
sl@0
    28
#include <assert.h>
sl@0
    29
#include <stdlib.h>
sl@0
    30
#include <stdio.h>
sl@0
    31
#include <string.h>
sl@0
    32
#include <ctype.h>
sl@0
    33
sl@0
    34
#include "fts2_tokenizer.h"
sl@0
    35
sl@0
    36
/*
sl@0
    37
** Class derived from sqlite3_tokenizer
sl@0
    38
*/
sl@0
    39
typedef struct porter_tokenizer {
sl@0
    40
  sqlite3_tokenizer base;      /* Base class */
sl@0
    41
} porter_tokenizer;
sl@0
    42
sl@0
    43
/*
sl@0
    44
** Class derived from sqlit3_tokenizer_cursor
sl@0
    45
*/
sl@0
    46
typedef struct porter_tokenizer_cursor {
sl@0
    47
  sqlite3_tokenizer_cursor base;
sl@0
    48
  const char *zInput;          /* input we are tokenizing */
sl@0
    49
  int nInput;                  /* size of the input */
sl@0
    50
  int iOffset;                 /* current position in zInput */
sl@0
    51
  int iToken;                  /* index of next token to be returned */
sl@0
    52
  char *zToken;                /* storage for current token */
sl@0
    53
  int nAllocated;              /* space allocated to zToken buffer */
sl@0
    54
} porter_tokenizer_cursor;
sl@0
    55
sl@0
    56
sl@0
    57
/* Forward declaration */
sl@0
    58
static const sqlite3_tokenizer_module porterTokenizerModule;
sl@0
    59
sl@0
    60
sl@0
    61
/*
sl@0
    62
** Create a new tokenizer instance.
sl@0
    63
*/
sl@0
    64
static int porterCreate(
sl@0
    65
  int argc, const char * const *argv,
sl@0
    66
  sqlite3_tokenizer **ppTokenizer
sl@0
    67
){
sl@0
    68
  porter_tokenizer *t;
sl@0
    69
  t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
sl@0
    70
  if( t==NULL ) return SQLITE_NOMEM;
sl@0
    71
  memset(t, 0, sizeof(*t));
sl@0
    72
  *ppTokenizer = &t->base;
sl@0
    73
  return SQLITE_OK;
sl@0
    74
}
sl@0
    75
sl@0
    76
/*
sl@0
    77
** Destroy a tokenizer
sl@0
    78
*/
sl@0
    79
static int porterDestroy(sqlite3_tokenizer *pTokenizer){
sl@0
    80
  sqlite3_free(pTokenizer);
sl@0
    81
  return SQLITE_OK;
sl@0
    82
}
sl@0
    83
sl@0
    84
/*
sl@0
    85
** Prepare to begin tokenizing a particular string.  The input
sl@0
    86
** string to be tokenized is zInput[0..nInput-1].  A cursor
sl@0
    87
** used to incrementally tokenize this string is returned in 
sl@0
    88
** *ppCursor.
sl@0
    89
*/
sl@0
    90
static int porterOpen(
sl@0
    91
  sqlite3_tokenizer *pTokenizer,         /* The tokenizer */
sl@0
    92
  const char *zInput, int nInput,        /* String to be tokenized */
sl@0
    93
  sqlite3_tokenizer_cursor **ppCursor    /* OUT: Tokenization cursor */
sl@0
    94
){
sl@0
    95
  porter_tokenizer_cursor *c;
sl@0
    96
sl@0
    97
  c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
sl@0
    98
  if( c==NULL ) return SQLITE_NOMEM;
sl@0
    99
sl@0
   100
  c->zInput = zInput;
sl@0
   101
  if( zInput==0 ){
sl@0
   102
    c->nInput = 0;
sl@0
   103
  }else if( nInput<0 ){
sl@0
   104
    c->nInput = (int)strlen(zInput);
sl@0
   105
  }else{
sl@0
   106
    c->nInput = nInput;
sl@0
   107
  }
sl@0
   108
  c->iOffset = 0;                 /* start tokenizing at the beginning */
sl@0
   109
  c->iToken = 0;
sl@0
   110
  c->zToken = NULL;               /* no space allocated, yet. */
sl@0
   111
  c->nAllocated = 0;
sl@0
   112
sl@0
   113
  *ppCursor = &c->base;
sl@0
   114
  return SQLITE_OK;
sl@0
   115
}
sl@0
   116
sl@0
   117
/*
sl@0
   118
** Close a tokenization cursor previously opened by a call to
sl@0
   119
** porterOpen() above.
sl@0
   120
*/
sl@0
   121
static int porterClose(sqlite3_tokenizer_cursor *pCursor){
sl@0
   122
  porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
sl@0
   123
  sqlite3_free(c->zToken);
sl@0
   124
  sqlite3_free(c);
sl@0
   125
  return SQLITE_OK;
sl@0
   126
}
sl@0
   127
/*
sl@0
   128
** Vowel or consonant
sl@0
   129
*/
sl@0
   130
static const char cType[] = {
sl@0
   131
   0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
sl@0
   132
   1, 1, 1, 2, 1
sl@0
   133
};
sl@0
   134
sl@0
   135
/*
sl@0
   136
** isConsonant() and isVowel() determine if their first character in
sl@0
   137
** the string they point to is a consonant or a vowel, according
sl@0
   138
** to Porter ruls.  
sl@0
   139
**
sl@0
   140
** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
sl@0
   141
** 'Y' is a consonant unless it follows another consonant,
sl@0
   142
** in which case it is a vowel.
sl@0
   143
**
sl@0
   144
** In these routine, the letters are in reverse order.  So the 'y' rule
sl@0
   145
** is that 'y' is a consonant unless it is followed by another
sl@0
   146
** consonent.
sl@0
   147
*/
sl@0
   148
static int isVowel(const char*);
sl@0
   149
static int isConsonant(const char *z){
sl@0
   150
  int j;
sl@0
   151
  char x = *z;
sl@0
   152
  if( x==0 ) return 0;
sl@0
   153
  assert( x>='a' && x<='z' );
sl@0
   154
  j = cType[x-'a'];
sl@0
   155
  if( j<2 ) return j;
sl@0
   156
  return z[1]==0 || isVowel(z + 1);
sl@0
   157
}
sl@0
   158
static int isVowel(const char *z){
sl@0
   159
  int j;
sl@0
   160
  char x = *z;
sl@0
   161
  if( x==0 ) return 0;
sl@0
   162
  assert( x>='a' && x<='z' );
sl@0
   163
  j = cType[x-'a'];
sl@0
   164
  if( j<2 ) return 1-j;
sl@0
   165
  return isConsonant(z + 1);
sl@0
   166
}
sl@0
   167
sl@0
   168
/*
sl@0
   169
** Let any sequence of one or more vowels be represented by V and let
sl@0
   170
** C be sequence of one or more consonants.  Then every word can be
sl@0
   171
** represented as:
sl@0
   172
**
sl@0
   173
**           [C] (VC){m} [V]
sl@0
   174
**
sl@0
   175
** In prose:  A word is an optional consonant followed by zero or
sl@0
   176
** vowel-consonant pairs followed by an optional vowel.  "m" is the
sl@0
   177
** number of vowel consonant pairs.  This routine computes the value
sl@0
   178
** of m for the first i bytes of a word.
sl@0
   179
**
sl@0
   180
** Return true if the m-value for z is 1 or more.  In other words,
sl@0
   181
** return true if z contains at least one vowel that is followed
sl@0
   182
** by a consonant.
sl@0
   183
**
sl@0
   184
** In this routine z[] is in reverse order.  So we are really looking
sl@0
   185
** for an instance of of a consonant followed by a vowel.
sl@0
   186
*/
sl@0
   187
static int m_gt_0(const char *z){
sl@0
   188
  while( isVowel(z) ){ z++; }
sl@0
   189
  if( *z==0 ) return 0;
sl@0
   190
  while( isConsonant(z) ){ z++; }
sl@0
   191
  return *z!=0;
sl@0
   192
}
sl@0
   193
sl@0
   194
/* Like mgt0 above except we are looking for a value of m which is
sl@0
   195
** exactly 1
sl@0
   196
*/
sl@0
   197
static int m_eq_1(const char *z){
sl@0
   198
  while( isVowel(z) ){ z++; }
sl@0
   199
  if( *z==0 ) return 0;
sl@0
   200
  while( isConsonant(z) ){ z++; }
sl@0
   201
  if( *z==0 ) return 0;
sl@0
   202
  while( isVowel(z) ){ z++; }
sl@0
   203
  if( *z==0 ) return 1;
sl@0
   204
  while( isConsonant(z) ){ z++; }
sl@0
   205
  return *z==0;
sl@0
   206
}
sl@0
   207
sl@0
   208
/* Like mgt0 above except we are looking for a value of m>1 instead
sl@0
   209
** or m>0
sl@0
   210
*/
sl@0
   211
static int m_gt_1(const char *z){
sl@0
   212
  while( isVowel(z) ){ z++; }
sl@0
   213
  if( *z==0 ) return 0;
sl@0
   214
  while( isConsonant(z) ){ z++; }
sl@0
   215
  if( *z==0 ) return 0;
sl@0
   216
  while( isVowel(z) ){ z++; }
sl@0
   217
  if( *z==0 ) return 0;
sl@0
   218
  while( isConsonant(z) ){ z++; }
sl@0
   219
  return *z!=0;
sl@0
   220
}
sl@0
   221
sl@0
   222
/*
sl@0
   223
** Return TRUE if there is a vowel anywhere within z[0..n-1]
sl@0
   224
*/
sl@0
   225
static int hasVowel(const char *z){
sl@0
   226
  while( isConsonant(z) ){ z++; }
sl@0
   227
  return *z!=0;
sl@0
   228
}
sl@0
   229
sl@0
   230
/*
sl@0
   231
** Return TRUE if the word ends in a double consonant.
sl@0
   232
**
sl@0
   233
** The text is reversed here. So we are really looking at
sl@0
   234
** the first two characters of z[].
sl@0
   235
*/
sl@0
   236
static int doubleConsonant(const char *z){
sl@0
   237
  return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
sl@0
   238
}
sl@0
   239
sl@0
   240
/*
sl@0
   241
** Return TRUE if the word ends with three letters which
sl@0
   242
** are consonant-vowel-consonent and where the final consonant
sl@0
   243
** is not 'w', 'x', or 'y'.
sl@0
   244
**
sl@0
   245
** The word is reversed here.  So we are really checking the
sl@0
   246
** first three letters and the first one cannot be in [wxy].
sl@0
   247
*/
sl@0
   248
static int star_oh(const char *z){
sl@0
   249
  return
sl@0
   250
    z[0]!=0 && isConsonant(z) &&
sl@0
   251
    z[0]!='w' && z[0]!='x' && z[0]!='y' &&
sl@0
   252
    z[1]!=0 && isVowel(z+1) &&
sl@0
   253
    z[2]!=0 && isConsonant(z+2);
sl@0
   254
}
sl@0
   255
sl@0
   256
/*
sl@0
   257
** If the word ends with zFrom and xCond() is true for the stem
sl@0
   258
** of the word that preceeds the zFrom ending, then change the 
sl@0
   259
** ending to zTo.
sl@0
   260
**
sl@0
   261
** The input word *pz and zFrom are both in reverse order.  zTo
sl@0
   262
** is in normal order. 
sl@0
   263
**
sl@0
   264
** Return TRUE if zFrom matches.  Return FALSE if zFrom does not
sl@0
   265
** match.  Not that TRUE is returned even if xCond() fails and
sl@0
   266
** no substitution occurs.
sl@0
   267
*/
sl@0
   268
static int stem(
sl@0
   269
  char **pz,             /* The word being stemmed (Reversed) */
sl@0
   270
  const char *zFrom,     /* If the ending matches this... (Reversed) */
sl@0
   271
  const char *zTo,       /* ... change the ending to this (not reversed) */
sl@0
   272
  int (*xCond)(const char*)   /* Condition that must be true */
sl@0
   273
){
sl@0
   274
  char *z = *pz;
sl@0
   275
  while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
sl@0
   276
  if( *zFrom!=0 ) return 0;
sl@0
   277
  if( xCond && !xCond(z) ) return 1;
sl@0
   278
  while( *zTo ){
sl@0
   279
    *(--z) = *(zTo++);
sl@0
   280
  }
sl@0
   281
  *pz = z;
sl@0
   282
  return 1;
sl@0
   283
}
sl@0
   284
sl@0
   285
/*
sl@0
   286
** This is the fallback stemmer used when the porter stemmer is
sl@0
   287
** inappropriate.  The input word is copied into the output with
sl@0
   288
** US-ASCII case folding.  If the input word is too long (more
sl@0
   289
** than 20 bytes if it contains no digits or more than 6 bytes if
sl@0
   290
** it contains digits) then word is truncated to 20 or 6 bytes
sl@0
   291
** by taking 10 or 3 bytes from the beginning and end.
sl@0
   292
*/
sl@0
   293
static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
sl@0
   294
  int i, mx, j;
sl@0
   295
  int hasDigit = 0;
sl@0
   296
  for(i=0; i<nIn; i++){
sl@0
   297
    int c = zIn[i];
sl@0
   298
    if( c>='A' && c<='Z' ){
sl@0
   299
      zOut[i] = c - 'A' + 'a';
sl@0
   300
    }else{
sl@0
   301
      if( c>='0' && c<='9' ) hasDigit = 1;
sl@0
   302
      zOut[i] = c;
sl@0
   303
    }
sl@0
   304
  }
sl@0
   305
  mx = hasDigit ? 3 : 10;
sl@0
   306
  if( nIn>mx*2 ){
sl@0
   307
    for(j=mx, i=nIn-mx; i<nIn; i++, j++){
sl@0
   308
      zOut[j] = zOut[i];
sl@0
   309
    }
sl@0
   310
    i = j;
sl@0
   311
  }
sl@0
   312
  zOut[i] = 0;
sl@0
   313
  *pnOut = i;
sl@0
   314
}
sl@0
   315
sl@0
   316
sl@0
   317
/*
sl@0
   318
** Stem the input word zIn[0..nIn-1].  Store the output in zOut.
sl@0
   319
** zOut is at least big enough to hold nIn bytes.  Write the actual
sl@0
   320
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
sl@0
   321
**
sl@0
   322
** Any upper-case characters in the US-ASCII character set ([A-Z])
sl@0
   323
** are converted to lower case.  Upper-case UTF characters are
sl@0
   324
** unchanged.
sl@0
   325
**
sl@0
   326
** Words that are longer than about 20 bytes are stemmed by retaining
sl@0
   327
** a few bytes from the beginning and the end of the word.  If the
sl@0
   328
** word contains digits, 3 bytes are taken from the beginning and
sl@0
   329
** 3 bytes from the end.  For long words without digits, 10 bytes
sl@0
   330
** are taken from each end.  US-ASCII case folding still applies.
sl@0
   331
** 
sl@0
   332
** If the input word contains not digits but does characters not 
sl@0
   333
** in [a-zA-Z] then no stemming is attempted and this routine just 
sl@0
   334
** copies the input into the input into the output with US-ASCII
sl@0
   335
** case folding.
sl@0
   336
**
sl@0
   337
** Stemming never increases the length of the word.  So there is
sl@0
   338
** no chance of overflowing the zOut buffer.
sl@0
   339
*/
sl@0
   340
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
sl@0
   341
  int i, j, c;
sl@0
   342
  char zReverse[28];
sl@0
   343
  char *z, *z2;
sl@0
   344
  if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
sl@0
   345
    /* The word is too big or too small for the porter stemmer.
sl@0
   346
    ** Fallback to the copy stemmer */
sl@0
   347
    copy_stemmer(zIn, nIn, zOut, pnOut);
sl@0
   348
    return;
sl@0
   349
  }
sl@0
   350
  for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
sl@0
   351
    c = zIn[i];
sl@0
   352
    if( c>='A' && c<='Z' ){
sl@0
   353
      zReverse[j] = c + 'a' - 'A';
sl@0
   354
    }else if( c>='a' && c<='z' ){
sl@0
   355
      zReverse[j] = c;
sl@0
   356
    }else{
sl@0
   357
      /* The use of a character not in [a-zA-Z] means that we fallback
sl@0
   358
      ** to the copy stemmer */
sl@0
   359
      copy_stemmer(zIn, nIn, zOut, pnOut);
sl@0
   360
      return;
sl@0
   361
    }
sl@0
   362
  }
sl@0
   363
  memset(&zReverse[sizeof(zReverse)-5], 0, 5);
sl@0
   364
  z = &zReverse[j+1];
sl@0
   365
sl@0
   366
sl@0
   367
  /* Step 1a */
sl@0
   368
  if( z[0]=='s' ){
sl@0
   369
    if(
sl@0
   370
     !stem(&z, "sess", "ss", 0) &&
sl@0
   371
     !stem(&z, "sei", "i", 0)  &&
sl@0
   372
     !stem(&z, "ss", "ss", 0)
sl@0
   373
    ){
sl@0
   374
      z++;
sl@0
   375
    }
sl@0
   376
  }
sl@0
   377
sl@0
   378
  /* Step 1b */  
sl@0
   379
  z2 = z;
sl@0
   380
  if( stem(&z, "dee", "ee", m_gt_0) ){
sl@0
   381
    /* Do nothing.  The work was all in the test */
sl@0
   382
  }else if( 
sl@0
   383
     (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
sl@0
   384
      && z!=z2
sl@0
   385
  ){
sl@0
   386
     if( stem(&z, "ta", "ate", 0) ||
sl@0
   387
         stem(&z, "lb", "ble", 0) ||
sl@0
   388
         stem(&z, "zi", "ize", 0) ){
sl@0
   389
       /* Do nothing.  The work was all in the test */
sl@0
   390
     }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
sl@0
   391
       z++;
sl@0
   392
     }else if( m_eq_1(z) && star_oh(z) ){
sl@0
   393
       *(--z) = 'e';
sl@0
   394
     }
sl@0
   395
  }
sl@0
   396
sl@0
   397
  /* Step 1c */
sl@0
   398
  if( z[0]=='y' && hasVowel(z+1) ){
sl@0
   399
    z[0] = 'i';
sl@0
   400
  }
sl@0
   401
sl@0
   402
  /* Step 2 */
sl@0
   403
  switch( z[1] ){
sl@0
   404
   case 'a':
sl@0
   405
     stem(&z, "lanoita", "ate", m_gt_0) ||
sl@0
   406
     stem(&z, "lanoit", "tion", m_gt_0);
sl@0
   407
     break;
sl@0
   408
   case 'c':
sl@0
   409
     stem(&z, "icne", "ence", m_gt_0) ||
sl@0
   410
     stem(&z, "icna", "ance", m_gt_0);
sl@0
   411
     break;
sl@0
   412
   case 'e':
sl@0
   413
     stem(&z, "rezi", "ize", m_gt_0);
sl@0
   414
     break;
sl@0
   415
   case 'g':
sl@0
   416
     stem(&z, "igol", "log", m_gt_0);
sl@0
   417
     break;
sl@0
   418
   case 'l':
sl@0
   419
     stem(&z, "ilb", "ble", m_gt_0) ||
sl@0
   420
     stem(&z, "illa", "al", m_gt_0) ||
sl@0
   421
     stem(&z, "iltne", "ent", m_gt_0) ||
sl@0
   422
     stem(&z, "ile", "e", m_gt_0) ||
sl@0
   423
     stem(&z, "ilsuo", "ous", m_gt_0);
sl@0
   424
     break;
sl@0
   425
   case 'o':
sl@0
   426
     stem(&z, "noitazi", "ize", m_gt_0) ||
sl@0
   427
     stem(&z, "noita", "ate", m_gt_0) ||
sl@0
   428
     stem(&z, "rota", "ate", m_gt_0);
sl@0
   429
     break;
sl@0
   430
   case 's':
sl@0
   431
     stem(&z, "msila", "al", m_gt_0) ||
sl@0
   432
     stem(&z, "ssenevi", "ive", m_gt_0) ||
sl@0
   433
     stem(&z, "ssenluf", "ful", m_gt_0) ||
sl@0
   434
     stem(&z, "ssensuo", "ous", m_gt_0);
sl@0
   435
     break;
sl@0
   436
   case 't':
sl@0
   437
     stem(&z, "itila", "al", m_gt_0) ||
sl@0
   438
     stem(&z, "itivi", "ive", m_gt_0) ||
sl@0
   439
     stem(&z, "itilib", "ble", m_gt_0);
sl@0
   440
     break;
sl@0
   441
  }
sl@0
   442
sl@0
   443
  /* Step 3 */
sl@0
   444
  switch( z[0] ){
sl@0
   445
   case 'e':
sl@0
   446
     stem(&z, "etaci", "ic", m_gt_0) ||
sl@0
   447
     stem(&z, "evita", "", m_gt_0)   ||
sl@0
   448
     stem(&z, "ezila", "al", m_gt_0);
sl@0
   449
     break;
sl@0
   450
   case 'i':
sl@0
   451
     stem(&z, "itici", "ic", m_gt_0);
sl@0
   452
     break;
sl@0
   453
   case 'l':
sl@0
   454
     stem(&z, "laci", "ic", m_gt_0) ||
sl@0
   455
     stem(&z, "luf", "", m_gt_0);
sl@0
   456
     break;
sl@0
   457
   case 's':
sl@0
   458
     stem(&z, "ssen", "", m_gt_0);
sl@0
   459
     break;
sl@0
   460
  }
sl@0
   461
sl@0
   462
  /* Step 4 */
sl@0
   463
  switch( z[1] ){
sl@0
   464
   case 'a':
sl@0
   465
     if( z[0]=='l' && m_gt_1(z+2) ){
sl@0
   466
       z += 2;
sl@0
   467
     }
sl@0
   468
     break;
sl@0
   469
   case 'c':
sl@0
   470
     if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e')  && m_gt_1(z+4)  ){
sl@0
   471
       z += 4;
sl@0
   472
     }
sl@0
   473
     break;
sl@0
   474
   case 'e':
sl@0
   475
     if( z[0]=='r' && m_gt_1(z+2) ){
sl@0
   476
       z += 2;
sl@0
   477
     }
sl@0
   478
     break;
sl@0
   479
   case 'i':
sl@0
   480
     if( z[0]=='c' && m_gt_1(z+2) ){
sl@0
   481
       z += 2;
sl@0
   482
     }
sl@0
   483
     break;
sl@0
   484
   case 'l':
sl@0
   485
     if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
sl@0
   486
       z += 4;
sl@0
   487
     }
sl@0
   488
     break;
sl@0
   489
   case 'n':
sl@0
   490
     if( z[0]=='t' ){
sl@0
   491
       if( z[2]=='a' ){
sl@0
   492
         if( m_gt_1(z+3) ){
sl@0
   493
           z += 3;
sl@0
   494
         }
sl@0
   495
       }else if( z[2]=='e' ){
sl@0
   496
         stem(&z, "tneme", "", m_gt_1) ||
sl@0
   497
         stem(&z, "tnem", "", m_gt_1) ||
sl@0
   498
         stem(&z, "tne", "", m_gt_1);
sl@0
   499
       }
sl@0
   500
     }
sl@0
   501
     break;
sl@0
   502
   case 'o':
sl@0
   503
     if( z[0]=='u' ){
sl@0
   504
       if( m_gt_1(z+2) ){
sl@0
   505
         z += 2;
sl@0
   506
       }
sl@0
   507
     }else if( z[3]=='s' || z[3]=='t' ){
sl@0
   508
       stem(&z, "noi", "", m_gt_1);
sl@0
   509
     }
sl@0
   510
     break;
sl@0
   511
   case 's':
sl@0
   512
     if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
sl@0
   513
       z += 3;
sl@0
   514
     }
sl@0
   515
     break;
sl@0
   516
   case 't':
sl@0
   517
     stem(&z, "eta", "", m_gt_1) ||
sl@0
   518
     stem(&z, "iti", "", m_gt_1);
sl@0
   519
     break;
sl@0
   520
   case 'u':
sl@0
   521
     if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
sl@0
   522
       z += 3;
sl@0
   523
     }
sl@0
   524
     break;
sl@0
   525
   case 'v':
sl@0
   526
   case 'z':
sl@0
   527
     if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
sl@0
   528
       z += 3;
sl@0
   529
     }
sl@0
   530
     break;
sl@0
   531
  }
sl@0
   532
sl@0
   533
  /* Step 5a */
sl@0
   534
  if( z[0]=='e' ){
sl@0
   535
    if( m_gt_1(z+1) ){
sl@0
   536
      z++;
sl@0
   537
    }else if( m_eq_1(z+1) && !star_oh(z+1) ){
sl@0
   538
      z++;
sl@0
   539
    }
sl@0
   540
  }
sl@0
   541
sl@0
   542
  /* Step 5b */
sl@0
   543
  if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
sl@0
   544
    z++;
sl@0
   545
  }
sl@0
   546
sl@0
   547
  /* z[] is now the stemmed word in reverse order.  Flip it back
sl@0
   548
  ** around into forward order and return.
sl@0
   549
  */
sl@0
   550
  *pnOut = i = strlen(z);
sl@0
   551
  zOut[i] = 0;
sl@0
   552
  while( *z ){
sl@0
   553
    zOut[--i] = *(z++);
sl@0
   554
  }
sl@0
   555
}
sl@0
   556
sl@0
   557
/*
sl@0
   558
** Characters that can be part of a token.  We assume any character
sl@0
   559
** whose value is greater than 0x80 (any UTF character) can be
sl@0
   560
** part of a token.  In other words, delimiters all must have
sl@0
   561
** values of 0x7f or lower.
sl@0
   562
*/
sl@0
   563
static const char porterIdChar[] = {
sl@0
   564
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
sl@0
   565
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0,  /* 3x */
sl@0
   566
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 4x */
sl@0
   567
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1,  /* 5x */
sl@0
   568
    0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,  /* 6x */
sl@0
   569
    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,  /* 7x */
sl@0
   570
};
sl@0
   571
#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
sl@0
   572
sl@0
   573
/*
sl@0
   574
** Extract the next token from a tokenization cursor.  The cursor must
sl@0
   575
** have been opened by a prior call to porterOpen().
sl@0
   576
*/
sl@0
   577
static int porterNext(
sl@0
   578
  sqlite3_tokenizer_cursor *pCursor,  /* Cursor returned by porterOpen */
sl@0
   579
  const char **pzToken,               /* OUT: *pzToken is the token text */
sl@0
   580
  int *pnBytes,                       /* OUT: Number of bytes in token */
sl@0
   581
  int *piStartOffset,                 /* OUT: Starting offset of token */
sl@0
   582
  int *piEndOffset,                   /* OUT: Ending offset of token */
sl@0
   583
  int *piPosition                     /* OUT: Position integer of token */
sl@0
   584
){
sl@0
   585
  porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
sl@0
   586
  const char *z = c->zInput;
sl@0
   587
sl@0
   588
  while( c->iOffset<c->nInput ){
sl@0
   589
    int iStartOffset, ch;
sl@0
   590
sl@0
   591
    /* Scan past delimiter characters */
sl@0
   592
    while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
sl@0
   593
      c->iOffset++;
sl@0
   594
    }
sl@0
   595
sl@0
   596
    /* Count non-delimiter characters. */
sl@0
   597
    iStartOffset = c->iOffset;
sl@0
   598
    while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
sl@0
   599
      c->iOffset++;
sl@0
   600
    }
sl@0
   601
sl@0
   602
    if( c->iOffset>iStartOffset ){
sl@0
   603
      int n = c->iOffset-iStartOffset;
sl@0
   604
      if( n>c->nAllocated ){
sl@0
   605
        c->nAllocated = n+20;
sl@0
   606
        c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
sl@0
   607
        if( c->zToken==NULL ) return SQLITE_NOMEM;
sl@0
   608
      }
sl@0
   609
      porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
sl@0
   610
      *pzToken = c->zToken;
sl@0
   611
      *piStartOffset = iStartOffset;
sl@0
   612
      *piEndOffset = c->iOffset;
sl@0
   613
      *piPosition = c->iToken++;
sl@0
   614
      return SQLITE_OK;
sl@0
   615
    }
sl@0
   616
  }
sl@0
   617
  return SQLITE_DONE;
sl@0
   618
}
sl@0
   619
sl@0
   620
/*
sl@0
   621
** The set of routines that implement the porter-stemmer tokenizer
sl@0
   622
*/
sl@0
   623
static const sqlite3_tokenizer_module porterTokenizerModule = {
sl@0
   624
  0,
sl@0
   625
  porterCreate,
sl@0
   626
  porterDestroy,
sl@0
   627
  porterOpen,
sl@0
   628
  porterClose,
sl@0
   629
  porterNext,
sl@0
   630
};
sl@0
   631
sl@0
   632
/*
sl@0
   633
** Allocate a new porter tokenizer.  Return a pointer to the new
sl@0
   634
** tokenizer in *ppModule
sl@0
   635
*/
sl@0
   636
void sqlite3Fts2PorterTokenizerModule(
sl@0
   637
  sqlite3_tokenizer_module const**ppModule
sl@0
   638
){
sl@0
   639
  *ppModule = &porterTokenizerModule;
sl@0
   640
}
sl@0
   641
sl@0
   642
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */