1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/os/persistentdata/persistentstorage/sqlite3api/SQLite/fts1_porter.c Fri Jun 15 03:10:57 2012 +0200
1.3 @@ -0,0 +1,643 @@
1.4 +/*
1.5 +** 2006 September 30
1.6 +**
1.7 +** The author disclaims copyright to this source code. In place of
1.8 +** a legal notice, here is a blessing:
1.9 +**
1.10 +** May you do good and not evil.
1.11 +** May you find forgiveness for yourself and forgive others.
1.12 +** May you share freely, never taking more than you give.
1.13 +**
1.14 +*************************************************************************
1.15 +** Implementation of the full-text-search tokenizer that implements
1.16 +** a Porter stemmer.
1.17 +*/
1.18 +
1.19 +/*
1.20 +** The code in this file is only compiled if:
1.21 +**
1.22 +** * The FTS1 module is being built as an extension
1.23 +** (in which case SQLITE_CORE is not defined), or
1.24 +**
1.25 +** * The FTS1 module is being built into the core of
1.26 +** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
1.27 +*/
1.28 +#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
1.29 +
1.30 +
1.31 +#include <assert.h>
1.32 +#include <stdlib.h>
1.33 +#include <stdio.h>
1.34 +#include <string.h>
1.35 +#include <ctype.h>
1.36 +
1.37 +#include "fts1_tokenizer.h"
1.38 +
1.39 +/*
1.40 +** Class derived from sqlite3_tokenizer
1.41 +*/
1.42 +typedef struct porter_tokenizer {
1.43 + sqlite3_tokenizer base; /* Base class */
1.44 +} porter_tokenizer;
1.45 +
1.46 +/*
1.47 +** Class derived from sqlit3_tokenizer_cursor
1.48 +*/
1.49 +typedef struct porter_tokenizer_cursor {
1.50 + sqlite3_tokenizer_cursor base;
1.51 + const char *zInput; /* input we are tokenizing */
1.52 + int nInput; /* size of the input */
1.53 + int iOffset; /* current position in zInput */
1.54 + int iToken; /* index of next token to be returned */
1.55 + char *zToken; /* storage for current token */
1.56 + int nAllocated; /* space allocated to zToken buffer */
1.57 +} porter_tokenizer_cursor;
1.58 +
1.59 +
1.60 +/* Forward declaration */
1.61 +static const sqlite3_tokenizer_module porterTokenizerModule;
1.62 +
1.63 +
1.64 +/*
1.65 +** Create a new tokenizer instance.
1.66 +*/
1.67 +static int porterCreate(
1.68 + int argc, const char * const *argv,
1.69 + sqlite3_tokenizer **ppTokenizer
1.70 +){
1.71 + porter_tokenizer *t;
1.72 + t = (porter_tokenizer *) calloc(sizeof(*t), 1);
1.73 + if( t==NULL ) return SQLITE_NOMEM;
1.74 +
1.75 + *ppTokenizer = &t->base;
1.76 + return SQLITE_OK;
1.77 +}
1.78 +
1.79 +/*
1.80 +** Destroy a tokenizer
1.81 +*/
1.82 +static int porterDestroy(sqlite3_tokenizer *pTokenizer){
1.83 + free(pTokenizer);
1.84 + return SQLITE_OK;
1.85 +}
1.86 +
1.87 +/*
1.88 +** Prepare to begin tokenizing a particular string. The input
1.89 +** string to be tokenized is zInput[0..nInput-1]. A cursor
1.90 +** used to incrementally tokenize this string is returned in
1.91 +** *ppCursor.
1.92 +*/
1.93 +static int porterOpen(
1.94 + sqlite3_tokenizer *pTokenizer, /* The tokenizer */
1.95 + const char *zInput, int nInput, /* String to be tokenized */
1.96 + sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
1.97 +){
1.98 + porter_tokenizer_cursor *c;
1.99 +
1.100 + c = (porter_tokenizer_cursor *) malloc(sizeof(*c));
1.101 + if( c==NULL ) return SQLITE_NOMEM;
1.102 +
1.103 + c->zInput = zInput;
1.104 + if( zInput==0 ){
1.105 + c->nInput = 0;
1.106 + }else if( nInput<0 ){
1.107 + c->nInput = (int)strlen(zInput);
1.108 + }else{
1.109 + c->nInput = nInput;
1.110 + }
1.111 + c->iOffset = 0; /* start tokenizing at the beginning */
1.112 + c->iToken = 0;
1.113 + c->zToken = NULL; /* no space allocated, yet. */
1.114 + c->nAllocated = 0;
1.115 +
1.116 + *ppCursor = &c->base;
1.117 + return SQLITE_OK;
1.118 +}
1.119 +
1.120 +/*
1.121 +** Close a tokenization cursor previously opened by a call to
1.122 +** porterOpen() above.
1.123 +*/
1.124 +static int porterClose(sqlite3_tokenizer_cursor *pCursor){
1.125 + porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
1.126 + free(c->zToken);
1.127 + free(c);
1.128 + return SQLITE_OK;
1.129 +}
1.130 +/*
1.131 +** Vowel or consonant
1.132 +*/
1.133 +static const char cType[] = {
1.134 + 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
1.135 + 1, 1, 1, 2, 1
1.136 +};
1.137 +
1.138 +/*
1.139 +** isConsonant() and isVowel() determine if their first character in
1.140 +** the string they point to is a consonant or a vowel, according
1.141 +** to Porter ruls.
1.142 +**
1.143 +** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
1.144 +** 'Y' is a consonant unless it follows another consonant,
1.145 +** in which case it is a vowel.
1.146 +**
1.147 +** In these routine, the letters are in reverse order. So the 'y' rule
1.148 +** is that 'y' is a consonant unless it is followed by another
1.149 +** consonent.
1.150 +*/
1.151 +static int isVowel(const char*);
1.152 +static int isConsonant(const char *z){
1.153 + int j;
1.154 + char x = *z;
1.155 + if( x==0 ) return 0;
1.156 + assert( x>='a' && x<='z' );
1.157 + j = cType[x-'a'];
1.158 + if( j<2 ) return j;
1.159 + return z[1]==0 || isVowel(z + 1);
1.160 +}
1.161 +static int isVowel(const char *z){
1.162 + int j;
1.163 + char x = *z;
1.164 + if( x==0 ) return 0;
1.165 + assert( x>='a' && x<='z' );
1.166 + j = cType[x-'a'];
1.167 + if( j<2 ) return 1-j;
1.168 + return isConsonant(z + 1);
1.169 +}
1.170 +
1.171 +/*
1.172 +** Let any sequence of one or more vowels be represented by V and let
1.173 +** C be sequence of one or more consonants. Then every word can be
1.174 +** represented as:
1.175 +**
1.176 +** [C] (VC){m} [V]
1.177 +**
1.178 +** In prose: A word is an optional consonant followed by zero or
1.179 +** vowel-consonant pairs followed by an optional vowel. "m" is the
1.180 +** number of vowel consonant pairs. This routine computes the value
1.181 +** of m for the first i bytes of a word.
1.182 +**
1.183 +** Return true if the m-value for z is 1 or more. In other words,
1.184 +** return true if z contains at least one vowel that is followed
1.185 +** by a consonant.
1.186 +**
1.187 +** In this routine z[] is in reverse order. So we are really looking
1.188 +** for an instance of of a consonant followed by a vowel.
1.189 +*/
1.190 +static int m_gt_0(const char *z){
1.191 + while( isVowel(z) ){ z++; }
1.192 + if( *z==0 ) return 0;
1.193 + while( isConsonant(z) ){ z++; }
1.194 + return *z!=0;
1.195 +}
1.196 +
1.197 +/* Like mgt0 above except we are looking for a value of m which is
1.198 +** exactly 1
1.199 +*/
1.200 +static int m_eq_1(const char *z){
1.201 + while( isVowel(z) ){ z++; }
1.202 + if( *z==0 ) return 0;
1.203 + while( isConsonant(z) ){ z++; }
1.204 + if( *z==0 ) return 0;
1.205 + while( isVowel(z) ){ z++; }
1.206 + if( *z==0 ) return 1;
1.207 + while( isConsonant(z) ){ z++; }
1.208 + return *z==0;
1.209 +}
1.210 +
1.211 +/* Like mgt0 above except we are looking for a value of m>1 instead
1.212 +** or m>0
1.213 +*/
1.214 +static int m_gt_1(const char *z){
1.215 + while( isVowel(z) ){ z++; }
1.216 + if( *z==0 ) return 0;
1.217 + while( isConsonant(z) ){ z++; }
1.218 + if( *z==0 ) return 0;
1.219 + while( isVowel(z) ){ z++; }
1.220 + if( *z==0 ) return 0;
1.221 + while( isConsonant(z) ){ z++; }
1.222 + return *z!=0;
1.223 +}
1.224 +
1.225 +/*
1.226 +** Return TRUE if there is a vowel anywhere within z[0..n-1]
1.227 +*/
1.228 +static int hasVowel(const char *z){
1.229 + while( isConsonant(z) ){ z++; }
1.230 + return *z!=0;
1.231 +}
1.232 +
1.233 +/*
1.234 +** Return TRUE if the word ends in a double consonant.
1.235 +**
1.236 +** The text is reversed here. So we are really looking at
1.237 +** the first two characters of z[].
1.238 +*/
1.239 +static int doubleConsonant(const char *z){
1.240 + return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
1.241 +}
1.242 +
1.243 +/*
1.244 +** Return TRUE if the word ends with three letters which
1.245 +** are consonant-vowel-consonent and where the final consonant
1.246 +** is not 'w', 'x', or 'y'.
1.247 +**
1.248 +** The word is reversed here. So we are really checking the
1.249 +** first three letters and the first one cannot be in [wxy].
1.250 +*/
1.251 +static int star_oh(const char *z){
1.252 + return
1.253 + z[0]!=0 && isConsonant(z) &&
1.254 + z[0]!='w' && z[0]!='x' && z[0]!='y' &&
1.255 + z[1]!=0 && isVowel(z+1) &&
1.256 + z[2]!=0 && isConsonant(z+2);
1.257 +}
1.258 +
1.259 +/*
1.260 +** If the word ends with zFrom and xCond() is true for the stem
1.261 +** of the word that preceeds the zFrom ending, then change the
1.262 +** ending to zTo.
1.263 +**
1.264 +** The input word *pz and zFrom are both in reverse order. zTo
1.265 +** is in normal order.
1.266 +**
1.267 +** Return TRUE if zFrom matches. Return FALSE if zFrom does not
1.268 +** match. Not that TRUE is returned even if xCond() fails and
1.269 +** no substitution occurs.
1.270 +*/
1.271 +static int stem(
1.272 + char **pz, /* The word being stemmed (Reversed) */
1.273 + const char *zFrom, /* If the ending matches this... (Reversed) */
1.274 + const char *zTo, /* ... change the ending to this (not reversed) */
1.275 + int (*xCond)(const char*) /* Condition that must be true */
1.276 +){
1.277 + char *z = *pz;
1.278 + while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
1.279 + if( *zFrom!=0 ) return 0;
1.280 + if( xCond && !xCond(z) ) return 1;
1.281 + while( *zTo ){
1.282 + *(--z) = *(zTo++);
1.283 + }
1.284 + *pz = z;
1.285 + return 1;
1.286 +}
1.287 +
1.288 +/*
1.289 +** This is the fallback stemmer used when the porter stemmer is
1.290 +** inappropriate. The input word is copied into the output with
1.291 +** US-ASCII case folding. If the input word is too long (more
1.292 +** than 20 bytes if it contains no digits or more than 6 bytes if
1.293 +** it contains digits) then word is truncated to 20 or 6 bytes
1.294 +** by taking 10 or 3 bytes from the beginning and end.
1.295 +*/
1.296 +static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
1.297 + int i, mx, j;
1.298 + int hasDigit = 0;
1.299 + for(i=0; i<nIn; i++){
1.300 + int c = zIn[i];
1.301 + if( c>='A' && c<='Z' ){
1.302 + zOut[i] = c - 'A' + 'a';
1.303 + }else{
1.304 + if( c>='0' && c<='9' ) hasDigit = 1;
1.305 + zOut[i] = c;
1.306 + }
1.307 + }
1.308 + mx = hasDigit ? 3 : 10;
1.309 + if( nIn>mx*2 ){
1.310 + for(j=mx, i=nIn-mx; i<nIn; i++, j++){
1.311 + zOut[j] = zOut[i];
1.312 + }
1.313 + i = j;
1.314 + }
1.315 + zOut[i] = 0;
1.316 + *pnOut = i;
1.317 +}
1.318 +
1.319 +
1.320 +/*
1.321 +** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
1.322 +** zOut is at least big enough to hold nIn bytes. Write the actual
1.323 +** size of the output word (exclusive of the '\0' terminator) into *pnOut.
1.324 +**
1.325 +** Any upper-case characters in the US-ASCII character set ([A-Z])
1.326 +** are converted to lower case. Upper-case UTF characters are
1.327 +** unchanged.
1.328 +**
1.329 +** Words that are longer than about 20 bytes are stemmed by retaining
1.330 +** a few bytes from the beginning and the end of the word. If the
1.331 +** word contains digits, 3 bytes are taken from the beginning and
1.332 +** 3 bytes from the end. For long words without digits, 10 bytes
1.333 +** are taken from each end. US-ASCII case folding still applies.
1.334 +**
1.335 +** If the input word contains not digits but does characters not
1.336 +** in [a-zA-Z] then no stemming is attempted and this routine just
1.337 +** copies the input into the input into the output with US-ASCII
1.338 +** case folding.
1.339 +**
1.340 +** Stemming never increases the length of the word. So there is
1.341 +** no chance of overflowing the zOut buffer.
1.342 +*/
1.343 +static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
1.344 + int i, j, c;
1.345 + char zReverse[28];
1.346 + char *z, *z2;
1.347 + if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
1.348 + /* The word is too big or too small for the porter stemmer.
1.349 + ** Fallback to the copy stemmer */
1.350 + copy_stemmer(zIn, nIn, zOut, pnOut);
1.351 + return;
1.352 + }
1.353 + for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
1.354 + c = zIn[i];
1.355 + if( c>='A' && c<='Z' ){
1.356 + zReverse[j] = c + 'a' - 'A';
1.357 + }else if( c>='a' && c<='z' ){
1.358 + zReverse[j] = c;
1.359 + }else{
1.360 + /* The use of a character not in [a-zA-Z] means that we fallback
1.361 + ** to the copy stemmer */
1.362 + copy_stemmer(zIn, nIn, zOut, pnOut);
1.363 + return;
1.364 + }
1.365 + }
1.366 + memset(&zReverse[sizeof(zReverse)-5], 0, 5);
1.367 + z = &zReverse[j+1];
1.368 +
1.369 +
1.370 + /* Step 1a */
1.371 + if( z[0]=='s' ){
1.372 + if(
1.373 + !stem(&z, "sess", "ss", 0) &&
1.374 + !stem(&z, "sei", "i", 0) &&
1.375 + !stem(&z, "ss", "ss", 0)
1.376 + ){
1.377 + z++;
1.378 + }
1.379 + }
1.380 +
1.381 + /* Step 1b */
1.382 + z2 = z;
1.383 + if( stem(&z, "dee", "ee", m_gt_0) ){
1.384 + /* Do nothing. The work was all in the test */
1.385 + }else if(
1.386 + (stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
1.387 + && z!=z2
1.388 + ){
1.389 + if( stem(&z, "ta", "ate", 0) ||
1.390 + stem(&z, "lb", "ble", 0) ||
1.391 + stem(&z, "zi", "ize", 0) ){
1.392 + /* Do nothing. The work was all in the test */
1.393 + }else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
1.394 + z++;
1.395 + }else if( m_eq_1(z) && star_oh(z) ){
1.396 + *(--z) = 'e';
1.397 + }
1.398 + }
1.399 +
1.400 + /* Step 1c */
1.401 + if( z[0]=='y' && hasVowel(z+1) ){
1.402 + z[0] = 'i';
1.403 + }
1.404 +
1.405 + /* Step 2 */
1.406 + switch( z[1] ){
1.407 + case 'a':
1.408 + stem(&z, "lanoita", "ate", m_gt_0) ||
1.409 + stem(&z, "lanoit", "tion", m_gt_0);
1.410 + break;
1.411 + case 'c':
1.412 + stem(&z, "icne", "ence", m_gt_0) ||
1.413 + stem(&z, "icna", "ance", m_gt_0);
1.414 + break;
1.415 + case 'e':
1.416 + stem(&z, "rezi", "ize", m_gt_0);
1.417 + break;
1.418 + case 'g':
1.419 + stem(&z, "igol", "log", m_gt_0);
1.420 + break;
1.421 + case 'l':
1.422 + stem(&z, "ilb", "ble", m_gt_0) ||
1.423 + stem(&z, "illa", "al", m_gt_0) ||
1.424 + stem(&z, "iltne", "ent", m_gt_0) ||
1.425 + stem(&z, "ile", "e", m_gt_0) ||
1.426 + stem(&z, "ilsuo", "ous", m_gt_0);
1.427 + break;
1.428 + case 'o':
1.429 + stem(&z, "noitazi", "ize", m_gt_0) ||
1.430 + stem(&z, "noita", "ate", m_gt_0) ||
1.431 + stem(&z, "rota", "ate", m_gt_0);
1.432 + break;
1.433 + case 's':
1.434 + stem(&z, "msila", "al", m_gt_0) ||
1.435 + stem(&z, "ssenevi", "ive", m_gt_0) ||
1.436 + stem(&z, "ssenluf", "ful", m_gt_0) ||
1.437 + stem(&z, "ssensuo", "ous", m_gt_0);
1.438 + break;
1.439 + case 't':
1.440 + stem(&z, "itila", "al", m_gt_0) ||
1.441 + stem(&z, "itivi", "ive", m_gt_0) ||
1.442 + stem(&z, "itilib", "ble", m_gt_0);
1.443 + break;
1.444 + }
1.445 +
1.446 + /* Step 3 */
1.447 + switch( z[0] ){
1.448 + case 'e':
1.449 + stem(&z, "etaci", "ic", m_gt_0) ||
1.450 + stem(&z, "evita", "", m_gt_0) ||
1.451 + stem(&z, "ezila", "al", m_gt_0);
1.452 + break;
1.453 + case 'i':
1.454 + stem(&z, "itici", "ic", m_gt_0);
1.455 + break;
1.456 + case 'l':
1.457 + stem(&z, "laci", "ic", m_gt_0) ||
1.458 + stem(&z, "luf", "", m_gt_0);
1.459 + break;
1.460 + case 's':
1.461 + stem(&z, "ssen", "", m_gt_0);
1.462 + break;
1.463 + }
1.464 +
1.465 + /* Step 4 */
1.466 + switch( z[1] ){
1.467 + case 'a':
1.468 + if( z[0]=='l' && m_gt_1(z+2) ){
1.469 + z += 2;
1.470 + }
1.471 + break;
1.472 + case 'c':
1.473 + if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
1.474 + z += 4;
1.475 + }
1.476 + break;
1.477 + case 'e':
1.478 + if( z[0]=='r' && m_gt_1(z+2) ){
1.479 + z += 2;
1.480 + }
1.481 + break;
1.482 + case 'i':
1.483 + if( z[0]=='c' && m_gt_1(z+2) ){
1.484 + z += 2;
1.485 + }
1.486 + break;
1.487 + case 'l':
1.488 + if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
1.489 + z += 4;
1.490 + }
1.491 + break;
1.492 + case 'n':
1.493 + if( z[0]=='t' ){
1.494 + if( z[2]=='a' ){
1.495 + if( m_gt_1(z+3) ){
1.496 + z += 3;
1.497 + }
1.498 + }else if( z[2]=='e' ){
1.499 + stem(&z, "tneme", "", m_gt_1) ||
1.500 + stem(&z, "tnem", "", m_gt_1) ||
1.501 + stem(&z, "tne", "", m_gt_1);
1.502 + }
1.503 + }
1.504 + break;
1.505 + case 'o':
1.506 + if( z[0]=='u' ){
1.507 + if( m_gt_1(z+2) ){
1.508 + z += 2;
1.509 + }
1.510 + }else if( z[3]=='s' || z[3]=='t' ){
1.511 + stem(&z, "noi", "", m_gt_1);
1.512 + }
1.513 + break;
1.514 + case 's':
1.515 + if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
1.516 + z += 3;
1.517 + }
1.518 + break;
1.519 + case 't':
1.520 + stem(&z, "eta", "", m_gt_1) ||
1.521 + stem(&z, "iti", "", m_gt_1);
1.522 + break;
1.523 + case 'u':
1.524 + if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
1.525 + z += 3;
1.526 + }
1.527 + break;
1.528 + case 'v':
1.529 + case 'z':
1.530 + if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
1.531 + z += 3;
1.532 + }
1.533 + break;
1.534 + }
1.535 +
1.536 + /* Step 5a */
1.537 + if( z[0]=='e' ){
1.538 + if( m_gt_1(z+1) ){
1.539 + z++;
1.540 + }else if( m_eq_1(z+1) && !star_oh(z+1) ){
1.541 + z++;
1.542 + }
1.543 + }
1.544 +
1.545 + /* Step 5b */
1.546 + if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
1.547 + z++;
1.548 + }
1.549 +
1.550 + /* z[] is now the stemmed word in reverse order. Flip it back
1.551 + ** around into forward order and return.
1.552 + */
1.553 + *pnOut = i = strlen(z);
1.554 + zOut[i] = 0;
1.555 + while( *z ){
1.556 + zOut[--i] = *(z++);
1.557 + }
1.558 +}
1.559 +
1.560 +/*
1.561 +** Characters that can be part of a token. We assume any character
1.562 +** whose value is greater than 0x80 (any UTF character) can be
1.563 +** part of a token. In other words, delimiters all must have
1.564 +** values of 0x7f or lower.
1.565 +*/
1.566 +static const char isIdChar[] = {
1.567 +/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
1.568 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
1.569 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
1.570 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
1.571 + 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
1.572 + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
1.573 +};
1.574 +#define idChar(C) (((ch=C)&0x80)!=0 || (ch>0x2f && isIdChar[ch-0x30]))
1.575 +#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !isIdChar[ch-0x30]))
1.576 +
1.577 +/*
1.578 +** Extract the next token from a tokenization cursor. The cursor must
1.579 +** have been opened by a prior call to porterOpen().
1.580 +*/
1.581 +static int porterNext(
1.582 + sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
1.583 + const char **pzToken, /* OUT: *pzToken is the token text */
1.584 + int *pnBytes, /* OUT: Number of bytes in token */
1.585 + int *piStartOffset, /* OUT: Starting offset of token */
1.586 + int *piEndOffset, /* OUT: Ending offset of token */
1.587 + int *piPosition /* OUT: Position integer of token */
1.588 +){
1.589 + porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
1.590 + const char *z = c->zInput;
1.591 +
1.592 + while( c->iOffset<c->nInput ){
1.593 + int iStartOffset, ch;
1.594 +
1.595 + /* Scan past delimiter characters */
1.596 + while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
1.597 + c->iOffset++;
1.598 + }
1.599 +
1.600 + /* Count non-delimiter characters. */
1.601 + iStartOffset = c->iOffset;
1.602 + while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
1.603 + c->iOffset++;
1.604 + }
1.605 +
1.606 + if( c->iOffset>iStartOffset ){
1.607 + int n = c->iOffset-iStartOffset;
1.608 + if( n>c->nAllocated ){
1.609 + c->nAllocated = n+20;
1.610 + c->zToken = realloc(c->zToken, c->nAllocated);
1.611 + if( c->zToken==NULL ) return SQLITE_NOMEM;
1.612 + }
1.613 + porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
1.614 + *pzToken = c->zToken;
1.615 + *piStartOffset = iStartOffset;
1.616 + *piEndOffset = c->iOffset;
1.617 + *piPosition = c->iToken++;
1.618 + return SQLITE_OK;
1.619 + }
1.620 + }
1.621 + return SQLITE_DONE;
1.622 +}
1.623 +
1.624 +/*
1.625 +** The set of routines that implement the porter-stemmer tokenizer
1.626 +*/
1.627 +static const sqlite3_tokenizer_module porterTokenizerModule = {
1.628 + 0,
1.629 + porterCreate,
1.630 + porterDestroy,
1.631 + porterOpen,
1.632 + porterClose,
1.633 + porterNext,
1.634 +};
1.635 +
1.636 +/*
1.637 +** Allocate a new porter tokenizer. Return a pointer to the new
1.638 +** tokenizer in *ppModule
1.639 +*/
1.640 +void sqlite3Fts1PorterTokenizerModule(
1.641 + sqlite3_tokenizer_module const**ppModule
1.642 +){
1.643 + *ppModule = &porterTokenizerModule;
1.644 +}
1.645 +
1.646 +#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */