sl@0
|
1 |
/*
|
sl@0
|
2 |
** 2006 September 30
|
sl@0
|
3 |
**
|
sl@0
|
4 |
** The author disclaims copyright to this source code. In place of
|
sl@0
|
5 |
** a legal notice, here is a blessing:
|
sl@0
|
6 |
**
|
sl@0
|
7 |
** May you do good and not evil.
|
sl@0
|
8 |
** May you find forgiveness for yourself and forgive others.
|
sl@0
|
9 |
** May you share freely, never taking more than you give.
|
sl@0
|
10 |
**
|
sl@0
|
11 |
*************************************************************************
|
sl@0
|
12 |
** Implementation of the full-text-search tokenizer that implements
|
sl@0
|
13 |
** a Porter stemmer.
|
sl@0
|
14 |
*/
|
sl@0
|
15 |
|
sl@0
|
16 |
/*
|
sl@0
|
17 |
** The code in this file is only compiled if:
|
sl@0
|
18 |
**
|
sl@0
|
19 |
** * The FTS3 module is being built as an extension
|
sl@0
|
20 |
** (in which case SQLITE_CORE is not defined), or
|
sl@0
|
21 |
**
|
sl@0
|
22 |
** * The FTS3 module is being built into the core of
|
sl@0
|
23 |
** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
|
sl@0
|
24 |
*/
|
sl@0
|
25 |
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
|
sl@0
|
26 |
|
sl@0
|
27 |
|
sl@0
|
28 |
#include <assert.h>
|
sl@0
|
29 |
#include <stdlib.h>
|
sl@0
|
30 |
#include <stdio.h>
|
sl@0
|
31 |
#include <string.h>
|
sl@0
|
32 |
#include <ctype.h>
|
sl@0
|
33 |
|
sl@0
|
34 |
#include "fts3_tokenizer.h"
|
sl@0
|
35 |
|
sl@0
|
36 |
/*
|
sl@0
|
37 |
** Class derived from sqlite3_tokenizer
|
sl@0
|
38 |
*/
|
sl@0
|
39 |
typedef struct porter_tokenizer {
|
sl@0
|
40 |
sqlite3_tokenizer base; /* Base class */
|
sl@0
|
41 |
} porter_tokenizer;
|
sl@0
|
42 |
|
sl@0
|
43 |
/*
|
sl@0
|
44 |
** Class derived from sqlit3_tokenizer_cursor
|
sl@0
|
45 |
*/
|
sl@0
|
46 |
typedef struct porter_tokenizer_cursor {
|
sl@0
|
47 |
sqlite3_tokenizer_cursor base;
|
sl@0
|
48 |
const char *zInput; /* input we are tokenizing */
|
sl@0
|
49 |
int nInput; /* size of the input */
|
sl@0
|
50 |
int iOffset; /* current position in zInput */
|
sl@0
|
51 |
int iToken; /* index of next token to be returned */
|
sl@0
|
52 |
char *zToken; /* storage for current token */
|
sl@0
|
53 |
int nAllocated; /* space allocated to zToken buffer */
|
sl@0
|
54 |
} porter_tokenizer_cursor;
|
sl@0
|
55 |
|
sl@0
|
56 |
|
sl@0
|
57 |
/* Forward declaration */
|
sl@0
|
58 |
static const sqlite3_tokenizer_module porterTokenizerModule;
|
sl@0
|
59 |
|
sl@0
|
60 |
|
sl@0
|
61 |
/*
|
sl@0
|
62 |
** Create a new tokenizer instance.
|
sl@0
|
63 |
*/
|
sl@0
|
64 |
static int porterCreate(
|
sl@0
|
65 |
int argc, const char * const *argv,
|
sl@0
|
66 |
sqlite3_tokenizer **ppTokenizer
|
sl@0
|
67 |
){
|
sl@0
|
68 |
porter_tokenizer *t;
|
sl@0
|
69 |
t = (porter_tokenizer *) sqlite3_malloc(sizeof(*t));
|
sl@0
|
70 |
if( t==NULL ) return SQLITE_NOMEM;
|
sl@0
|
71 |
memset(t, 0, sizeof(*t));
|
sl@0
|
72 |
*ppTokenizer = &t->base;
|
sl@0
|
73 |
return SQLITE_OK;
|
sl@0
|
74 |
}
|
sl@0
|
75 |
|
sl@0
|
76 |
/*
|
sl@0
|
77 |
** Destroy a tokenizer
|
sl@0
|
78 |
*/
|
sl@0
|
79 |
static int porterDestroy(sqlite3_tokenizer *pTokenizer){
|
sl@0
|
80 |
sqlite3_free(pTokenizer);
|
sl@0
|
81 |
return SQLITE_OK;
|
sl@0
|
82 |
}
|
sl@0
|
83 |
|
sl@0
|
84 |
/*
|
sl@0
|
85 |
** Prepare to begin tokenizing a particular string. The input
|
sl@0
|
86 |
** string to be tokenized is zInput[0..nInput-1]. A cursor
|
sl@0
|
87 |
** used to incrementally tokenize this string is returned in
|
sl@0
|
88 |
** *ppCursor.
|
sl@0
|
89 |
*/
|
sl@0
|
90 |
static int porterOpen(
|
sl@0
|
91 |
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
sl@0
|
92 |
const char *zInput, int nInput, /* String to be tokenized */
|
sl@0
|
93 |
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
sl@0
|
94 |
){
|
sl@0
|
95 |
porter_tokenizer_cursor *c;
|
sl@0
|
96 |
|
sl@0
|
97 |
c = (porter_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
|
sl@0
|
98 |
if( c==NULL ) return SQLITE_NOMEM;
|
sl@0
|
99 |
|
sl@0
|
100 |
c->zInput = zInput;
|
sl@0
|
101 |
if( zInput==0 ){
|
sl@0
|
102 |
c->nInput = 0;
|
sl@0
|
103 |
}else if( nInput<0 ){
|
sl@0
|
104 |
c->nInput = (int)strlen(zInput);
|
sl@0
|
105 |
}else{
|
sl@0
|
106 |
c->nInput = nInput;
|
sl@0
|
107 |
}
|
sl@0
|
108 |
c->iOffset = 0; /* start tokenizing at the beginning */
|
sl@0
|
109 |
c->iToken = 0;
|
sl@0
|
110 |
c->zToken = NULL; /* no space allocated, yet. */
|
sl@0
|
111 |
c->nAllocated = 0;
|
sl@0
|
112 |
|
sl@0
|
113 |
*ppCursor = &c->base;
|
sl@0
|
114 |
return SQLITE_OK;
|
sl@0
|
115 |
}
|
sl@0
|
116 |
|
sl@0
|
117 |
/*
|
sl@0
|
118 |
** Close a tokenization cursor previously opened by a call to
|
sl@0
|
119 |
** porterOpen() above.
|
sl@0
|
120 |
*/
|
sl@0
|
121 |
static int porterClose(sqlite3_tokenizer_cursor *pCursor){
|
sl@0
|
122 |
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
|
sl@0
|
123 |
sqlite3_free(c->zToken);
|
sl@0
|
124 |
sqlite3_free(c);
|
sl@0
|
125 |
return SQLITE_OK;
|
sl@0
|
126 |
}
|
sl@0
|
127 |
/*
|
sl@0
|
128 |
** Vowel or consonant
|
sl@0
|
129 |
*/
|
sl@0
|
130 |
static const char cType[] = {
|
sl@0
|
131 |
0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0,
|
sl@0
|
132 |
1, 1, 1, 2, 1
|
sl@0
|
133 |
};
|
sl@0
|
134 |
|
sl@0
|
135 |
/*
|
sl@0
|
136 |
** isConsonant() and isVowel() determine if their first character in
|
sl@0
|
137 |
** the string they point to is a consonant or a vowel, according
|
sl@0
|
138 |
** to Porter ruls.
|
sl@0
|
139 |
**
|
sl@0
|
140 |
** A consonate is any letter other than 'a', 'e', 'i', 'o', or 'u'.
|
sl@0
|
141 |
** 'Y' is a consonant unless it follows another consonant,
|
sl@0
|
142 |
** in which case it is a vowel.
|
sl@0
|
143 |
**
|
sl@0
|
144 |
** In these routine, the letters are in reverse order. So the 'y' rule
|
sl@0
|
145 |
** is that 'y' is a consonant unless it is followed by another
|
sl@0
|
146 |
** consonent.
|
sl@0
|
147 |
*/
|
sl@0
|
148 |
static int isVowel(const char*);
|
sl@0
|
149 |
static int isConsonant(const char *z){
|
sl@0
|
150 |
int j;
|
sl@0
|
151 |
char x = *z;
|
sl@0
|
152 |
if( x==0 ) return 0;
|
sl@0
|
153 |
assert( x>='a' && x<='z' );
|
sl@0
|
154 |
j = cType[x-'a'];
|
sl@0
|
155 |
if( j<2 ) return j;
|
sl@0
|
156 |
return z[1]==0 || isVowel(z + 1);
|
sl@0
|
157 |
}
|
sl@0
|
158 |
static int isVowel(const char *z){
|
sl@0
|
159 |
int j;
|
sl@0
|
160 |
char x = *z;
|
sl@0
|
161 |
if( x==0 ) return 0;
|
sl@0
|
162 |
assert( x>='a' && x<='z' );
|
sl@0
|
163 |
j = cType[x-'a'];
|
sl@0
|
164 |
if( j<2 ) return 1-j;
|
sl@0
|
165 |
return isConsonant(z + 1);
|
sl@0
|
166 |
}
|
sl@0
|
167 |
|
sl@0
|
168 |
/*
|
sl@0
|
169 |
** Let any sequence of one or more vowels be represented by V and let
|
sl@0
|
170 |
** C be sequence of one or more consonants. Then every word can be
|
sl@0
|
171 |
** represented as:
|
sl@0
|
172 |
**
|
sl@0
|
173 |
** [C] (VC){m} [V]
|
sl@0
|
174 |
**
|
sl@0
|
175 |
** In prose: A word is an optional consonant followed by zero or
|
sl@0
|
176 |
** vowel-consonant pairs followed by an optional vowel. "m" is the
|
sl@0
|
177 |
** number of vowel consonant pairs. This routine computes the value
|
sl@0
|
178 |
** of m for the first i bytes of a word.
|
sl@0
|
179 |
**
|
sl@0
|
180 |
** Return true if the m-value for z is 1 or more. In other words,
|
sl@0
|
181 |
** return true if z contains at least one vowel that is followed
|
sl@0
|
182 |
** by a consonant.
|
sl@0
|
183 |
**
|
sl@0
|
184 |
** In this routine z[] is in reverse order. So we are really looking
|
sl@0
|
185 |
** for an instance of of a consonant followed by a vowel.
|
sl@0
|
186 |
*/
|
sl@0
|
187 |
static int m_gt_0(const char *z){
|
sl@0
|
188 |
while( isVowel(z) ){ z++; }
|
sl@0
|
189 |
if( *z==0 ) return 0;
|
sl@0
|
190 |
while( isConsonant(z) ){ z++; }
|
sl@0
|
191 |
return *z!=0;
|
sl@0
|
192 |
}
|
sl@0
|
193 |
|
sl@0
|
194 |
/* Like mgt0 above except we are looking for a value of m which is
|
sl@0
|
195 |
** exactly 1
|
sl@0
|
196 |
*/
|
sl@0
|
197 |
static int m_eq_1(const char *z){
|
sl@0
|
198 |
while( isVowel(z) ){ z++; }
|
sl@0
|
199 |
if( *z==0 ) return 0;
|
sl@0
|
200 |
while( isConsonant(z) ){ z++; }
|
sl@0
|
201 |
if( *z==0 ) return 0;
|
sl@0
|
202 |
while( isVowel(z) ){ z++; }
|
sl@0
|
203 |
if( *z==0 ) return 1;
|
sl@0
|
204 |
while( isConsonant(z) ){ z++; }
|
sl@0
|
205 |
return *z==0;
|
sl@0
|
206 |
}
|
sl@0
|
207 |
|
sl@0
|
208 |
/* Like mgt0 above except we are looking for a value of m>1 instead
|
sl@0
|
209 |
** or m>0
|
sl@0
|
210 |
*/
|
sl@0
|
211 |
static int m_gt_1(const char *z){
|
sl@0
|
212 |
while( isVowel(z) ){ z++; }
|
sl@0
|
213 |
if( *z==0 ) return 0;
|
sl@0
|
214 |
while( isConsonant(z) ){ z++; }
|
sl@0
|
215 |
if( *z==0 ) return 0;
|
sl@0
|
216 |
while( isVowel(z) ){ z++; }
|
sl@0
|
217 |
if( *z==0 ) return 0;
|
sl@0
|
218 |
while( isConsonant(z) ){ z++; }
|
sl@0
|
219 |
return *z!=0;
|
sl@0
|
220 |
}
|
sl@0
|
221 |
|
sl@0
|
222 |
/*
|
sl@0
|
223 |
** Return TRUE if there is a vowel anywhere within z[0..n-1]
|
sl@0
|
224 |
*/
|
sl@0
|
225 |
static int hasVowel(const char *z){
|
sl@0
|
226 |
while( isConsonant(z) ){ z++; }
|
sl@0
|
227 |
return *z!=0;
|
sl@0
|
228 |
}
|
sl@0
|
229 |
|
sl@0
|
230 |
/*
|
sl@0
|
231 |
** Return TRUE if the word ends in a double consonant.
|
sl@0
|
232 |
**
|
sl@0
|
233 |
** The text is reversed here. So we are really looking at
|
sl@0
|
234 |
** the first two characters of z[].
|
sl@0
|
235 |
*/
|
sl@0
|
236 |
static int doubleConsonant(const char *z){
|
sl@0
|
237 |
return isConsonant(z) && z[0]==z[1] && isConsonant(z+1);
|
sl@0
|
238 |
}
|
sl@0
|
239 |
|
sl@0
|
240 |
/*
|
sl@0
|
241 |
** Return TRUE if the word ends with three letters which
|
sl@0
|
242 |
** are consonant-vowel-consonent and where the final consonant
|
sl@0
|
243 |
** is not 'w', 'x', or 'y'.
|
sl@0
|
244 |
**
|
sl@0
|
245 |
** The word is reversed here. So we are really checking the
|
sl@0
|
246 |
** first three letters and the first one cannot be in [wxy].
|
sl@0
|
247 |
*/
|
sl@0
|
248 |
static int star_oh(const char *z){
|
sl@0
|
249 |
return
|
sl@0
|
250 |
z[0]!=0 && isConsonant(z) &&
|
sl@0
|
251 |
z[0]!='w' && z[0]!='x' && z[0]!='y' &&
|
sl@0
|
252 |
z[1]!=0 && isVowel(z+1) &&
|
sl@0
|
253 |
z[2]!=0 && isConsonant(z+2);
|
sl@0
|
254 |
}
|
sl@0
|
255 |
|
sl@0
|
256 |
/*
|
sl@0
|
257 |
** If the word ends with zFrom and xCond() is true for the stem
|
sl@0
|
258 |
** of the word that preceeds the zFrom ending, then change the
|
sl@0
|
259 |
** ending to zTo.
|
sl@0
|
260 |
**
|
sl@0
|
261 |
** The input word *pz and zFrom are both in reverse order. zTo
|
sl@0
|
262 |
** is in normal order.
|
sl@0
|
263 |
**
|
sl@0
|
264 |
** Return TRUE if zFrom matches. Return FALSE if zFrom does not
|
sl@0
|
265 |
** match. Not that TRUE is returned even if xCond() fails and
|
sl@0
|
266 |
** no substitution occurs.
|
sl@0
|
267 |
*/
|
sl@0
|
268 |
static int stem(
|
sl@0
|
269 |
char **pz, /* The word being stemmed (Reversed) */
|
sl@0
|
270 |
const char *zFrom, /* If the ending matches this... (Reversed) */
|
sl@0
|
271 |
const char *zTo, /* ... change the ending to this (not reversed) */
|
sl@0
|
272 |
int (*xCond)(const char*) /* Condition that must be true */
|
sl@0
|
273 |
){
|
sl@0
|
274 |
char *z = *pz;
|
sl@0
|
275 |
while( *zFrom && *zFrom==*z ){ z++; zFrom++; }
|
sl@0
|
276 |
if( *zFrom!=0 ) return 0;
|
sl@0
|
277 |
if( xCond && !xCond(z) ) return 1;
|
sl@0
|
278 |
while( *zTo ){
|
sl@0
|
279 |
*(--z) = *(zTo++);
|
sl@0
|
280 |
}
|
sl@0
|
281 |
*pz = z;
|
sl@0
|
282 |
return 1;
|
sl@0
|
283 |
}
|
sl@0
|
284 |
|
sl@0
|
285 |
/*
|
sl@0
|
286 |
** This is the fallback stemmer used when the porter stemmer is
|
sl@0
|
287 |
** inappropriate. The input word is copied into the output with
|
sl@0
|
288 |
** US-ASCII case folding. If the input word is too long (more
|
sl@0
|
289 |
** than 20 bytes if it contains no digits or more than 6 bytes if
|
sl@0
|
290 |
** it contains digits) then word is truncated to 20 or 6 bytes
|
sl@0
|
291 |
** by taking 10 or 3 bytes from the beginning and end.
|
sl@0
|
292 |
*/
|
sl@0
|
293 |
static void copy_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
|
sl@0
|
294 |
int i, mx, j;
|
sl@0
|
295 |
int hasDigit = 0;
|
sl@0
|
296 |
for(i=0; i<nIn; i++){
|
sl@0
|
297 |
int c = zIn[i];
|
sl@0
|
298 |
if( c>='A' && c<='Z' ){
|
sl@0
|
299 |
zOut[i] = c - 'A' + 'a';
|
sl@0
|
300 |
}else{
|
sl@0
|
301 |
if( c>='0' && c<='9' ) hasDigit = 1;
|
sl@0
|
302 |
zOut[i] = c;
|
sl@0
|
303 |
}
|
sl@0
|
304 |
}
|
sl@0
|
305 |
mx = hasDigit ? 3 : 10;
|
sl@0
|
306 |
if( nIn>mx*2 ){
|
sl@0
|
307 |
for(j=mx, i=nIn-mx; i<nIn; i++, j++){
|
sl@0
|
308 |
zOut[j] = zOut[i];
|
sl@0
|
309 |
}
|
sl@0
|
310 |
i = j;
|
sl@0
|
311 |
}
|
sl@0
|
312 |
zOut[i] = 0;
|
sl@0
|
313 |
*pnOut = i;
|
sl@0
|
314 |
}
|
sl@0
|
315 |
|
sl@0
|
316 |
|
sl@0
|
317 |
/*
|
sl@0
|
318 |
** Stem the input word zIn[0..nIn-1]. Store the output in zOut.
|
sl@0
|
319 |
** zOut is at least big enough to hold nIn bytes. Write the actual
|
sl@0
|
320 |
** size of the output word (exclusive of the '\0' terminator) into *pnOut.
|
sl@0
|
321 |
**
|
sl@0
|
322 |
** Any upper-case characters in the US-ASCII character set ([A-Z])
|
sl@0
|
323 |
** are converted to lower case. Upper-case UTF characters are
|
sl@0
|
324 |
** unchanged.
|
sl@0
|
325 |
**
|
sl@0
|
326 |
** Words that are longer than about 20 bytes are stemmed by retaining
|
sl@0
|
327 |
** a few bytes from the beginning and the end of the word. If the
|
sl@0
|
328 |
** word contains digits, 3 bytes are taken from the beginning and
|
sl@0
|
329 |
** 3 bytes from the end. For long words without digits, 10 bytes
|
sl@0
|
330 |
** are taken from each end. US-ASCII case folding still applies.
|
sl@0
|
331 |
**
|
sl@0
|
332 |
** If the input word contains not digits but does characters not
|
sl@0
|
333 |
** in [a-zA-Z] then no stemming is attempted and this routine just
|
sl@0
|
334 |
** copies the input into the input into the output with US-ASCII
|
sl@0
|
335 |
** case folding.
|
sl@0
|
336 |
**
|
sl@0
|
337 |
** Stemming never increases the length of the word. So there is
|
sl@0
|
338 |
** no chance of overflowing the zOut buffer.
|
sl@0
|
339 |
*/
|
sl@0
|
340 |
static void porter_stemmer(const char *zIn, int nIn, char *zOut, int *pnOut){
|
sl@0
|
341 |
int i, j, c;
|
sl@0
|
342 |
char zReverse[28];
|
sl@0
|
343 |
char *z, *z2;
|
sl@0
|
344 |
if( nIn<3 || nIn>=sizeof(zReverse)-7 ){
|
sl@0
|
345 |
/* The word is too big or too small for the porter stemmer.
|
sl@0
|
346 |
** Fallback to the copy stemmer */
|
sl@0
|
347 |
copy_stemmer(zIn, nIn, zOut, pnOut);
|
sl@0
|
348 |
return;
|
sl@0
|
349 |
}
|
sl@0
|
350 |
for(i=0, j=sizeof(zReverse)-6; i<nIn; i++, j--){
|
sl@0
|
351 |
c = zIn[i];
|
sl@0
|
352 |
if( c>='A' && c<='Z' ){
|
sl@0
|
353 |
zReverse[j] = c + 'a' - 'A';
|
sl@0
|
354 |
}else if( c>='a' && c<='z' ){
|
sl@0
|
355 |
zReverse[j] = c;
|
sl@0
|
356 |
}else{
|
sl@0
|
357 |
/* The use of a character not in [a-zA-Z] means that we fallback
|
sl@0
|
358 |
** to the copy stemmer */
|
sl@0
|
359 |
copy_stemmer(zIn, nIn, zOut, pnOut);
|
sl@0
|
360 |
return;
|
sl@0
|
361 |
}
|
sl@0
|
362 |
}
|
sl@0
|
363 |
memset(&zReverse[sizeof(zReverse)-5], 0, 5);
|
sl@0
|
364 |
z = &zReverse[j+1];
|
sl@0
|
365 |
|
sl@0
|
366 |
|
sl@0
|
367 |
/* Step 1a */
|
sl@0
|
368 |
if( z[0]=='s' ){
|
sl@0
|
369 |
if(
|
sl@0
|
370 |
!stem(&z, "sess", "ss", 0) &&
|
sl@0
|
371 |
!stem(&z, "sei", "i", 0) &&
|
sl@0
|
372 |
!stem(&z, "ss", "ss", 0)
|
sl@0
|
373 |
){
|
sl@0
|
374 |
z++;
|
sl@0
|
375 |
}
|
sl@0
|
376 |
}
|
sl@0
|
377 |
|
sl@0
|
378 |
/* Step 1b */
|
sl@0
|
379 |
z2 = z;
|
sl@0
|
380 |
if( stem(&z, "dee", "ee", m_gt_0) ){
|
sl@0
|
381 |
/* Do nothing. The work was all in the test */
|
sl@0
|
382 |
}else if(
|
sl@0
|
383 |
(stem(&z, "gni", "", hasVowel) || stem(&z, "de", "", hasVowel))
|
sl@0
|
384 |
&& z!=z2
|
sl@0
|
385 |
){
|
sl@0
|
386 |
if( stem(&z, "ta", "ate", 0) ||
|
sl@0
|
387 |
stem(&z, "lb", "ble", 0) ||
|
sl@0
|
388 |
stem(&z, "zi", "ize", 0) ){
|
sl@0
|
389 |
/* Do nothing. The work was all in the test */
|
sl@0
|
390 |
}else if( doubleConsonant(z) && (*z!='l' && *z!='s' && *z!='z') ){
|
sl@0
|
391 |
z++;
|
sl@0
|
392 |
}else if( m_eq_1(z) && star_oh(z) ){
|
sl@0
|
393 |
*(--z) = 'e';
|
sl@0
|
394 |
}
|
sl@0
|
395 |
}
|
sl@0
|
396 |
|
sl@0
|
397 |
/* Step 1c */
|
sl@0
|
398 |
if( z[0]=='y' && hasVowel(z+1) ){
|
sl@0
|
399 |
z[0] = 'i';
|
sl@0
|
400 |
}
|
sl@0
|
401 |
|
sl@0
|
402 |
/* Step 2 */
|
sl@0
|
403 |
switch( z[1] ){
|
sl@0
|
404 |
case 'a':
|
sl@0
|
405 |
stem(&z, "lanoita", "ate", m_gt_0) ||
|
sl@0
|
406 |
stem(&z, "lanoit", "tion", m_gt_0);
|
sl@0
|
407 |
break;
|
sl@0
|
408 |
case 'c':
|
sl@0
|
409 |
stem(&z, "icne", "ence", m_gt_0) ||
|
sl@0
|
410 |
stem(&z, "icna", "ance", m_gt_0);
|
sl@0
|
411 |
break;
|
sl@0
|
412 |
case 'e':
|
sl@0
|
413 |
stem(&z, "rezi", "ize", m_gt_0);
|
sl@0
|
414 |
break;
|
sl@0
|
415 |
case 'g':
|
sl@0
|
416 |
stem(&z, "igol", "log", m_gt_0);
|
sl@0
|
417 |
break;
|
sl@0
|
418 |
case 'l':
|
sl@0
|
419 |
stem(&z, "ilb", "ble", m_gt_0) ||
|
sl@0
|
420 |
stem(&z, "illa", "al", m_gt_0) ||
|
sl@0
|
421 |
stem(&z, "iltne", "ent", m_gt_0) ||
|
sl@0
|
422 |
stem(&z, "ile", "e", m_gt_0) ||
|
sl@0
|
423 |
stem(&z, "ilsuo", "ous", m_gt_0);
|
sl@0
|
424 |
break;
|
sl@0
|
425 |
case 'o':
|
sl@0
|
426 |
stem(&z, "noitazi", "ize", m_gt_0) ||
|
sl@0
|
427 |
stem(&z, "noita", "ate", m_gt_0) ||
|
sl@0
|
428 |
stem(&z, "rota", "ate", m_gt_0);
|
sl@0
|
429 |
break;
|
sl@0
|
430 |
case 's':
|
sl@0
|
431 |
stem(&z, "msila", "al", m_gt_0) ||
|
sl@0
|
432 |
stem(&z, "ssenevi", "ive", m_gt_0) ||
|
sl@0
|
433 |
stem(&z, "ssenluf", "ful", m_gt_0) ||
|
sl@0
|
434 |
stem(&z, "ssensuo", "ous", m_gt_0);
|
sl@0
|
435 |
break;
|
sl@0
|
436 |
case 't':
|
sl@0
|
437 |
stem(&z, "itila", "al", m_gt_0) ||
|
sl@0
|
438 |
stem(&z, "itivi", "ive", m_gt_0) ||
|
sl@0
|
439 |
stem(&z, "itilib", "ble", m_gt_0);
|
sl@0
|
440 |
break;
|
sl@0
|
441 |
}
|
sl@0
|
442 |
|
sl@0
|
443 |
/* Step 3 */
|
sl@0
|
444 |
switch( z[0] ){
|
sl@0
|
445 |
case 'e':
|
sl@0
|
446 |
stem(&z, "etaci", "ic", m_gt_0) ||
|
sl@0
|
447 |
stem(&z, "evita", "", m_gt_0) ||
|
sl@0
|
448 |
stem(&z, "ezila", "al", m_gt_0);
|
sl@0
|
449 |
break;
|
sl@0
|
450 |
case 'i':
|
sl@0
|
451 |
stem(&z, "itici", "ic", m_gt_0);
|
sl@0
|
452 |
break;
|
sl@0
|
453 |
case 'l':
|
sl@0
|
454 |
stem(&z, "laci", "ic", m_gt_0) ||
|
sl@0
|
455 |
stem(&z, "luf", "", m_gt_0);
|
sl@0
|
456 |
break;
|
sl@0
|
457 |
case 's':
|
sl@0
|
458 |
stem(&z, "ssen", "", m_gt_0);
|
sl@0
|
459 |
break;
|
sl@0
|
460 |
}
|
sl@0
|
461 |
|
sl@0
|
462 |
/* Step 4 */
|
sl@0
|
463 |
switch( z[1] ){
|
sl@0
|
464 |
case 'a':
|
sl@0
|
465 |
if( z[0]=='l' && m_gt_1(z+2) ){
|
sl@0
|
466 |
z += 2;
|
sl@0
|
467 |
}
|
sl@0
|
468 |
break;
|
sl@0
|
469 |
case 'c':
|
sl@0
|
470 |
if( z[0]=='e' && z[2]=='n' && (z[3]=='a' || z[3]=='e') && m_gt_1(z+4) ){
|
sl@0
|
471 |
z += 4;
|
sl@0
|
472 |
}
|
sl@0
|
473 |
break;
|
sl@0
|
474 |
case 'e':
|
sl@0
|
475 |
if( z[0]=='r' && m_gt_1(z+2) ){
|
sl@0
|
476 |
z += 2;
|
sl@0
|
477 |
}
|
sl@0
|
478 |
break;
|
sl@0
|
479 |
case 'i':
|
sl@0
|
480 |
if( z[0]=='c' && m_gt_1(z+2) ){
|
sl@0
|
481 |
z += 2;
|
sl@0
|
482 |
}
|
sl@0
|
483 |
break;
|
sl@0
|
484 |
case 'l':
|
sl@0
|
485 |
if( z[0]=='e' && z[2]=='b' && (z[3]=='a' || z[3]=='i') && m_gt_1(z+4) ){
|
sl@0
|
486 |
z += 4;
|
sl@0
|
487 |
}
|
sl@0
|
488 |
break;
|
sl@0
|
489 |
case 'n':
|
sl@0
|
490 |
if( z[0]=='t' ){
|
sl@0
|
491 |
if( z[2]=='a' ){
|
sl@0
|
492 |
if( m_gt_1(z+3) ){
|
sl@0
|
493 |
z += 3;
|
sl@0
|
494 |
}
|
sl@0
|
495 |
}else if( z[2]=='e' ){
|
sl@0
|
496 |
stem(&z, "tneme", "", m_gt_1) ||
|
sl@0
|
497 |
stem(&z, "tnem", "", m_gt_1) ||
|
sl@0
|
498 |
stem(&z, "tne", "", m_gt_1);
|
sl@0
|
499 |
}
|
sl@0
|
500 |
}
|
sl@0
|
501 |
break;
|
sl@0
|
502 |
case 'o':
|
sl@0
|
503 |
if( z[0]=='u' ){
|
sl@0
|
504 |
if( m_gt_1(z+2) ){
|
sl@0
|
505 |
z += 2;
|
sl@0
|
506 |
}
|
sl@0
|
507 |
}else if( z[3]=='s' || z[3]=='t' ){
|
sl@0
|
508 |
stem(&z, "noi", "", m_gt_1);
|
sl@0
|
509 |
}
|
sl@0
|
510 |
break;
|
sl@0
|
511 |
case 's':
|
sl@0
|
512 |
if( z[0]=='m' && z[2]=='i' && m_gt_1(z+3) ){
|
sl@0
|
513 |
z += 3;
|
sl@0
|
514 |
}
|
sl@0
|
515 |
break;
|
sl@0
|
516 |
case 't':
|
sl@0
|
517 |
stem(&z, "eta", "", m_gt_1) ||
|
sl@0
|
518 |
stem(&z, "iti", "", m_gt_1);
|
sl@0
|
519 |
break;
|
sl@0
|
520 |
case 'u':
|
sl@0
|
521 |
if( z[0]=='s' && z[2]=='o' && m_gt_1(z+3) ){
|
sl@0
|
522 |
z += 3;
|
sl@0
|
523 |
}
|
sl@0
|
524 |
break;
|
sl@0
|
525 |
case 'v':
|
sl@0
|
526 |
case 'z':
|
sl@0
|
527 |
if( z[0]=='e' && z[2]=='i' && m_gt_1(z+3) ){
|
sl@0
|
528 |
z += 3;
|
sl@0
|
529 |
}
|
sl@0
|
530 |
break;
|
sl@0
|
531 |
}
|
sl@0
|
532 |
|
sl@0
|
533 |
/* Step 5a */
|
sl@0
|
534 |
if( z[0]=='e' ){
|
sl@0
|
535 |
if( m_gt_1(z+1) ){
|
sl@0
|
536 |
z++;
|
sl@0
|
537 |
}else if( m_eq_1(z+1) && !star_oh(z+1) ){
|
sl@0
|
538 |
z++;
|
sl@0
|
539 |
}
|
sl@0
|
540 |
}
|
sl@0
|
541 |
|
sl@0
|
542 |
/* Step 5b */
|
sl@0
|
543 |
if( m_gt_1(z) && z[0]=='l' && z[1]=='l' ){
|
sl@0
|
544 |
z++;
|
sl@0
|
545 |
}
|
sl@0
|
546 |
|
sl@0
|
547 |
/* z[] is now the stemmed word in reverse order. Flip it back
|
sl@0
|
548 |
** around into forward order and return.
|
sl@0
|
549 |
*/
|
sl@0
|
550 |
*pnOut = i = strlen(z);
|
sl@0
|
551 |
zOut[i] = 0;
|
sl@0
|
552 |
while( *z ){
|
sl@0
|
553 |
zOut[--i] = *(z++);
|
sl@0
|
554 |
}
|
sl@0
|
555 |
}
|
sl@0
|
556 |
|
sl@0
|
557 |
/*
|
sl@0
|
558 |
** Characters that can be part of a token. We assume any character
|
sl@0
|
559 |
** whose value is greater than 0x80 (any UTF character) can be
|
sl@0
|
560 |
** part of a token. In other words, delimiters all must have
|
sl@0
|
561 |
** values of 0x7f or lower.
|
sl@0
|
562 |
*/
|
sl@0
|
563 |
static const char porterIdChar[] = {
|
sl@0
|
564 |
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
|
sl@0
|
565 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
|
sl@0
|
566 |
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
|
sl@0
|
567 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
|
sl@0
|
568 |
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
|
sl@0
|
569 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
|
sl@0
|
570 |
};
|
sl@0
|
571 |
#define isDelim(C) (((ch=C)&0x80)==0 && (ch<0x30 || !porterIdChar[ch-0x30]))
|
sl@0
|
572 |
|
sl@0
|
573 |
/*
|
sl@0
|
574 |
** Extract the next token from a tokenization cursor. The cursor must
|
sl@0
|
575 |
** have been opened by a prior call to porterOpen().
|
sl@0
|
576 |
*/
|
sl@0
|
577 |
static int porterNext(
|
sl@0
|
578 |
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by porterOpen */
|
sl@0
|
579 |
const char **pzToken, /* OUT: *pzToken is the token text */
|
sl@0
|
580 |
int *pnBytes, /* OUT: Number of bytes in token */
|
sl@0
|
581 |
int *piStartOffset, /* OUT: Starting offset of token */
|
sl@0
|
582 |
int *piEndOffset, /* OUT: Ending offset of token */
|
sl@0
|
583 |
int *piPosition /* OUT: Position integer of token */
|
sl@0
|
584 |
){
|
sl@0
|
585 |
porter_tokenizer_cursor *c = (porter_tokenizer_cursor *) pCursor;
|
sl@0
|
586 |
const char *z = c->zInput;
|
sl@0
|
587 |
|
sl@0
|
588 |
while( c->iOffset<c->nInput ){
|
sl@0
|
589 |
int iStartOffset, ch;
|
sl@0
|
590 |
|
sl@0
|
591 |
/* Scan past delimiter characters */
|
sl@0
|
592 |
while( c->iOffset<c->nInput && isDelim(z[c->iOffset]) ){
|
sl@0
|
593 |
c->iOffset++;
|
sl@0
|
594 |
}
|
sl@0
|
595 |
|
sl@0
|
596 |
/* Count non-delimiter characters. */
|
sl@0
|
597 |
iStartOffset = c->iOffset;
|
sl@0
|
598 |
while( c->iOffset<c->nInput && !isDelim(z[c->iOffset]) ){
|
sl@0
|
599 |
c->iOffset++;
|
sl@0
|
600 |
}
|
sl@0
|
601 |
|
sl@0
|
602 |
if( c->iOffset>iStartOffset ){
|
sl@0
|
603 |
int n = c->iOffset-iStartOffset;
|
sl@0
|
604 |
if( n>c->nAllocated ){
|
sl@0
|
605 |
c->nAllocated = n+20;
|
sl@0
|
606 |
c->zToken = sqlite3_realloc(c->zToken, c->nAllocated);
|
sl@0
|
607 |
if( c->zToken==NULL ) return SQLITE_NOMEM;
|
sl@0
|
608 |
}
|
sl@0
|
609 |
porter_stemmer(&z[iStartOffset], n, c->zToken, pnBytes);
|
sl@0
|
610 |
*pzToken = c->zToken;
|
sl@0
|
611 |
*piStartOffset = iStartOffset;
|
sl@0
|
612 |
*piEndOffset = c->iOffset;
|
sl@0
|
613 |
*piPosition = c->iToken++;
|
sl@0
|
614 |
return SQLITE_OK;
|
sl@0
|
615 |
}
|
sl@0
|
616 |
}
|
sl@0
|
617 |
return SQLITE_DONE;
|
sl@0
|
618 |
}
|
sl@0
|
619 |
|
sl@0
|
620 |
/*
|
sl@0
|
621 |
** The set of routines that implement the porter-stemmer tokenizer
|
sl@0
|
622 |
*/
|
sl@0
|
623 |
static const sqlite3_tokenizer_module porterTokenizerModule = {
|
sl@0
|
624 |
0,
|
sl@0
|
625 |
porterCreate,
|
sl@0
|
626 |
porterDestroy,
|
sl@0
|
627 |
porterOpen,
|
sl@0
|
628 |
porterClose,
|
sl@0
|
629 |
porterNext,
|
sl@0
|
630 |
};
|
sl@0
|
631 |
|
sl@0
|
632 |
/*
|
sl@0
|
633 |
** Allocate a new porter tokenizer. Return a pointer to the new
|
sl@0
|
634 |
** tokenizer in *ppModule
|
sl@0
|
635 |
*/
|
sl@0
|
636 |
void sqlite3Fts3PorterTokenizerModule(
|
sl@0
|
637 |
sqlite3_tokenizer_module const**ppModule
|
sl@0
|
638 |
){
|
sl@0
|
639 |
*ppModule = &porterTokenizerModule;
|
sl@0
|
640 |
}
|
sl@0
|
641 |
|
sl@0
|
642 |
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */
|