sl@0
|
1 |
/*
|
sl@0
|
2 |
** 2006 Oct 10
|
sl@0
|
3 |
**
|
sl@0
|
4 |
** The author disclaims copyright to this source code. In place of
|
sl@0
|
5 |
** a legal notice, here is a blessing:
|
sl@0
|
6 |
**
|
sl@0
|
7 |
** May you do good and not evil.
|
sl@0
|
8 |
** May you find forgiveness for yourself and forgive others.
|
sl@0
|
9 |
** May you share freely, never taking more than you give.
|
sl@0
|
10 |
**
|
sl@0
|
11 |
******************************************************************************
|
sl@0
|
12 |
**
|
sl@0
|
13 |
** Implementation of the "simple" full-text-search tokenizer.
|
sl@0
|
14 |
*/
|
sl@0
|
15 |
|
sl@0
|
16 |
/*
|
sl@0
|
17 |
** The code in this file is only compiled if:
|
sl@0
|
18 |
**
|
sl@0
|
19 |
** * The FTS2 module is being built as an extension
|
sl@0
|
20 |
** (in which case SQLITE_CORE is not defined), or
|
sl@0
|
21 |
**
|
sl@0
|
22 |
** * The FTS2 module is being built into the core of
|
sl@0
|
23 |
** SQLite (in which case SQLITE_ENABLE_FTS2 is defined).
|
sl@0
|
24 |
*/
|
sl@0
|
25 |
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2)
|
sl@0
|
26 |
|
sl@0
|
27 |
|
sl@0
|
28 |
#include <assert.h>
|
sl@0
|
29 |
#include <stdlib.h>
|
sl@0
|
30 |
#include <stdio.h>
|
sl@0
|
31 |
#include <string.h>
|
sl@0
|
32 |
#include <ctype.h>
|
sl@0
|
33 |
|
sl@0
|
34 |
#include "fts2_tokenizer.h"
|
sl@0
|
35 |
|
sl@0
|
36 |
typedef struct simple_tokenizer {
|
sl@0
|
37 |
sqlite3_tokenizer base;
|
sl@0
|
38 |
char delim[128]; /* flag ASCII delimiters */
|
sl@0
|
39 |
} simple_tokenizer;
|
sl@0
|
40 |
|
sl@0
|
41 |
typedef struct simple_tokenizer_cursor {
|
sl@0
|
42 |
sqlite3_tokenizer_cursor base;
|
sl@0
|
43 |
const char *pInput; /* input we are tokenizing */
|
sl@0
|
44 |
int nBytes; /* size of the input */
|
sl@0
|
45 |
int iOffset; /* current position in pInput */
|
sl@0
|
46 |
int iToken; /* index of next token to be returned */
|
sl@0
|
47 |
char *pToken; /* storage for current token */
|
sl@0
|
48 |
int nTokenAllocated; /* space allocated to zToken buffer */
|
sl@0
|
49 |
} simple_tokenizer_cursor;
|
sl@0
|
50 |
|
sl@0
|
51 |
|
sl@0
|
52 |
/* Forward declaration */
|
sl@0
|
53 |
static const sqlite3_tokenizer_module simpleTokenizerModule;
|
sl@0
|
54 |
|
sl@0
|
55 |
static int simpleDelim(simple_tokenizer *t, unsigned char c){
|
sl@0
|
56 |
return c<0x80 && t->delim[c];
|
sl@0
|
57 |
}
|
sl@0
|
58 |
|
sl@0
|
59 |
/*
|
sl@0
|
60 |
** Create a new tokenizer instance.
|
sl@0
|
61 |
*/
|
sl@0
|
62 |
static int simpleCreate(
|
sl@0
|
63 |
int argc, const char * const *argv,
|
sl@0
|
64 |
sqlite3_tokenizer **ppTokenizer
|
sl@0
|
65 |
){
|
sl@0
|
66 |
simple_tokenizer *t;
|
sl@0
|
67 |
|
sl@0
|
68 |
t = (simple_tokenizer *) sqlite3_malloc(sizeof(*t));
|
sl@0
|
69 |
if( t==NULL ) return SQLITE_NOMEM;
|
sl@0
|
70 |
memset(t, 0, sizeof(*t));
|
sl@0
|
71 |
|
sl@0
|
72 |
/* TODO(shess) Delimiters need to remain the same from run to run,
|
sl@0
|
73 |
** else we need to reindex. One solution would be a meta-table to
|
sl@0
|
74 |
** track such information in the database, then we'd only want this
|
sl@0
|
75 |
** information on the initial create.
|
sl@0
|
76 |
*/
|
sl@0
|
77 |
if( argc>1 ){
|
sl@0
|
78 |
int i, n = strlen(argv[1]);
|
sl@0
|
79 |
for(i=0; i<n; i++){
|
sl@0
|
80 |
unsigned char ch = argv[1][i];
|
sl@0
|
81 |
/* We explicitly don't support UTF-8 delimiters for now. */
|
sl@0
|
82 |
if( ch>=0x80 ){
|
sl@0
|
83 |
sqlite3_free(t);
|
sl@0
|
84 |
return SQLITE_ERROR;
|
sl@0
|
85 |
}
|
sl@0
|
86 |
t->delim[ch] = 1;
|
sl@0
|
87 |
}
|
sl@0
|
88 |
} else {
|
sl@0
|
89 |
/* Mark non-alphanumeric ASCII characters as delimiters */
|
sl@0
|
90 |
int i;
|
sl@0
|
91 |
for(i=1; i<0x80; i++){
|
sl@0
|
92 |
t->delim[i] = !isalnum(i);
|
sl@0
|
93 |
}
|
sl@0
|
94 |
}
|
sl@0
|
95 |
|
sl@0
|
96 |
*ppTokenizer = &t->base;
|
sl@0
|
97 |
return SQLITE_OK;
|
sl@0
|
98 |
}
|
sl@0
|
99 |
|
sl@0
|
100 |
/*
|
sl@0
|
101 |
** Destroy a tokenizer
|
sl@0
|
102 |
*/
|
sl@0
|
103 |
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
sl@0
|
104 |
sqlite3_free(pTokenizer);
|
sl@0
|
105 |
return SQLITE_OK;
|
sl@0
|
106 |
}
|
sl@0
|
107 |
|
sl@0
|
108 |
/*
|
sl@0
|
109 |
** Prepare to begin tokenizing a particular string. The input
|
sl@0
|
110 |
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
sl@0
|
111 |
** used to incrementally tokenize this string is returned in
|
sl@0
|
112 |
** *ppCursor.
|
sl@0
|
113 |
*/
|
sl@0
|
114 |
static int simpleOpen(
|
sl@0
|
115 |
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
sl@0
|
116 |
const char *pInput, int nBytes, /* String to be tokenized */
|
sl@0
|
117 |
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
sl@0
|
118 |
){
|
sl@0
|
119 |
simple_tokenizer_cursor *c;
|
sl@0
|
120 |
|
sl@0
|
121 |
c = (simple_tokenizer_cursor *) sqlite3_malloc(sizeof(*c));
|
sl@0
|
122 |
if( c==NULL ) return SQLITE_NOMEM;
|
sl@0
|
123 |
|
sl@0
|
124 |
c->pInput = pInput;
|
sl@0
|
125 |
if( pInput==0 ){
|
sl@0
|
126 |
c->nBytes = 0;
|
sl@0
|
127 |
}else if( nBytes<0 ){
|
sl@0
|
128 |
c->nBytes = (int)strlen(pInput);
|
sl@0
|
129 |
}else{
|
sl@0
|
130 |
c->nBytes = nBytes;
|
sl@0
|
131 |
}
|
sl@0
|
132 |
c->iOffset = 0; /* start tokenizing at the beginning */
|
sl@0
|
133 |
c->iToken = 0;
|
sl@0
|
134 |
c->pToken = NULL; /* no space allocated, yet. */
|
sl@0
|
135 |
c->nTokenAllocated = 0;
|
sl@0
|
136 |
|
sl@0
|
137 |
*ppCursor = &c->base;
|
sl@0
|
138 |
return SQLITE_OK;
|
sl@0
|
139 |
}
|
sl@0
|
140 |
|
sl@0
|
141 |
/*
|
sl@0
|
142 |
** Close a tokenization cursor previously opened by a call to
|
sl@0
|
143 |
** simpleOpen() above.
|
sl@0
|
144 |
*/
|
sl@0
|
145 |
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
sl@0
|
146 |
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
sl@0
|
147 |
sqlite3_free(c->pToken);
|
sl@0
|
148 |
sqlite3_free(c);
|
sl@0
|
149 |
return SQLITE_OK;
|
sl@0
|
150 |
}
|
sl@0
|
151 |
|
sl@0
|
152 |
/*
|
sl@0
|
153 |
** Extract the next token from a tokenization cursor. The cursor must
|
sl@0
|
154 |
** have been opened by a prior call to simpleOpen().
|
sl@0
|
155 |
*/
|
sl@0
|
156 |
static int simpleNext(
|
sl@0
|
157 |
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
|
sl@0
|
158 |
const char **ppToken, /* OUT: *ppToken is the token text */
|
sl@0
|
159 |
int *pnBytes, /* OUT: Number of bytes in token */
|
sl@0
|
160 |
int *piStartOffset, /* OUT: Starting offset of token */
|
sl@0
|
161 |
int *piEndOffset, /* OUT: Ending offset of token */
|
sl@0
|
162 |
int *piPosition /* OUT: Position integer of token */
|
sl@0
|
163 |
){
|
sl@0
|
164 |
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
sl@0
|
165 |
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
sl@0
|
166 |
unsigned char *p = (unsigned char *)c->pInput;
|
sl@0
|
167 |
|
sl@0
|
168 |
while( c->iOffset<c->nBytes ){
|
sl@0
|
169 |
int iStartOffset;
|
sl@0
|
170 |
|
sl@0
|
171 |
/* Scan past delimiter characters */
|
sl@0
|
172 |
while( c->iOffset<c->nBytes && simpleDelim(t, p[c->iOffset]) ){
|
sl@0
|
173 |
c->iOffset++;
|
sl@0
|
174 |
}
|
sl@0
|
175 |
|
sl@0
|
176 |
/* Count non-delimiter characters. */
|
sl@0
|
177 |
iStartOffset = c->iOffset;
|
sl@0
|
178 |
while( c->iOffset<c->nBytes && !simpleDelim(t, p[c->iOffset]) ){
|
sl@0
|
179 |
c->iOffset++;
|
sl@0
|
180 |
}
|
sl@0
|
181 |
|
sl@0
|
182 |
if( c->iOffset>iStartOffset ){
|
sl@0
|
183 |
int i, n = c->iOffset-iStartOffset;
|
sl@0
|
184 |
if( n>c->nTokenAllocated ){
|
sl@0
|
185 |
c->nTokenAllocated = n+20;
|
sl@0
|
186 |
c->pToken = sqlite3_realloc(c->pToken, c->nTokenAllocated);
|
sl@0
|
187 |
if( c->pToken==NULL ) return SQLITE_NOMEM;
|
sl@0
|
188 |
}
|
sl@0
|
189 |
for(i=0; i<n; i++){
|
sl@0
|
190 |
/* TODO(shess) This needs expansion to handle UTF-8
|
sl@0
|
191 |
** case-insensitivity.
|
sl@0
|
192 |
*/
|
sl@0
|
193 |
unsigned char ch = p[iStartOffset+i];
|
sl@0
|
194 |
c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
|
sl@0
|
195 |
}
|
sl@0
|
196 |
*ppToken = c->pToken;
|
sl@0
|
197 |
*pnBytes = n;
|
sl@0
|
198 |
*piStartOffset = iStartOffset;
|
sl@0
|
199 |
*piEndOffset = c->iOffset;
|
sl@0
|
200 |
*piPosition = c->iToken++;
|
sl@0
|
201 |
|
sl@0
|
202 |
return SQLITE_OK;
|
sl@0
|
203 |
}
|
sl@0
|
204 |
}
|
sl@0
|
205 |
return SQLITE_DONE;
|
sl@0
|
206 |
}
|
sl@0
|
207 |
|
sl@0
|
208 |
/*
|
sl@0
|
209 |
** The set of routines that implement the simple tokenizer
|
sl@0
|
210 |
*/
|
sl@0
|
211 |
static const sqlite3_tokenizer_module simpleTokenizerModule = {
|
sl@0
|
212 |
0,
|
sl@0
|
213 |
simpleCreate,
|
sl@0
|
214 |
simpleDestroy,
|
sl@0
|
215 |
simpleOpen,
|
sl@0
|
216 |
simpleClose,
|
sl@0
|
217 |
simpleNext,
|
sl@0
|
218 |
};
|
sl@0
|
219 |
|
sl@0
|
220 |
/*
|
sl@0
|
221 |
** Allocate a new simple tokenizer. Return a pointer to the new
|
sl@0
|
222 |
** tokenizer in *ppModule
|
sl@0
|
223 |
*/
|
sl@0
|
224 |
void sqlite3Fts2SimpleTokenizerModule(
|
sl@0
|
225 |
sqlite3_tokenizer_module const**ppModule
|
sl@0
|
226 |
){
|
sl@0
|
227 |
*ppModule = &simpleTokenizerModule;
|
sl@0
|
228 |
}
|
sl@0
|
229 |
|
sl@0
|
230 |
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS2) */
|