sl@0
|
1 |
/*
|
sl@0
|
2 |
** The author disclaims copyright to this source code.
|
sl@0
|
3 |
**
|
sl@0
|
4 |
*************************************************************************
|
sl@0
|
5 |
** Implementation of the "simple" full-text-search tokenizer.
|
sl@0
|
6 |
*/
|
sl@0
|
7 |
|
sl@0
|
8 |
/*
|
sl@0
|
9 |
** The code in this file is only compiled if:
|
sl@0
|
10 |
**
|
sl@0
|
11 |
** * The FTS1 module is being built as an extension
|
sl@0
|
12 |
** (in which case SQLITE_CORE is not defined), or
|
sl@0
|
13 |
**
|
sl@0
|
14 |
** * The FTS1 module is being built into the core of
|
sl@0
|
15 |
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
|
sl@0
|
16 |
*/
|
sl@0
|
17 |
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
|
sl@0
|
18 |
|
sl@0
|
19 |
|
sl@0
|
20 |
#include <assert.h>
|
sl@0
|
21 |
#include <stdlib.h>
|
sl@0
|
22 |
#include <stdio.h>
|
sl@0
|
23 |
#include <string.h>
|
sl@0
|
24 |
#include <ctype.h>
|
sl@0
|
25 |
|
sl@0
|
26 |
#include "fts1_tokenizer.h"
|
sl@0
|
27 |
|
sl@0
|
28 |
typedef struct simple_tokenizer {
|
sl@0
|
29 |
sqlite3_tokenizer base;
|
sl@0
|
30 |
char delim[128]; /* flag ASCII delimiters */
|
sl@0
|
31 |
} simple_tokenizer;
|
sl@0
|
32 |
|
sl@0
|
33 |
typedef struct simple_tokenizer_cursor {
|
sl@0
|
34 |
sqlite3_tokenizer_cursor base;
|
sl@0
|
35 |
const char *pInput; /* input we are tokenizing */
|
sl@0
|
36 |
int nBytes; /* size of the input */
|
sl@0
|
37 |
int iOffset; /* current position in pInput */
|
sl@0
|
38 |
int iToken; /* index of next token to be returned */
|
sl@0
|
39 |
char *pToken; /* storage for current token */
|
sl@0
|
40 |
int nTokenAllocated; /* space allocated to zToken buffer */
|
sl@0
|
41 |
} simple_tokenizer_cursor;
|
sl@0
|
42 |
|
sl@0
|
43 |
|
sl@0
|
44 |
/* Forward declaration */
|
sl@0
|
45 |
static const sqlite3_tokenizer_module simpleTokenizerModule;
|
sl@0
|
46 |
|
sl@0
|
47 |
static int isDelim(simple_tokenizer *t, unsigned char c){
|
sl@0
|
48 |
return c<0x80 && t->delim[c];
|
sl@0
|
49 |
}
|
sl@0
|
50 |
|
sl@0
|
51 |
/*
|
sl@0
|
52 |
** Create a new tokenizer instance.
|
sl@0
|
53 |
*/
|
sl@0
|
54 |
static int simpleCreate(
|
sl@0
|
55 |
int argc, const char * const *argv,
|
sl@0
|
56 |
sqlite3_tokenizer **ppTokenizer
|
sl@0
|
57 |
){
|
sl@0
|
58 |
simple_tokenizer *t;
|
sl@0
|
59 |
|
sl@0
|
60 |
t = (simple_tokenizer *) calloc(sizeof(*t), 1);
|
sl@0
|
61 |
if( t==NULL ) return SQLITE_NOMEM;
|
sl@0
|
62 |
|
sl@0
|
63 |
/* TODO(shess) Delimiters need to remain the same from run to run,
|
sl@0
|
64 |
** else we need to reindex. One solution would be a meta-table to
|
sl@0
|
65 |
** track such information in the database, then we'd only want this
|
sl@0
|
66 |
** information on the initial create.
|
sl@0
|
67 |
*/
|
sl@0
|
68 |
if( argc>1 ){
|
sl@0
|
69 |
int i, n = strlen(argv[1]);
|
sl@0
|
70 |
for(i=0; i<n; i++){
|
sl@0
|
71 |
unsigned char ch = argv[1][i];
|
sl@0
|
72 |
/* We explicitly don't support UTF-8 delimiters for now. */
|
sl@0
|
73 |
if( ch>=0x80 ){
|
sl@0
|
74 |
free(t);
|
sl@0
|
75 |
return SQLITE_ERROR;
|
sl@0
|
76 |
}
|
sl@0
|
77 |
t->delim[ch] = 1;
|
sl@0
|
78 |
}
|
sl@0
|
79 |
} else {
|
sl@0
|
80 |
/* Mark non-alphanumeric ASCII characters as delimiters */
|
sl@0
|
81 |
int i;
|
sl@0
|
82 |
for(i=1; i<0x80; i++){
|
sl@0
|
83 |
t->delim[i] = !isalnum(i);
|
sl@0
|
84 |
}
|
sl@0
|
85 |
}
|
sl@0
|
86 |
|
sl@0
|
87 |
*ppTokenizer = &t->base;
|
sl@0
|
88 |
return SQLITE_OK;
|
sl@0
|
89 |
}
|
sl@0
|
90 |
|
sl@0
|
91 |
/*
|
sl@0
|
92 |
** Destroy a tokenizer
|
sl@0
|
93 |
*/
|
sl@0
|
94 |
static int simpleDestroy(sqlite3_tokenizer *pTokenizer){
|
sl@0
|
95 |
free(pTokenizer);
|
sl@0
|
96 |
return SQLITE_OK;
|
sl@0
|
97 |
}
|
sl@0
|
98 |
|
sl@0
|
99 |
/*
|
sl@0
|
100 |
** Prepare to begin tokenizing a particular string. The input
|
sl@0
|
101 |
** string to be tokenized is pInput[0..nBytes-1]. A cursor
|
sl@0
|
102 |
** used to incrementally tokenize this string is returned in
|
sl@0
|
103 |
** *ppCursor.
|
sl@0
|
104 |
*/
|
sl@0
|
105 |
static int simpleOpen(
|
sl@0
|
106 |
sqlite3_tokenizer *pTokenizer, /* The tokenizer */
|
sl@0
|
107 |
const char *pInput, int nBytes, /* String to be tokenized */
|
sl@0
|
108 |
sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */
|
sl@0
|
109 |
){
|
sl@0
|
110 |
simple_tokenizer_cursor *c;
|
sl@0
|
111 |
|
sl@0
|
112 |
c = (simple_tokenizer_cursor *) malloc(sizeof(*c));
|
sl@0
|
113 |
if( c==NULL ) return SQLITE_NOMEM;
|
sl@0
|
114 |
|
sl@0
|
115 |
c->pInput = pInput;
|
sl@0
|
116 |
if( pInput==0 ){
|
sl@0
|
117 |
c->nBytes = 0;
|
sl@0
|
118 |
}else if( nBytes<0 ){
|
sl@0
|
119 |
c->nBytes = (int)strlen(pInput);
|
sl@0
|
120 |
}else{
|
sl@0
|
121 |
c->nBytes = nBytes;
|
sl@0
|
122 |
}
|
sl@0
|
123 |
c->iOffset = 0; /* start tokenizing at the beginning */
|
sl@0
|
124 |
c->iToken = 0;
|
sl@0
|
125 |
c->pToken = NULL; /* no space allocated, yet. */
|
sl@0
|
126 |
c->nTokenAllocated = 0;
|
sl@0
|
127 |
|
sl@0
|
128 |
*ppCursor = &c->base;
|
sl@0
|
129 |
return SQLITE_OK;
|
sl@0
|
130 |
}
|
sl@0
|
131 |
|
sl@0
|
132 |
/*
|
sl@0
|
133 |
** Close a tokenization cursor previously opened by a call to
|
sl@0
|
134 |
** simpleOpen() above.
|
sl@0
|
135 |
*/
|
sl@0
|
136 |
static int simpleClose(sqlite3_tokenizer_cursor *pCursor){
|
sl@0
|
137 |
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
sl@0
|
138 |
free(c->pToken);
|
sl@0
|
139 |
free(c);
|
sl@0
|
140 |
return SQLITE_OK;
|
sl@0
|
141 |
}
|
sl@0
|
142 |
|
sl@0
|
143 |
/*
|
sl@0
|
144 |
** Extract the next token from a tokenization cursor. The cursor must
|
sl@0
|
145 |
** have been opened by a prior call to simpleOpen().
|
sl@0
|
146 |
*/
|
sl@0
|
147 |
static int simpleNext(
|
sl@0
|
148 |
sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */
|
sl@0
|
149 |
const char **ppToken, /* OUT: *ppToken is the token text */
|
sl@0
|
150 |
int *pnBytes, /* OUT: Number of bytes in token */
|
sl@0
|
151 |
int *piStartOffset, /* OUT: Starting offset of token */
|
sl@0
|
152 |
int *piEndOffset, /* OUT: Ending offset of token */
|
sl@0
|
153 |
int *piPosition /* OUT: Position integer of token */
|
sl@0
|
154 |
){
|
sl@0
|
155 |
simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor;
|
sl@0
|
156 |
simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer;
|
sl@0
|
157 |
unsigned char *p = (unsigned char *)c->pInput;
|
sl@0
|
158 |
|
sl@0
|
159 |
while( c->iOffset<c->nBytes ){
|
sl@0
|
160 |
int iStartOffset;
|
sl@0
|
161 |
|
sl@0
|
162 |
/* Scan past delimiter characters */
|
sl@0
|
163 |
while( c->iOffset<c->nBytes && isDelim(t, p[c->iOffset]) ){
|
sl@0
|
164 |
c->iOffset++;
|
sl@0
|
165 |
}
|
sl@0
|
166 |
|
sl@0
|
167 |
/* Count non-delimiter characters. */
|
sl@0
|
168 |
iStartOffset = c->iOffset;
|
sl@0
|
169 |
while( c->iOffset<c->nBytes && !isDelim(t, p[c->iOffset]) ){
|
sl@0
|
170 |
c->iOffset++;
|
sl@0
|
171 |
}
|
sl@0
|
172 |
|
sl@0
|
173 |
if( c->iOffset>iStartOffset ){
|
sl@0
|
174 |
int i, n = c->iOffset-iStartOffset;
|
sl@0
|
175 |
if( n>c->nTokenAllocated ){
|
sl@0
|
176 |
c->nTokenAllocated = n+20;
|
sl@0
|
177 |
c->pToken = realloc(c->pToken, c->nTokenAllocated);
|
sl@0
|
178 |
if( c->pToken==NULL ) return SQLITE_NOMEM;
|
sl@0
|
179 |
}
|
sl@0
|
180 |
for(i=0; i<n; i++){
|
sl@0
|
181 |
/* TODO(shess) This needs expansion to handle UTF-8
|
sl@0
|
182 |
** case-insensitivity.
|
sl@0
|
183 |
*/
|
sl@0
|
184 |
unsigned char ch = p[iStartOffset+i];
|
sl@0
|
185 |
c->pToken[i] = ch<0x80 ? tolower(ch) : ch;
|
sl@0
|
186 |
}
|
sl@0
|
187 |
*ppToken = c->pToken;
|
sl@0
|
188 |
*pnBytes = n;
|
sl@0
|
189 |
*piStartOffset = iStartOffset;
|
sl@0
|
190 |
*piEndOffset = c->iOffset;
|
sl@0
|
191 |
*piPosition = c->iToken++;
|
sl@0
|
192 |
|
sl@0
|
193 |
return SQLITE_OK;
|
sl@0
|
194 |
}
|
sl@0
|
195 |
}
|
sl@0
|
196 |
return SQLITE_DONE;
|
sl@0
|
197 |
}
|
sl@0
|
198 |
|
sl@0
|
199 |
/*
|
sl@0
|
200 |
** The set of routines that implement the simple tokenizer
|
sl@0
|
201 |
*/
|
sl@0
|
202 |
static const sqlite3_tokenizer_module simpleTokenizerModule = {
|
sl@0
|
203 |
0,
|
sl@0
|
204 |
simpleCreate,
|
sl@0
|
205 |
simpleDestroy,
|
sl@0
|
206 |
simpleOpen,
|
sl@0
|
207 |
simpleClose,
|
sl@0
|
208 |
simpleNext,
|
sl@0
|
209 |
};
|
sl@0
|
210 |
|
sl@0
|
211 |
/*
|
sl@0
|
212 |
** Allocate a new simple tokenizer. Return a pointer to the new
|
sl@0
|
213 |
** tokenizer in *ppModule
|
sl@0
|
214 |
*/
|
sl@0
|
215 |
void sqlite3Fts1SimpleTokenizerModule(
|
sl@0
|
216 |
sqlite3_tokenizer_module const**ppModule
|
sl@0
|
217 |
){
|
sl@0
|
218 |
*ppModule = &simpleTokenizerModule;
|
sl@0
|
219 |
}
|
sl@0
|
220 |
|
sl@0
|
221 |
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
|