sl@0
|
1 |
/* fts1 has a design flaw which can lead to database corruption (see
|
sl@0
|
2 |
** below). It is recommended not to use it any longer, instead use
|
sl@0
|
3 |
** fts3 (or higher). If you believe that your use of fts1 is safe,
|
sl@0
|
4 |
** add -DSQLITE_ENABLE_BROKEN_FTS1=1 to your CFLAGS.
|
sl@0
|
5 |
*/
|
sl@0
|
6 |
#if (!defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)) \
|
sl@0
|
7 |
&& !defined(SQLITE_ENABLE_BROKEN_FTS1)
|
sl@0
|
8 |
#error fts1 has a design flaw and has been deprecated.
|
sl@0
|
9 |
#endif
|
sl@0
|
10 |
/* The flaw is that fts1 uses the content table's unaliased rowid as
|
sl@0
|
11 |
** the unique docid. fts1 embeds the rowid in the index it builds,
|
sl@0
|
12 |
** and expects the rowid to not change. The SQLite VACUUM operation
|
sl@0
|
13 |
** will renumber such rowids, thereby breaking fts1. If you are using
|
sl@0
|
14 |
** fts1 in a system which has disabled VACUUM, then you can continue
|
sl@0
|
15 |
** to use it safely. Note that PRAGMA auto_vacuum does NOT disable
|
sl@0
|
16 |
** VACUUM, though systems using auto_vacuum are unlikely to invoke
|
sl@0
|
17 |
** VACUUM.
|
sl@0
|
18 |
**
|
sl@0
|
19 |
** fts1 should be safe even across VACUUM if you only insert documents
|
sl@0
|
20 |
** and never delete.
|
sl@0
|
21 |
*/
|
sl@0
|
22 |
|
sl@0
|
23 |
/* The author disclaims copyright to this source code.
|
sl@0
|
24 |
*
|
sl@0
|
25 |
* This is an SQLite module implementing full-text search.
|
sl@0
|
26 |
*/
|
sl@0
|
27 |
|
sl@0
|
28 |
/*
|
sl@0
|
29 |
** The code in this file is only compiled if:
|
sl@0
|
30 |
**
|
sl@0
|
31 |
** * The FTS1 module is being built as an extension
|
sl@0
|
32 |
** (in which case SQLITE_CORE is not defined), or
|
sl@0
|
33 |
**
|
sl@0
|
34 |
** * The FTS1 module is being built into the core of
|
sl@0
|
35 |
** SQLite (in which case SQLITE_ENABLE_FTS1 is defined).
|
sl@0
|
36 |
*/
|
sl@0
|
37 |
#if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1)
|
sl@0
|
38 |
|
sl@0
|
39 |
#if defined(SQLITE_ENABLE_FTS1) && !defined(SQLITE_CORE)
|
sl@0
|
40 |
# define SQLITE_CORE 1
|
sl@0
|
41 |
#endif
|
sl@0
|
42 |
|
sl@0
|
43 |
#include <assert.h>
|
sl@0
|
44 |
#include <stdlib.h>
|
sl@0
|
45 |
#include <stdio.h>
|
sl@0
|
46 |
#include <string.h>
|
sl@0
|
47 |
#include <ctype.h>
|
sl@0
|
48 |
|
sl@0
|
49 |
#include "fts1.h"
|
sl@0
|
50 |
#include "fts1_hash.h"
|
sl@0
|
51 |
#include "fts1_tokenizer.h"
|
sl@0
|
52 |
#include "sqlite3.h"
|
sl@0
|
53 |
#include "sqlite3ext.h"
|
sl@0
|
54 |
SQLITE_EXTENSION_INIT1
|
sl@0
|
55 |
|
sl@0
|
56 |
|
sl@0
|
57 |
#if 0
|
sl@0
|
58 |
# define TRACE(A) printf A; fflush(stdout)
|
sl@0
|
59 |
#else
|
sl@0
|
60 |
# define TRACE(A)
|
sl@0
|
61 |
#endif
|
sl@0
|
62 |
|
sl@0
|
63 |
/* utility functions */
|
sl@0
|
64 |
|
sl@0
|
65 |
typedef struct StringBuffer {
|
sl@0
|
66 |
int len; /* length, not including null terminator */
|
sl@0
|
67 |
int alloced; /* Space allocated for s[] */
|
sl@0
|
68 |
char *s; /* Content of the string */
|
sl@0
|
69 |
} StringBuffer;
|
sl@0
|
70 |
|
sl@0
|
71 |
static void initStringBuffer(StringBuffer *sb){
|
sl@0
|
72 |
sb->len = 0;
|
sl@0
|
73 |
sb->alloced = 100;
|
sl@0
|
74 |
sb->s = malloc(100);
|
sl@0
|
75 |
sb->s[0] = '\0';
|
sl@0
|
76 |
}
|
sl@0
|
77 |
|
sl@0
|
78 |
static void nappend(StringBuffer *sb, const char *zFrom, int nFrom){
|
sl@0
|
79 |
if( sb->len + nFrom >= sb->alloced ){
|
sl@0
|
80 |
sb->alloced = sb->len + nFrom + 100;
|
sl@0
|
81 |
sb->s = realloc(sb->s, sb->alloced+1);
|
sl@0
|
82 |
if( sb->s==0 ){
|
sl@0
|
83 |
initStringBuffer(sb);
|
sl@0
|
84 |
return;
|
sl@0
|
85 |
}
|
sl@0
|
86 |
}
|
sl@0
|
87 |
memcpy(sb->s + sb->len, zFrom, nFrom);
|
sl@0
|
88 |
sb->len += nFrom;
|
sl@0
|
89 |
sb->s[sb->len] = 0;
|
sl@0
|
90 |
}
|
sl@0
|
91 |
static void append(StringBuffer *sb, const char *zFrom){
|
sl@0
|
92 |
nappend(sb, zFrom, strlen(zFrom));
|
sl@0
|
93 |
}
|
sl@0
|
94 |
|
sl@0
|
95 |
/* We encode variable-length integers in little-endian order using seven bits
|
sl@0
|
96 |
* per byte as follows:
|
sl@0
|
97 |
**
|
sl@0
|
98 |
** KEY:
|
sl@0
|
99 |
** A = 0xxxxxxx 7 bits of data and one flag bit
|
sl@0
|
100 |
** B = 1xxxxxxx 7 bits of data and one flag bit
|
sl@0
|
101 |
**
|
sl@0
|
102 |
** 7 bits - A
|
sl@0
|
103 |
** 14 bits - BA
|
sl@0
|
104 |
** 21 bits - BBA
|
sl@0
|
105 |
** and so on.
|
sl@0
|
106 |
*/
|
sl@0
|
107 |
|
sl@0
|
108 |
/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
|
sl@0
|
109 |
#define VARINT_MAX 10
|
sl@0
|
110 |
|
sl@0
|
111 |
/* Write a 64-bit variable-length integer to memory starting at p[0].
|
sl@0
|
112 |
* The length of data written will be between 1 and VARINT_MAX bytes.
|
sl@0
|
113 |
* The number of bytes written is returned. */
|
sl@0
|
114 |
static int putVarint(char *p, sqlite_int64 v){
|
sl@0
|
115 |
unsigned char *q = (unsigned char *) p;
|
sl@0
|
116 |
sqlite_uint64 vu = v;
|
sl@0
|
117 |
do{
|
sl@0
|
118 |
*q++ = (unsigned char) ((vu & 0x7f) | 0x80);
|
sl@0
|
119 |
vu >>= 7;
|
sl@0
|
120 |
}while( vu!=0 );
|
sl@0
|
121 |
q[-1] &= 0x7f; /* turn off high bit in final byte */
|
sl@0
|
122 |
assert( q - (unsigned char *)p <= VARINT_MAX );
|
sl@0
|
123 |
return (int) (q - (unsigned char *)p);
|
sl@0
|
124 |
}
|
sl@0
|
125 |
|
sl@0
|
126 |
/* Read a 64-bit variable-length integer from memory starting at p[0].
|
sl@0
|
127 |
* Return the number of bytes read, or 0 on error.
|
sl@0
|
128 |
* The value is stored in *v. */
|
sl@0
|
129 |
static int getVarint(const char *p, sqlite_int64 *v){
|
sl@0
|
130 |
const unsigned char *q = (const unsigned char *) p;
|
sl@0
|
131 |
sqlite_uint64 x = 0, y = 1;
|
sl@0
|
132 |
while( (*q & 0x80) == 0x80 ){
|
sl@0
|
133 |
x += y * (*q++ & 0x7f);
|
sl@0
|
134 |
y <<= 7;
|
sl@0
|
135 |
if( q - (unsigned char *)p >= VARINT_MAX ){ /* bad data */
|
sl@0
|
136 |
assert( 0 );
|
sl@0
|
137 |
return 0;
|
sl@0
|
138 |
}
|
sl@0
|
139 |
}
|
sl@0
|
140 |
x += y * (*q++);
|
sl@0
|
141 |
*v = (sqlite_int64) x;
|
sl@0
|
142 |
return (int) (q - (unsigned char *)p);
|
sl@0
|
143 |
}
|
sl@0
|
144 |
|
sl@0
|
145 |
static int getVarint32(const char *p, int *pi){
|
sl@0
|
146 |
sqlite_int64 i;
|
sl@0
|
147 |
int ret = getVarint(p, &i);
|
sl@0
|
148 |
*pi = (int) i;
|
sl@0
|
149 |
assert( *pi==i );
|
sl@0
|
150 |
return ret;
|
sl@0
|
151 |
}
|
sl@0
|
152 |
|
sl@0
|
153 |
/*** Document lists ***
|
sl@0
|
154 |
*
|
sl@0
|
155 |
* A document list holds a sorted list of varint-encoded document IDs.
|
sl@0
|
156 |
*
|
sl@0
|
157 |
* A doclist with type DL_POSITIONS_OFFSETS is stored like this:
|
sl@0
|
158 |
*
|
sl@0
|
159 |
* array {
|
sl@0
|
160 |
* varint docid;
|
sl@0
|
161 |
* array {
|
sl@0
|
162 |
* varint position; (delta from previous position plus POS_BASE)
|
sl@0
|
163 |
* varint startOffset; (delta from previous startOffset)
|
sl@0
|
164 |
* varint endOffset; (delta from startOffset)
|
sl@0
|
165 |
* }
|
sl@0
|
166 |
* }
|
sl@0
|
167 |
*
|
sl@0
|
168 |
* Here, array { X } means zero or more occurrences of X, adjacent in memory.
|
sl@0
|
169 |
*
|
sl@0
|
170 |
* A position list may hold positions for text in multiple columns. A position
|
sl@0
|
171 |
* POS_COLUMN is followed by a varint containing the index of the column for
|
sl@0
|
172 |
* following positions in the list. Any positions appearing before any
|
sl@0
|
173 |
* occurrences of POS_COLUMN are for column 0.
|
sl@0
|
174 |
*
|
sl@0
|
175 |
* A doclist with type DL_POSITIONS is like the above, but holds only docids
|
sl@0
|
176 |
* and positions without offset information.
|
sl@0
|
177 |
*
|
sl@0
|
178 |
* A doclist with type DL_DOCIDS is like the above, but holds only docids
|
sl@0
|
179 |
* without positions or offset information.
|
sl@0
|
180 |
*
|
sl@0
|
181 |
* On disk, every document list has positions and offsets, so we don't bother
|
sl@0
|
182 |
* to serialize a doclist's type.
|
sl@0
|
183 |
*
|
sl@0
|
184 |
* We don't yet delta-encode document IDs; doing so will probably be a
|
sl@0
|
185 |
* modest win.
|
sl@0
|
186 |
*
|
sl@0
|
187 |
* NOTE(shess) I've thought of a slightly (1%) better offset encoding.
|
sl@0
|
188 |
* After the first offset, estimate the next offset by using the
|
sl@0
|
189 |
* current token position and the previous token position and offset,
|
sl@0
|
190 |
* offset to handle some variance. So the estimate would be
|
sl@0
|
191 |
* (iPosition*w->iStartOffset/w->iPosition-64), which is delta-encoded
|
sl@0
|
192 |
* as normal. Offsets more than 64 chars from the estimate are
|
sl@0
|
193 |
* encoded as the delta to the previous start offset + 128. An
|
sl@0
|
194 |
* additional tiny increment can be gained by using the end offset of
|
sl@0
|
195 |
* the previous token to make the estimate a tiny bit more precise.
|
sl@0
|
196 |
*/
|
sl@0
|
197 |
|
sl@0
|
198 |
/* It is not safe to call isspace(), tolower(), or isalnum() on
|
sl@0
|
199 |
** hi-bit-set characters. This is the same solution used in the
|
sl@0
|
200 |
** tokenizer.
|
sl@0
|
201 |
*/
|
sl@0
|
202 |
/* TODO(shess) The snippet-generation code should be using the
|
sl@0
|
203 |
** tokenizer-generated tokens rather than doing its own local
|
sl@0
|
204 |
** tokenization.
|
sl@0
|
205 |
*/
|
sl@0
|
206 |
/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
|
sl@0
|
207 |
static int safe_isspace(char c){
|
sl@0
|
208 |
return (c&0x80)==0 ? isspace(c) : 0;
|
sl@0
|
209 |
}
|
sl@0
|
210 |
static int safe_tolower(char c){
|
sl@0
|
211 |
return (c&0x80)==0 ? tolower(c) : c;
|
sl@0
|
212 |
}
|
sl@0
|
213 |
static int safe_isalnum(char c){
|
sl@0
|
214 |
return (c&0x80)==0 ? isalnum(c) : 0;
|
sl@0
|
215 |
}
|
sl@0
|
216 |
|
sl@0
|
217 |
typedef enum DocListType {
|
sl@0
|
218 |
DL_DOCIDS, /* docids only */
|
sl@0
|
219 |
DL_POSITIONS, /* docids + positions */
|
sl@0
|
220 |
DL_POSITIONS_OFFSETS /* docids + positions + offsets */
|
sl@0
|
221 |
} DocListType;
|
sl@0
|
222 |
|
sl@0
|
223 |
/*
|
sl@0
|
224 |
** By default, only positions and not offsets are stored in the doclists.
|
sl@0
|
225 |
** To change this so that offsets are stored too, compile with
|
sl@0
|
226 |
**
|
sl@0
|
227 |
** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
|
sl@0
|
228 |
**
|
sl@0
|
229 |
*/
|
sl@0
|
230 |
#ifndef DL_DEFAULT
|
sl@0
|
231 |
# define DL_DEFAULT DL_POSITIONS
|
sl@0
|
232 |
#endif
|
sl@0
|
233 |
|
sl@0
|
234 |
typedef struct DocList {
|
sl@0
|
235 |
char *pData;
|
sl@0
|
236 |
int nData;
|
sl@0
|
237 |
DocListType iType;
|
sl@0
|
238 |
int iLastColumn; /* the last column written */
|
sl@0
|
239 |
int iLastPos; /* the last position written */
|
sl@0
|
240 |
int iLastOffset; /* the last start offset written */
|
sl@0
|
241 |
} DocList;
|
sl@0
|
242 |
|
sl@0
|
243 |
enum {
|
sl@0
|
244 |
POS_END = 0, /* end of this position list */
|
sl@0
|
245 |
POS_COLUMN, /* followed by new column number */
|
sl@0
|
246 |
POS_BASE
|
sl@0
|
247 |
};
|
sl@0
|
248 |
|
sl@0
|
249 |
/* Initialize a new DocList to hold the given data. */
|
sl@0
|
250 |
static void docListInit(DocList *d, DocListType iType,
|
sl@0
|
251 |
const char *pData, int nData){
|
sl@0
|
252 |
d->nData = nData;
|
sl@0
|
253 |
if( nData>0 ){
|
sl@0
|
254 |
d->pData = malloc(nData);
|
sl@0
|
255 |
memcpy(d->pData, pData, nData);
|
sl@0
|
256 |
} else {
|
sl@0
|
257 |
d->pData = NULL;
|
sl@0
|
258 |
}
|
sl@0
|
259 |
d->iType = iType;
|
sl@0
|
260 |
d->iLastColumn = 0;
|
sl@0
|
261 |
d->iLastPos = d->iLastOffset = 0;
|
sl@0
|
262 |
}
|
sl@0
|
263 |
|
sl@0
|
264 |
/* Create a new dynamically-allocated DocList. */
|
sl@0
|
265 |
static DocList *docListNew(DocListType iType){
|
sl@0
|
266 |
DocList *d = (DocList *) malloc(sizeof(DocList));
|
sl@0
|
267 |
docListInit(d, iType, 0, 0);
|
sl@0
|
268 |
return d;
|
sl@0
|
269 |
}
|
sl@0
|
270 |
|
sl@0
|
271 |
static void docListDestroy(DocList *d){
|
sl@0
|
272 |
free(d->pData);
|
sl@0
|
273 |
#ifndef NDEBUG
|
sl@0
|
274 |
memset(d, 0x55, sizeof(*d));
|
sl@0
|
275 |
#endif
|
sl@0
|
276 |
}
|
sl@0
|
277 |
|
sl@0
|
278 |
static void docListDelete(DocList *d){
|
sl@0
|
279 |
docListDestroy(d);
|
sl@0
|
280 |
free(d);
|
sl@0
|
281 |
}
|
sl@0
|
282 |
|
sl@0
|
283 |
static char *docListEnd(DocList *d){
|
sl@0
|
284 |
return d->pData + d->nData;
|
sl@0
|
285 |
}
|
sl@0
|
286 |
|
sl@0
|
287 |
/* Append a varint to a DocList's data. */
|
sl@0
|
288 |
static void appendVarint(DocList *d, sqlite_int64 i){
|
sl@0
|
289 |
char c[VARINT_MAX];
|
sl@0
|
290 |
int n = putVarint(c, i);
|
sl@0
|
291 |
d->pData = realloc(d->pData, d->nData + n);
|
sl@0
|
292 |
memcpy(d->pData + d->nData, c, n);
|
sl@0
|
293 |
d->nData += n;
|
sl@0
|
294 |
}
|
sl@0
|
295 |
|
sl@0
|
296 |
static void docListAddDocid(DocList *d, sqlite_int64 iDocid){
|
sl@0
|
297 |
appendVarint(d, iDocid);
|
sl@0
|
298 |
if( d->iType>=DL_POSITIONS ){
|
sl@0
|
299 |
appendVarint(d, POS_END); /* initially empty position list */
|
sl@0
|
300 |
d->iLastColumn = 0;
|
sl@0
|
301 |
d->iLastPos = d->iLastOffset = 0;
|
sl@0
|
302 |
}
|
sl@0
|
303 |
}
|
sl@0
|
304 |
|
sl@0
|
305 |
/* helper function for docListAddPos and docListAddPosOffset */
|
sl@0
|
306 |
static void addPos(DocList *d, int iColumn, int iPos){
|
sl@0
|
307 |
assert( d->nData>0 );
|
sl@0
|
308 |
--d->nData; /* remove previous terminator */
|
sl@0
|
309 |
if( iColumn!=d->iLastColumn ){
|
sl@0
|
310 |
assert( iColumn>d->iLastColumn );
|
sl@0
|
311 |
appendVarint(d, POS_COLUMN);
|
sl@0
|
312 |
appendVarint(d, iColumn);
|
sl@0
|
313 |
d->iLastColumn = iColumn;
|
sl@0
|
314 |
d->iLastPos = d->iLastOffset = 0;
|
sl@0
|
315 |
}
|
sl@0
|
316 |
assert( iPos>=d->iLastPos );
|
sl@0
|
317 |
appendVarint(d, iPos-d->iLastPos+POS_BASE);
|
sl@0
|
318 |
d->iLastPos = iPos;
|
sl@0
|
319 |
}
|
sl@0
|
320 |
|
sl@0
|
321 |
/* Add a position to the last position list in a doclist. */
|
sl@0
|
322 |
static void docListAddPos(DocList *d, int iColumn, int iPos){
|
sl@0
|
323 |
assert( d->iType==DL_POSITIONS );
|
sl@0
|
324 |
addPos(d, iColumn, iPos);
|
sl@0
|
325 |
appendVarint(d, POS_END); /* add new terminator */
|
sl@0
|
326 |
}
|
sl@0
|
327 |
|
sl@0
|
328 |
/*
|
sl@0
|
329 |
** Add a position and starting and ending offsets to a doclist.
|
sl@0
|
330 |
**
|
sl@0
|
331 |
** If the doclist is setup to handle only positions, then insert
|
sl@0
|
332 |
** the position only and ignore the offsets.
|
sl@0
|
333 |
*/
|
sl@0
|
334 |
static void docListAddPosOffset(
|
sl@0
|
335 |
DocList *d, /* Doclist under construction */
|
sl@0
|
336 |
int iColumn, /* Column the inserted term is part of */
|
sl@0
|
337 |
int iPos, /* Position of the inserted term */
|
sl@0
|
338 |
int iStartOffset, /* Starting offset of inserted term */
|
sl@0
|
339 |
int iEndOffset /* Ending offset of inserted term */
|
sl@0
|
340 |
){
|
sl@0
|
341 |
assert( d->iType>=DL_POSITIONS );
|
sl@0
|
342 |
addPos(d, iColumn, iPos);
|
sl@0
|
343 |
if( d->iType==DL_POSITIONS_OFFSETS ){
|
sl@0
|
344 |
assert( iStartOffset>=d->iLastOffset );
|
sl@0
|
345 |
appendVarint(d, iStartOffset-d->iLastOffset);
|
sl@0
|
346 |
d->iLastOffset = iStartOffset;
|
sl@0
|
347 |
assert( iEndOffset>=iStartOffset );
|
sl@0
|
348 |
appendVarint(d, iEndOffset-iStartOffset);
|
sl@0
|
349 |
}
|
sl@0
|
350 |
appendVarint(d, POS_END); /* add new terminator */
|
sl@0
|
351 |
}
|
sl@0
|
352 |
|
sl@0
|
353 |
/*
|
sl@0
|
354 |
** A DocListReader object is a cursor into a doclist. Initialize
|
sl@0
|
355 |
** the cursor to the beginning of the doclist by calling readerInit().
|
sl@0
|
356 |
** Then use routines
|
sl@0
|
357 |
**
|
sl@0
|
358 |
** peekDocid()
|
sl@0
|
359 |
** readDocid()
|
sl@0
|
360 |
** readPosition()
|
sl@0
|
361 |
** skipPositionList()
|
sl@0
|
362 |
** and so forth...
|
sl@0
|
363 |
**
|
sl@0
|
364 |
** to read information out of the doclist. When we reach the end
|
sl@0
|
365 |
** of the doclist, atEnd() returns TRUE.
|
sl@0
|
366 |
*/
|
sl@0
|
367 |
typedef struct DocListReader {
|
sl@0
|
368 |
DocList *pDoclist; /* The document list we are stepping through */
|
sl@0
|
369 |
char *p; /* Pointer to next unread byte in the doclist */
|
sl@0
|
370 |
int iLastColumn;
|
sl@0
|
371 |
int iLastPos; /* the last position read, or -1 when not in a position list */
|
sl@0
|
372 |
} DocListReader;
|
sl@0
|
373 |
|
sl@0
|
374 |
/*
|
sl@0
|
375 |
** Initialize the DocListReader r to point to the beginning of pDoclist.
|
sl@0
|
376 |
*/
|
sl@0
|
377 |
static void readerInit(DocListReader *r, DocList *pDoclist){
|
sl@0
|
378 |
r->pDoclist = pDoclist;
|
sl@0
|
379 |
if( pDoclist!=NULL ){
|
sl@0
|
380 |
r->p = pDoclist->pData;
|
sl@0
|
381 |
}
|
sl@0
|
382 |
r->iLastColumn = -1;
|
sl@0
|
383 |
r->iLastPos = -1;
|
sl@0
|
384 |
}
|
sl@0
|
385 |
|
sl@0
|
386 |
/*
|
sl@0
|
387 |
** Return TRUE if we have reached then end of pReader and there is
|
sl@0
|
388 |
** nothing else left to read.
|
sl@0
|
389 |
*/
|
sl@0
|
390 |
static int atEnd(DocListReader *pReader){
|
sl@0
|
391 |
return pReader->pDoclist==0 || (pReader->p >= docListEnd(pReader->pDoclist));
|
sl@0
|
392 |
}
|
sl@0
|
393 |
|
sl@0
|
394 |
/* Peek at the next docid without advancing the read pointer.
|
sl@0
|
395 |
*/
|
sl@0
|
396 |
static sqlite_int64 peekDocid(DocListReader *pReader){
|
sl@0
|
397 |
sqlite_int64 ret;
|
sl@0
|
398 |
assert( !atEnd(pReader) );
|
sl@0
|
399 |
assert( pReader->iLastPos==-1 );
|
sl@0
|
400 |
getVarint(pReader->p, &ret);
|
sl@0
|
401 |
return ret;
|
sl@0
|
402 |
}
|
sl@0
|
403 |
|
sl@0
|
404 |
/* Read the next docid. See also nextDocid().
|
sl@0
|
405 |
*/
|
sl@0
|
406 |
static sqlite_int64 readDocid(DocListReader *pReader){
|
sl@0
|
407 |
sqlite_int64 ret;
|
sl@0
|
408 |
assert( !atEnd(pReader) );
|
sl@0
|
409 |
assert( pReader->iLastPos==-1 );
|
sl@0
|
410 |
pReader->p += getVarint(pReader->p, &ret);
|
sl@0
|
411 |
if( pReader->pDoclist->iType>=DL_POSITIONS ){
|
sl@0
|
412 |
pReader->iLastColumn = 0;
|
sl@0
|
413 |
pReader->iLastPos = 0;
|
sl@0
|
414 |
}
|
sl@0
|
415 |
return ret;
|
sl@0
|
416 |
}
|
sl@0
|
417 |
|
sl@0
|
418 |
/* Read the next position and column index from a position list.
|
sl@0
|
419 |
* Returns the position, or -1 at the end of the list. */
|
sl@0
|
420 |
static int readPosition(DocListReader *pReader, int *iColumn){
|
sl@0
|
421 |
int i;
|
sl@0
|
422 |
int iType = pReader->pDoclist->iType;
|
sl@0
|
423 |
|
sl@0
|
424 |
if( pReader->iLastPos==-1 ){
|
sl@0
|
425 |
return -1;
|
sl@0
|
426 |
}
|
sl@0
|
427 |
assert( !atEnd(pReader) );
|
sl@0
|
428 |
|
sl@0
|
429 |
if( iType<DL_POSITIONS ){
|
sl@0
|
430 |
return -1;
|
sl@0
|
431 |
}
|
sl@0
|
432 |
pReader->p += getVarint32(pReader->p, &i);
|
sl@0
|
433 |
if( i==POS_END ){
|
sl@0
|
434 |
pReader->iLastColumn = pReader->iLastPos = -1;
|
sl@0
|
435 |
*iColumn = -1;
|
sl@0
|
436 |
return -1;
|
sl@0
|
437 |
}
|
sl@0
|
438 |
if( i==POS_COLUMN ){
|
sl@0
|
439 |
pReader->p += getVarint32(pReader->p, &pReader->iLastColumn);
|
sl@0
|
440 |
pReader->iLastPos = 0;
|
sl@0
|
441 |
pReader->p += getVarint32(pReader->p, &i);
|
sl@0
|
442 |
assert( i>=POS_BASE );
|
sl@0
|
443 |
}
|
sl@0
|
444 |
pReader->iLastPos += ((int) i)-POS_BASE;
|
sl@0
|
445 |
if( iType>=DL_POSITIONS_OFFSETS ){
|
sl@0
|
446 |
/* Skip over offsets, ignoring them for now. */
|
sl@0
|
447 |
int iStart, iEnd;
|
sl@0
|
448 |
pReader->p += getVarint32(pReader->p, &iStart);
|
sl@0
|
449 |
pReader->p += getVarint32(pReader->p, &iEnd);
|
sl@0
|
450 |
}
|
sl@0
|
451 |
*iColumn = pReader->iLastColumn;
|
sl@0
|
452 |
return pReader->iLastPos;
|
sl@0
|
453 |
}
|
sl@0
|
454 |
|
sl@0
|
455 |
/* Skip past the end of a position list. */
|
sl@0
|
456 |
static void skipPositionList(DocListReader *pReader){
|
sl@0
|
457 |
DocList *p = pReader->pDoclist;
|
sl@0
|
458 |
if( p && p->iType>=DL_POSITIONS ){
|
sl@0
|
459 |
int iColumn;
|
sl@0
|
460 |
while( readPosition(pReader, &iColumn)!=-1 ){}
|
sl@0
|
461 |
}
|
sl@0
|
462 |
}
|
sl@0
|
463 |
|
sl@0
|
464 |
/* Skip over a docid, including its position list if the doclist has
|
sl@0
|
465 |
* positions. */
|
sl@0
|
466 |
static void skipDocument(DocListReader *pReader){
|
sl@0
|
467 |
readDocid(pReader);
|
sl@0
|
468 |
skipPositionList(pReader);
|
sl@0
|
469 |
}
|
sl@0
|
470 |
|
sl@0
|
471 |
/* Skip past all docids which are less than [iDocid]. Returns 1 if a docid
|
sl@0
|
472 |
* matching [iDocid] was found. */
|
sl@0
|
473 |
static int skipToDocid(DocListReader *pReader, sqlite_int64 iDocid){
|
sl@0
|
474 |
sqlite_int64 d = 0;
|
sl@0
|
475 |
while( !atEnd(pReader) && (d=peekDocid(pReader))<iDocid ){
|
sl@0
|
476 |
skipDocument(pReader);
|
sl@0
|
477 |
}
|
sl@0
|
478 |
return !atEnd(pReader) && d==iDocid;
|
sl@0
|
479 |
}
|
sl@0
|
480 |
|
sl@0
|
481 |
/* Return the first document in a document list.
|
sl@0
|
482 |
*/
|
sl@0
|
483 |
static sqlite_int64 firstDocid(DocList *d){
|
sl@0
|
484 |
DocListReader r;
|
sl@0
|
485 |
readerInit(&r, d);
|
sl@0
|
486 |
return readDocid(&r);
|
sl@0
|
487 |
}
|
sl@0
|
488 |
|
sl@0
|
489 |
#ifdef SQLITE_DEBUG
|
sl@0
|
490 |
/*
|
sl@0
|
491 |
** This routine is used for debugging purpose only.
|
sl@0
|
492 |
**
|
sl@0
|
493 |
** Write the content of a doclist to standard output.
|
sl@0
|
494 |
*/
|
sl@0
|
495 |
static void printDoclist(DocList *p){
|
sl@0
|
496 |
DocListReader r;
|
sl@0
|
497 |
const char *zSep = "";
|
sl@0
|
498 |
|
sl@0
|
499 |
readerInit(&r, p);
|
sl@0
|
500 |
while( !atEnd(&r) ){
|
sl@0
|
501 |
sqlite_int64 docid = readDocid(&r);
|
sl@0
|
502 |
if( docid==0 ){
|
sl@0
|
503 |
skipPositionList(&r);
|
sl@0
|
504 |
continue;
|
sl@0
|
505 |
}
|
sl@0
|
506 |
printf("%s%lld", zSep, docid);
|
sl@0
|
507 |
zSep = ",";
|
sl@0
|
508 |
if( p->iType>=DL_POSITIONS ){
|
sl@0
|
509 |
int iPos, iCol;
|
sl@0
|
510 |
const char *zDiv = "";
|
sl@0
|
511 |
printf("(");
|
sl@0
|
512 |
while( (iPos = readPosition(&r, &iCol))>=0 ){
|
sl@0
|
513 |
printf("%s%d:%d", zDiv, iCol, iPos);
|
sl@0
|
514 |
zDiv = ":";
|
sl@0
|
515 |
}
|
sl@0
|
516 |
printf(")");
|
sl@0
|
517 |
}
|
sl@0
|
518 |
}
|
sl@0
|
519 |
printf("\n");
|
sl@0
|
520 |
fflush(stdout);
|
sl@0
|
521 |
}
|
sl@0
|
522 |
#endif /* SQLITE_DEBUG */
|
sl@0
|
523 |
|
sl@0
|
524 |
/* Trim the given doclist to contain only positions in column
|
sl@0
|
525 |
* [iRestrictColumn]. */
|
sl@0
|
526 |
static void docListRestrictColumn(DocList *in, int iRestrictColumn){
|
sl@0
|
527 |
DocListReader r;
|
sl@0
|
528 |
DocList out;
|
sl@0
|
529 |
|
sl@0
|
530 |
assert( in->iType>=DL_POSITIONS );
|
sl@0
|
531 |
readerInit(&r, in);
|
sl@0
|
532 |
docListInit(&out, DL_POSITIONS, NULL, 0);
|
sl@0
|
533 |
|
sl@0
|
534 |
while( !atEnd(&r) ){
|
sl@0
|
535 |
sqlite_int64 iDocid = readDocid(&r);
|
sl@0
|
536 |
int iPos, iColumn;
|
sl@0
|
537 |
|
sl@0
|
538 |
docListAddDocid(&out, iDocid);
|
sl@0
|
539 |
while( (iPos = readPosition(&r, &iColumn)) != -1 ){
|
sl@0
|
540 |
if( iColumn==iRestrictColumn ){
|
sl@0
|
541 |
docListAddPos(&out, iColumn, iPos);
|
sl@0
|
542 |
}
|
sl@0
|
543 |
}
|
sl@0
|
544 |
}
|
sl@0
|
545 |
|
sl@0
|
546 |
docListDestroy(in);
|
sl@0
|
547 |
*in = out;
|
sl@0
|
548 |
}
|
sl@0
|
549 |
|
sl@0
|
550 |
/* Trim the given doclist by discarding any docids without any remaining
|
sl@0
|
551 |
* positions. */
|
sl@0
|
552 |
static void docListDiscardEmpty(DocList *in) {
|
sl@0
|
553 |
DocListReader r;
|
sl@0
|
554 |
DocList out;
|
sl@0
|
555 |
|
sl@0
|
556 |
/* TODO: It would be nice to implement this operation in place; that
|
sl@0
|
557 |
* could save a significant amount of memory in queries with long doclists. */
|
sl@0
|
558 |
assert( in->iType>=DL_POSITIONS );
|
sl@0
|
559 |
readerInit(&r, in);
|
sl@0
|
560 |
docListInit(&out, DL_POSITIONS, NULL, 0);
|
sl@0
|
561 |
|
sl@0
|
562 |
while( !atEnd(&r) ){
|
sl@0
|
563 |
sqlite_int64 iDocid = readDocid(&r);
|
sl@0
|
564 |
int match = 0;
|
sl@0
|
565 |
int iPos, iColumn;
|
sl@0
|
566 |
while( (iPos = readPosition(&r, &iColumn)) != -1 ){
|
sl@0
|
567 |
if( !match ){
|
sl@0
|
568 |
docListAddDocid(&out, iDocid);
|
sl@0
|
569 |
match = 1;
|
sl@0
|
570 |
}
|
sl@0
|
571 |
docListAddPos(&out, iColumn, iPos);
|
sl@0
|
572 |
}
|
sl@0
|
573 |
}
|
sl@0
|
574 |
|
sl@0
|
575 |
docListDestroy(in);
|
sl@0
|
576 |
*in = out;
|
sl@0
|
577 |
}
|
sl@0
|
578 |
|
sl@0
|
579 |
/* Helper function for docListUpdate() and docListAccumulate().
|
sl@0
|
580 |
** Splices a doclist element into the doclist represented by r,
|
sl@0
|
581 |
** leaving r pointing after the newly spliced element.
|
sl@0
|
582 |
*/
|
sl@0
|
583 |
static void docListSpliceElement(DocListReader *r, sqlite_int64 iDocid,
|
sl@0
|
584 |
const char *pSource, int nSource){
|
sl@0
|
585 |
DocList *d = r->pDoclist;
|
sl@0
|
586 |
char *pTarget;
|
sl@0
|
587 |
int nTarget, found;
|
sl@0
|
588 |
|
sl@0
|
589 |
found = skipToDocid(r, iDocid);
|
sl@0
|
590 |
|
sl@0
|
591 |
/* Describe slice in d to place pSource/nSource. */
|
sl@0
|
592 |
pTarget = r->p;
|
sl@0
|
593 |
if( found ){
|
sl@0
|
594 |
skipDocument(r);
|
sl@0
|
595 |
nTarget = r->p-pTarget;
|
sl@0
|
596 |
}else{
|
sl@0
|
597 |
nTarget = 0;
|
sl@0
|
598 |
}
|
sl@0
|
599 |
|
sl@0
|
600 |
/* The sense of the following is that there are three possibilities.
|
sl@0
|
601 |
** If nTarget==nSource, we should not move any memory nor realloc.
|
sl@0
|
602 |
** If nTarget>nSource, trim target and realloc.
|
sl@0
|
603 |
** If nTarget<nSource, realloc then expand target.
|
sl@0
|
604 |
*/
|
sl@0
|
605 |
if( nTarget>nSource ){
|
sl@0
|
606 |
memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
|
sl@0
|
607 |
}
|
sl@0
|
608 |
if( nTarget!=nSource ){
|
sl@0
|
609 |
int iDoclist = pTarget-d->pData;
|
sl@0
|
610 |
d->pData = realloc(d->pData, d->nData+nSource-nTarget);
|
sl@0
|
611 |
pTarget = d->pData+iDoclist;
|
sl@0
|
612 |
}
|
sl@0
|
613 |
if( nTarget<nSource ){
|
sl@0
|
614 |
memmove(pTarget+nSource, pTarget+nTarget, docListEnd(d)-(pTarget+nTarget));
|
sl@0
|
615 |
}
|
sl@0
|
616 |
|
sl@0
|
617 |
memcpy(pTarget, pSource, nSource);
|
sl@0
|
618 |
d->nData += nSource-nTarget;
|
sl@0
|
619 |
r->p = pTarget+nSource;
|
sl@0
|
620 |
}
|
sl@0
|
621 |
|
sl@0
|
622 |
/* Insert/update pUpdate into the doclist. */
|
sl@0
|
623 |
static void docListUpdate(DocList *d, DocList *pUpdate){
|
sl@0
|
624 |
DocListReader reader;
|
sl@0
|
625 |
|
sl@0
|
626 |
assert( d!=NULL && pUpdate!=NULL );
|
sl@0
|
627 |
assert( d->iType==pUpdate->iType);
|
sl@0
|
628 |
|
sl@0
|
629 |
readerInit(&reader, d);
|
sl@0
|
630 |
docListSpliceElement(&reader, firstDocid(pUpdate),
|
sl@0
|
631 |
pUpdate->pData, pUpdate->nData);
|
sl@0
|
632 |
}
|
sl@0
|
633 |
|
sl@0
|
634 |
/* Propagate elements from pUpdate to pAcc, overwriting elements with
|
sl@0
|
635 |
** matching docids.
|
sl@0
|
636 |
*/
|
sl@0
|
637 |
static void docListAccumulate(DocList *pAcc, DocList *pUpdate){
|
sl@0
|
638 |
DocListReader accReader, updateReader;
|
sl@0
|
639 |
|
sl@0
|
640 |
/* Handle edge cases where one doclist is empty. */
|
sl@0
|
641 |
assert( pAcc!=NULL );
|
sl@0
|
642 |
if( pUpdate==NULL || pUpdate->nData==0 ) return;
|
sl@0
|
643 |
if( pAcc->nData==0 ){
|
sl@0
|
644 |
pAcc->pData = malloc(pUpdate->nData);
|
sl@0
|
645 |
memcpy(pAcc->pData, pUpdate->pData, pUpdate->nData);
|
sl@0
|
646 |
pAcc->nData = pUpdate->nData;
|
sl@0
|
647 |
return;
|
sl@0
|
648 |
}
|
sl@0
|
649 |
|
sl@0
|
650 |
readerInit(&accReader, pAcc);
|
sl@0
|
651 |
readerInit(&updateReader, pUpdate);
|
sl@0
|
652 |
|
sl@0
|
653 |
while( !atEnd(&updateReader) ){
|
sl@0
|
654 |
char *pSource = updateReader.p;
|
sl@0
|
655 |
sqlite_int64 iDocid = readDocid(&updateReader);
|
sl@0
|
656 |
skipPositionList(&updateReader);
|
sl@0
|
657 |
docListSpliceElement(&accReader, iDocid, pSource, updateReader.p-pSource);
|
sl@0
|
658 |
}
|
sl@0
|
659 |
}
|
sl@0
|
660 |
|
sl@0
|
661 |
/*
|
sl@0
|
662 |
** Read the next docid off of pIn. Return 0 if we reach the end.
|
sl@0
|
663 |
*
|
sl@0
|
664 |
* TODO: This assumes that docids are never 0, but they may actually be 0 since
|
sl@0
|
665 |
* users can choose docids when inserting into a full-text table. Fix this.
|
sl@0
|
666 |
*/
|
sl@0
|
667 |
static sqlite_int64 nextDocid(DocListReader *pIn){
|
sl@0
|
668 |
skipPositionList(pIn);
|
sl@0
|
669 |
return atEnd(pIn) ? 0 : readDocid(pIn);
|
sl@0
|
670 |
}
|
sl@0
|
671 |
|
sl@0
|
672 |
/*
|
sl@0
|
673 |
** pLeft and pRight are two DocListReaders that are pointing to
|
sl@0
|
674 |
** positions lists of the same document: iDocid.
|
sl@0
|
675 |
**
|
sl@0
|
676 |
** If there are no instances in pLeft or pRight where the position
|
sl@0
|
677 |
** of pLeft is one less than the position of pRight, then this
|
sl@0
|
678 |
** routine adds nothing to pOut.
|
sl@0
|
679 |
**
|
sl@0
|
680 |
** If there are one or more instances where positions from pLeft
|
sl@0
|
681 |
** are exactly one less than positions from pRight, then add a new
|
sl@0
|
682 |
** document record to pOut. If pOut wants to hold positions, then
|
sl@0
|
683 |
** include the positions from pRight that are one more than a
|
sl@0
|
684 |
** position in pLeft. In other words: pRight.iPos==pLeft.iPos+1.
|
sl@0
|
685 |
**
|
sl@0
|
686 |
** pLeft and pRight are left pointing at the next document record.
|
sl@0
|
687 |
*/
|
sl@0
|
688 |
static void mergePosList(
|
sl@0
|
689 |
DocListReader *pLeft, /* Left position list */
|
sl@0
|
690 |
DocListReader *pRight, /* Right position list */
|
sl@0
|
691 |
sqlite_int64 iDocid, /* The docid from pLeft and pRight */
|
sl@0
|
692 |
DocList *pOut /* Write the merged document record here */
|
sl@0
|
693 |
){
|
sl@0
|
694 |
int iLeftCol, iLeftPos = readPosition(pLeft, &iLeftCol);
|
sl@0
|
695 |
int iRightCol, iRightPos = readPosition(pRight, &iRightCol);
|
sl@0
|
696 |
int match = 0;
|
sl@0
|
697 |
|
sl@0
|
698 |
/* Loop until we've reached the end of both position lists. */
|
sl@0
|
699 |
while( iLeftPos!=-1 && iRightPos!=-1 ){
|
sl@0
|
700 |
if( iLeftCol==iRightCol && iLeftPos+1==iRightPos ){
|
sl@0
|
701 |
if( !match ){
|
sl@0
|
702 |
docListAddDocid(pOut, iDocid);
|
sl@0
|
703 |
match = 1;
|
sl@0
|
704 |
}
|
sl@0
|
705 |
if( pOut->iType>=DL_POSITIONS ){
|
sl@0
|
706 |
docListAddPos(pOut, iRightCol, iRightPos);
|
sl@0
|
707 |
}
|
sl@0
|
708 |
iLeftPos = readPosition(pLeft, &iLeftCol);
|
sl@0
|
709 |
iRightPos = readPosition(pRight, &iRightCol);
|
sl@0
|
710 |
}else if( iRightCol<iLeftCol ||
|
sl@0
|
711 |
(iRightCol==iLeftCol && iRightPos<iLeftPos+1) ){
|
sl@0
|
712 |
iRightPos = readPosition(pRight, &iRightCol);
|
sl@0
|
713 |
}else{
|
sl@0
|
714 |
iLeftPos = readPosition(pLeft, &iLeftCol);
|
sl@0
|
715 |
}
|
sl@0
|
716 |
}
|
sl@0
|
717 |
if( iLeftPos>=0 ) skipPositionList(pLeft);
|
sl@0
|
718 |
if( iRightPos>=0 ) skipPositionList(pRight);
|
sl@0
|
719 |
}
|
sl@0
|
720 |
|
sl@0
|
721 |
/* We have two doclists: pLeft and pRight.
|
sl@0
|
722 |
** Write the phrase intersection of these two doclists into pOut.
|
sl@0
|
723 |
**
|
sl@0
|
724 |
** A phrase intersection means that two documents only match
|
sl@0
|
725 |
** if pLeft.iPos+1==pRight.iPos.
|
sl@0
|
726 |
**
|
sl@0
|
727 |
** The output pOut may or may not contain positions. If pOut
|
sl@0
|
728 |
** does contain positions, they are the positions of pRight.
|
sl@0
|
729 |
*/
|
sl@0
|
730 |
static void docListPhraseMerge(
|
sl@0
|
731 |
DocList *pLeft, /* Doclist resulting from the words on the left */
|
sl@0
|
732 |
DocList *pRight, /* Doclist for the next word to the right */
|
sl@0
|
733 |
DocList *pOut /* Write the combined doclist here */
|
sl@0
|
734 |
){
|
sl@0
|
735 |
DocListReader left, right;
|
sl@0
|
736 |
sqlite_int64 docidLeft, docidRight;
|
sl@0
|
737 |
|
sl@0
|
738 |
readerInit(&left, pLeft);
|
sl@0
|
739 |
readerInit(&right, pRight);
|
sl@0
|
740 |
docidLeft = nextDocid(&left);
|
sl@0
|
741 |
docidRight = nextDocid(&right);
|
sl@0
|
742 |
|
sl@0
|
743 |
while( docidLeft>0 && docidRight>0 ){
|
sl@0
|
744 |
if( docidLeft<docidRight ){
|
sl@0
|
745 |
docidLeft = nextDocid(&left);
|
sl@0
|
746 |
}else if( docidRight<docidLeft ){
|
sl@0
|
747 |
docidRight = nextDocid(&right);
|
sl@0
|
748 |
}else{
|
sl@0
|
749 |
mergePosList(&left, &right, docidLeft, pOut);
|
sl@0
|
750 |
docidLeft = nextDocid(&left);
|
sl@0
|
751 |
docidRight = nextDocid(&right);
|
sl@0
|
752 |
}
|
sl@0
|
753 |
}
|
sl@0
|
754 |
}
|
sl@0
|
755 |
|
sl@0
|
756 |
/* We have two doclists: pLeft and pRight.
|
sl@0
|
757 |
** Write the intersection of these two doclists into pOut.
|
sl@0
|
758 |
** Only docids are matched. Position information is ignored.
|
sl@0
|
759 |
**
|
sl@0
|
760 |
** The output pOut never holds positions.
|
sl@0
|
761 |
*/
|
sl@0
|
762 |
static void docListAndMerge(
|
sl@0
|
763 |
DocList *pLeft, /* Doclist resulting from the words on the left */
|
sl@0
|
764 |
DocList *pRight, /* Doclist for the next word to the right */
|
sl@0
|
765 |
DocList *pOut /* Write the combined doclist here */
|
sl@0
|
766 |
){
|
sl@0
|
767 |
DocListReader left, right;
|
sl@0
|
768 |
sqlite_int64 docidLeft, docidRight;
|
sl@0
|
769 |
|
sl@0
|
770 |
assert( pOut->iType<DL_POSITIONS );
|
sl@0
|
771 |
|
sl@0
|
772 |
readerInit(&left, pLeft);
|
sl@0
|
773 |
readerInit(&right, pRight);
|
sl@0
|
774 |
docidLeft = nextDocid(&left);
|
sl@0
|
775 |
docidRight = nextDocid(&right);
|
sl@0
|
776 |
|
sl@0
|
777 |
while( docidLeft>0 && docidRight>0 ){
|
sl@0
|
778 |
if( docidLeft<docidRight ){
|
sl@0
|
779 |
docidLeft = nextDocid(&left);
|
sl@0
|
780 |
}else if( docidRight<docidLeft ){
|
sl@0
|
781 |
docidRight = nextDocid(&right);
|
sl@0
|
782 |
}else{
|
sl@0
|
783 |
docListAddDocid(pOut, docidLeft);
|
sl@0
|
784 |
docidLeft = nextDocid(&left);
|
sl@0
|
785 |
docidRight = nextDocid(&right);
|
sl@0
|
786 |
}
|
sl@0
|
787 |
}
|
sl@0
|
788 |
}
|
sl@0
|
789 |
|
sl@0
|
790 |
/* We have two doclists: pLeft and pRight.
|
sl@0
|
791 |
** Write the union of these two doclists into pOut.
|
sl@0
|
792 |
** Only docids are matched. Position information is ignored.
|
sl@0
|
793 |
**
|
sl@0
|
794 |
** The output pOut never holds positions.
|
sl@0
|
795 |
*/
|
sl@0
|
796 |
static void docListOrMerge(
|
sl@0
|
797 |
DocList *pLeft, /* Doclist resulting from the words on the left */
|
sl@0
|
798 |
DocList *pRight, /* Doclist for the next word to the right */
|
sl@0
|
799 |
DocList *pOut /* Write the combined doclist here */
|
sl@0
|
800 |
){
|
sl@0
|
801 |
DocListReader left, right;
|
sl@0
|
802 |
sqlite_int64 docidLeft, docidRight, priorLeft;
|
sl@0
|
803 |
|
sl@0
|
804 |
readerInit(&left, pLeft);
|
sl@0
|
805 |
readerInit(&right, pRight);
|
sl@0
|
806 |
docidLeft = nextDocid(&left);
|
sl@0
|
807 |
docidRight = nextDocid(&right);
|
sl@0
|
808 |
|
sl@0
|
809 |
while( docidLeft>0 && docidRight>0 ){
|
sl@0
|
810 |
if( docidLeft<=docidRight ){
|
sl@0
|
811 |
docListAddDocid(pOut, docidLeft);
|
sl@0
|
812 |
}else{
|
sl@0
|
813 |
docListAddDocid(pOut, docidRight);
|
sl@0
|
814 |
}
|
sl@0
|
815 |
priorLeft = docidLeft;
|
sl@0
|
816 |
if( docidLeft<=docidRight ){
|
sl@0
|
817 |
docidLeft = nextDocid(&left);
|
sl@0
|
818 |
}
|
sl@0
|
819 |
if( docidRight>0 && docidRight<=priorLeft ){
|
sl@0
|
820 |
docidRight = nextDocid(&right);
|
sl@0
|
821 |
}
|
sl@0
|
822 |
}
|
sl@0
|
823 |
while( docidLeft>0 ){
|
sl@0
|
824 |
docListAddDocid(pOut, docidLeft);
|
sl@0
|
825 |
docidLeft = nextDocid(&left);
|
sl@0
|
826 |
}
|
sl@0
|
827 |
while( docidRight>0 ){
|
sl@0
|
828 |
docListAddDocid(pOut, docidRight);
|
sl@0
|
829 |
docidRight = nextDocid(&right);
|
sl@0
|
830 |
}
|
sl@0
|
831 |
}
|
sl@0
|
832 |
|
sl@0
|
833 |
/* We have two doclists: pLeft and pRight.
|
sl@0
|
834 |
** Write into pOut all documents that occur in pLeft but not
|
sl@0
|
835 |
** in pRight.
|
sl@0
|
836 |
**
|
sl@0
|
837 |
** Only docids are matched. Position information is ignored.
|
sl@0
|
838 |
**
|
sl@0
|
839 |
** The output pOut never holds positions.
|
sl@0
|
840 |
*/
|
sl@0
|
841 |
static void docListExceptMerge(
|
sl@0
|
842 |
DocList *pLeft, /* Doclist resulting from the words on the left */
|
sl@0
|
843 |
DocList *pRight, /* Doclist for the next word to the right */
|
sl@0
|
844 |
DocList *pOut /* Write the combined doclist here */
|
sl@0
|
845 |
){
|
sl@0
|
846 |
DocListReader left, right;
|
sl@0
|
847 |
sqlite_int64 docidLeft, docidRight, priorLeft;
|
sl@0
|
848 |
|
sl@0
|
849 |
readerInit(&left, pLeft);
|
sl@0
|
850 |
readerInit(&right, pRight);
|
sl@0
|
851 |
docidLeft = nextDocid(&left);
|
sl@0
|
852 |
docidRight = nextDocid(&right);
|
sl@0
|
853 |
|
sl@0
|
854 |
while( docidLeft>0 && docidRight>0 ){
|
sl@0
|
855 |
priorLeft = docidLeft;
|
sl@0
|
856 |
if( docidLeft<docidRight ){
|
sl@0
|
857 |
docListAddDocid(pOut, docidLeft);
|
sl@0
|
858 |
}
|
sl@0
|
859 |
if( docidLeft<=docidRight ){
|
sl@0
|
860 |
docidLeft = nextDocid(&left);
|
sl@0
|
861 |
}
|
sl@0
|
862 |
if( docidRight>0 && docidRight<=priorLeft ){
|
sl@0
|
863 |
docidRight = nextDocid(&right);
|
sl@0
|
864 |
}
|
sl@0
|
865 |
}
|
sl@0
|
866 |
while( docidLeft>0 ){
|
sl@0
|
867 |
docListAddDocid(pOut, docidLeft);
|
sl@0
|
868 |
docidLeft = nextDocid(&left);
|
sl@0
|
869 |
}
|
sl@0
|
870 |
}
|
sl@0
|
871 |
|
sl@0
|
872 |
static char *string_dup_n(const char *s, int n){
|
sl@0
|
873 |
char *str = malloc(n + 1);
|
sl@0
|
874 |
memcpy(str, s, n);
|
sl@0
|
875 |
str[n] = '\0';
|
sl@0
|
876 |
return str;
|
sl@0
|
877 |
}
|
sl@0
|
878 |
|
sl@0
|
879 |
/* Duplicate a string; the caller must free() the returned string.
|
sl@0
|
880 |
* (We don't use strdup() since it is not part of the standard C library and
|
sl@0
|
881 |
* may not be available everywhere.) */
|
sl@0
|
882 |
static char *string_dup(const char *s){
|
sl@0
|
883 |
return string_dup_n(s, strlen(s));
|
sl@0
|
884 |
}
|
sl@0
|
885 |
|
sl@0
|
886 |
/* Format a string, replacing each occurrence of the % character with
|
sl@0
|
887 |
* zDb.zName. This may be more convenient than sqlite_mprintf()
|
sl@0
|
888 |
* when one string is used repeatedly in a format string.
|
sl@0
|
889 |
* The caller must free() the returned string. */
|
sl@0
|
890 |
static char *string_format(const char *zFormat,
|
sl@0
|
891 |
const char *zDb, const char *zName){
|
sl@0
|
892 |
const char *p;
|
sl@0
|
893 |
size_t len = 0;
|
sl@0
|
894 |
size_t nDb = strlen(zDb);
|
sl@0
|
895 |
size_t nName = strlen(zName);
|
sl@0
|
896 |
size_t nFullTableName = nDb+1+nName;
|
sl@0
|
897 |
char *result;
|
sl@0
|
898 |
char *r;
|
sl@0
|
899 |
|
sl@0
|
900 |
/* first compute length needed */
|
sl@0
|
901 |
for(p = zFormat ; *p ; ++p){
|
sl@0
|
902 |
len += (*p=='%' ? nFullTableName : 1);
|
sl@0
|
903 |
}
|
sl@0
|
904 |
len += 1; /* for null terminator */
|
sl@0
|
905 |
|
sl@0
|
906 |
r = result = malloc(len);
|
sl@0
|
907 |
for(p = zFormat; *p; ++p){
|
sl@0
|
908 |
if( *p=='%' ){
|
sl@0
|
909 |
memcpy(r, zDb, nDb);
|
sl@0
|
910 |
r += nDb;
|
sl@0
|
911 |
*r++ = '.';
|
sl@0
|
912 |
memcpy(r, zName, nName);
|
sl@0
|
913 |
r += nName;
|
sl@0
|
914 |
} else {
|
sl@0
|
915 |
*r++ = *p;
|
sl@0
|
916 |
}
|
sl@0
|
917 |
}
|
sl@0
|
918 |
*r++ = '\0';
|
sl@0
|
919 |
assert( r == result + len );
|
sl@0
|
920 |
return result;
|
sl@0
|
921 |
}
|
sl@0
|
922 |
|
sl@0
|
923 |
static int sql_exec(sqlite3 *db, const char *zDb, const char *zName,
|
sl@0
|
924 |
const char *zFormat){
|
sl@0
|
925 |
char *zCommand = string_format(zFormat, zDb, zName);
|
sl@0
|
926 |
int rc;
|
sl@0
|
927 |
TRACE(("FTS1 sql: %s\n", zCommand));
|
sl@0
|
928 |
rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
|
sl@0
|
929 |
free(zCommand);
|
sl@0
|
930 |
return rc;
|
sl@0
|
931 |
}
|
sl@0
|
932 |
|
sl@0
|
933 |
static int sql_prepare(sqlite3 *db, const char *zDb, const char *zName,
|
sl@0
|
934 |
sqlite3_stmt **ppStmt, const char *zFormat){
|
sl@0
|
935 |
char *zCommand = string_format(zFormat, zDb, zName);
|
sl@0
|
936 |
int rc;
|
sl@0
|
937 |
TRACE(("FTS1 prepare: %s\n", zCommand));
|
sl@0
|
938 |
rc = sqlite3_prepare(db, zCommand, -1, ppStmt, NULL);
|
sl@0
|
939 |
free(zCommand);
|
sl@0
|
940 |
return rc;
|
sl@0
|
941 |
}
|
sl@0
|
942 |
|
sl@0
|
943 |
/* end utility functions */
|
sl@0
|
944 |
|
sl@0
|
945 |
/* Forward reference */
|
sl@0
|
946 |
typedef struct fulltext_vtab fulltext_vtab;
|
sl@0
|
947 |
|
sl@0
|
948 |
/* A single term in a query is represented by an instances of
|
sl@0
|
949 |
** the following structure.
|
sl@0
|
950 |
*/
|
sl@0
|
951 |
typedef struct QueryTerm {
|
sl@0
|
952 |
short int nPhrase; /* How many following terms are part of the same phrase */
|
sl@0
|
953 |
short int iPhrase; /* This is the i-th term of a phrase. */
|
sl@0
|
954 |
short int iColumn; /* Column of the index that must match this term */
|
sl@0
|
955 |
signed char isOr; /* this term is preceded by "OR" */
|
sl@0
|
956 |
signed char isNot; /* this term is preceded by "-" */
|
sl@0
|
957 |
char *pTerm; /* text of the term. '\000' terminated. malloced */
|
sl@0
|
958 |
int nTerm; /* Number of bytes in pTerm[] */
|
sl@0
|
959 |
} QueryTerm;
|
sl@0
|
960 |
|
sl@0
|
961 |
|
sl@0
|
962 |
/* A query string is parsed into a Query structure.
|
sl@0
|
963 |
*
|
sl@0
|
964 |
* We could, in theory, allow query strings to be complicated
|
sl@0
|
965 |
* nested expressions with precedence determined by parentheses.
|
sl@0
|
966 |
* But none of the major search engines do this. (Perhaps the
|
sl@0
|
967 |
* feeling is that an parenthesized expression is two complex of
|
sl@0
|
968 |
* an idea for the average user to grasp.) Taking our lead from
|
sl@0
|
969 |
* the major search engines, we will allow queries to be a list
|
sl@0
|
970 |
* of terms (with an implied AND operator) or phrases in double-quotes,
|
sl@0
|
971 |
* with a single optional "-" before each non-phrase term to designate
|
sl@0
|
972 |
* negation and an optional OR connector.
|
sl@0
|
973 |
*
|
sl@0
|
974 |
* OR binds more tightly than the implied AND, which is what the
|
sl@0
|
975 |
* major search engines seem to do. So, for example:
|
sl@0
|
976 |
*
|
sl@0
|
977 |
* [one two OR three] ==> one AND (two OR three)
|
sl@0
|
978 |
* [one OR two three] ==> (one OR two) AND three
|
sl@0
|
979 |
*
|
sl@0
|
980 |
* A "-" before a term matches all entries that lack that term.
|
sl@0
|
981 |
* The "-" must occur immediately before the term with in intervening
|
sl@0
|
982 |
* space. This is how the search engines do it.
|
sl@0
|
983 |
*
|
sl@0
|
984 |
* A NOT term cannot be the right-hand operand of an OR. If this
|
sl@0
|
985 |
* occurs in the query string, the NOT is ignored:
|
sl@0
|
986 |
*
|
sl@0
|
987 |
* [one OR -two] ==> one OR two
|
sl@0
|
988 |
*
|
sl@0
|
989 |
*/
|
sl@0
|
990 |
typedef struct Query {
|
sl@0
|
991 |
fulltext_vtab *pFts; /* The full text index */
|
sl@0
|
992 |
int nTerms; /* Number of terms in the query */
|
sl@0
|
993 |
QueryTerm *pTerms; /* Array of terms. Space obtained from malloc() */
|
sl@0
|
994 |
int nextIsOr; /* Set the isOr flag on the next inserted term */
|
sl@0
|
995 |
int nextColumn; /* Next word parsed must be in this column */
|
sl@0
|
996 |
int dfltColumn; /* The default column */
|
sl@0
|
997 |
} Query;
|
sl@0
|
998 |
|
sl@0
|
999 |
|
sl@0
|
1000 |
/*
|
sl@0
|
1001 |
** An instance of the following structure keeps track of generated
|
sl@0
|
1002 |
** matching-word offset information and snippets.
|
sl@0
|
1003 |
*/
|
sl@0
|
1004 |
typedef struct Snippet {
|
sl@0
|
1005 |
int nMatch; /* Total number of matches */
|
sl@0
|
1006 |
int nAlloc; /* Space allocated for aMatch[] */
|
sl@0
|
1007 |
struct snippetMatch { /* One entry for each matching term */
|
sl@0
|
1008 |
char snStatus; /* Status flag for use while constructing snippets */
|
sl@0
|
1009 |
short int iCol; /* The column that contains the match */
|
sl@0
|
1010 |
short int iTerm; /* The index in Query.pTerms[] of the matching term */
|
sl@0
|
1011 |
short int nByte; /* Number of bytes in the term */
|
sl@0
|
1012 |
int iStart; /* The offset to the first character of the term */
|
sl@0
|
1013 |
} *aMatch; /* Points to space obtained from malloc */
|
sl@0
|
1014 |
char *zOffset; /* Text rendering of aMatch[] */
|
sl@0
|
1015 |
int nOffset; /* strlen(zOffset) */
|
sl@0
|
1016 |
char *zSnippet; /* Snippet text */
|
sl@0
|
1017 |
int nSnippet; /* strlen(zSnippet) */
|
sl@0
|
1018 |
} Snippet;
|
sl@0
|
1019 |
|
sl@0
|
1020 |
|
sl@0
|
1021 |
typedef enum QueryType {
|
sl@0
|
1022 |
QUERY_GENERIC, /* table scan */
|
sl@0
|
1023 |
QUERY_ROWID, /* lookup by rowid */
|
sl@0
|
1024 |
QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
|
sl@0
|
1025 |
} QueryType;
|
sl@0
|
1026 |
|
sl@0
|
1027 |
/* TODO(shess) CHUNK_MAX controls how much data we allow in segment 0
|
sl@0
|
1028 |
** before we start aggregating into larger segments. Lower CHUNK_MAX
|
sl@0
|
1029 |
** means that for a given input we have more individual segments per
|
sl@0
|
1030 |
** term, which means more rows in the table and a bigger index (due to
|
sl@0
|
1031 |
** both more rows and bigger rowids). But it also reduces the average
|
sl@0
|
1032 |
** cost of adding new elements to the segment 0 doclist, and it seems
|
sl@0
|
1033 |
** to reduce the number of pages read and written during inserts. 256
|
sl@0
|
1034 |
** was chosen by measuring insertion times for a certain input (first
|
sl@0
|
1035 |
** 10k documents of Enron corpus), though including query performance
|
sl@0
|
1036 |
** in the decision may argue for a larger value.
|
sl@0
|
1037 |
*/
|
sl@0
|
1038 |
#define CHUNK_MAX 256
|
sl@0
|
1039 |
|
sl@0
|
1040 |
typedef enum fulltext_statement {
|
sl@0
|
1041 |
CONTENT_INSERT_STMT,
|
sl@0
|
1042 |
CONTENT_SELECT_STMT,
|
sl@0
|
1043 |
CONTENT_UPDATE_STMT,
|
sl@0
|
1044 |
CONTENT_DELETE_STMT,
|
sl@0
|
1045 |
|
sl@0
|
1046 |
TERM_SELECT_STMT,
|
sl@0
|
1047 |
TERM_SELECT_ALL_STMT,
|
sl@0
|
1048 |
TERM_INSERT_STMT,
|
sl@0
|
1049 |
TERM_UPDATE_STMT,
|
sl@0
|
1050 |
TERM_DELETE_STMT,
|
sl@0
|
1051 |
|
sl@0
|
1052 |
MAX_STMT /* Always at end! */
|
sl@0
|
1053 |
} fulltext_statement;
|
sl@0
|
1054 |
|
sl@0
|
1055 |
/* These must exactly match the enum above. */
|
sl@0
|
1056 |
/* TODO(adam): Is there some risk that a statement (in particular,
|
sl@0
|
1057 |
** pTermSelectStmt) will be used in two cursors at once, e.g. if a
|
sl@0
|
1058 |
** query joins a virtual table to itself? If so perhaps we should
|
sl@0
|
1059 |
** move some of these to the cursor object.
|
sl@0
|
1060 |
*/
|
sl@0
|
1061 |
static const char *const fulltext_zStatement[MAX_STMT] = {
|
sl@0
|
1062 |
/* CONTENT_INSERT */ NULL, /* generated in contentInsertStatement() */
|
sl@0
|
1063 |
/* CONTENT_SELECT */ "select * from %_content where rowid = ?",
|
sl@0
|
1064 |
/* CONTENT_UPDATE */ NULL, /* generated in contentUpdateStatement() */
|
sl@0
|
1065 |
/* CONTENT_DELETE */ "delete from %_content where rowid = ?",
|
sl@0
|
1066 |
|
sl@0
|
1067 |
/* TERM_SELECT */
|
sl@0
|
1068 |
"select rowid, doclist from %_term where term = ? and segment = ?",
|
sl@0
|
1069 |
/* TERM_SELECT_ALL */
|
sl@0
|
1070 |
"select doclist from %_term where term = ? order by segment",
|
sl@0
|
1071 |
/* TERM_INSERT */
|
sl@0
|
1072 |
"insert into %_term (rowid, term, segment, doclist) values (?, ?, ?, ?)",
|
sl@0
|
1073 |
/* TERM_UPDATE */ "update %_term set doclist = ? where rowid = ?",
|
sl@0
|
1074 |
/* TERM_DELETE */ "delete from %_term where rowid = ?",
|
sl@0
|
1075 |
};
|
sl@0
|
1076 |
|
sl@0
|
1077 |
/*
|
sl@0
|
1078 |
** A connection to a fulltext index is an instance of the following
|
sl@0
|
1079 |
** structure. The xCreate and xConnect methods create an instance
|
sl@0
|
1080 |
** of this structure and xDestroy and xDisconnect free that instance.
|
sl@0
|
1081 |
** All other methods receive a pointer to the structure as one of their
|
sl@0
|
1082 |
** arguments.
|
sl@0
|
1083 |
*/
|
sl@0
|
1084 |
struct fulltext_vtab {
|
sl@0
|
1085 |
sqlite3_vtab base; /* Base class used by SQLite core */
|
sl@0
|
1086 |
sqlite3 *db; /* The database connection */
|
sl@0
|
1087 |
const char *zDb; /* logical database name */
|
sl@0
|
1088 |
const char *zName; /* virtual table name */
|
sl@0
|
1089 |
int nColumn; /* number of columns in virtual table */
|
sl@0
|
1090 |
char **azColumn; /* column names. malloced */
|
sl@0
|
1091 |
char **azContentColumn; /* column names in content table; malloced */
|
sl@0
|
1092 |
sqlite3_tokenizer *pTokenizer; /* tokenizer for inserts and queries */
|
sl@0
|
1093 |
|
sl@0
|
1094 |
/* Precompiled statements which we keep as long as the table is
|
sl@0
|
1095 |
** open.
|
sl@0
|
1096 |
*/
|
sl@0
|
1097 |
sqlite3_stmt *pFulltextStatements[MAX_STMT];
|
sl@0
|
1098 |
};
|
sl@0
|
1099 |
|
sl@0
|
1100 |
/*
|
sl@0
|
1101 |
** When the core wants to do a query, it create a cursor using a
|
sl@0
|
1102 |
** call to xOpen. This structure is an instance of a cursor. It
|
sl@0
|
1103 |
** is destroyed by xClose.
|
sl@0
|
1104 |
*/
|
sl@0
|
1105 |
typedef struct fulltext_cursor {
|
sl@0
|
1106 |
sqlite3_vtab_cursor base; /* Base class used by SQLite core */
|
sl@0
|
1107 |
QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
|
sl@0
|
1108 |
sqlite3_stmt *pStmt; /* Prepared statement in use by the cursor */
|
sl@0
|
1109 |
int eof; /* True if at End Of Results */
|
sl@0
|
1110 |
Query q; /* Parsed query string */
|
sl@0
|
1111 |
Snippet snippet; /* Cached snippet for the current row */
|
sl@0
|
1112 |
int iColumn; /* Column being searched */
|
sl@0
|
1113 |
DocListReader result; /* used when iCursorType == QUERY_FULLTEXT */
|
sl@0
|
1114 |
} fulltext_cursor;
|
sl@0
|
1115 |
|
sl@0
|
1116 |
static struct fulltext_vtab *cursor_vtab(fulltext_cursor *c){
|
sl@0
|
1117 |
return (fulltext_vtab *) c->base.pVtab;
|
sl@0
|
1118 |
}
|
sl@0
|
1119 |
|
sl@0
|
1120 |
static const sqlite3_module fulltextModule; /* forward declaration */
|
sl@0
|
1121 |
|
sl@0
|
1122 |
/* Append a list of strings separated by commas to a StringBuffer. */
|
sl@0
|
1123 |
static void appendList(StringBuffer *sb, int nString, char **azString){
|
sl@0
|
1124 |
int i;
|
sl@0
|
1125 |
for(i=0; i<nString; ++i){
|
sl@0
|
1126 |
if( i>0 ) append(sb, ", ");
|
sl@0
|
1127 |
append(sb, azString[i]);
|
sl@0
|
1128 |
}
|
sl@0
|
1129 |
}
|
sl@0
|
1130 |
|
sl@0
|
1131 |
/* Return a dynamically generated statement of the form
|
sl@0
|
1132 |
* insert into %_content (rowid, ...) values (?, ...)
|
sl@0
|
1133 |
*/
|
sl@0
|
1134 |
static const char *contentInsertStatement(fulltext_vtab *v){
|
sl@0
|
1135 |
StringBuffer sb;
|
sl@0
|
1136 |
int i;
|
sl@0
|
1137 |
|
sl@0
|
1138 |
initStringBuffer(&sb);
|
sl@0
|
1139 |
append(&sb, "insert into %_content (rowid, ");
|
sl@0
|
1140 |
appendList(&sb, v->nColumn, v->azContentColumn);
|
sl@0
|
1141 |
append(&sb, ") values (?");
|
sl@0
|
1142 |
for(i=0; i<v->nColumn; ++i)
|
sl@0
|
1143 |
append(&sb, ", ?");
|
sl@0
|
1144 |
append(&sb, ")");
|
sl@0
|
1145 |
return sb.s;
|
sl@0
|
1146 |
}
|
sl@0
|
1147 |
|
sl@0
|
1148 |
/* Return a dynamically generated statement of the form
|
sl@0
|
1149 |
* update %_content set [col_0] = ?, [col_1] = ?, ...
|
sl@0
|
1150 |
* where rowid = ?
|
sl@0
|
1151 |
*/
|
sl@0
|
1152 |
static const char *contentUpdateStatement(fulltext_vtab *v){
|
sl@0
|
1153 |
StringBuffer sb;
|
sl@0
|
1154 |
int i;
|
sl@0
|
1155 |
|
sl@0
|
1156 |
initStringBuffer(&sb);
|
sl@0
|
1157 |
append(&sb, "update %_content set ");
|
sl@0
|
1158 |
for(i=0; i<v->nColumn; ++i) {
|
sl@0
|
1159 |
if( i>0 ){
|
sl@0
|
1160 |
append(&sb, ", ");
|
sl@0
|
1161 |
}
|
sl@0
|
1162 |
append(&sb, v->azContentColumn[i]);
|
sl@0
|
1163 |
append(&sb, " = ?");
|
sl@0
|
1164 |
}
|
sl@0
|
1165 |
append(&sb, " where rowid = ?");
|
sl@0
|
1166 |
return sb.s;
|
sl@0
|
1167 |
}
|
sl@0
|
1168 |
|
sl@0
|
1169 |
/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
|
sl@0
|
1170 |
** If the indicated statement has never been prepared, it is prepared
|
sl@0
|
1171 |
** and cached, otherwise the cached version is reset.
|
sl@0
|
1172 |
*/
|
sl@0
|
1173 |
static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
|
sl@0
|
1174 |
sqlite3_stmt **ppStmt){
|
sl@0
|
1175 |
assert( iStmt<MAX_STMT );
|
sl@0
|
1176 |
if( v->pFulltextStatements[iStmt]==NULL ){
|
sl@0
|
1177 |
const char *zStmt;
|
sl@0
|
1178 |
int rc;
|
sl@0
|
1179 |
switch( iStmt ){
|
sl@0
|
1180 |
case CONTENT_INSERT_STMT:
|
sl@0
|
1181 |
zStmt = contentInsertStatement(v); break;
|
sl@0
|
1182 |
case CONTENT_UPDATE_STMT:
|
sl@0
|
1183 |
zStmt = contentUpdateStatement(v); break;
|
sl@0
|
1184 |
default:
|
sl@0
|
1185 |
zStmt = fulltext_zStatement[iStmt];
|
sl@0
|
1186 |
}
|
sl@0
|
1187 |
rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
|
sl@0
|
1188 |
zStmt);
|
sl@0
|
1189 |
if( zStmt != fulltext_zStatement[iStmt]) free((void *) zStmt);
|
sl@0
|
1190 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1191 |
} else {
|
sl@0
|
1192 |
int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
|
sl@0
|
1193 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1194 |
}
|
sl@0
|
1195 |
|
sl@0
|
1196 |
*ppStmt = v->pFulltextStatements[iStmt];
|
sl@0
|
1197 |
return SQLITE_OK;
|
sl@0
|
1198 |
}
|
sl@0
|
1199 |
|
sl@0
|
1200 |
/* Step the indicated statement, handling errors SQLITE_BUSY (by
|
sl@0
|
1201 |
** retrying) and SQLITE_SCHEMA (by re-preparing and transferring
|
sl@0
|
1202 |
** bindings to the new statement).
|
sl@0
|
1203 |
** TODO(adam): We should extend this function so that it can work with
|
sl@0
|
1204 |
** statements declared locally, not only globally cached statements.
|
sl@0
|
1205 |
*/
|
sl@0
|
1206 |
static int sql_step_statement(fulltext_vtab *v, fulltext_statement iStmt,
|
sl@0
|
1207 |
sqlite3_stmt **ppStmt){
|
sl@0
|
1208 |
int rc;
|
sl@0
|
1209 |
sqlite3_stmt *s = *ppStmt;
|
sl@0
|
1210 |
assert( iStmt<MAX_STMT );
|
sl@0
|
1211 |
assert( s==v->pFulltextStatements[iStmt] );
|
sl@0
|
1212 |
|
sl@0
|
1213 |
while( (rc=sqlite3_step(s))!=SQLITE_DONE && rc!=SQLITE_ROW ){
|
sl@0
|
1214 |
if( rc==SQLITE_BUSY ) continue;
|
sl@0
|
1215 |
if( rc!=SQLITE_ERROR ) return rc;
|
sl@0
|
1216 |
|
sl@0
|
1217 |
/* If an SQLITE_SCHEMA error has occured, then finalizing this
|
sl@0
|
1218 |
* statement is going to delete the fulltext_vtab structure. If
|
sl@0
|
1219 |
* the statement just executed is in the pFulltextStatements[]
|
sl@0
|
1220 |
* array, it will be finalized twice. So remove it before
|
sl@0
|
1221 |
* calling sqlite3_finalize().
|
sl@0
|
1222 |
*/
|
sl@0
|
1223 |
v->pFulltextStatements[iStmt] = NULL;
|
sl@0
|
1224 |
rc = sqlite3_finalize(s);
|
sl@0
|
1225 |
break;
|
sl@0
|
1226 |
}
|
sl@0
|
1227 |
return rc;
|
sl@0
|
1228 |
|
sl@0
|
1229 |
err:
|
sl@0
|
1230 |
sqlite3_finalize(s);
|
sl@0
|
1231 |
return rc;
|
sl@0
|
1232 |
}
|
sl@0
|
1233 |
|
sl@0
|
1234 |
/* Like sql_step_statement(), but convert SQLITE_DONE to SQLITE_OK.
|
sl@0
|
1235 |
** Useful for statements like UPDATE, where we expect no results.
|
sl@0
|
1236 |
*/
|
sl@0
|
1237 |
static int sql_single_step_statement(fulltext_vtab *v,
|
sl@0
|
1238 |
fulltext_statement iStmt,
|
sl@0
|
1239 |
sqlite3_stmt **ppStmt){
|
sl@0
|
1240 |
int rc = sql_step_statement(v, iStmt, ppStmt);
|
sl@0
|
1241 |
return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
|
sl@0
|
1242 |
}
|
sl@0
|
1243 |
|
sl@0
|
1244 |
/* insert into %_content (rowid, ...) values ([rowid], [pValues]) */
|
sl@0
|
1245 |
static int content_insert(fulltext_vtab *v, sqlite3_value *rowid,
|
sl@0
|
1246 |
sqlite3_value **pValues){
|
sl@0
|
1247 |
sqlite3_stmt *s;
|
sl@0
|
1248 |
int i;
|
sl@0
|
1249 |
int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
|
sl@0
|
1250 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1251 |
|
sl@0
|
1252 |
rc = sqlite3_bind_value(s, 1, rowid);
|
sl@0
|
1253 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1254 |
|
sl@0
|
1255 |
for(i=0; i<v->nColumn; ++i){
|
sl@0
|
1256 |
rc = sqlite3_bind_value(s, 2+i, pValues[i]);
|
sl@0
|
1257 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1258 |
}
|
sl@0
|
1259 |
|
sl@0
|
1260 |
return sql_single_step_statement(v, CONTENT_INSERT_STMT, &s);
|
sl@0
|
1261 |
}
|
sl@0
|
1262 |
|
sl@0
|
1263 |
/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
|
sl@0
|
1264 |
* where rowid = [iRowid] */
|
sl@0
|
1265 |
static int content_update(fulltext_vtab *v, sqlite3_value **pValues,
|
sl@0
|
1266 |
sqlite_int64 iRowid){
|
sl@0
|
1267 |
sqlite3_stmt *s;
|
sl@0
|
1268 |
int i;
|
sl@0
|
1269 |
int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
|
sl@0
|
1270 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1271 |
|
sl@0
|
1272 |
for(i=0; i<v->nColumn; ++i){
|
sl@0
|
1273 |
rc = sqlite3_bind_value(s, 1+i, pValues[i]);
|
sl@0
|
1274 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1275 |
}
|
sl@0
|
1276 |
|
sl@0
|
1277 |
rc = sqlite3_bind_int64(s, 1+v->nColumn, iRowid);
|
sl@0
|
1278 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1279 |
|
sl@0
|
1280 |
return sql_single_step_statement(v, CONTENT_UPDATE_STMT, &s);
|
sl@0
|
1281 |
}
|
sl@0
|
1282 |
|
sl@0
|
1283 |
static void freeStringArray(int nString, const char **pString){
|
sl@0
|
1284 |
int i;
|
sl@0
|
1285 |
|
sl@0
|
1286 |
for (i=0 ; i < nString ; ++i) {
|
sl@0
|
1287 |
if( pString[i]!=NULL ) free((void *) pString[i]);
|
sl@0
|
1288 |
}
|
sl@0
|
1289 |
free((void *) pString);
|
sl@0
|
1290 |
}
|
sl@0
|
1291 |
|
sl@0
|
1292 |
/* select * from %_content where rowid = [iRow]
|
sl@0
|
1293 |
* The caller must delete the returned array and all strings in it.
|
sl@0
|
1294 |
* null fields will be NULL in the returned array.
|
sl@0
|
1295 |
*
|
sl@0
|
1296 |
* TODO: Perhaps we should return pointer/length strings here for consistency
|
sl@0
|
1297 |
* with other code which uses pointer/length. */
|
sl@0
|
1298 |
static int content_select(fulltext_vtab *v, sqlite_int64 iRow,
|
sl@0
|
1299 |
const char ***pValues){
|
sl@0
|
1300 |
sqlite3_stmt *s;
|
sl@0
|
1301 |
const char **values;
|
sl@0
|
1302 |
int i;
|
sl@0
|
1303 |
int rc;
|
sl@0
|
1304 |
|
sl@0
|
1305 |
*pValues = NULL;
|
sl@0
|
1306 |
|
sl@0
|
1307 |
rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
|
sl@0
|
1308 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1309 |
|
sl@0
|
1310 |
rc = sqlite3_bind_int64(s, 1, iRow);
|
sl@0
|
1311 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1312 |
|
sl@0
|
1313 |
rc = sql_step_statement(v, CONTENT_SELECT_STMT, &s);
|
sl@0
|
1314 |
if( rc!=SQLITE_ROW ) return rc;
|
sl@0
|
1315 |
|
sl@0
|
1316 |
values = (const char **) malloc(v->nColumn * sizeof(const char *));
|
sl@0
|
1317 |
for(i=0; i<v->nColumn; ++i){
|
sl@0
|
1318 |
if( sqlite3_column_type(s, i)==SQLITE_NULL ){
|
sl@0
|
1319 |
values[i] = NULL;
|
sl@0
|
1320 |
}else{
|
sl@0
|
1321 |
values[i] = string_dup((char*)sqlite3_column_text(s, i));
|
sl@0
|
1322 |
}
|
sl@0
|
1323 |
}
|
sl@0
|
1324 |
|
sl@0
|
1325 |
/* We expect only one row. We must execute another sqlite3_step()
|
sl@0
|
1326 |
* to complete the iteration; otherwise the table will remain locked. */
|
sl@0
|
1327 |
rc = sqlite3_step(s);
|
sl@0
|
1328 |
if( rc==SQLITE_DONE ){
|
sl@0
|
1329 |
*pValues = values;
|
sl@0
|
1330 |
return SQLITE_OK;
|
sl@0
|
1331 |
}
|
sl@0
|
1332 |
|
sl@0
|
1333 |
freeStringArray(v->nColumn, values);
|
sl@0
|
1334 |
return rc;
|
sl@0
|
1335 |
}
|
sl@0
|
1336 |
|
sl@0
|
1337 |
/* delete from %_content where rowid = [iRow ] */
|
sl@0
|
1338 |
static int content_delete(fulltext_vtab *v, sqlite_int64 iRow){
|
sl@0
|
1339 |
sqlite3_stmt *s;
|
sl@0
|
1340 |
int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
|
sl@0
|
1341 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1342 |
|
sl@0
|
1343 |
rc = sqlite3_bind_int64(s, 1, iRow);
|
sl@0
|
1344 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1345 |
|
sl@0
|
1346 |
return sql_single_step_statement(v, CONTENT_DELETE_STMT, &s);
|
sl@0
|
1347 |
}
|
sl@0
|
1348 |
|
sl@0
|
1349 |
/* select rowid, doclist from %_term
|
sl@0
|
1350 |
* where term = [pTerm] and segment = [iSegment]
|
sl@0
|
1351 |
* If found, returns SQLITE_ROW; the caller must free the
|
sl@0
|
1352 |
* returned doclist. If no rows found, returns SQLITE_DONE. */
|
sl@0
|
1353 |
static int term_select(fulltext_vtab *v, const char *pTerm, int nTerm,
|
sl@0
|
1354 |
int iSegment,
|
sl@0
|
1355 |
sqlite_int64 *rowid, DocList *out){
|
sl@0
|
1356 |
sqlite3_stmt *s;
|
sl@0
|
1357 |
int rc = sql_get_statement(v, TERM_SELECT_STMT, &s);
|
sl@0
|
1358 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1359 |
|
sl@0
|
1360 |
rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
|
sl@0
|
1361 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1362 |
|
sl@0
|
1363 |
rc = sqlite3_bind_int(s, 2, iSegment);
|
sl@0
|
1364 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1365 |
|
sl@0
|
1366 |
rc = sql_step_statement(v, TERM_SELECT_STMT, &s);
|
sl@0
|
1367 |
if( rc!=SQLITE_ROW ) return rc;
|
sl@0
|
1368 |
|
sl@0
|
1369 |
*rowid = sqlite3_column_int64(s, 0);
|
sl@0
|
1370 |
docListInit(out, DL_DEFAULT,
|
sl@0
|
1371 |
sqlite3_column_blob(s, 1), sqlite3_column_bytes(s, 1));
|
sl@0
|
1372 |
|
sl@0
|
1373 |
/* We expect only one row. We must execute another sqlite3_step()
|
sl@0
|
1374 |
* to complete the iteration; otherwise the table will remain locked. */
|
sl@0
|
1375 |
rc = sqlite3_step(s);
|
sl@0
|
1376 |
return rc==SQLITE_DONE ? SQLITE_ROW : rc;
|
sl@0
|
1377 |
}
|
sl@0
|
1378 |
|
sl@0
|
1379 |
/* Load the segment doclists for term pTerm and merge them in
|
sl@0
|
1380 |
** appropriate order into out. Returns SQLITE_OK if successful. If
|
sl@0
|
1381 |
** there are no segments for pTerm, successfully returns an empty
|
sl@0
|
1382 |
** doclist in out.
|
sl@0
|
1383 |
**
|
sl@0
|
1384 |
** Each document consists of 1 or more "columns". The number of
|
sl@0
|
1385 |
** columns is v->nColumn. If iColumn==v->nColumn, then return
|
sl@0
|
1386 |
** position information about all columns. If iColumn<v->nColumn,
|
sl@0
|
1387 |
** then only return position information about the iColumn-th column
|
sl@0
|
1388 |
** (where the first column is 0).
|
sl@0
|
1389 |
*/
|
sl@0
|
1390 |
static int term_select_all(
|
sl@0
|
1391 |
fulltext_vtab *v, /* The fulltext index we are querying against */
|
sl@0
|
1392 |
int iColumn, /* If <nColumn, only look at the iColumn-th column */
|
sl@0
|
1393 |
const char *pTerm, /* The term whose posting lists we want */
|
sl@0
|
1394 |
int nTerm, /* Number of bytes in pTerm */
|
sl@0
|
1395 |
DocList *out /* Write the resulting doclist here */
|
sl@0
|
1396 |
){
|
sl@0
|
1397 |
DocList doclist;
|
sl@0
|
1398 |
sqlite3_stmt *s;
|
sl@0
|
1399 |
int rc = sql_get_statement(v, TERM_SELECT_ALL_STMT, &s);
|
sl@0
|
1400 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1401 |
|
sl@0
|
1402 |
rc = sqlite3_bind_text(s, 1, pTerm, nTerm, SQLITE_STATIC);
|
sl@0
|
1403 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1404 |
|
sl@0
|
1405 |
docListInit(&doclist, DL_DEFAULT, 0, 0);
|
sl@0
|
1406 |
|
sl@0
|
1407 |
/* TODO(shess) Handle schema and busy errors. */
|
sl@0
|
1408 |
while( (rc=sql_step_statement(v, TERM_SELECT_ALL_STMT, &s))==SQLITE_ROW ){
|
sl@0
|
1409 |
DocList old;
|
sl@0
|
1410 |
|
sl@0
|
1411 |
/* TODO(shess) If we processed doclists from oldest to newest, we
|
sl@0
|
1412 |
** could skip the malloc() involved with the following call. For
|
sl@0
|
1413 |
** now, I'd rather keep this logic similar to index_insert_term().
|
sl@0
|
1414 |
** We could additionally drop elements when we see deletes, but
|
sl@0
|
1415 |
** that would require a distinct version of docListAccumulate().
|
sl@0
|
1416 |
*/
|
sl@0
|
1417 |
docListInit(&old, DL_DEFAULT,
|
sl@0
|
1418 |
sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0));
|
sl@0
|
1419 |
|
sl@0
|
1420 |
if( iColumn<v->nColumn ){ /* querying a single column */
|
sl@0
|
1421 |
docListRestrictColumn(&old, iColumn);
|
sl@0
|
1422 |
}
|
sl@0
|
1423 |
|
sl@0
|
1424 |
/* doclist contains the newer data, so write it over old. Then
|
sl@0
|
1425 |
** steal accumulated result for doclist.
|
sl@0
|
1426 |
*/
|
sl@0
|
1427 |
docListAccumulate(&old, &doclist);
|
sl@0
|
1428 |
docListDestroy(&doclist);
|
sl@0
|
1429 |
doclist = old;
|
sl@0
|
1430 |
}
|
sl@0
|
1431 |
if( rc!=SQLITE_DONE ){
|
sl@0
|
1432 |
docListDestroy(&doclist);
|
sl@0
|
1433 |
return rc;
|
sl@0
|
1434 |
}
|
sl@0
|
1435 |
|
sl@0
|
1436 |
docListDiscardEmpty(&doclist);
|
sl@0
|
1437 |
*out = doclist;
|
sl@0
|
1438 |
return SQLITE_OK;
|
sl@0
|
1439 |
}
|
sl@0
|
1440 |
|
sl@0
|
1441 |
/* insert into %_term (rowid, term, segment, doclist)
|
sl@0
|
1442 |
values ([piRowid], [pTerm], [iSegment], [doclist])
|
sl@0
|
1443 |
** Lets sqlite select rowid if piRowid is NULL, else uses *piRowid.
|
sl@0
|
1444 |
**
|
sl@0
|
1445 |
** NOTE(shess) piRowid is IN, with values of "space of int64" plus
|
sl@0
|
1446 |
** null, it is not used to pass data back to the caller.
|
sl@0
|
1447 |
*/
|
sl@0
|
1448 |
static int term_insert(fulltext_vtab *v, sqlite_int64 *piRowid,
|
sl@0
|
1449 |
const char *pTerm, int nTerm,
|
sl@0
|
1450 |
int iSegment, DocList *doclist){
|
sl@0
|
1451 |
sqlite3_stmt *s;
|
sl@0
|
1452 |
int rc = sql_get_statement(v, TERM_INSERT_STMT, &s);
|
sl@0
|
1453 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1454 |
|
sl@0
|
1455 |
if( piRowid==NULL ){
|
sl@0
|
1456 |
rc = sqlite3_bind_null(s, 1);
|
sl@0
|
1457 |
}else{
|
sl@0
|
1458 |
rc = sqlite3_bind_int64(s, 1, *piRowid);
|
sl@0
|
1459 |
}
|
sl@0
|
1460 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1461 |
|
sl@0
|
1462 |
rc = sqlite3_bind_text(s, 2, pTerm, nTerm, SQLITE_STATIC);
|
sl@0
|
1463 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1464 |
|
sl@0
|
1465 |
rc = sqlite3_bind_int(s, 3, iSegment);
|
sl@0
|
1466 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1467 |
|
sl@0
|
1468 |
rc = sqlite3_bind_blob(s, 4, doclist->pData, doclist->nData, SQLITE_STATIC);
|
sl@0
|
1469 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1470 |
|
sl@0
|
1471 |
return sql_single_step_statement(v, TERM_INSERT_STMT, &s);
|
sl@0
|
1472 |
}
|
sl@0
|
1473 |
|
sl@0
|
1474 |
/* update %_term set doclist = [doclist] where rowid = [rowid] */
|
sl@0
|
1475 |
static int term_update(fulltext_vtab *v, sqlite_int64 rowid,
|
sl@0
|
1476 |
DocList *doclist){
|
sl@0
|
1477 |
sqlite3_stmt *s;
|
sl@0
|
1478 |
int rc = sql_get_statement(v, TERM_UPDATE_STMT, &s);
|
sl@0
|
1479 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1480 |
|
sl@0
|
1481 |
rc = sqlite3_bind_blob(s, 1, doclist->pData, doclist->nData, SQLITE_STATIC);
|
sl@0
|
1482 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1483 |
|
sl@0
|
1484 |
rc = sqlite3_bind_int64(s, 2, rowid);
|
sl@0
|
1485 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1486 |
|
sl@0
|
1487 |
return sql_single_step_statement(v, TERM_UPDATE_STMT, &s);
|
sl@0
|
1488 |
}
|
sl@0
|
1489 |
|
sl@0
|
1490 |
static int term_delete(fulltext_vtab *v, sqlite_int64 rowid){
|
sl@0
|
1491 |
sqlite3_stmt *s;
|
sl@0
|
1492 |
int rc = sql_get_statement(v, TERM_DELETE_STMT, &s);
|
sl@0
|
1493 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1494 |
|
sl@0
|
1495 |
rc = sqlite3_bind_int64(s, 1, rowid);
|
sl@0
|
1496 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
1497 |
|
sl@0
|
1498 |
return sql_single_step_statement(v, TERM_DELETE_STMT, &s);
|
sl@0
|
1499 |
}
|
sl@0
|
1500 |
|
sl@0
|
1501 |
/*
|
sl@0
|
1502 |
** Free the memory used to contain a fulltext_vtab structure.
|
sl@0
|
1503 |
*/
|
sl@0
|
1504 |
static void fulltext_vtab_destroy(fulltext_vtab *v){
|
sl@0
|
1505 |
int iStmt, i;
|
sl@0
|
1506 |
|
sl@0
|
1507 |
TRACE(("FTS1 Destroy %p\n", v));
|
sl@0
|
1508 |
for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
|
sl@0
|
1509 |
if( v->pFulltextStatements[iStmt]!=NULL ){
|
sl@0
|
1510 |
sqlite3_finalize(v->pFulltextStatements[iStmt]);
|
sl@0
|
1511 |
v->pFulltextStatements[iStmt] = NULL;
|
sl@0
|
1512 |
}
|
sl@0
|
1513 |
}
|
sl@0
|
1514 |
|
sl@0
|
1515 |
if( v->pTokenizer!=NULL ){
|
sl@0
|
1516 |
v->pTokenizer->pModule->xDestroy(v->pTokenizer);
|
sl@0
|
1517 |
v->pTokenizer = NULL;
|
sl@0
|
1518 |
}
|
sl@0
|
1519 |
|
sl@0
|
1520 |
free(v->azColumn);
|
sl@0
|
1521 |
for(i = 0; i < v->nColumn; ++i) {
|
sl@0
|
1522 |
sqlite3_free(v->azContentColumn[i]);
|
sl@0
|
1523 |
}
|
sl@0
|
1524 |
free(v->azContentColumn);
|
sl@0
|
1525 |
free(v);
|
sl@0
|
1526 |
}
|
sl@0
|
1527 |
|
sl@0
|
1528 |
/*
|
sl@0
|
1529 |
** Token types for parsing the arguments to xConnect or xCreate.
|
sl@0
|
1530 |
*/
|
sl@0
|
1531 |
#define TOKEN_EOF 0 /* End of file */
|
sl@0
|
1532 |
#define TOKEN_SPACE 1 /* Any kind of whitespace */
|
sl@0
|
1533 |
#define TOKEN_ID 2 /* An identifier */
|
sl@0
|
1534 |
#define TOKEN_STRING 3 /* A string literal */
|
sl@0
|
1535 |
#define TOKEN_PUNCT 4 /* A single punctuation character */
|
sl@0
|
1536 |
|
sl@0
|
1537 |
/*
|
sl@0
|
1538 |
** If X is a character that can be used in an identifier then
|
sl@0
|
1539 |
** IdChar(X) will be true. Otherwise it is false.
|
sl@0
|
1540 |
**
|
sl@0
|
1541 |
** For ASCII, any character with the high-order bit set is
|
sl@0
|
1542 |
** allowed in an identifier. For 7-bit characters,
|
sl@0
|
1543 |
** sqlite3IsIdChar[X] must be 1.
|
sl@0
|
1544 |
**
|
sl@0
|
1545 |
** Ticket #1066. the SQL standard does not allow '$' in the
|
sl@0
|
1546 |
** middle of identfiers. But many SQL implementations do.
|
sl@0
|
1547 |
** SQLite will allow '$' in identifiers for compatibility.
|
sl@0
|
1548 |
** But the feature is undocumented.
|
sl@0
|
1549 |
*/
|
sl@0
|
1550 |
static const char isIdChar[] = {
|
sl@0
|
1551 |
/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
|
sl@0
|
1552 |
0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
|
sl@0
|
1553 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
|
sl@0
|
1554 |
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
|
sl@0
|
1555 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
|
sl@0
|
1556 |
0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
|
sl@0
|
1557 |
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
|
sl@0
|
1558 |
};
|
sl@0
|
1559 |
#define IdChar(C) (((c=C)&0x80)!=0 || (c>0x1f && isIdChar[c-0x20]))
|
sl@0
|
1560 |
|
sl@0
|
1561 |
|
sl@0
|
1562 |
/*
|
sl@0
|
1563 |
** Return the length of the token that begins at z[0].
|
sl@0
|
1564 |
** Store the token type in *tokenType before returning.
|
sl@0
|
1565 |
*/
|
sl@0
|
1566 |
static int getToken(const char *z, int *tokenType){
|
sl@0
|
1567 |
int i, c;
|
sl@0
|
1568 |
switch( *z ){
|
sl@0
|
1569 |
case 0: {
|
sl@0
|
1570 |
*tokenType = TOKEN_EOF;
|
sl@0
|
1571 |
return 0;
|
sl@0
|
1572 |
}
|
sl@0
|
1573 |
case ' ': case '\t': case '\n': case '\f': case '\r': {
|
sl@0
|
1574 |
for(i=1; safe_isspace(z[i]); i++){}
|
sl@0
|
1575 |
*tokenType = TOKEN_SPACE;
|
sl@0
|
1576 |
return i;
|
sl@0
|
1577 |
}
|
sl@0
|
1578 |
case '`':
|
sl@0
|
1579 |
case '\'':
|
sl@0
|
1580 |
case '"': {
|
sl@0
|
1581 |
int delim = z[0];
|
sl@0
|
1582 |
for(i=1; (c=z[i])!=0; i++){
|
sl@0
|
1583 |
if( c==delim ){
|
sl@0
|
1584 |
if( z[i+1]==delim ){
|
sl@0
|
1585 |
i++;
|
sl@0
|
1586 |
}else{
|
sl@0
|
1587 |
break;
|
sl@0
|
1588 |
}
|
sl@0
|
1589 |
}
|
sl@0
|
1590 |
}
|
sl@0
|
1591 |
*tokenType = TOKEN_STRING;
|
sl@0
|
1592 |
return i + (c!=0);
|
sl@0
|
1593 |
}
|
sl@0
|
1594 |
case '[': {
|
sl@0
|
1595 |
for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
|
sl@0
|
1596 |
*tokenType = TOKEN_ID;
|
sl@0
|
1597 |
return i;
|
sl@0
|
1598 |
}
|
sl@0
|
1599 |
default: {
|
sl@0
|
1600 |
if( !IdChar(*z) ){
|
sl@0
|
1601 |
break;
|
sl@0
|
1602 |
}
|
sl@0
|
1603 |
for(i=1; IdChar(z[i]); i++){}
|
sl@0
|
1604 |
*tokenType = TOKEN_ID;
|
sl@0
|
1605 |
return i;
|
sl@0
|
1606 |
}
|
sl@0
|
1607 |
}
|
sl@0
|
1608 |
*tokenType = TOKEN_PUNCT;
|
sl@0
|
1609 |
return 1;
|
sl@0
|
1610 |
}
|
sl@0
|
1611 |
|
sl@0
|
1612 |
/*
|
sl@0
|
1613 |
** A token extracted from a string is an instance of the following
|
sl@0
|
1614 |
** structure.
|
sl@0
|
1615 |
*/
|
sl@0
|
1616 |
typedef struct Token {
|
sl@0
|
1617 |
const char *z; /* Pointer to token text. Not '\000' terminated */
|
sl@0
|
1618 |
short int n; /* Length of the token text in bytes. */
|
sl@0
|
1619 |
} Token;
|
sl@0
|
1620 |
|
sl@0
|
1621 |
/*
|
sl@0
|
1622 |
** Given a input string (which is really one of the argv[] parameters
|
sl@0
|
1623 |
** passed into xConnect or xCreate) split the string up into tokens.
|
sl@0
|
1624 |
** Return an array of pointers to '\000' terminated strings, one string
|
sl@0
|
1625 |
** for each non-whitespace token.
|
sl@0
|
1626 |
**
|
sl@0
|
1627 |
** The returned array is terminated by a single NULL pointer.
|
sl@0
|
1628 |
**
|
sl@0
|
1629 |
** Space to hold the returned array is obtained from a single
|
sl@0
|
1630 |
** malloc and should be freed by passing the return value to free().
|
sl@0
|
1631 |
** The individual strings within the token list are all a part of
|
sl@0
|
1632 |
** the single memory allocation and will all be freed at once.
|
sl@0
|
1633 |
*/
|
sl@0
|
1634 |
static char **tokenizeString(const char *z, int *pnToken){
|
sl@0
|
1635 |
int nToken = 0;
|
sl@0
|
1636 |
Token *aToken = malloc( strlen(z) * sizeof(aToken[0]) );
|
sl@0
|
1637 |
int n = 1;
|
sl@0
|
1638 |
int e, i;
|
sl@0
|
1639 |
int totalSize = 0;
|
sl@0
|
1640 |
char **azToken;
|
sl@0
|
1641 |
char *zCopy;
|
sl@0
|
1642 |
while( n>0 ){
|
sl@0
|
1643 |
n = getToken(z, &e);
|
sl@0
|
1644 |
if( e!=TOKEN_SPACE ){
|
sl@0
|
1645 |
aToken[nToken].z = z;
|
sl@0
|
1646 |
aToken[nToken].n = n;
|
sl@0
|
1647 |
nToken++;
|
sl@0
|
1648 |
totalSize += n+1;
|
sl@0
|
1649 |
}
|
sl@0
|
1650 |
z += n;
|
sl@0
|
1651 |
}
|
sl@0
|
1652 |
azToken = (char**)malloc( nToken*sizeof(char*) + totalSize );
|
sl@0
|
1653 |
zCopy = (char*)&azToken[nToken];
|
sl@0
|
1654 |
nToken--;
|
sl@0
|
1655 |
for(i=0; i<nToken; i++){
|
sl@0
|
1656 |
azToken[i] = zCopy;
|
sl@0
|
1657 |
n = aToken[i].n;
|
sl@0
|
1658 |
memcpy(zCopy, aToken[i].z, n);
|
sl@0
|
1659 |
zCopy[n] = 0;
|
sl@0
|
1660 |
zCopy += n+1;
|
sl@0
|
1661 |
}
|
sl@0
|
1662 |
azToken[nToken] = 0;
|
sl@0
|
1663 |
free(aToken);
|
sl@0
|
1664 |
*pnToken = nToken;
|
sl@0
|
1665 |
return azToken;
|
sl@0
|
1666 |
}
|
sl@0
|
1667 |
|
sl@0
|
1668 |
/*
|
sl@0
|
1669 |
** Convert an SQL-style quoted string into a normal string by removing
|
sl@0
|
1670 |
** the quote characters. The conversion is done in-place. If the
|
sl@0
|
1671 |
** input does not begin with a quote character, then this routine
|
sl@0
|
1672 |
** is a no-op.
|
sl@0
|
1673 |
**
|
sl@0
|
1674 |
** Examples:
|
sl@0
|
1675 |
**
|
sl@0
|
1676 |
** "abc" becomes abc
|
sl@0
|
1677 |
** 'xyz' becomes xyz
|
sl@0
|
1678 |
** [pqr] becomes pqr
|
sl@0
|
1679 |
** `mno` becomes mno
|
sl@0
|
1680 |
*/
|
sl@0
|
1681 |
static void dequoteString(char *z){
|
sl@0
|
1682 |
int quote;
|
sl@0
|
1683 |
int i, j;
|
sl@0
|
1684 |
if( z==0 ) return;
|
sl@0
|
1685 |
quote = z[0];
|
sl@0
|
1686 |
switch( quote ){
|
sl@0
|
1687 |
case '\'': break;
|
sl@0
|
1688 |
case '"': break;
|
sl@0
|
1689 |
case '`': break; /* For MySQL compatibility */
|
sl@0
|
1690 |
case '[': quote = ']'; break; /* For MS SqlServer compatibility */
|
sl@0
|
1691 |
default: return;
|
sl@0
|
1692 |
}
|
sl@0
|
1693 |
for(i=1, j=0; z[i]; i++){
|
sl@0
|
1694 |
if( z[i]==quote ){
|
sl@0
|
1695 |
if( z[i+1]==quote ){
|
sl@0
|
1696 |
z[j++] = quote;
|
sl@0
|
1697 |
i++;
|
sl@0
|
1698 |
}else{
|
sl@0
|
1699 |
z[j++] = 0;
|
sl@0
|
1700 |
break;
|
sl@0
|
1701 |
}
|
sl@0
|
1702 |
}else{
|
sl@0
|
1703 |
z[j++] = z[i];
|
sl@0
|
1704 |
}
|
sl@0
|
1705 |
}
|
sl@0
|
1706 |
}
|
sl@0
|
1707 |
|
sl@0
|
1708 |
/*
|
sl@0
|
1709 |
** The input azIn is a NULL-terminated list of tokens. Remove the first
|
sl@0
|
1710 |
** token and all punctuation tokens. Remove the quotes from
|
sl@0
|
1711 |
** around string literal tokens.
|
sl@0
|
1712 |
**
|
sl@0
|
1713 |
** Example:
|
sl@0
|
1714 |
**
|
sl@0
|
1715 |
** input: tokenize chinese ( 'simplifed' , 'mixed' )
|
sl@0
|
1716 |
** output: chinese simplifed mixed
|
sl@0
|
1717 |
**
|
sl@0
|
1718 |
** Another example:
|
sl@0
|
1719 |
**
|
sl@0
|
1720 |
** input: delimiters ( '[' , ']' , '...' )
|
sl@0
|
1721 |
** output: [ ] ...
|
sl@0
|
1722 |
*/
|
sl@0
|
1723 |
static void tokenListToIdList(char **azIn){
|
sl@0
|
1724 |
int i, j;
|
sl@0
|
1725 |
if( azIn ){
|
sl@0
|
1726 |
for(i=0, j=-1; azIn[i]; i++){
|
sl@0
|
1727 |
if( safe_isalnum(azIn[i][0]) || azIn[i][1] ){
|
sl@0
|
1728 |
dequoteString(azIn[i]);
|
sl@0
|
1729 |
if( j>=0 ){
|
sl@0
|
1730 |
azIn[j] = azIn[i];
|
sl@0
|
1731 |
}
|
sl@0
|
1732 |
j++;
|
sl@0
|
1733 |
}
|
sl@0
|
1734 |
}
|
sl@0
|
1735 |
azIn[j] = 0;
|
sl@0
|
1736 |
}
|
sl@0
|
1737 |
}
|
sl@0
|
1738 |
|
sl@0
|
1739 |
|
sl@0
|
1740 |
/*
|
sl@0
|
1741 |
** Find the first alphanumeric token in the string zIn. Null-terminate
|
sl@0
|
1742 |
** this token. Remove any quotation marks. And return a pointer to
|
sl@0
|
1743 |
** the result.
|
sl@0
|
1744 |
*/
|
sl@0
|
1745 |
static char *firstToken(char *zIn, char **pzTail){
|
sl@0
|
1746 |
int n, ttype;
|
sl@0
|
1747 |
while(1){
|
sl@0
|
1748 |
n = getToken(zIn, &ttype);
|
sl@0
|
1749 |
if( ttype==TOKEN_SPACE ){
|
sl@0
|
1750 |
zIn += n;
|
sl@0
|
1751 |
}else if( ttype==TOKEN_EOF ){
|
sl@0
|
1752 |
*pzTail = zIn;
|
sl@0
|
1753 |
return 0;
|
sl@0
|
1754 |
}else{
|
sl@0
|
1755 |
zIn[n] = 0;
|
sl@0
|
1756 |
*pzTail = &zIn[1];
|
sl@0
|
1757 |
dequoteString(zIn);
|
sl@0
|
1758 |
return zIn;
|
sl@0
|
1759 |
}
|
sl@0
|
1760 |
}
|
sl@0
|
1761 |
/*NOTREACHED*/
|
sl@0
|
1762 |
}
|
sl@0
|
1763 |
|
sl@0
|
1764 |
/* Return true if...
|
sl@0
|
1765 |
**
|
sl@0
|
1766 |
** * s begins with the string t, ignoring case
|
sl@0
|
1767 |
** * s is longer than t
|
sl@0
|
1768 |
** * The first character of s beyond t is not a alphanumeric
|
sl@0
|
1769 |
**
|
sl@0
|
1770 |
** Ignore leading space in *s.
|
sl@0
|
1771 |
**
|
sl@0
|
1772 |
** To put it another way, return true if the first token of
|
sl@0
|
1773 |
** s[] is t[].
|
sl@0
|
1774 |
*/
|
sl@0
|
1775 |
static int startsWith(const char *s, const char *t){
|
sl@0
|
1776 |
while( safe_isspace(*s) ){ s++; }
|
sl@0
|
1777 |
while( *t ){
|
sl@0
|
1778 |
if( safe_tolower(*s++)!=safe_tolower(*t++) ) return 0;
|
sl@0
|
1779 |
}
|
sl@0
|
1780 |
return *s!='_' && !safe_isalnum(*s);
|
sl@0
|
1781 |
}
|
sl@0
|
1782 |
|
sl@0
|
1783 |
/*
|
sl@0
|
1784 |
** An instance of this structure defines the "spec" of a
|
sl@0
|
1785 |
** full text index. This structure is populated by parseSpec
|
sl@0
|
1786 |
** and use by fulltextConnect and fulltextCreate.
|
sl@0
|
1787 |
*/
|
sl@0
|
1788 |
typedef struct TableSpec {
|
sl@0
|
1789 |
const char *zDb; /* Logical database name */
|
sl@0
|
1790 |
const char *zName; /* Name of the full-text index */
|
sl@0
|
1791 |
int nColumn; /* Number of columns to be indexed */
|
sl@0
|
1792 |
char **azColumn; /* Original names of columns to be indexed */
|
sl@0
|
1793 |
char **azContentColumn; /* Column names for %_content */
|
sl@0
|
1794 |
char **azTokenizer; /* Name of tokenizer and its arguments */
|
sl@0
|
1795 |
} TableSpec;
|
sl@0
|
1796 |
|
sl@0
|
1797 |
/*
|
sl@0
|
1798 |
** Reclaim all of the memory used by a TableSpec
|
sl@0
|
1799 |
*/
|
sl@0
|
1800 |
static void clearTableSpec(TableSpec *p) {
|
sl@0
|
1801 |
free(p->azColumn);
|
sl@0
|
1802 |
free(p->azContentColumn);
|
sl@0
|
1803 |
free(p->azTokenizer);
|
sl@0
|
1804 |
}
|
sl@0
|
1805 |
|
sl@0
|
1806 |
/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
|
sl@0
|
1807 |
*
|
sl@0
|
1808 |
* CREATE VIRTUAL TABLE email
|
sl@0
|
1809 |
* USING fts1(subject, body, tokenize mytokenizer(myarg))
|
sl@0
|
1810 |
*
|
sl@0
|
1811 |
* We return parsed information in a TableSpec structure.
|
sl@0
|
1812 |
*
|
sl@0
|
1813 |
*/
|
sl@0
|
1814 |
static int parseSpec(TableSpec *pSpec, int argc, const char *const*argv,
|
sl@0
|
1815 |
char**pzErr){
|
sl@0
|
1816 |
int i, n;
|
sl@0
|
1817 |
char *z, *zDummy;
|
sl@0
|
1818 |
char **azArg;
|
sl@0
|
1819 |
const char *zTokenizer = 0; /* argv[] entry describing the tokenizer */
|
sl@0
|
1820 |
|
sl@0
|
1821 |
assert( argc>=3 );
|
sl@0
|
1822 |
/* Current interface:
|
sl@0
|
1823 |
** argv[0] - module name
|
sl@0
|
1824 |
** argv[1] - database name
|
sl@0
|
1825 |
** argv[2] - table name
|
sl@0
|
1826 |
** argv[3..] - columns, optionally followed by tokenizer specification
|
sl@0
|
1827 |
** and snippet delimiters specification.
|
sl@0
|
1828 |
*/
|
sl@0
|
1829 |
|
sl@0
|
1830 |
/* Make a copy of the complete argv[][] array in a single allocation.
|
sl@0
|
1831 |
** The argv[][] array is read-only and transient. We can write to the
|
sl@0
|
1832 |
** copy in order to modify things and the copy is persistent.
|
sl@0
|
1833 |
*/
|
sl@0
|
1834 |
memset(pSpec, 0, sizeof(*pSpec));
|
sl@0
|
1835 |
for(i=n=0; i<argc; i++){
|
sl@0
|
1836 |
n += strlen(argv[i]) + 1;
|
sl@0
|
1837 |
}
|
sl@0
|
1838 |
azArg = malloc( sizeof(char*)*argc + n );
|
sl@0
|
1839 |
if( azArg==0 ){
|
sl@0
|
1840 |
return SQLITE_NOMEM;
|
sl@0
|
1841 |
}
|
sl@0
|
1842 |
z = (char*)&azArg[argc];
|
sl@0
|
1843 |
for(i=0; i<argc; i++){
|
sl@0
|
1844 |
azArg[i] = z;
|
sl@0
|
1845 |
strcpy(z, argv[i]);
|
sl@0
|
1846 |
z += strlen(z)+1;
|
sl@0
|
1847 |
}
|
sl@0
|
1848 |
|
sl@0
|
1849 |
/* Identify the column names and the tokenizer and delimiter arguments
|
sl@0
|
1850 |
** in the argv[][] array.
|
sl@0
|
1851 |
*/
|
sl@0
|
1852 |
pSpec->zDb = azArg[1];
|
sl@0
|
1853 |
pSpec->zName = azArg[2];
|
sl@0
|
1854 |
pSpec->nColumn = 0;
|
sl@0
|
1855 |
pSpec->azColumn = azArg;
|
sl@0
|
1856 |
zTokenizer = "tokenize simple";
|
sl@0
|
1857 |
for(i=3; i<argc; ++i){
|
sl@0
|
1858 |
if( startsWith(azArg[i],"tokenize") ){
|
sl@0
|
1859 |
zTokenizer = azArg[i];
|
sl@0
|
1860 |
}else{
|
sl@0
|
1861 |
z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
|
sl@0
|
1862 |
pSpec->nColumn++;
|
sl@0
|
1863 |
}
|
sl@0
|
1864 |
}
|
sl@0
|
1865 |
if( pSpec->nColumn==0 ){
|
sl@0
|
1866 |
azArg[0] = "content";
|
sl@0
|
1867 |
pSpec->nColumn = 1;
|
sl@0
|
1868 |
}
|
sl@0
|
1869 |
|
sl@0
|
1870 |
/*
|
sl@0
|
1871 |
** Construct the list of content column names.
|
sl@0
|
1872 |
**
|
sl@0
|
1873 |
** Each content column name will be of the form cNNAAAA
|
sl@0
|
1874 |
** where NN is the column number and AAAA is the sanitized
|
sl@0
|
1875 |
** column name. "sanitized" means that special characters are
|
sl@0
|
1876 |
** converted to "_". The cNN prefix guarantees that all column
|
sl@0
|
1877 |
** names are unique.
|
sl@0
|
1878 |
**
|
sl@0
|
1879 |
** The AAAA suffix is not strictly necessary. It is included
|
sl@0
|
1880 |
** for the convenience of people who might examine the generated
|
sl@0
|
1881 |
** %_content table and wonder what the columns are used for.
|
sl@0
|
1882 |
*/
|
sl@0
|
1883 |
pSpec->azContentColumn = malloc( pSpec->nColumn * sizeof(char *) );
|
sl@0
|
1884 |
if( pSpec->azContentColumn==0 ){
|
sl@0
|
1885 |
clearTableSpec(pSpec);
|
sl@0
|
1886 |
return SQLITE_NOMEM;
|
sl@0
|
1887 |
}
|
sl@0
|
1888 |
for(i=0; i<pSpec->nColumn; i++){
|
sl@0
|
1889 |
char *p;
|
sl@0
|
1890 |
pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
|
sl@0
|
1891 |
for (p = pSpec->azContentColumn[i]; *p ; ++p) {
|
sl@0
|
1892 |
if( !safe_isalnum(*p) ) *p = '_';
|
sl@0
|
1893 |
}
|
sl@0
|
1894 |
}
|
sl@0
|
1895 |
|
sl@0
|
1896 |
/*
|
sl@0
|
1897 |
** Parse the tokenizer specification string.
|
sl@0
|
1898 |
*/
|
sl@0
|
1899 |
pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
|
sl@0
|
1900 |
tokenListToIdList(pSpec->azTokenizer);
|
sl@0
|
1901 |
|
sl@0
|
1902 |
return SQLITE_OK;
|
sl@0
|
1903 |
}
|
sl@0
|
1904 |
|
sl@0
|
1905 |
/*
|
sl@0
|
1906 |
** Generate a CREATE TABLE statement that describes the schema of
|
sl@0
|
1907 |
** the virtual table. Return a pointer to this schema string.
|
sl@0
|
1908 |
**
|
sl@0
|
1909 |
** Space is obtained from sqlite3_mprintf() and should be freed
|
sl@0
|
1910 |
** using sqlite3_free().
|
sl@0
|
1911 |
*/
|
sl@0
|
1912 |
static char *fulltextSchema(
|
sl@0
|
1913 |
int nColumn, /* Number of columns */
|
sl@0
|
1914 |
const char *const* azColumn, /* List of columns */
|
sl@0
|
1915 |
const char *zTableName /* Name of the table */
|
sl@0
|
1916 |
){
|
sl@0
|
1917 |
int i;
|
sl@0
|
1918 |
char *zSchema, *zNext;
|
sl@0
|
1919 |
const char *zSep = "(";
|
sl@0
|
1920 |
zSchema = sqlite3_mprintf("CREATE TABLE x");
|
sl@0
|
1921 |
for(i=0; i<nColumn; i++){
|
sl@0
|
1922 |
zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
|
sl@0
|
1923 |
sqlite3_free(zSchema);
|
sl@0
|
1924 |
zSchema = zNext;
|
sl@0
|
1925 |
zSep = ",";
|
sl@0
|
1926 |
}
|
sl@0
|
1927 |
zNext = sqlite3_mprintf("%s,%Q)", zSchema, zTableName);
|
sl@0
|
1928 |
sqlite3_free(zSchema);
|
sl@0
|
1929 |
return zNext;
|
sl@0
|
1930 |
}
|
sl@0
|
1931 |
|
sl@0
|
1932 |
/*
|
sl@0
|
1933 |
** Build a new sqlite3_vtab structure that will describe the
|
sl@0
|
1934 |
** fulltext index defined by spec.
|
sl@0
|
1935 |
*/
|
sl@0
|
1936 |
static int constructVtab(
|
sl@0
|
1937 |
sqlite3 *db, /* The SQLite database connection */
|
sl@0
|
1938 |
TableSpec *spec, /* Parsed spec information from parseSpec() */
|
sl@0
|
1939 |
sqlite3_vtab **ppVTab, /* Write the resulting vtab structure here */
|
sl@0
|
1940 |
char **pzErr /* Write any error message here */
|
sl@0
|
1941 |
){
|
sl@0
|
1942 |
int rc;
|
sl@0
|
1943 |
int n;
|
sl@0
|
1944 |
fulltext_vtab *v = 0;
|
sl@0
|
1945 |
const sqlite3_tokenizer_module *m = NULL;
|
sl@0
|
1946 |
char *schema;
|
sl@0
|
1947 |
|
sl@0
|
1948 |
v = (fulltext_vtab *) malloc(sizeof(fulltext_vtab));
|
sl@0
|
1949 |
if( v==0 ) return SQLITE_NOMEM;
|
sl@0
|
1950 |
memset(v, 0, sizeof(*v));
|
sl@0
|
1951 |
/* sqlite will initialize v->base */
|
sl@0
|
1952 |
v->db = db;
|
sl@0
|
1953 |
v->zDb = spec->zDb; /* Freed when azColumn is freed */
|
sl@0
|
1954 |
v->zName = spec->zName; /* Freed when azColumn is freed */
|
sl@0
|
1955 |
v->nColumn = spec->nColumn;
|
sl@0
|
1956 |
v->azContentColumn = spec->azContentColumn;
|
sl@0
|
1957 |
spec->azContentColumn = 0;
|
sl@0
|
1958 |
v->azColumn = spec->azColumn;
|
sl@0
|
1959 |
spec->azColumn = 0;
|
sl@0
|
1960 |
|
sl@0
|
1961 |
if( spec->azTokenizer==0 ){
|
sl@0
|
1962 |
return SQLITE_NOMEM;
|
sl@0
|
1963 |
}
|
sl@0
|
1964 |
/* TODO(shess) For now, add new tokenizers as else if clauses. */
|
sl@0
|
1965 |
if( spec->azTokenizer[0]==0 || startsWith(spec->azTokenizer[0], "simple") ){
|
sl@0
|
1966 |
sqlite3Fts1SimpleTokenizerModule(&m);
|
sl@0
|
1967 |
}else if( startsWith(spec->azTokenizer[0], "porter") ){
|
sl@0
|
1968 |
sqlite3Fts1PorterTokenizerModule(&m);
|
sl@0
|
1969 |
}else{
|
sl@0
|
1970 |
*pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
|
sl@0
|
1971 |
rc = SQLITE_ERROR;
|
sl@0
|
1972 |
goto err;
|
sl@0
|
1973 |
}
|
sl@0
|
1974 |
for(n=0; spec->azTokenizer[n]; n++){}
|
sl@0
|
1975 |
if( n ){
|
sl@0
|
1976 |
rc = m->xCreate(n-1, (const char*const*)&spec->azTokenizer[1],
|
sl@0
|
1977 |
&v->pTokenizer);
|
sl@0
|
1978 |
}else{
|
sl@0
|
1979 |
rc = m->xCreate(0, 0, &v->pTokenizer);
|
sl@0
|
1980 |
}
|
sl@0
|
1981 |
if( rc!=SQLITE_OK ) goto err;
|
sl@0
|
1982 |
v->pTokenizer->pModule = m;
|
sl@0
|
1983 |
|
sl@0
|
1984 |
/* TODO: verify the existence of backing tables foo_content, foo_term */
|
sl@0
|
1985 |
|
sl@0
|
1986 |
schema = fulltextSchema(v->nColumn, (const char*const*)v->azColumn,
|
sl@0
|
1987 |
spec->zName);
|
sl@0
|
1988 |
rc = sqlite3_declare_vtab(db, schema);
|
sl@0
|
1989 |
sqlite3_free(schema);
|
sl@0
|
1990 |
if( rc!=SQLITE_OK ) goto err;
|
sl@0
|
1991 |
|
sl@0
|
1992 |
memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
|
sl@0
|
1993 |
|
sl@0
|
1994 |
*ppVTab = &v->base;
|
sl@0
|
1995 |
TRACE(("FTS1 Connect %p\n", v));
|
sl@0
|
1996 |
|
sl@0
|
1997 |
return rc;
|
sl@0
|
1998 |
|
sl@0
|
1999 |
err:
|
sl@0
|
2000 |
fulltext_vtab_destroy(v);
|
sl@0
|
2001 |
return rc;
|
sl@0
|
2002 |
}
|
sl@0
|
2003 |
|
sl@0
|
2004 |
static int fulltextConnect(
|
sl@0
|
2005 |
sqlite3 *db,
|
sl@0
|
2006 |
void *pAux,
|
sl@0
|
2007 |
int argc, const char *const*argv,
|
sl@0
|
2008 |
sqlite3_vtab **ppVTab,
|
sl@0
|
2009 |
char **pzErr
|
sl@0
|
2010 |
){
|
sl@0
|
2011 |
TableSpec spec;
|
sl@0
|
2012 |
int rc = parseSpec(&spec, argc, argv, pzErr);
|
sl@0
|
2013 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2014 |
|
sl@0
|
2015 |
rc = constructVtab(db, &spec, ppVTab, pzErr);
|
sl@0
|
2016 |
clearTableSpec(&spec);
|
sl@0
|
2017 |
return rc;
|
sl@0
|
2018 |
}
|
sl@0
|
2019 |
|
sl@0
|
2020 |
/* The %_content table holds the text of each document, with
|
sl@0
|
2021 |
** the rowid used as the docid.
|
sl@0
|
2022 |
**
|
sl@0
|
2023 |
** The %_term table maps each term to a document list blob
|
sl@0
|
2024 |
** containing elements sorted by ascending docid, each element
|
sl@0
|
2025 |
** encoded as:
|
sl@0
|
2026 |
**
|
sl@0
|
2027 |
** docid varint-encoded
|
sl@0
|
2028 |
** token elements:
|
sl@0
|
2029 |
** position+1 varint-encoded as delta from previous position
|
sl@0
|
2030 |
** start offset varint-encoded as delta from previous start offset
|
sl@0
|
2031 |
** end offset varint-encoded as delta from start offset
|
sl@0
|
2032 |
**
|
sl@0
|
2033 |
** The sentinel position of 0 indicates the end of the token list.
|
sl@0
|
2034 |
**
|
sl@0
|
2035 |
** Additionally, doclist blobs are chunked into multiple segments,
|
sl@0
|
2036 |
** using segment to order the segments. New elements are added to
|
sl@0
|
2037 |
** the segment at segment 0, until it exceeds CHUNK_MAX. Then
|
sl@0
|
2038 |
** segment 0 is deleted, and the doclist is inserted at segment 1.
|
sl@0
|
2039 |
** If there is already a doclist at segment 1, the segment 0 doclist
|
sl@0
|
2040 |
** is merged with it, the segment 1 doclist is deleted, and the
|
sl@0
|
2041 |
** merged doclist is inserted at segment 2, repeating those
|
sl@0
|
2042 |
** operations until an insert succeeds.
|
sl@0
|
2043 |
**
|
sl@0
|
2044 |
** Since this structure doesn't allow us to update elements in place
|
sl@0
|
2045 |
** in case of deletion or update, these are simply written to
|
sl@0
|
2046 |
** segment 0 (with an empty token list in case of deletion), with
|
sl@0
|
2047 |
** docListAccumulate() taking care to retain lower-segment
|
sl@0
|
2048 |
** information in preference to higher-segment information.
|
sl@0
|
2049 |
*/
|
sl@0
|
2050 |
/* TODO(shess) Provide a VACUUM type operation which both removes
|
sl@0
|
2051 |
** deleted elements which are no longer necessary, and duplicated
|
sl@0
|
2052 |
** elements. I suspect this will probably not be necessary in
|
sl@0
|
2053 |
** practice, though.
|
sl@0
|
2054 |
*/
|
sl@0
|
2055 |
static int fulltextCreate(sqlite3 *db, void *pAux,
|
sl@0
|
2056 |
int argc, const char * const *argv,
|
sl@0
|
2057 |
sqlite3_vtab **ppVTab, char **pzErr){
|
sl@0
|
2058 |
int rc;
|
sl@0
|
2059 |
TableSpec spec;
|
sl@0
|
2060 |
StringBuffer schema;
|
sl@0
|
2061 |
TRACE(("FTS1 Create\n"));
|
sl@0
|
2062 |
|
sl@0
|
2063 |
rc = parseSpec(&spec, argc, argv, pzErr);
|
sl@0
|
2064 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2065 |
|
sl@0
|
2066 |
initStringBuffer(&schema);
|
sl@0
|
2067 |
append(&schema, "CREATE TABLE %_content(");
|
sl@0
|
2068 |
appendList(&schema, spec.nColumn, spec.azContentColumn);
|
sl@0
|
2069 |
append(&schema, ")");
|
sl@0
|
2070 |
rc = sql_exec(db, spec.zDb, spec.zName, schema.s);
|
sl@0
|
2071 |
free(schema.s);
|
sl@0
|
2072 |
if( rc!=SQLITE_OK ) goto out;
|
sl@0
|
2073 |
|
sl@0
|
2074 |
rc = sql_exec(db, spec.zDb, spec.zName,
|
sl@0
|
2075 |
"create table %_term(term text, segment integer, doclist blob, "
|
sl@0
|
2076 |
"primary key(term, segment));");
|
sl@0
|
2077 |
if( rc!=SQLITE_OK ) goto out;
|
sl@0
|
2078 |
|
sl@0
|
2079 |
rc = constructVtab(db, &spec, ppVTab, pzErr);
|
sl@0
|
2080 |
|
sl@0
|
2081 |
out:
|
sl@0
|
2082 |
clearTableSpec(&spec);
|
sl@0
|
2083 |
return rc;
|
sl@0
|
2084 |
}
|
sl@0
|
2085 |
|
sl@0
|
2086 |
/* Decide how to handle an SQL query. */
|
sl@0
|
2087 |
static int fulltextBestIndex(sqlite3_vtab *pVTab, sqlite3_index_info *pInfo){
|
sl@0
|
2088 |
int i;
|
sl@0
|
2089 |
TRACE(("FTS1 BestIndex\n"));
|
sl@0
|
2090 |
|
sl@0
|
2091 |
for(i=0; i<pInfo->nConstraint; ++i){
|
sl@0
|
2092 |
const struct sqlite3_index_constraint *pConstraint;
|
sl@0
|
2093 |
pConstraint = &pInfo->aConstraint[i];
|
sl@0
|
2094 |
if( pConstraint->usable ) {
|
sl@0
|
2095 |
if( pConstraint->iColumn==-1 &&
|
sl@0
|
2096 |
pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
|
sl@0
|
2097 |
pInfo->idxNum = QUERY_ROWID; /* lookup by rowid */
|
sl@0
|
2098 |
TRACE(("FTS1 QUERY_ROWID\n"));
|
sl@0
|
2099 |
} else if( pConstraint->iColumn>=0 &&
|
sl@0
|
2100 |
pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
|
sl@0
|
2101 |
/* full-text search */
|
sl@0
|
2102 |
pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
|
sl@0
|
2103 |
TRACE(("FTS1 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
|
sl@0
|
2104 |
} else continue;
|
sl@0
|
2105 |
|
sl@0
|
2106 |
pInfo->aConstraintUsage[i].argvIndex = 1;
|
sl@0
|
2107 |
pInfo->aConstraintUsage[i].omit = 1;
|
sl@0
|
2108 |
|
sl@0
|
2109 |
/* An arbitrary value for now.
|
sl@0
|
2110 |
* TODO: Perhaps rowid matches should be considered cheaper than
|
sl@0
|
2111 |
* full-text searches. */
|
sl@0
|
2112 |
pInfo->estimatedCost = 1.0;
|
sl@0
|
2113 |
|
sl@0
|
2114 |
return SQLITE_OK;
|
sl@0
|
2115 |
}
|
sl@0
|
2116 |
}
|
sl@0
|
2117 |
pInfo->idxNum = QUERY_GENERIC;
|
sl@0
|
2118 |
return SQLITE_OK;
|
sl@0
|
2119 |
}
|
sl@0
|
2120 |
|
sl@0
|
2121 |
static int fulltextDisconnect(sqlite3_vtab *pVTab){
|
sl@0
|
2122 |
TRACE(("FTS1 Disconnect %p\n", pVTab));
|
sl@0
|
2123 |
fulltext_vtab_destroy((fulltext_vtab *)pVTab);
|
sl@0
|
2124 |
return SQLITE_OK;
|
sl@0
|
2125 |
}
|
sl@0
|
2126 |
|
sl@0
|
2127 |
static int fulltextDestroy(sqlite3_vtab *pVTab){
|
sl@0
|
2128 |
fulltext_vtab *v = (fulltext_vtab *)pVTab;
|
sl@0
|
2129 |
int rc;
|
sl@0
|
2130 |
|
sl@0
|
2131 |
TRACE(("FTS1 Destroy %p\n", pVTab));
|
sl@0
|
2132 |
rc = sql_exec(v->db, v->zDb, v->zName,
|
sl@0
|
2133 |
"drop table if exists %_content;"
|
sl@0
|
2134 |
"drop table if exists %_term;"
|
sl@0
|
2135 |
);
|
sl@0
|
2136 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2137 |
|
sl@0
|
2138 |
fulltext_vtab_destroy((fulltext_vtab *)pVTab);
|
sl@0
|
2139 |
return SQLITE_OK;
|
sl@0
|
2140 |
}
|
sl@0
|
2141 |
|
sl@0
|
2142 |
static int fulltextOpen(sqlite3_vtab *pVTab, sqlite3_vtab_cursor **ppCursor){
|
sl@0
|
2143 |
fulltext_cursor *c;
|
sl@0
|
2144 |
|
sl@0
|
2145 |
c = (fulltext_cursor *) calloc(sizeof(fulltext_cursor), 1);
|
sl@0
|
2146 |
/* sqlite will initialize c->base */
|
sl@0
|
2147 |
*ppCursor = &c->base;
|
sl@0
|
2148 |
TRACE(("FTS1 Open %p: %p\n", pVTab, c));
|
sl@0
|
2149 |
|
sl@0
|
2150 |
return SQLITE_OK;
|
sl@0
|
2151 |
}
|
sl@0
|
2152 |
|
sl@0
|
2153 |
|
sl@0
|
2154 |
/* Free all of the dynamically allocated memory held by *q
|
sl@0
|
2155 |
*/
|
sl@0
|
2156 |
static void queryClear(Query *q){
|
sl@0
|
2157 |
int i;
|
sl@0
|
2158 |
for(i = 0; i < q->nTerms; ++i){
|
sl@0
|
2159 |
free(q->pTerms[i].pTerm);
|
sl@0
|
2160 |
}
|
sl@0
|
2161 |
free(q->pTerms);
|
sl@0
|
2162 |
memset(q, 0, sizeof(*q));
|
sl@0
|
2163 |
}
|
sl@0
|
2164 |
|
sl@0
|
2165 |
/* Free all of the dynamically allocated memory held by the
|
sl@0
|
2166 |
** Snippet
|
sl@0
|
2167 |
*/
|
sl@0
|
2168 |
static void snippetClear(Snippet *p){
|
sl@0
|
2169 |
free(p->aMatch);
|
sl@0
|
2170 |
free(p->zOffset);
|
sl@0
|
2171 |
free(p->zSnippet);
|
sl@0
|
2172 |
memset(p, 0, sizeof(*p));
|
sl@0
|
2173 |
}
|
sl@0
|
2174 |
/*
|
sl@0
|
2175 |
** Append a single entry to the p->aMatch[] log.
|
sl@0
|
2176 |
*/
|
sl@0
|
2177 |
static void snippetAppendMatch(
|
sl@0
|
2178 |
Snippet *p, /* Append the entry to this snippet */
|
sl@0
|
2179 |
int iCol, int iTerm, /* The column and query term */
|
sl@0
|
2180 |
int iStart, int nByte /* Offset and size of the match */
|
sl@0
|
2181 |
){
|
sl@0
|
2182 |
int i;
|
sl@0
|
2183 |
struct snippetMatch *pMatch;
|
sl@0
|
2184 |
if( p->nMatch+1>=p->nAlloc ){
|
sl@0
|
2185 |
p->nAlloc = p->nAlloc*2 + 10;
|
sl@0
|
2186 |
p->aMatch = realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
|
sl@0
|
2187 |
if( p->aMatch==0 ){
|
sl@0
|
2188 |
p->nMatch = 0;
|
sl@0
|
2189 |
p->nAlloc = 0;
|
sl@0
|
2190 |
return;
|
sl@0
|
2191 |
}
|
sl@0
|
2192 |
}
|
sl@0
|
2193 |
i = p->nMatch++;
|
sl@0
|
2194 |
pMatch = &p->aMatch[i];
|
sl@0
|
2195 |
pMatch->iCol = iCol;
|
sl@0
|
2196 |
pMatch->iTerm = iTerm;
|
sl@0
|
2197 |
pMatch->iStart = iStart;
|
sl@0
|
2198 |
pMatch->nByte = nByte;
|
sl@0
|
2199 |
}
|
sl@0
|
2200 |
|
sl@0
|
2201 |
/*
|
sl@0
|
2202 |
** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
|
sl@0
|
2203 |
*/
|
sl@0
|
2204 |
#define FTS1_ROTOR_SZ (32)
|
sl@0
|
2205 |
#define FTS1_ROTOR_MASK (FTS1_ROTOR_SZ-1)
|
sl@0
|
2206 |
|
sl@0
|
2207 |
/*
|
sl@0
|
2208 |
** Add entries to pSnippet->aMatch[] for every match that occurs against
|
sl@0
|
2209 |
** document zDoc[0..nDoc-1] which is stored in column iColumn.
|
sl@0
|
2210 |
*/
|
sl@0
|
2211 |
static void snippetOffsetsOfColumn(
|
sl@0
|
2212 |
Query *pQuery,
|
sl@0
|
2213 |
Snippet *pSnippet,
|
sl@0
|
2214 |
int iColumn,
|
sl@0
|
2215 |
const char *zDoc,
|
sl@0
|
2216 |
int nDoc
|
sl@0
|
2217 |
){
|
sl@0
|
2218 |
const sqlite3_tokenizer_module *pTModule; /* The tokenizer module */
|
sl@0
|
2219 |
sqlite3_tokenizer *pTokenizer; /* The specific tokenizer */
|
sl@0
|
2220 |
sqlite3_tokenizer_cursor *pTCursor; /* Tokenizer cursor */
|
sl@0
|
2221 |
fulltext_vtab *pVtab; /* The full text index */
|
sl@0
|
2222 |
int nColumn; /* Number of columns in the index */
|
sl@0
|
2223 |
const QueryTerm *aTerm; /* Query string terms */
|
sl@0
|
2224 |
int nTerm; /* Number of query string terms */
|
sl@0
|
2225 |
int i, j; /* Loop counters */
|
sl@0
|
2226 |
int rc; /* Return code */
|
sl@0
|
2227 |
unsigned int match, prevMatch; /* Phrase search bitmasks */
|
sl@0
|
2228 |
const char *zToken; /* Next token from the tokenizer */
|
sl@0
|
2229 |
int nToken; /* Size of zToken */
|
sl@0
|
2230 |
int iBegin, iEnd, iPos; /* Offsets of beginning and end */
|
sl@0
|
2231 |
|
sl@0
|
2232 |
/* The following variables keep a circular buffer of the last
|
sl@0
|
2233 |
** few tokens */
|
sl@0
|
2234 |
unsigned int iRotor = 0; /* Index of current token */
|
sl@0
|
2235 |
int iRotorBegin[FTS1_ROTOR_SZ]; /* Beginning offset of token */
|
sl@0
|
2236 |
int iRotorLen[FTS1_ROTOR_SZ]; /* Length of token */
|
sl@0
|
2237 |
|
sl@0
|
2238 |
pVtab = pQuery->pFts;
|
sl@0
|
2239 |
nColumn = pVtab->nColumn;
|
sl@0
|
2240 |
pTokenizer = pVtab->pTokenizer;
|
sl@0
|
2241 |
pTModule = pTokenizer->pModule;
|
sl@0
|
2242 |
rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
|
sl@0
|
2243 |
if( rc ) return;
|
sl@0
|
2244 |
pTCursor->pTokenizer = pTokenizer;
|
sl@0
|
2245 |
aTerm = pQuery->pTerms;
|
sl@0
|
2246 |
nTerm = pQuery->nTerms;
|
sl@0
|
2247 |
if( nTerm>=FTS1_ROTOR_SZ ){
|
sl@0
|
2248 |
nTerm = FTS1_ROTOR_SZ - 1;
|
sl@0
|
2249 |
}
|
sl@0
|
2250 |
prevMatch = 0;
|
sl@0
|
2251 |
while(1){
|
sl@0
|
2252 |
rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
|
sl@0
|
2253 |
if( rc ) break;
|
sl@0
|
2254 |
iRotorBegin[iRotor&FTS1_ROTOR_MASK] = iBegin;
|
sl@0
|
2255 |
iRotorLen[iRotor&FTS1_ROTOR_MASK] = iEnd-iBegin;
|
sl@0
|
2256 |
match = 0;
|
sl@0
|
2257 |
for(i=0; i<nTerm; i++){
|
sl@0
|
2258 |
int iCol;
|
sl@0
|
2259 |
iCol = aTerm[i].iColumn;
|
sl@0
|
2260 |
if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
|
sl@0
|
2261 |
if( aTerm[i].nTerm!=nToken ) continue;
|
sl@0
|
2262 |
if( memcmp(aTerm[i].pTerm, zToken, nToken) ) continue;
|
sl@0
|
2263 |
if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
|
sl@0
|
2264 |
match |= 1<<i;
|
sl@0
|
2265 |
if( i==nTerm-1 || aTerm[i+1].iPhrase==1 ){
|
sl@0
|
2266 |
for(j=aTerm[i].iPhrase-1; j>=0; j--){
|
sl@0
|
2267 |
int k = (iRotor-j) & FTS1_ROTOR_MASK;
|
sl@0
|
2268 |
snippetAppendMatch(pSnippet, iColumn, i-j,
|
sl@0
|
2269 |
iRotorBegin[k], iRotorLen[k]);
|
sl@0
|
2270 |
}
|
sl@0
|
2271 |
}
|
sl@0
|
2272 |
}
|
sl@0
|
2273 |
prevMatch = match<<1;
|
sl@0
|
2274 |
iRotor++;
|
sl@0
|
2275 |
}
|
sl@0
|
2276 |
pTModule->xClose(pTCursor);
|
sl@0
|
2277 |
}
|
sl@0
|
2278 |
|
sl@0
|
2279 |
|
sl@0
|
2280 |
/*
|
sl@0
|
2281 |
** Compute all offsets for the current row of the query.
|
sl@0
|
2282 |
** If the offsets have already been computed, this routine is a no-op.
|
sl@0
|
2283 |
*/
|
sl@0
|
2284 |
static void snippetAllOffsets(fulltext_cursor *p){
|
sl@0
|
2285 |
int nColumn;
|
sl@0
|
2286 |
int iColumn, i;
|
sl@0
|
2287 |
int iFirst, iLast;
|
sl@0
|
2288 |
fulltext_vtab *pFts;
|
sl@0
|
2289 |
|
sl@0
|
2290 |
if( p->snippet.nMatch ) return;
|
sl@0
|
2291 |
if( p->q.nTerms==0 ) return;
|
sl@0
|
2292 |
pFts = p->q.pFts;
|
sl@0
|
2293 |
nColumn = pFts->nColumn;
|
sl@0
|
2294 |
iColumn = p->iCursorType - QUERY_FULLTEXT;
|
sl@0
|
2295 |
if( iColumn<0 || iColumn>=nColumn ){
|
sl@0
|
2296 |
iFirst = 0;
|
sl@0
|
2297 |
iLast = nColumn-1;
|
sl@0
|
2298 |
}else{
|
sl@0
|
2299 |
iFirst = iColumn;
|
sl@0
|
2300 |
iLast = iColumn;
|
sl@0
|
2301 |
}
|
sl@0
|
2302 |
for(i=iFirst; i<=iLast; i++){
|
sl@0
|
2303 |
const char *zDoc;
|
sl@0
|
2304 |
int nDoc;
|
sl@0
|
2305 |
zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
|
sl@0
|
2306 |
nDoc = sqlite3_column_bytes(p->pStmt, i+1);
|
sl@0
|
2307 |
snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
|
sl@0
|
2308 |
}
|
sl@0
|
2309 |
}
|
sl@0
|
2310 |
|
sl@0
|
2311 |
/*
|
sl@0
|
2312 |
** Convert the information in the aMatch[] array of the snippet
|
sl@0
|
2313 |
** into the string zOffset[0..nOffset-1].
|
sl@0
|
2314 |
*/
|
sl@0
|
2315 |
static void snippetOffsetText(Snippet *p){
|
sl@0
|
2316 |
int i;
|
sl@0
|
2317 |
int cnt = 0;
|
sl@0
|
2318 |
StringBuffer sb;
|
sl@0
|
2319 |
char zBuf[200];
|
sl@0
|
2320 |
if( p->zOffset ) return;
|
sl@0
|
2321 |
initStringBuffer(&sb);
|
sl@0
|
2322 |
for(i=0; i<p->nMatch; i++){
|
sl@0
|
2323 |
struct snippetMatch *pMatch = &p->aMatch[i];
|
sl@0
|
2324 |
zBuf[0] = ' ';
|
sl@0
|
2325 |
sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
|
sl@0
|
2326 |
pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
|
sl@0
|
2327 |
append(&sb, zBuf);
|
sl@0
|
2328 |
cnt++;
|
sl@0
|
2329 |
}
|
sl@0
|
2330 |
p->zOffset = sb.s;
|
sl@0
|
2331 |
p->nOffset = sb.len;
|
sl@0
|
2332 |
}
|
sl@0
|
2333 |
|
sl@0
|
2334 |
/*
|
sl@0
|
2335 |
** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
|
sl@0
|
2336 |
** of matching words some of which might be in zDoc. zDoc is column
|
sl@0
|
2337 |
** number iCol.
|
sl@0
|
2338 |
**
|
sl@0
|
2339 |
** iBreak is suggested spot in zDoc where we could begin or end an
|
sl@0
|
2340 |
** excerpt. Return a value similar to iBreak but possibly adjusted
|
sl@0
|
2341 |
** to be a little left or right so that the break point is better.
|
sl@0
|
2342 |
*/
|
sl@0
|
2343 |
static int wordBoundary(
|
sl@0
|
2344 |
int iBreak, /* The suggested break point */
|
sl@0
|
2345 |
const char *zDoc, /* Document text */
|
sl@0
|
2346 |
int nDoc, /* Number of bytes in zDoc[] */
|
sl@0
|
2347 |
struct snippetMatch *aMatch, /* Matching words */
|
sl@0
|
2348 |
int nMatch, /* Number of entries in aMatch[] */
|
sl@0
|
2349 |
int iCol /* The column number for zDoc[] */
|
sl@0
|
2350 |
){
|
sl@0
|
2351 |
int i;
|
sl@0
|
2352 |
if( iBreak<=10 ){
|
sl@0
|
2353 |
return 0;
|
sl@0
|
2354 |
}
|
sl@0
|
2355 |
if( iBreak>=nDoc-10 ){
|
sl@0
|
2356 |
return nDoc;
|
sl@0
|
2357 |
}
|
sl@0
|
2358 |
for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
|
sl@0
|
2359 |
while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
|
sl@0
|
2360 |
if( i<nMatch ){
|
sl@0
|
2361 |
if( aMatch[i].iStart<iBreak+10 ){
|
sl@0
|
2362 |
return aMatch[i].iStart;
|
sl@0
|
2363 |
}
|
sl@0
|
2364 |
if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
|
sl@0
|
2365 |
return aMatch[i-1].iStart;
|
sl@0
|
2366 |
}
|
sl@0
|
2367 |
}
|
sl@0
|
2368 |
for(i=1; i<=10; i++){
|
sl@0
|
2369 |
if( safe_isspace(zDoc[iBreak-i]) ){
|
sl@0
|
2370 |
return iBreak - i + 1;
|
sl@0
|
2371 |
}
|
sl@0
|
2372 |
if( safe_isspace(zDoc[iBreak+i]) ){
|
sl@0
|
2373 |
return iBreak + i + 1;
|
sl@0
|
2374 |
}
|
sl@0
|
2375 |
}
|
sl@0
|
2376 |
return iBreak;
|
sl@0
|
2377 |
}
|
sl@0
|
2378 |
|
sl@0
|
2379 |
/*
|
sl@0
|
2380 |
** If the StringBuffer does not end in white space, add a single
|
sl@0
|
2381 |
** space character to the end.
|
sl@0
|
2382 |
*/
|
sl@0
|
2383 |
static void appendWhiteSpace(StringBuffer *p){
|
sl@0
|
2384 |
if( p->len==0 ) return;
|
sl@0
|
2385 |
if( safe_isspace(p->s[p->len-1]) ) return;
|
sl@0
|
2386 |
append(p, " ");
|
sl@0
|
2387 |
}
|
sl@0
|
2388 |
|
sl@0
|
2389 |
/*
|
sl@0
|
2390 |
** Remove white space from teh end of the StringBuffer
|
sl@0
|
2391 |
*/
|
sl@0
|
2392 |
static void trimWhiteSpace(StringBuffer *p){
|
sl@0
|
2393 |
while( p->len>0 && safe_isspace(p->s[p->len-1]) ){
|
sl@0
|
2394 |
p->len--;
|
sl@0
|
2395 |
}
|
sl@0
|
2396 |
}
|
sl@0
|
2397 |
|
sl@0
|
2398 |
|
sl@0
|
2399 |
|
sl@0
|
2400 |
/*
|
sl@0
|
2401 |
** Allowed values for Snippet.aMatch[].snStatus
|
sl@0
|
2402 |
*/
|
sl@0
|
2403 |
#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
|
sl@0
|
2404 |
#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
|
sl@0
|
2405 |
|
sl@0
|
2406 |
/*
|
sl@0
|
2407 |
** Generate the text of a snippet.
|
sl@0
|
2408 |
*/
|
sl@0
|
2409 |
static void snippetText(
|
sl@0
|
2410 |
fulltext_cursor *pCursor, /* The cursor we need the snippet for */
|
sl@0
|
2411 |
const char *zStartMark, /* Markup to appear before each match */
|
sl@0
|
2412 |
const char *zEndMark, /* Markup to appear after each match */
|
sl@0
|
2413 |
const char *zEllipsis /* Ellipsis mark */
|
sl@0
|
2414 |
){
|
sl@0
|
2415 |
int i, j;
|
sl@0
|
2416 |
struct snippetMatch *aMatch;
|
sl@0
|
2417 |
int nMatch;
|
sl@0
|
2418 |
int nDesired;
|
sl@0
|
2419 |
StringBuffer sb;
|
sl@0
|
2420 |
int tailCol;
|
sl@0
|
2421 |
int tailOffset;
|
sl@0
|
2422 |
int iCol;
|
sl@0
|
2423 |
int nDoc;
|
sl@0
|
2424 |
const char *zDoc;
|
sl@0
|
2425 |
int iStart, iEnd;
|
sl@0
|
2426 |
int tailEllipsis = 0;
|
sl@0
|
2427 |
int iMatch;
|
sl@0
|
2428 |
|
sl@0
|
2429 |
|
sl@0
|
2430 |
free(pCursor->snippet.zSnippet);
|
sl@0
|
2431 |
pCursor->snippet.zSnippet = 0;
|
sl@0
|
2432 |
aMatch = pCursor->snippet.aMatch;
|
sl@0
|
2433 |
nMatch = pCursor->snippet.nMatch;
|
sl@0
|
2434 |
initStringBuffer(&sb);
|
sl@0
|
2435 |
|
sl@0
|
2436 |
for(i=0; i<nMatch; i++){
|
sl@0
|
2437 |
aMatch[i].snStatus = SNIPPET_IGNORE;
|
sl@0
|
2438 |
}
|
sl@0
|
2439 |
nDesired = 0;
|
sl@0
|
2440 |
for(i=0; i<pCursor->q.nTerms; i++){
|
sl@0
|
2441 |
for(j=0; j<nMatch; j++){
|
sl@0
|
2442 |
if( aMatch[j].iTerm==i ){
|
sl@0
|
2443 |
aMatch[j].snStatus = SNIPPET_DESIRED;
|
sl@0
|
2444 |
nDesired++;
|
sl@0
|
2445 |
break;
|
sl@0
|
2446 |
}
|
sl@0
|
2447 |
}
|
sl@0
|
2448 |
}
|
sl@0
|
2449 |
|
sl@0
|
2450 |
iMatch = 0;
|
sl@0
|
2451 |
tailCol = -1;
|
sl@0
|
2452 |
tailOffset = 0;
|
sl@0
|
2453 |
for(i=0; i<nMatch && nDesired>0; i++){
|
sl@0
|
2454 |
if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
|
sl@0
|
2455 |
nDesired--;
|
sl@0
|
2456 |
iCol = aMatch[i].iCol;
|
sl@0
|
2457 |
zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
|
sl@0
|
2458 |
nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
|
sl@0
|
2459 |
iStart = aMatch[i].iStart - 40;
|
sl@0
|
2460 |
iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
|
sl@0
|
2461 |
if( iStart<=10 ){
|
sl@0
|
2462 |
iStart = 0;
|
sl@0
|
2463 |
}
|
sl@0
|
2464 |
if( iCol==tailCol && iStart<=tailOffset+20 ){
|
sl@0
|
2465 |
iStart = tailOffset;
|
sl@0
|
2466 |
}
|
sl@0
|
2467 |
if( (iCol!=tailCol && tailCol>=0) || iStart!=tailOffset ){
|
sl@0
|
2468 |
trimWhiteSpace(&sb);
|
sl@0
|
2469 |
appendWhiteSpace(&sb);
|
sl@0
|
2470 |
append(&sb, zEllipsis);
|
sl@0
|
2471 |
appendWhiteSpace(&sb);
|
sl@0
|
2472 |
}
|
sl@0
|
2473 |
iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
|
sl@0
|
2474 |
iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
|
sl@0
|
2475 |
if( iEnd>=nDoc-10 ){
|
sl@0
|
2476 |
iEnd = nDoc;
|
sl@0
|
2477 |
tailEllipsis = 0;
|
sl@0
|
2478 |
}else{
|
sl@0
|
2479 |
tailEllipsis = 1;
|
sl@0
|
2480 |
}
|
sl@0
|
2481 |
while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
|
sl@0
|
2482 |
while( iStart<iEnd ){
|
sl@0
|
2483 |
while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
|
sl@0
|
2484 |
&& aMatch[iMatch].iCol<=iCol ){
|
sl@0
|
2485 |
iMatch++;
|
sl@0
|
2486 |
}
|
sl@0
|
2487 |
if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
|
sl@0
|
2488 |
&& aMatch[iMatch].iCol==iCol ){
|
sl@0
|
2489 |
nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
|
sl@0
|
2490 |
iStart = aMatch[iMatch].iStart;
|
sl@0
|
2491 |
append(&sb, zStartMark);
|
sl@0
|
2492 |
nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
|
sl@0
|
2493 |
append(&sb, zEndMark);
|
sl@0
|
2494 |
iStart += aMatch[iMatch].nByte;
|
sl@0
|
2495 |
for(j=iMatch+1; j<nMatch; j++){
|
sl@0
|
2496 |
if( aMatch[j].iTerm==aMatch[iMatch].iTerm
|
sl@0
|
2497 |
&& aMatch[j].snStatus==SNIPPET_DESIRED ){
|
sl@0
|
2498 |
nDesired--;
|
sl@0
|
2499 |
aMatch[j].snStatus = SNIPPET_IGNORE;
|
sl@0
|
2500 |
}
|
sl@0
|
2501 |
}
|
sl@0
|
2502 |
}else{
|
sl@0
|
2503 |
nappend(&sb, &zDoc[iStart], iEnd - iStart);
|
sl@0
|
2504 |
iStart = iEnd;
|
sl@0
|
2505 |
}
|
sl@0
|
2506 |
}
|
sl@0
|
2507 |
tailCol = iCol;
|
sl@0
|
2508 |
tailOffset = iEnd;
|
sl@0
|
2509 |
}
|
sl@0
|
2510 |
trimWhiteSpace(&sb);
|
sl@0
|
2511 |
if( tailEllipsis ){
|
sl@0
|
2512 |
appendWhiteSpace(&sb);
|
sl@0
|
2513 |
append(&sb, zEllipsis);
|
sl@0
|
2514 |
}
|
sl@0
|
2515 |
pCursor->snippet.zSnippet = sb.s;
|
sl@0
|
2516 |
pCursor->snippet.nSnippet = sb.len;
|
sl@0
|
2517 |
}
|
sl@0
|
2518 |
|
sl@0
|
2519 |
|
sl@0
|
2520 |
/*
|
sl@0
|
2521 |
** Close the cursor. For additional information see the documentation
|
sl@0
|
2522 |
** on the xClose method of the virtual table interface.
|
sl@0
|
2523 |
*/
|
sl@0
|
2524 |
static int fulltextClose(sqlite3_vtab_cursor *pCursor){
|
sl@0
|
2525 |
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
sl@0
|
2526 |
TRACE(("FTS1 Close %p\n", c));
|
sl@0
|
2527 |
sqlite3_finalize(c->pStmt);
|
sl@0
|
2528 |
queryClear(&c->q);
|
sl@0
|
2529 |
snippetClear(&c->snippet);
|
sl@0
|
2530 |
if( c->result.pDoclist!=NULL ){
|
sl@0
|
2531 |
docListDelete(c->result.pDoclist);
|
sl@0
|
2532 |
}
|
sl@0
|
2533 |
free(c);
|
sl@0
|
2534 |
return SQLITE_OK;
|
sl@0
|
2535 |
}
|
sl@0
|
2536 |
|
sl@0
|
2537 |
static int fulltextNext(sqlite3_vtab_cursor *pCursor){
|
sl@0
|
2538 |
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
sl@0
|
2539 |
sqlite_int64 iDocid;
|
sl@0
|
2540 |
int rc;
|
sl@0
|
2541 |
|
sl@0
|
2542 |
TRACE(("FTS1 Next %p\n", pCursor));
|
sl@0
|
2543 |
snippetClear(&c->snippet);
|
sl@0
|
2544 |
if( c->iCursorType < QUERY_FULLTEXT ){
|
sl@0
|
2545 |
/* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
|
sl@0
|
2546 |
rc = sqlite3_step(c->pStmt);
|
sl@0
|
2547 |
switch( rc ){
|
sl@0
|
2548 |
case SQLITE_ROW:
|
sl@0
|
2549 |
c->eof = 0;
|
sl@0
|
2550 |
return SQLITE_OK;
|
sl@0
|
2551 |
case SQLITE_DONE:
|
sl@0
|
2552 |
c->eof = 1;
|
sl@0
|
2553 |
return SQLITE_OK;
|
sl@0
|
2554 |
default:
|
sl@0
|
2555 |
c->eof = 1;
|
sl@0
|
2556 |
return rc;
|
sl@0
|
2557 |
}
|
sl@0
|
2558 |
} else { /* full-text query */
|
sl@0
|
2559 |
rc = sqlite3_reset(c->pStmt);
|
sl@0
|
2560 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2561 |
|
sl@0
|
2562 |
iDocid = nextDocid(&c->result);
|
sl@0
|
2563 |
if( iDocid==0 ){
|
sl@0
|
2564 |
c->eof = 1;
|
sl@0
|
2565 |
return SQLITE_OK;
|
sl@0
|
2566 |
}
|
sl@0
|
2567 |
rc = sqlite3_bind_int64(c->pStmt, 1, iDocid);
|
sl@0
|
2568 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2569 |
/* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
|
sl@0
|
2570 |
rc = sqlite3_step(c->pStmt);
|
sl@0
|
2571 |
if( rc==SQLITE_ROW ){ /* the case we expect */
|
sl@0
|
2572 |
c->eof = 0;
|
sl@0
|
2573 |
return SQLITE_OK;
|
sl@0
|
2574 |
}
|
sl@0
|
2575 |
/* an error occurred; abort */
|
sl@0
|
2576 |
return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
|
sl@0
|
2577 |
}
|
sl@0
|
2578 |
}
|
sl@0
|
2579 |
|
sl@0
|
2580 |
|
sl@0
|
2581 |
/* Return a DocList corresponding to the query term *pTerm. If *pTerm
|
sl@0
|
2582 |
** is the first term of a phrase query, go ahead and evaluate the phrase
|
sl@0
|
2583 |
** query and return the doclist for the entire phrase query.
|
sl@0
|
2584 |
**
|
sl@0
|
2585 |
** The result is stored in pTerm->doclist.
|
sl@0
|
2586 |
*/
|
sl@0
|
2587 |
static int docListOfTerm(
|
sl@0
|
2588 |
fulltext_vtab *v, /* The full text index */
|
sl@0
|
2589 |
int iColumn, /* column to restrict to. No restrition if >=nColumn */
|
sl@0
|
2590 |
QueryTerm *pQTerm, /* Term we are looking for, or 1st term of a phrase */
|
sl@0
|
2591 |
DocList **ppResult /* Write the result here */
|
sl@0
|
2592 |
){
|
sl@0
|
2593 |
DocList *pLeft, *pRight, *pNew;
|
sl@0
|
2594 |
int i, rc;
|
sl@0
|
2595 |
|
sl@0
|
2596 |
pLeft = docListNew(DL_POSITIONS);
|
sl@0
|
2597 |
rc = term_select_all(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pLeft);
|
sl@0
|
2598 |
if( rc ){
|
sl@0
|
2599 |
docListDelete(pLeft);
|
sl@0
|
2600 |
return rc;
|
sl@0
|
2601 |
}
|
sl@0
|
2602 |
for(i=1; i<=pQTerm->nPhrase; i++){
|
sl@0
|
2603 |
pRight = docListNew(DL_POSITIONS);
|
sl@0
|
2604 |
rc = term_select_all(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm, pRight);
|
sl@0
|
2605 |
if( rc ){
|
sl@0
|
2606 |
docListDelete(pLeft);
|
sl@0
|
2607 |
return rc;
|
sl@0
|
2608 |
}
|
sl@0
|
2609 |
pNew = docListNew(i<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS);
|
sl@0
|
2610 |
docListPhraseMerge(pLeft, pRight, pNew);
|
sl@0
|
2611 |
docListDelete(pLeft);
|
sl@0
|
2612 |
docListDelete(pRight);
|
sl@0
|
2613 |
pLeft = pNew;
|
sl@0
|
2614 |
}
|
sl@0
|
2615 |
*ppResult = pLeft;
|
sl@0
|
2616 |
return SQLITE_OK;
|
sl@0
|
2617 |
}
|
sl@0
|
2618 |
|
sl@0
|
2619 |
/* Add a new term pTerm[0..nTerm-1] to the query *q.
|
sl@0
|
2620 |
*/
|
sl@0
|
2621 |
static void queryAdd(Query *q, const char *pTerm, int nTerm){
|
sl@0
|
2622 |
QueryTerm *t;
|
sl@0
|
2623 |
++q->nTerms;
|
sl@0
|
2624 |
q->pTerms = realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
|
sl@0
|
2625 |
if( q->pTerms==0 ){
|
sl@0
|
2626 |
q->nTerms = 0;
|
sl@0
|
2627 |
return;
|
sl@0
|
2628 |
}
|
sl@0
|
2629 |
t = &q->pTerms[q->nTerms - 1];
|
sl@0
|
2630 |
memset(t, 0, sizeof(*t));
|
sl@0
|
2631 |
t->pTerm = malloc(nTerm+1);
|
sl@0
|
2632 |
memcpy(t->pTerm, pTerm, nTerm);
|
sl@0
|
2633 |
t->pTerm[nTerm] = 0;
|
sl@0
|
2634 |
t->nTerm = nTerm;
|
sl@0
|
2635 |
t->isOr = q->nextIsOr;
|
sl@0
|
2636 |
q->nextIsOr = 0;
|
sl@0
|
2637 |
t->iColumn = q->nextColumn;
|
sl@0
|
2638 |
q->nextColumn = q->dfltColumn;
|
sl@0
|
2639 |
}
|
sl@0
|
2640 |
|
sl@0
|
2641 |
/*
|
sl@0
|
2642 |
** Check to see if the string zToken[0...nToken-1] matches any
|
sl@0
|
2643 |
** column name in the virtual table. If it does,
|
sl@0
|
2644 |
** return the zero-indexed column number. If not, return -1.
|
sl@0
|
2645 |
*/
|
sl@0
|
2646 |
static int checkColumnSpecifier(
|
sl@0
|
2647 |
fulltext_vtab *pVtab, /* The virtual table */
|
sl@0
|
2648 |
const char *zToken, /* Text of the token */
|
sl@0
|
2649 |
int nToken /* Number of characters in the token */
|
sl@0
|
2650 |
){
|
sl@0
|
2651 |
int i;
|
sl@0
|
2652 |
for(i=0; i<pVtab->nColumn; i++){
|
sl@0
|
2653 |
if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
|
sl@0
|
2654 |
&& pVtab->azColumn[i][nToken]==0 ){
|
sl@0
|
2655 |
return i;
|
sl@0
|
2656 |
}
|
sl@0
|
2657 |
}
|
sl@0
|
2658 |
return -1;
|
sl@0
|
2659 |
}
|
sl@0
|
2660 |
|
sl@0
|
2661 |
/*
|
sl@0
|
2662 |
** Parse the text at pSegment[0..nSegment-1]. Add additional terms
|
sl@0
|
2663 |
** to the query being assemblied in pQuery.
|
sl@0
|
2664 |
**
|
sl@0
|
2665 |
** inPhrase is true if pSegment[0..nSegement-1] is contained within
|
sl@0
|
2666 |
** double-quotes. If inPhrase is true, then the first term
|
sl@0
|
2667 |
** is marked with the number of terms in the phrase less one and
|
sl@0
|
2668 |
** OR and "-" syntax is ignored. If inPhrase is false, then every
|
sl@0
|
2669 |
** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
|
sl@0
|
2670 |
*/
|
sl@0
|
2671 |
static int tokenizeSegment(
|
sl@0
|
2672 |
sqlite3_tokenizer *pTokenizer, /* The tokenizer to use */
|
sl@0
|
2673 |
const char *pSegment, int nSegment, /* Query expression being parsed */
|
sl@0
|
2674 |
int inPhrase, /* True if within "..." */
|
sl@0
|
2675 |
Query *pQuery /* Append results here */
|
sl@0
|
2676 |
){
|
sl@0
|
2677 |
const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
|
sl@0
|
2678 |
sqlite3_tokenizer_cursor *pCursor;
|
sl@0
|
2679 |
int firstIndex = pQuery->nTerms;
|
sl@0
|
2680 |
int iCol;
|
sl@0
|
2681 |
int nTerm = 1;
|
sl@0
|
2682 |
|
sl@0
|
2683 |
int rc = pModule->xOpen(pTokenizer, pSegment, nSegment, &pCursor);
|
sl@0
|
2684 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2685 |
pCursor->pTokenizer = pTokenizer;
|
sl@0
|
2686 |
|
sl@0
|
2687 |
while( 1 ){
|
sl@0
|
2688 |
const char *pToken;
|
sl@0
|
2689 |
int nToken, iBegin, iEnd, iPos;
|
sl@0
|
2690 |
|
sl@0
|
2691 |
rc = pModule->xNext(pCursor,
|
sl@0
|
2692 |
&pToken, &nToken,
|
sl@0
|
2693 |
&iBegin, &iEnd, &iPos);
|
sl@0
|
2694 |
if( rc!=SQLITE_OK ) break;
|
sl@0
|
2695 |
if( !inPhrase &&
|
sl@0
|
2696 |
pSegment[iEnd]==':' &&
|
sl@0
|
2697 |
(iCol = checkColumnSpecifier(pQuery->pFts, pToken, nToken))>=0 ){
|
sl@0
|
2698 |
pQuery->nextColumn = iCol;
|
sl@0
|
2699 |
continue;
|
sl@0
|
2700 |
}
|
sl@0
|
2701 |
if( !inPhrase && pQuery->nTerms>0 && nToken==2
|
sl@0
|
2702 |
&& pSegment[iBegin]=='O' && pSegment[iBegin+1]=='R' ){
|
sl@0
|
2703 |
pQuery->nextIsOr = 1;
|
sl@0
|
2704 |
continue;
|
sl@0
|
2705 |
}
|
sl@0
|
2706 |
queryAdd(pQuery, pToken, nToken);
|
sl@0
|
2707 |
if( !inPhrase && iBegin>0 && pSegment[iBegin-1]=='-' ){
|
sl@0
|
2708 |
pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
|
sl@0
|
2709 |
}
|
sl@0
|
2710 |
pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
|
sl@0
|
2711 |
if( inPhrase ){
|
sl@0
|
2712 |
nTerm++;
|
sl@0
|
2713 |
}
|
sl@0
|
2714 |
}
|
sl@0
|
2715 |
|
sl@0
|
2716 |
if( inPhrase && pQuery->nTerms>firstIndex ){
|
sl@0
|
2717 |
pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
|
sl@0
|
2718 |
}
|
sl@0
|
2719 |
|
sl@0
|
2720 |
return pModule->xClose(pCursor);
|
sl@0
|
2721 |
}
|
sl@0
|
2722 |
|
sl@0
|
2723 |
/* Parse a query string, yielding a Query object pQuery.
|
sl@0
|
2724 |
**
|
sl@0
|
2725 |
** The calling function will need to queryClear() to clean up
|
sl@0
|
2726 |
** the dynamically allocated memory held by pQuery.
|
sl@0
|
2727 |
*/
|
sl@0
|
2728 |
static int parseQuery(
|
sl@0
|
2729 |
fulltext_vtab *v, /* The fulltext index */
|
sl@0
|
2730 |
const char *zInput, /* Input text of the query string */
|
sl@0
|
2731 |
int nInput, /* Size of the input text */
|
sl@0
|
2732 |
int dfltColumn, /* Default column of the index to match against */
|
sl@0
|
2733 |
Query *pQuery /* Write the parse results here. */
|
sl@0
|
2734 |
){
|
sl@0
|
2735 |
int iInput, inPhrase = 0;
|
sl@0
|
2736 |
|
sl@0
|
2737 |
if( zInput==0 ) nInput = 0;
|
sl@0
|
2738 |
if( nInput<0 ) nInput = strlen(zInput);
|
sl@0
|
2739 |
pQuery->nTerms = 0;
|
sl@0
|
2740 |
pQuery->pTerms = NULL;
|
sl@0
|
2741 |
pQuery->nextIsOr = 0;
|
sl@0
|
2742 |
pQuery->nextColumn = dfltColumn;
|
sl@0
|
2743 |
pQuery->dfltColumn = dfltColumn;
|
sl@0
|
2744 |
pQuery->pFts = v;
|
sl@0
|
2745 |
|
sl@0
|
2746 |
for(iInput=0; iInput<nInput; ++iInput){
|
sl@0
|
2747 |
int i;
|
sl@0
|
2748 |
for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
|
sl@0
|
2749 |
if( i>iInput ){
|
sl@0
|
2750 |
tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
|
sl@0
|
2751 |
pQuery);
|
sl@0
|
2752 |
}
|
sl@0
|
2753 |
iInput = i;
|
sl@0
|
2754 |
if( i<nInput ){
|
sl@0
|
2755 |
assert( zInput[i]=='"' );
|
sl@0
|
2756 |
inPhrase = !inPhrase;
|
sl@0
|
2757 |
}
|
sl@0
|
2758 |
}
|
sl@0
|
2759 |
|
sl@0
|
2760 |
if( inPhrase ){
|
sl@0
|
2761 |
/* unmatched quote */
|
sl@0
|
2762 |
queryClear(pQuery);
|
sl@0
|
2763 |
return SQLITE_ERROR;
|
sl@0
|
2764 |
}
|
sl@0
|
2765 |
return SQLITE_OK;
|
sl@0
|
2766 |
}
|
sl@0
|
2767 |
|
sl@0
|
2768 |
/* Perform a full-text query using the search expression in
|
sl@0
|
2769 |
** zInput[0..nInput-1]. Return a list of matching documents
|
sl@0
|
2770 |
** in pResult.
|
sl@0
|
2771 |
**
|
sl@0
|
2772 |
** Queries must match column iColumn. Or if iColumn>=nColumn
|
sl@0
|
2773 |
** they are allowed to match against any column.
|
sl@0
|
2774 |
*/
|
sl@0
|
2775 |
static int fulltextQuery(
|
sl@0
|
2776 |
fulltext_vtab *v, /* The full text index */
|
sl@0
|
2777 |
int iColumn, /* Match against this column by default */
|
sl@0
|
2778 |
const char *zInput, /* The query string */
|
sl@0
|
2779 |
int nInput, /* Number of bytes in zInput[] */
|
sl@0
|
2780 |
DocList **pResult, /* Write the result doclist here */
|
sl@0
|
2781 |
Query *pQuery /* Put parsed query string here */
|
sl@0
|
2782 |
){
|
sl@0
|
2783 |
int i, iNext, rc;
|
sl@0
|
2784 |
DocList *pLeft = NULL;
|
sl@0
|
2785 |
DocList *pRight, *pNew, *pOr;
|
sl@0
|
2786 |
int nNot = 0;
|
sl@0
|
2787 |
QueryTerm *aTerm;
|
sl@0
|
2788 |
|
sl@0
|
2789 |
rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
|
sl@0
|
2790 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2791 |
|
sl@0
|
2792 |
/* Merge AND terms. */
|
sl@0
|
2793 |
aTerm = pQuery->pTerms;
|
sl@0
|
2794 |
for(i = 0; i<pQuery->nTerms; i=iNext){
|
sl@0
|
2795 |
if( aTerm[i].isNot ){
|
sl@0
|
2796 |
/* Handle all NOT terms in a separate pass */
|
sl@0
|
2797 |
nNot++;
|
sl@0
|
2798 |
iNext = i + aTerm[i].nPhrase+1;
|
sl@0
|
2799 |
continue;
|
sl@0
|
2800 |
}
|
sl@0
|
2801 |
iNext = i + aTerm[i].nPhrase + 1;
|
sl@0
|
2802 |
rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
|
sl@0
|
2803 |
if( rc ){
|
sl@0
|
2804 |
queryClear(pQuery);
|
sl@0
|
2805 |
return rc;
|
sl@0
|
2806 |
}
|
sl@0
|
2807 |
while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
|
sl@0
|
2808 |
rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &pOr);
|
sl@0
|
2809 |
iNext += aTerm[iNext].nPhrase + 1;
|
sl@0
|
2810 |
if( rc ){
|
sl@0
|
2811 |
queryClear(pQuery);
|
sl@0
|
2812 |
return rc;
|
sl@0
|
2813 |
}
|
sl@0
|
2814 |
pNew = docListNew(DL_DOCIDS);
|
sl@0
|
2815 |
docListOrMerge(pRight, pOr, pNew);
|
sl@0
|
2816 |
docListDelete(pRight);
|
sl@0
|
2817 |
docListDelete(pOr);
|
sl@0
|
2818 |
pRight = pNew;
|
sl@0
|
2819 |
}
|
sl@0
|
2820 |
if( pLeft==0 ){
|
sl@0
|
2821 |
pLeft = pRight;
|
sl@0
|
2822 |
}else{
|
sl@0
|
2823 |
pNew = docListNew(DL_DOCIDS);
|
sl@0
|
2824 |
docListAndMerge(pLeft, pRight, pNew);
|
sl@0
|
2825 |
docListDelete(pRight);
|
sl@0
|
2826 |
docListDelete(pLeft);
|
sl@0
|
2827 |
pLeft = pNew;
|
sl@0
|
2828 |
}
|
sl@0
|
2829 |
}
|
sl@0
|
2830 |
|
sl@0
|
2831 |
if( nNot && pLeft==0 ){
|
sl@0
|
2832 |
/* We do not yet know how to handle a query of only NOT terms */
|
sl@0
|
2833 |
return SQLITE_ERROR;
|
sl@0
|
2834 |
}
|
sl@0
|
2835 |
|
sl@0
|
2836 |
/* Do the EXCEPT terms */
|
sl@0
|
2837 |
for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
|
sl@0
|
2838 |
if( !aTerm[i].isNot ) continue;
|
sl@0
|
2839 |
rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &pRight);
|
sl@0
|
2840 |
if( rc ){
|
sl@0
|
2841 |
queryClear(pQuery);
|
sl@0
|
2842 |
docListDelete(pLeft);
|
sl@0
|
2843 |
return rc;
|
sl@0
|
2844 |
}
|
sl@0
|
2845 |
pNew = docListNew(DL_DOCIDS);
|
sl@0
|
2846 |
docListExceptMerge(pLeft, pRight, pNew);
|
sl@0
|
2847 |
docListDelete(pRight);
|
sl@0
|
2848 |
docListDelete(pLeft);
|
sl@0
|
2849 |
pLeft = pNew;
|
sl@0
|
2850 |
}
|
sl@0
|
2851 |
|
sl@0
|
2852 |
*pResult = pLeft;
|
sl@0
|
2853 |
return rc;
|
sl@0
|
2854 |
}
|
sl@0
|
2855 |
|
sl@0
|
2856 |
/*
|
sl@0
|
2857 |
** This is the xFilter interface for the virtual table. See
|
sl@0
|
2858 |
** the virtual table xFilter method documentation for additional
|
sl@0
|
2859 |
** information.
|
sl@0
|
2860 |
**
|
sl@0
|
2861 |
** If idxNum==QUERY_GENERIC then do a full table scan against
|
sl@0
|
2862 |
** the %_content table.
|
sl@0
|
2863 |
**
|
sl@0
|
2864 |
** If idxNum==QUERY_ROWID then do a rowid lookup for a single entry
|
sl@0
|
2865 |
** in the %_content table.
|
sl@0
|
2866 |
**
|
sl@0
|
2867 |
** If idxNum>=QUERY_FULLTEXT then use the full text index. The
|
sl@0
|
2868 |
** column on the left-hand side of the MATCH operator is column
|
sl@0
|
2869 |
** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
|
sl@0
|
2870 |
** side of the MATCH operator.
|
sl@0
|
2871 |
*/
|
sl@0
|
2872 |
/* TODO(shess) Upgrade the cursor initialization and destruction to
|
sl@0
|
2873 |
** account for fulltextFilter() being called multiple times on the
|
sl@0
|
2874 |
** same cursor. The current solution is very fragile. Apply fix to
|
sl@0
|
2875 |
** fts2 as appropriate.
|
sl@0
|
2876 |
*/
|
sl@0
|
2877 |
static int fulltextFilter(
|
sl@0
|
2878 |
sqlite3_vtab_cursor *pCursor, /* The cursor used for this query */
|
sl@0
|
2879 |
int idxNum, const char *idxStr, /* Which indexing scheme to use */
|
sl@0
|
2880 |
int argc, sqlite3_value **argv /* Arguments for the indexing scheme */
|
sl@0
|
2881 |
){
|
sl@0
|
2882 |
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
sl@0
|
2883 |
fulltext_vtab *v = cursor_vtab(c);
|
sl@0
|
2884 |
int rc;
|
sl@0
|
2885 |
char *zSql;
|
sl@0
|
2886 |
|
sl@0
|
2887 |
TRACE(("FTS1 Filter %p\n",pCursor));
|
sl@0
|
2888 |
|
sl@0
|
2889 |
zSql = sqlite3_mprintf("select rowid, * from %%_content %s",
|
sl@0
|
2890 |
idxNum==QUERY_GENERIC ? "" : "where rowid=?");
|
sl@0
|
2891 |
sqlite3_finalize(c->pStmt);
|
sl@0
|
2892 |
rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt, zSql);
|
sl@0
|
2893 |
sqlite3_free(zSql);
|
sl@0
|
2894 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2895 |
|
sl@0
|
2896 |
c->iCursorType = idxNum;
|
sl@0
|
2897 |
switch( idxNum ){
|
sl@0
|
2898 |
case QUERY_GENERIC:
|
sl@0
|
2899 |
break;
|
sl@0
|
2900 |
|
sl@0
|
2901 |
case QUERY_ROWID:
|
sl@0
|
2902 |
rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
|
sl@0
|
2903 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2904 |
break;
|
sl@0
|
2905 |
|
sl@0
|
2906 |
default: /* full-text search */
|
sl@0
|
2907 |
{
|
sl@0
|
2908 |
const char *zQuery = (const char *)sqlite3_value_text(argv[0]);
|
sl@0
|
2909 |
DocList *pResult;
|
sl@0
|
2910 |
assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
|
sl@0
|
2911 |
assert( argc==1 );
|
sl@0
|
2912 |
queryClear(&c->q);
|
sl@0
|
2913 |
rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &pResult, &c->q);
|
sl@0
|
2914 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2915 |
if( c->result.pDoclist!=NULL ) docListDelete(c->result.pDoclist);
|
sl@0
|
2916 |
readerInit(&c->result, pResult);
|
sl@0
|
2917 |
break;
|
sl@0
|
2918 |
}
|
sl@0
|
2919 |
}
|
sl@0
|
2920 |
|
sl@0
|
2921 |
return fulltextNext(pCursor);
|
sl@0
|
2922 |
}
|
sl@0
|
2923 |
|
sl@0
|
2924 |
/* This is the xEof method of the virtual table. The SQLite core
|
sl@0
|
2925 |
** calls this routine to find out if it has reached the end of
|
sl@0
|
2926 |
** a query's results set.
|
sl@0
|
2927 |
*/
|
sl@0
|
2928 |
static int fulltextEof(sqlite3_vtab_cursor *pCursor){
|
sl@0
|
2929 |
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
sl@0
|
2930 |
return c->eof;
|
sl@0
|
2931 |
}
|
sl@0
|
2932 |
|
sl@0
|
2933 |
/* This is the xColumn method of the virtual table. The SQLite
|
sl@0
|
2934 |
** core calls this method during a query when it needs the value
|
sl@0
|
2935 |
** of a column from the virtual table. This method needs to use
|
sl@0
|
2936 |
** one of the sqlite3_result_*() routines to store the requested
|
sl@0
|
2937 |
** value back in the pContext.
|
sl@0
|
2938 |
*/
|
sl@0
|
2939 |
static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
|
sl@0
|
2940 |
sqlite3_context *pContext, int idxCol){
|
sl@0
|
2941 |
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
sl@0
|
2942 |
fulltext_vtab *v = cursor_vtab(c);
|
sl@0
|
2943 |
|
sl@0
|
2944 |
if( idxCol<v->nColumn ){
|
sl@0
|
2945 |
sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
|
sl@0
|
2946 |
sqlite3_result_value(pContext, pVal);
|
sl@0
|
2947 |
}else if( idxCol==v->nColumn ){
|
sl@0
|
2948 |
/* The extra column whose name is the same as the table.
|
sl@0
|
2949 |
** Return a blob which is a pointer to the cursor
|
sl@0
|
2950 |
*/
|
sl@0
|
2951 |
sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
|
sl@0
|
2952 |
}
|
sl@0
|
2953 |
return SQLITE_OK;
|
sl@0
|
2954 |
}
|
sl@0
|
2955 |
|
sl@0
|
2956 |
/* This is the xRowid method. The SQLite core calls this routine to
|
sl@0
|
2957 |
** retrive the rowid for the current row of the result set. The
|
sl@0
|
2958 |
** rowid should be written to *pRowid.
|
sl@0
|
2959 |
*/
|
sl@0
|
2960 |
static int fulltextRowid(sqlite3_vtab_cursor *pCursor, sqlite_int64 *pRowid){
|
sl@0
|
2961 |
fulltext_cursor *c = (fulltext_cursor *) pCursor;
|
sl@0
|
2962 |
|
sl@0
|
2963 |
*pRowid = sqlite3_column_int64(c->pStmt, 0);
|
sl@0
|
2964 |
return SQLITE_OK;
|
sl@0
|
2965 |
}
|
sl@0
|
2966 |
|
sl@0
|
2967 |
/* Add all terms in [zText] to the given hash table. If [iColumn] > 0,
|
sl@0
|
2968 |
* we also store positions and offsets in the hash table using the given
|
sl@0
|
2969 |
* column number. */
|
sl@0
|
2970 |
static int buildTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iDocid,
|
sl@0
|
2971 |
const char *zText, int iColumn){
|
sl@0
|
2972 |
sqlite3_tokenizer *pTokenizer = v->pTokenizer;
|
sl@0
|
2973 |
sqlite3_tokenizer_cursor *pCursor;
|
sl@0
|
2974 |
const char *pToken;
|
sl@0
|
2975 |
int nTokenBytes;
|
sl@0
|
2976 |
int iStartOffset, iEndOffset, iPosition;
|
sl@0
|
2977 |
int rc;
|
sl@0
|
2978 |
|
sl@0
|
2979 |
rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
|
sl@0
|
2980 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
2981 |
|
sl@0
|
2982 |
pCursor->pTokenizer = pTokenizer;
|
sl@0
|
2983 |
while( SQLITE_OK==pTokenizer->pModule->xNext(pCursor,
|
sl@0
|
2984 |
&pToken, &nTokenBytes,
|
sl@0
|
2985 |
&iStartOffset, &iEndOffset,
|
sl@0
|
2986 |
&iPosition) ){
|
sl@0
|
2987 |
DocList *p;
|
sl@0
|
2988 |
|
sl@0
|
2989 |
/* Positions can't be negative; we use -1 as a terminator internally. */
|
sl@0
|
2990 |
if( iPosition<0 ){
|
sl@0
|
2991 |
pTokenizer->pModule->xClose(pCursor);
|
sl@0
|
2992 |
return SQLITE_ERROR;
|
sl@0
|
2993 |
}
|
sl@0
|
2994 |
|
sl@0
|
2995 |
p = fts1HashFind(terms, pToken, nTokenBytes);
|
sl@0
|
2996 |
if( p==NULL ){
|
sl@0
|
2997 |
p = docListNew(DL_DEFAULT);
|
sl@0
|
2998 |
docListAddDocid(p, iDocid);
|
sl@0
|
2999 |
fts1HashInsert(terms, pToken, nTokenBytes, p);
|
sl@0
|
3000 |
}
|
sl@0
|
3001 |
if( iColumn>=0 ){
|
sl@0
|
3002 |
docListAddPosOffset(p, iColumn, iPosition, iStartOffset, iEndOffset);
|
sl@0
|
3003 |
}
|
sl@0
|
3004 |
}
|
sl@0
|
3005 |
|
sl@0
|
3006 |
/* TODO(shess) Check return? Should this be able to cause errors at
|
sl@0
|
3007 |
** this point? Actually, same question about sqlite3_finalize(),
|
sl@0
|
3008 |
** though one could argue that failure there means that the data is
|
sl@0
|
3009 |
** not durable. *ponder*
|
sl@0
|
3010 |
*/
|
sl@0
|
3011 |
pTokenizer->pModule->xClose(pCursor);
|
sl@0
|
3012 |
return rc;
|
sl@0
|
3013 |
}
|
sl@0
|
3014 |
|
sl@0
|
3015 |
/* Update the %_terms table to map the term [pTerm] to the given rowid. */
|
sl@0
|
3016 |
static int index_insert_term(fulltext_vtab *v, const char *pTerm, int nTerm,
|
sl@0
|
3017 |
DocList *d){
|
sl@0
|
3018 |
sqlite_int64 iIndexRow;
|
sl@0
|
3019 |
DocList doclist;
|
sl@0
|
3020 |
int iSegment = 0, rc;
|
sl@0
|
3021 |
|
sl@0
|
3022 |
rc = term_select(v, pTerm, nTerm, iSegment, &iIndexRow, &doclist);
|
sl@0
|
3023 |
if( rc==SQLITE_DONE ){
|
sl@0
|
3024 |
docListInit(&doclist, DL_DEFAULT, 0, 0);
|
sl@0
|
3025 |
docListUpdate(&doclist, d);
|
sl@0
|
3026 |
/* TODO(shess) Consider length(doclist)>CHUNK_MAX? */
|
sl@0
|
3027 |
rc = term_insert(v, NULL, pTerm, nTerm, iSegment, &doclist);
|
sl@0
|
3028 |
goto err;
|
sl@0
|
3029 |
}
|
sl@0
|
3030 |
if( rc!=SQLITE_ROW ) return SQLITE_ERROR;
|
sl@0
|
3031 |
|
sl@0
|
3032 |
docListUpdate(&doclist, d);
|
sl@0
|
3033 |
if( doclist.nData<=CHUNK_MAX ){
|
sl@0
|
3034 |
rc = term_update(v, iIndexRow, &doclist);
|
sl@0
|
3035 |
goto err;
|
sl@0
|
3036 |
}
|
sl@0
|
3037 |
|
sl@0
|
3038 |
/* Doclist doesn't fit, delete what's there, and accumulate
|
sl@0
|
3039 |
** forward.
|
sl@0
|
3040 |
*/
|
sl@0
|
3041 |
rc = term_delete(v, iIndexRow);
|
sl@0
|
3042 |
if( rc!=SQLITE_OK ) goto err;
|
sl@0
|
3043 |
|
sl@0
|
3044 |
/* Try to insert the doclist into a higher segment bucket. On
|
sl@0
|
3045 |
** failure, accumulate existing doclist with the doclist from that
|
sl@0
|
3046 |
** bucket, and put results in the next bucket.
|
sl@0
|
3047 |
*/
|
sl@0
|
3048 |
iSegment++;
|
sl@0
|
3049 |
while( (rc=term_insert(v, &iIndexRow, pTerm, nTerm, iSegment,
|
sl@0
|
3050 |
&doclist))!=SQLITE_OK ){
|
sl@0
|
3051 |
sqlite_int64 iSegmentRow;
|
sl@0
|
3052 |
DocList old;
|
sl@0
|
3053 |
int rc2;
|
sl@0
|
3054 |
|
sl@0
|
3055 |
/* Retain old error in case the term_insert() error was really an
|
sl@0
|
3056 |
** error rather than a bounced insert.
|
sl@0
|
3057 |
*/
|
sl@0
|
3058 |
rc2 = term_select(v, pTerm, nTerm, iSegment, &iSegmentRow, &old);
|
sl@0
|
3059 |
if( rc2!=SQLITE_ROW ) goto err;
|
sl@0
|
3060 |
|
sl@0
|
3061 |
rc = term_delete(v, iSegmentRow);
|
sl@0
|
3062 |
if( rc!=SQLITE_OK ) goto err;
|
sl@0
|
3063 |
|
sl@0
|
3064 |
/* Reusing lowest-number deleted row keeps the index smaller. */
|
sl@0
|
3065 |
if( iSegmentRow<iIndexRow ) iIndexRow = iSegmentRow;
|
sl@0
|
3066 |
|
sl@0
|
3067 |
/* doclist contains the newer data, so accumulate it over old.
|
sl@0
|
3068 |
** Then steal accumulated data for doclist.
|
sl@0
|
3069 |
*/
|
sl@0
|
3070 |
docListAccumulate(&old, &doclist);
|
sl@0
|
3071 |
docListDestroy(&doclist);
|
sl@0
|
3072 |
doclist = old;
|
sl@0
|
3073 |
|
sl@0
|
3074 |
iSegment++;
|
sl@0
|
3075 |
}
|
sl@0
|
3076 |
|
sl@0
|
3077 |
err:
|
sl@0
|
3078 |
docListDestroy(&doclist);
|
sl@0
|
3079 |
return rc;
|
sl@0
|
3080 |
}
|
sl@0
|
3081 |
|
sl@0
|
3082 |
/* Add doclists for all terms in [pValues] to the hash table [terms]. */
|
sl@0
|
3083 |
static int insertTerms(fulltext_vtab *v, fts1Hash *terms, sqlite_int64 iRowid,
|
sl@0
|
3084 |
sqlite3_value **pValues){
|
sl@0
|
3085 |
int i;
|
sl@0
|
3086 |
for(i = 0; i < v->nColumn ; ++i){
|
sl@0
|
3087 |
char *zText = (char*)sqlite3_value_text(pValues[i]);
|
sl@0
|
3088 |
int rc = buildTerms(v, terms, iRowid, zText, i);
|
sl@0
|
3089 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
3090 |
}
|
sl@0
|
3091 |
return SQLITE_OK;
|
sl@0
|
3092 |
}
|
sl@0
|
3093 |
|
sl@0
|
3094 |
/* Add empty doclists for all terms in the given row's content to the hash
|
sl@0
|
3095 |
* table [pTerms]. */
|
sl@0
|
3096 |
static int deleteTerms(fulltext_vtab *v, fts1Hash *pTerms, sqlite_int64 iRowid){
|
sl@0
|
3097 |
const char **pValues;
|
sl@0
|
3098 |
int i;
|
sl@0
|
3099 |
|
sl@0
|
3100 |
int rc = content_select(v, iRowid, &pValues);
|
sl@0
|
3101 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
3102 |
|
sl@0
|
3103 |
for(i = 0 ; i < v->nColumn; ++i) {
|
sl@0
|
3104 |
rc = buildTerms(v, pTerms, iRowid, pValues[i], -1);
|
sl@0
|
3105 |
if( rc!=SQLITE_OK ) break;
|
sl@0
|
3106 |
}
|
sl@0
|
3107 |
|
sl@0
|
3108 |
freeStringArray(v->nColumn, pValues);
|
sl@0
|
3109 |
return SQLITE_OK;
|
sl@0
|
3110 |
}
|
sl@0
|
3111 |
|
sl@0
|
3112 |
/* Insert a row into the %_content table; set *piRowid to be the ID of the
|
sl@0
|
3113 |
* new row. Fill [pTerms] with new doclists for the %_term table. */
|
sl@0
|
3114 |
static int index_insert(fulltext_vtab *v, sqlite3_value *pRequestRowid,
|
sl@0
|
3115 |
sqlite3_value **pValues,
|
sl@0
|
3116 |
sqlite_int64 *piRowid, fts1Hash *pTerms){
|
sl@0
|
3117 |
int rc;
|
sl@0
|
3118 |
|
sl@0
|
3119 |
rc = content_insert(v, pRequestRowid, pValues); /* execute an SQL INSERT */
|
sl@0
|
3120 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
3121 |
*piRowid = sqlite3_last_insert_rowid(v->db);
|
sl@0
|
3122 |
return insertTerms(v, pTerms, *piRowid, pValues);
|
sl@0
|
3123 |
}
|
sl@0
|
3124 |
|
sl@0
|
3125 |
/* Delete a row from the %_content table; fill [pTerms] with empty doclists
|
sl@0
|
3126 |
* to be written to the %_term table. */
|
sl@0
|
3127 |
static int index_delete(fulltext_vtab *v, sqlite_int64 iRow, fts1Hash *pTerms){
|
sl@0
|
3128 |
int rc = deleteTerms(v, pTerms, iRow);
|
sl@0
|
3129 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
3130 |
return content_delete(v, iRow); /* execute an SQL DELETE */
|
sl@0
|
3131 |
}
|
sl@0
|
3132 |
|
sl@0
|
3133 |
/* Update a row in the %_content table; fill [pTerms] with new doclists for the
|
sl@0
|
3134 |
* %_term table. */
|
sl@0
|
3135 |
static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
|
sl@0
|
3136 |
sqlite3_value **pValues, fts1Hash *pTerms){
|
sl@0
|
3137 |
/* Generate an empty doclist for each term that previously appeared in this
|
sl@0
|
3138 |
* row. */
|
sl@0
|
3139 |
int rc = deleteTerms(v, pTerms, iRow);
|
sl@0
|
3140 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
3141 |
|
sl@0
|
3142 |
rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
|
sl@0
|
3143 |
if( rc!=SQLITE_OK ) return rc;
|
sl@0
|
3144 |
|
sl@0
|
3145 |
/* Now add positions for terms which appear in the updated row. */
|
sl@0
|
3146 |
return insertTerms(v, pTerms, iRow, pValues);
|
sl@0
|
3147 |
}
|
sl@0
|
3148 |
|
sl@0
|
3149 |
/* This function implements the xUpdate callback; it is the top-level entry
|
sl@0
|
3150 |
* point for inserting, deleting or updating a row in a full-text table. */
|
sl@0
|
3151 |
static int fulltextUpdate(sqlite3_vtab *pVtab, int nArg, sqlite3_value **ppArg,
|
sl@0
|
3152 |
sqlite_int64 *pRowid){
|
sl@0
|
3153 |
fulltext_vtab *v = (fulltext_vtab *) pVtab;
|
sl@0
|
3154 |
fts1Hash terms; /* maps term string -> PosList */
|
sl@0
|
3155 |
int rc;
|
sl@0
|
3156 |
fts1HashElem *e;
|
sl@0
|
3157 |
|
sl@0
|
3158 |
TRACE(("FTS1 Update %p\n", pVtab));
|
sl@0
|
3159 |
|
sl@0
|
3160 |
fts1HashInit(&terms, FTS1_HASH_STRING, 1);
|
sl@0
|
3161 |
|
sl@0
|
3162 |
if( nArg<2 ){
|
sl@0
|
3163 |
rc = index_delete(v, sqlite3_value_int64(ppArg[0]), &terms);
|
sl@0
|
3164 |
} else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
|
sl@0
|
3165 |
/* An update:
|
sl@0
|
3166 |
* ppArg[0] = old rowid
|
sl@0
|
3167 |
* ppArg[1] = new rowid
|
sl@0
|
3168 |
* ppArg[2..2+v->nColumn-1] = values
|
sl@0
|
3169 |
* ppArg[2+v->nColumn] = value for magic column (we ignore this)
|
sl@0
|
3170 |
*/
|
sl@0
|
3171 |
sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
|
sl@0
|
3172 |
if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER ||
|
sl@0
|
3173 |
sqlite3_value_int64(ppArg[1]) != rowid ){
|
sl@0
|
3174 |
rc = SQLITE_ERROR; /* we don't allow changing the rowid */
|
sl@0
|
3175 |
} else {
|
sl@0
|
3176 |
assert( nArg==2+v->nColumn+1);
|
sl@0
|
3177 |
rc = index_update(v, rowid, &ppArg[2], &terms);
|
sl@0
|
3178 |
}
|
sl@0
|
3179 |
} else {
|
sl@0
|
3180 |
/* An insert:
|
sl@0
|
3181 |
* ppArg[1] = requested rowid
|
sl@0
|
3182 |
* ppArg[2..2+v->nColumn-1] = values
|
sl@0
|
3183 |
* ppArg[2+v->nColumn] = value for magic column (we ignore this)
|
sl@0
|
3184 |
*/
|
sl@0
|
3185 |
assert( nArg==2+v->nColumn+1);
|
sl@0
|
3186 |
rc = index_insert(v, ppArg[1], &ppArg[2], pRowid, &terms);
|
sl@0
|
3187 |
}
|
sl@0
|
3188 |
|
sl@0
|
3189 |
if( rc==SQLITE_OK ){
|
sl@0
|
3190 |
/* Write updated doclists to disk. */
|
sl@0
|
3191 |
for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
|
sl@0
|
3192 |
DocList *p = fts1HashData(e);
|
sl@0
|
3193 |
rc = index_insert_term(v, fts1HashKey(e), fts1HashKeysize(e), p);
|
sl@0
|
3194 |
if( rc!=SQLITE_OK ) break;
|
sl@0
|
3195 |
}
|
sl@0
|
3196 |
}
|
sl@0
|
3197 |
|
sl@0
|
3198 |
/* clean up */
|
sl@0
|
3199 |
for(e=fts1HashFirst(&terms); e; e=fts1HashNext(e)){
|
sl@0
|
3200 |
DocList *p = fts1HashData(e);
|
sl@0
|
3201 |
docListDelete(p);
|
sl@0
|
3202 |
}
|
sl@0
|
3203 |
fts1HashClear(&terms);
|
sl@0
|
3204 |
|
sl@0
|
3205 |
return rc;
|
sl@0
|
3206 |
}
|
sl@0
|
3207 |
|
sl@0
|
3208 |
/*
|
sl@0
|
3209 |
** Implementation of the snippet() function for FTS1
|
sl@0
|
3210 |
*/
|
sl@0
|
3211 |
static void snippetFunc(
|
sl@0
|
3212 |
sqlite3_context *pContext,
|
sl@0
|
3213 |
int argc,
|
sl@0
|
3214 |
sqlite3_value **argv
|
sl@0
|
3215 |
){
|
sl@0
|
3216 |
fulltext_cursor *pCursor;
|
sl@0
|
3217 |
if( argc<1 ) return;
|
sl@0
|
3218 |
if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
|
sl@0
|
3219 |
sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
|
sl@0
|
3220 |
sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
|
sl@0
|
3221 |
}else{
|
sl@0
|
3222 |
const char *zStart = "<b>";
|
sl@0
|
3223 |
const char *zEnd = "</b>";
|
sl@0
|
3224 |
const char *zEllipsis = "<b>...</b>";
|
sl@0
|
3225 |
memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
|
sl@0
|
3226 |
if( argc>=2 ){
|
sl@0
|
3227 |
zStart = (const char*)sqlite3_value_text(argv[1]);
|
sl@0
|
3228 |
if( argc>=3 ){
|
sl@0
|
3229 |
zEnd = (const char*)sqlite3_value_text(argv[2]);
|
sl@0
|
3230 |
if( argc>=4 ){
|
sl@0
|
3231 |
zEllipsis = (const char*)sqlite3_value_text(argv[3]);
|
sl@0
|
3232 |
}
|
sl@0
|
3233 |
}
|
sl@0
|
3234 |
}
|
sl@0
|
3235 |
snippetAllOffsets(pCursor);
|
sl@0
|
3236 |
snippetText(pCursor, zStart, zEnd, zEllipsis);
|
sl@0
|
3237 |
sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
|
sl@0
|
3238 |
pCursor->snippet.nSnippet, SQLITE_STATIC);
|
sl@0
|
3239 |
}
|
sl@0
|
3240 |
}
|
sl@0
|
3241 |
|
sl@0
|
3242 |
/*
|
sl@0
|
3243 |
** Implementation of the offsets() function for FTS1
|
sl@0
|
3244 |
*/
|
sl@0
|
3245 |
static void snippetOffsetsFunc(
|
sl@0
|
3246 |
sqlite3_context *pContext,
|
sl@0
|
3247 |
int argc,
|
sl@0
|
3248 |
sqlite3_value **argv
|
sl@0
|
3249 |
){
|
sl@0
|
3250 |
fulltext_cursor *pCursor;
|
sl@0
|
3251 |
if( argc<1 ) return;
|
sl@0
|
3252 |
if( sqlite3_value_type(argv[0])!=SQLITE_BLOB ||
|
sl@0
|
3253 |
sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
|
sl@0
|
3254 |
sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
|
sl@0
|
3255 |
}else{
|
sl@0
|
3256 |
memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
|
sl@0
|
3257 |
snippetAllOffsets(pCursor);
|
sl@0
|
3258 |
snippetOffsetText(&pCursor->snippet);
|
sl@0
|
3259 |
sqlite3_result_text(pContext,
|
sl@0
|
3260 |
pCursor->snippet.zOffset, pCursor->snippet.nOffset,
|
sl@0
|
3261 |
SQLITE_STATIC);
|
sl@0
|
3262 |
}
|
sl@0
|
3263 |
}
|
sl@0
|
3264 |
|
sl@0
|
3265 |
/*
|
sl@0
|
3266 |
** This routine implements the xFindFunction method for the FTS1
|
sl@0
|
3267 |
** virtual table.
|
sl@0
|
3268 |
*/
|
sl@0
|
3269 |
static int fulltextFindFunction(
|
sl@0
|
3270 |
sqlite3_vtab *pVtab,
|
sl@0
|
3271 |
int nArg,
|
sl@0
|
3272 |
const char *zName,
|
sl@0
|
3273 |
void (**pxFunc)(sqlite3_context*,int,sqlite3_value**),
|
sl@0
|
3274 |
void **ppArg
|
sl@0
|
3275 |
){
|
sl@0
|
3276 |
if( strcmp(zName,"snippet")==0 ){
|
sl@0
|
3277 |
*pxFunc = snippetFunc;
|
sl@0
|
3278 |
return 1;
|
sl@0
|
3279 |
}else if( strcmp(zName,"offsets")==0 ){
|
sl@0
|
3280 |
*pxFunc = snippetOffsetsFunc;
|
sl@0
|
3281 |
return 1;
|
sl@0
|
3282 |
}
|
sl@0
|
3283 |
return 0;
|
sl@0
|
3284 |
}
|
sl@0
|
3285 |
|
sl@0
|
3286 |
/*
|
sl@0
|
3287 |
** Rename an fts1 table.
|
sl@0
|
3288 |
*/
|
sl@0
|
3289 |
static int fulltextRename(
|
sl@0
|
3290 |
sqlite3_vtab *pVtab,
|
sl@0
|
3291 |
const char *zName
|
sl@0
|
3292 |
){
|
sl@0
|
3293 |
fulltext_vtab *p = (fulltext_vtab *)pVtab;
|
sl@0
|
3294 |
int rc = SQLITE_NOMEM;
|
sl@0
|
3295 |
char *zSql = sqlite3_mprintf(
|
sl@0
|
3296 |
"ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
|
sl@0
|
3297 |
"ALTER TABLE %Q.'%q_term' RENAME TO '%q_term';"
|
sl@0
|
3298 |
, p->zDb, p->zName, zName
|
sl@0
|
3299 |
, p->zDb, p->zName, zName
|
sl@0
|
3300 |
);
|
sl@0
|
3301 |
if( zSql ){
|
sl@0
|
3302 |
rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
|
sl@0
|
3303 |
sqlite3_free(zSql);
|
sl@0
|
3304 |
}
|
sl@0
|
3305 |
return rc;
|
sl@0
|
3306 |
}
|
sl@0
|
3307 |
|
sl@0
|
3308 |
static const sqlite3_module fulltextModule = {
|
sl@0
|
3309 |
/* iVersion */ 0,
|
sl@0
|
3310 |
/* xCreate */ fulltextCreate,
|
sl@0
|
3311 |
/* xConnect */ fulltextConnect,
|
sl@0
|
3312 |
/* xBestIndex */ fulltextBestIndex,
|
sl@0
|
3313 |
/* xDisconnect */ fulltextDisconnect,
|
sl@0
|
3314 |
/* xDestroy */ fulltextDestroy,
|
sl@0
|
3315 |
/* xOpen */ fulltextOpen,
|
sl@0
|
3316 |
/* xClose */ fulltextClose,
|
sl@0
|
3317 |
/* xFilter */ fulltextFilter,
|
sl@0
|
3318 |
/* xNext */ fulltextNext,
|
sl@0
|
3319 |
/* xEof */ fulltextEof,
|
sl@0
|
3320 |
/* xColumn */ fulltextColumn,
|
sl@0
|
3321 |
/* xRowid */ fulltextRowid,
|
sl@0
|
3322 |
/* xUpdate */ fulltextUpdate,
|
sl@0
|
3323 |
/* xBegin */ 0,
|
sl@0
|
3324 |
/* xSync */ 0,
|
sl@0
|
3325 |
/* xCommit */ 0,
|
sl@0
|
3326 |
/* xRollback */ 0,
|
sl@0
|
3327 |
/* xFindFunction */ fulltextFindFunction,
|
sl@0
|
3328 |
/* xRename */ fulltextRename,
|
sl@0
|
3329 |
};
|
sl@0
|
3330 |
|
sl@0
|
3331 |
int sqlite3Fts1Init(sqlite3 *db){
|
sl@0
|
3332 |
sqlite3_overload_function(db, "snippet", -1);
|
sl@0
|
3333 |
sqlite3_overload_function(db, "offsets", -1);
|
sl@0
|
3334 |
return sqlite3_create_module(db, "fts1", &fulltextModule, 0);
|
sl@0
|
3335 |
}
|
sl@0
|
3336 |
|
sl@0
|
3337 |
#if !SQLITE_CORE
|
sl@0
|
3338 |
int sqlite3_extension_init(sqlite3 *db, char **pzErrMsg,
|
sl@0
|
3339 |
const sqlite3_api_routines *pApi){
|
sl@0
|
3340 |
SQLITE_EXTENSION_INIT2(pApi)
|
sl@0
|
3341 |
return sqlite3Fts1Init(db);
|
sl@0
|
3342 |
}
|
sl@0
|
3343 |
#endif
|
sl@0
|
3344 |
|
sl@0
|
3345 |
#endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */
|