Symaptic: os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3.c@bde4ae8d615e (annotated)

sl@0	1	/*
sl@0	2	** 2006 Oct 10
sl@0	3	**
sl@0	4	** The author disclaims copyright to this source code. In place of
sl@0	5	** a legal notice, here is a blessing:
sl@0	6	**
sl@0	7	** May you do good and not evil.
sl@0	8	** May you find forgiveness for yourself and forgive others.
sl@0	9	** May you share freely, never taking more than you give.
sl@0	10	**
sl@0	11	******************************************************************************
sl@0	12	**
sl@0	13	** This is an SQLite module implementing full-text search.
sl@0	14	*/
sl@0	15
sl@0	16	/*
sl@0	17	** The code in this file is only compiled if:
sl@0	18	**
sl@0	19	** * The FTS3 module is being built as an extension
sl@0	20	** (in which case SQLITE_CORE is not defined), or
sl@0	21	**
sl@0	22	** * The FTS3 module is being built into the core of
sl@0	23	** SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
sl@0	24	*/
sl@0	25
sl@0	26	/* TODO(shess) Consider exporting this comment to an HTML file or the
sl@0	27	** wiki.
sl@0	28	*/
sl@0	29	/* The full-text index is stored in a series of b+tree (-like)
sl@0	30	** structures called segments which map terms to doclists. The
sl@0	31	** structures are like b+trees in layout, but are constructed from the
sl@0	32	** bottom up in optimal fashion and are not updatable. Since trees
sl@0	33	** are built from the bottom up, things will be described from the
sl@0	34	** bottom up.
sl@0	35	**
sl@0	36	**
sl@0	37	** Varints **
sl@0	38	** The basic unit of encoding is a variable-length integer called a
sl@0	39	** varint. We encode variable-length integers in little-endian order
sl@0	40	** using seven bits * per byte as follows:
sl@0	41	**
sl@0	42	** KEY:
sl@0	43	** A = 0xxxxxxx 7 bits of data and one flag bit
sl@0	44	** B = 1xxxxxxx 7 bits of data and one flag bit
sl@0	45	**
sl@0	46	** 7 bits - A
sl@0	47	** 14 bits - BA
sl@0	48	** 21 bits - BBA
sl@0	49	** and so on.
sl@0	50	**
sl@0	51	** This is identical to how sqlite encodes varints (see util.c).
sl@0	52	**
sl@0	53	**
sl@0	54	** Document lists **
sl@0	55	** A doclist (document list) holds a docid-sorted list of hits for a
sl@0	56	** given term. Doclists hold docids, and can optionally associate
sl@0	57	** token positions and offsets with docids.
sl@0	58	**
sl@0	59	** A DL_POSITIONS_OFFSETS doclist is stored like this:
sl@0	60	**
sl@0	61	** array {
sl@0	62	** varint docid;
sl@0	63	** array { (position list for column 0)
sl@0	64	** varint position; (delta from previous position plus POS_BASE)
sl@0	65	** varint startOffset; (delta from previous startOffset)
sl@0	66	** varint endOffset; (delta from startOffset)
sl@0	67	** }
sl@0	68	** array {
sl@0	69	** varint POS_COLUMN; (marks start of position list for new column)
sl@0	70	** varint column; (index of new column)
sl@0	71	** array {
sl@0	72	** varint position; (delta from previous position plus POS_BASE)
sl@0	73	** varint startOffset;(delta from previous startOffset)
sl@0	74	** varint endOffset; (delta from startOffset)
sl@0	75	** }
sl@0	76	** }
sl@0	77	** varint POS_END; (marks end of positions for this document.
sl@0	78	** }
sl@0	79	**
sl@0	80	** Here, array { X } means zero or more occurrences of X, adjacent in
sl@0	81	** memory. A "position" is an index of a token in the token stream
sl@0	82	** generated by the tokenizer, while an "offset" is a byte offset,
sl@0	83	** both based at 0. Note that POS_END and POS_COLUMN occur in the
sl@0	84	** same logical place as the position element, and act as sentinals
sl@0	85	** ending a position list array.
sl@0	86	**
sl@0	87	** A DL_POSITIONS doclist omits the startOffset and endOffset
sl@0	88	** information. A DL_DOCIDS doclist omits both the position and
sl@0	89	** offset information, becoming an array of varint-encoded docids.
sl@0	90	**
sl@0	91	** On-disk data is stored as type DL_DEFAULT, so we don't serialize
sl@0	92	** the type. Due to how deletion is implemented in the segmentation
sl@0	93	** system, on-disk doclists MUST store at least positions.
sl@0	94	**
sl@0	95	**
sl@0	96	** Segment leaf nodes **
sl@0	97	** Segment leaf nodes store terms and doclists, ordered by term. Leaf
sl@0	98	** nodes are written using LeafWriter, and read using LeafReader (to
sl@0	99	** iterate through a single leaf node's data) and LeavesReader (to
sl@0	100	** iterate through a segment's entire leaf layer). Leaf nodes have
sl@0	101	** the format:
sl@0	102	**
sl@0	103	** varint iHeight; (height from leaf level, always 0)
sl@0	104	** varint nTerm; (length of first term)
sl@0	105	** char pTerm[nTerm]; (content of first term)
sl@0	106	** varint nDoclist; (length of term's associated doclist)
sl@0	107	** char pDoclist[nDoclist]; (content of doclist)
sl@0	108	** array {
sl@0	109	** (further terms are delta-encoded)
sl@0	110	** varint nPrefix; (length of prefix shared with previous term)
sl@0	111	** varint nSuffix; (length of unshared suffix)
sl@0	112	** char pTermSuffix[nSuffix];(unshared suffix of next term)
sl@0	113	** varint nDoclist; (length of term's associated doclist)
sl@0	114	** char pDoclist[nDoclist]; (content of doclist)
sl@0	115	** }
sl@0	116	**
sl@0	117	** Here, array { X } means zero or more occurrences of X, adjacent in
sl@0	118	** memory.
sl@0	119	**
sl@0	120	** Leaf nodes are broken into blocks which are stored contiguously in
sl@0	121	** the %_segments table in sorted order. This means that when the end
sl@0	122	** of a node is reached, the next term is in the node with the next
sl@0	123	** greater node id.
sl@0	124	**
sl@0	125	** New data is spilled to a new leaf node when the current node
sl@0	126	** exceeds LEAF_MAX bytes (default 2048). New data which itself is
sl@0	127	** larger than STANDALONE_MIN (default 1024) is placed in a standalone
sl@0	128	** node (a leaf node with a single term and doclist). The goal of
sl@0	129	** these settings is to pack together groups of small doclists while
sl@0	130	** making it efficient to directly access large doclists. The
sl@0	131	** assumption is that large doclists represent terms which are more
sl@0	132	** likely to be query targets.
sl@0	133	**
sl@0	134	** TODO(shess) It may be useful for blocking decisions to be more
sl@0	135	** dynamic. For instance, it may make more sense to have a 2.5k leaf
sl@0	136	** node rather than splitting into 2k and .5k nodes. My intuition is
sl@0	137	** that this might extend through 2x or 4x the pagesize.
sl@0	138	**
sl@0	139	**
sl@0	140	** Segment interior nodes **
sl@0	141	** Segment interior nodes store blockids for subtree nodes and terms
sl@0	142	** to describe what data is stored by the each subtree. Interior
sl@0	143	** nodes are written using InteriorWriter, and read using
sl@0	144	** InteriorReader. InteriorWriters are created as needed when
sl@0	145	** SegmentWriter creates new leaf nodes, or when an interior node
sl@0	146	** itself grows too big and must be split. The format of interior
sl@0	147	** nodes:
sl@0	148	**
sl@0	149	** varint iHeight; (height from leaf level, always >0)
sl@0	150	** varint iBlockid; (block id of node's leftmost subtree)
sl@0	151	** optional {
sl@0	152	** varint nTerm; (length of first term)
sl@0	153	** char pTerm[nTerm]; (content of first term)
sl@0	154	** array {
sl@0	155	** (further terms are delta-encoded)
sl@0	156	** varint nPrefix; (length of shared prefix with previous term)
sl@0	157	** varint nSuffix; (length of unshared suffix)
sl@0	158	** char pTermSuffix[nSuffix]; (unshared suffix of next term)
sl@0	159	** }
sl@0	160	** }
sl@0	161	**
sl@0	162	** Here, optional { X } means an optional element, while array { X }
sl@0	163	** means zero or more occurrences of X, adjacent in memory.
sl@0	164	**
sl@0	165	** An interior node encodes n terms separating n+1 subtrees. The
sl@0	166	** subtree blocks are contiguous, so only the first subtree's blockid
sl@0	167	** is encoded. The subtree at iBlockid will contain all terms less
sl@0	168	** than the first term encoded (or all terms if no term is encoded).
sl@0	169	** Otherwise, for terms greater than or equal to pTerm[i] but less
sl@0	170	** than pTerm[i+1], the subtree for that term will be rooted at
sl@0	171	** iBlockid+i. Interior nodes only store enough term data to
sl@0	172	** distinguish adjacent children (if the rightmost term of the left
sl@0	173	** child is "something", and the leftmost term of the right child is
sl@0	174	** "wicked", only "w" is stored).
sl@0	175	**
sl@0	176	** New data is spilled to a new interior node at the same height when
sl@0	177	** the current node exceeds INTERIOR_MAX bytes (default 2048).
sl@0	178	** INTERIOR_MIN_TERMS (default 7) keeps large terms from monopolizing
sl@0	179	** interior nodes and making the tree too skinny. The interior nodes
sl@0	180	** at a given height are naturally tracked by interior nodes at
sl@0	181	** height+1, and so on.
sl@0	182	**
sl@0	183	**
sl@0	184	** Segment directory **
sl@0	185	** The segment directory in table %_segdir stores meta-information for
sl@0	186	** merging and deleting segments, and also the root node of the
sl@0	187	** segment's tree.
sl@0	188	**
sl@0	189	** The root node is the top node of the segment's tree after encoding
sl@0	190	** the entire segment, restricted to ROOT_MAX bytes (default 1024).
sl@0	191	** This could be either a leaf node or an interior node. If the top
sl@0	192	** node requires more than ROOT_MAX bytes, it is flushed to %_segments
sl@0	193	** and a new root interior node is generated (which should always fit
sl@0	194	** within ROOT_MAX because it only needs space for 2 varints, the
sl@0	195	** height and the blockid of the previous root).
sl@0	196	**
sl@0	197	** The meta-information in the segment directory is:
sl@0	198	** level - segment level (see below)
sl@0	199	** idx - index within level
sl@0	200	** - (level,idx uniquely identify a segment)
sl@0	201	** start_block - first leaf node
sl@0	202	** leaves_end_block - last leaf node
sl@0	203	** end_block - last block (including interior nodes)
sl@0	204	** root - contents of root node
sl@0	205	**
sl@0	206	** If the root node is a leaf node, then start_block,
sl@0	207	** leaves_end_block, and end_block are all 0.
sl@0	208	**
sl@0	209	**
sl@0	210	** Segment merging **
sl@0	211	** To amortize update costs, segments are groups into levels and
sl@0	212	** merged in matches. Each increase in level represents exponentially
sl@0	213	** more documents.
sl@0	214	**
sl@0	215	** New documents (actually, document updates) are tokenized and
sl@0	216	** written individually (using LeafWriter) to a level 0 segment, with
sl@0	217	** incrementing idx. When idx reaches MERGE_COUNT (default 16), all
sl@0	218	** level 0 segments are merged into a single level 1 segment. Level 1
sl@0	219	** is populated like level 0, and eventually MERGE_COUNT level 1
sl@0	220	** segments are merged to a single level 2 segment (representing
sl@0	221	** MERGE_COUNT^2 updates), and so on.
sl@0	222	**
sl@0	223	** A segment merge traverses all segments at a given level in
sl@0	224	** parallel, performing a straightforward sorted merge. Since segment
sl@0	225	** leaf nodes are written in to the %_segments table in order, this
sl@0	226	** merge traverses the underlying sqlite disk structures efficiently.
sl@0	227	** After the merge, all segment blocks from the merged level are
sl@0	228	** deleted.
sl@0	229	**
sl@0	230	** MERGE_COUNT controls how often we merge segments. 16 seems to be
sl@0	231	** somewhat of a sweet spot for insertion performance. 32 and 64 show
sl@0	232	** very similar performance numbers to 16 on insertion, though they're
sl@0	233	** a tiny bit slower (perhaps due to more overhead in merge-time
sl@0	234	** sorting). 8 is about 20% slower than 16, 4 about 50% slower than
sl@0	235	** 16, 2 about 66% slower than 16.
sl@0	236	**
sl@0	237	** At query time, high MERGE_COUNT increases the number of segments
sl@0	238	** which need to be scanned and merged. For instance, with 100k docs
sl@0	239	** inserted:
sl@0	240	**
sl@0	241	** MERGE_COUNT segments
sl@0	242	** 16 25
sl@0	243	** 8 12
sl@0	244	** 4 10
sl@0	245	** 2 6
sl@0	246	**
sl@0	247	** This appears to have only a moderate impact on queries for very
sl@0	248	** frequent terms (which are somewhat dominated by segment merge
sl@0	249	** costs), and infrequent and non-existent terms still seem to be fast
sl@0	250	** even with many segments.
sl@0	251	**
sl@0	252	** TODO(shess) That said, it would be nice to have a better query-side
sl@0	253	** argument for MERGE_COUNT of 16. Also, it is possible/likely that
sl@0	254	** optimizations to things like doclist merging will swing the sweet
sl@0	255	** spot around.
sl@0	256	**
sl@0	257	**
sl@0	258	**
sl@0	259	** Handling of deletions and updates **
sl@0	260	** Since we're using a segmented structure, with no docid-oriented
sl@0	261	** index into the term index, we clearly cannot simply update the term
sl@0	262	** index when a document is deleted or updated. For deletions, we
sl@0	263	** write an empty doclist (varint(docid) varint(POS_END)), for updates
sl@0	264	** we simply write the new doclist. Segment merges overwrite older
sl@0	265	** data for a particular docid with newer data, so deletes or updates
sl@0	266	** will eventually overtake the earlier data and knock it out. The
sl@0	267	** query logic likewise merges doclists so that newer data knocks out
sl@0	268	** older data.
sl@0	269	**
sl@0	270	** TODO(shess) Provide a VACUUM type operation to clear out all
sl@0	271	** deletions and duplications. This would basically be a forced merge
sl@0	272	** into a single segment.
sl@0	273	*/
sl@0	274
sl@0	275	#if !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3)
sl@0	276
sl@0	277	#if defined(SQLITE_ENABLE_FTS3) && !defined(SQLITE_CORE)
sl@0	278	# define SQLITE_CORE 1
sl@0	279	#endif
sl@0	280
sl@0	281	#include <assert.h>
sl@0	282	#include <stdlib.h>
sl@0	283	#include <stdio.h>
sl@0	284	#include <string.h>
sl@0	285	#include <ctype.h>
sl@0	286
sl@0	287	#include "fts3.h"
sl@0	288	#include "fts3_hash.h"
sl@0	289	#include "fts3_tokenizer.h"
sl@0	290	#ifndef SQLITE_CORE
sl@0	291	# include "sqlite3ext.h"
sl@0	292	SQLITE_EXTENSION_INIT1
sl@0	293	#endif
sl@0	294
sl@0	295
sl@0	296	/* TODO(shess) MAN, this thing needs some refactoring. At minimum, it
sl@0	297	** would be nice to order the file better, perhaps something along the
sl@0	298	** lines of:
sl@0	299	**
sl@0	300	** - utility functions
sl@0	301	** - table setup functions
sl@0	302	** - table update functions
sl@0	303	** - table query functions
sl@0	304	**
sl@0	305	** Put the query functions last because they're likely to reference
sl@0	306	** typedefs or functions from the table update section.
sl@0	307	*/
sl@0	308
sl@0	309	#if 0
sl@0	310	# define FTSTRACE(A) printf A; fflush(stdout)
sl@0	311	#else
sl@0	312	# define FTSTRACE(A)
sl@0	313	#endif
sl@0	314
sl@0	315	/*
sl@0	316	** Default span for NEAR operators.
sl@0	317	*/
sl@0	318	#define SQLITE_FTS3_DEFAULT_NEAR_PARAM 10
sl@0	319
sl@0	320	/* It is not safe to call isspace(), tolower(), or isalnum() on
sl@0	321	** hi-bit-set characters. This is the same solution used in the
sl@0	322	** tokenizer.
sl@0	323	*/
sl@0	324	/* TODO(shess) The snippet-generation code should be using the
sl@0	325	** tokenizer-generated tokens rather than doing its own local
sl@0	326	** tokenization.
sl@0	327	*/
sl@0	328	/* TODO(shess) Is __isascii() a portable version of (c&0x80)==0? */
sl@0	329	static int safe_isspace(char c){
sl@0	330	return (c&0x80)==0 ? isspace(c) : 0;
sl@0	331	}
sl@0	332	static int safe_tolower(char c){
sl@0	333	return (c&0x80)==0 ? tolower(c) : c;
sl@0	334	}
sl@0	335	static int safe_isalnum(char c){
sl@0	336	return (c&0x80)==0 ? isalnum(c) : 0;
sl@0	337	}
sl@0	338
sl@0	339	typedef enum DocListType {
sl@0	340	DL_DOCIDS, /* docids only */
sl@0	341	DL_POSITIONS, /* docids + positions */
sl@0	342	DL_POSITIONS_OFFSETS /* docids + positions + offsets */
sl@0	343	} DocListType;
sl@0	344
sl@0	345	/*
sl@0	346	** By default, only positions and not offsets are stored in the doclists.
sl@0	347	** To change this so that offsets are stored too, compile with
sl@0	348	**
sl@0	349	** -DDL_DEFAULT=DL_POSITIONS_OFFSETS
sl@0	350	**
sl@0	351	** If DL_DEFAULT is set to DL_DOCIDS, your table can only be inserted
sl@0	352	** into (no deletes or updates).
sl@0	353	*/
sl@0	354	#ifndef DL_DEFAULT
sl@0	355	# define DL_DEFAULT DL_POSITIONS
sl@0	356	#endif
sl@0	357
sl@0	358	enum {
sl@0	359	POS_END = 0, /* end of this position list */
sl@0	360	POS_COLUMN, /* followed by new column number */
sl@0	361	POS_BASE
sl@0	362	};
sl@0	363
sl@0	364	/* MERGE_COUNT controls how often we merge segments (see comment at
sl@0	365	** top of file).
sl@0	366	*/
sl@0	367	#define MERGE_COUNT 16
sl@0	368
sl@0	369	/* utility functions */
sl@0	370
sl@0	371	/* CLEAR() and SCRAMBLE() abstract memset() on a pointer to a single
sl@0	372	** record to prevent errors of the form:
sl@0	373	**
sl@0	374	** my_function(SomeType *b){
sl@0	375	** memset(b, '\0', sizeof(b)); // sizeof(b)!=sizeof(*b)
sl@0	376	** }
sl@0	377	*/
sl@0	378	/* TODO(shess) Obvious candidates for a header file. */
sl@0	379	#define CLEAR(b) memset(b, '\0', sizeof(*(b)))
sl@0	380
sl@0	381	#ifndef NDEBUG
sl@0	382	# define SCRAMBLE(b) memset(b, 0x55, sizeof(*(b)))
sl@0	383	#else
sl@0	384	# define SCRAMBLE(b)
sl@0	385	#endif
sl@0	386
sl@0	387	/* We may need up to VARINT_MAX bytes to store an encoded 64-bit integer. */
sl@0	388	#define VARINT_MAX 10
sl@0	389
sl@0	390	/* Write a 64-bit variable-length integer to memory starting at p[0].
sl@0	391	* The length of data written will be between 1 and VARINT_MAX bytes.
sl@0	392	* The number of bytes written is returned. */
sl@0	393	static int fts3PutVarint(char *p, sqlite_int64 v){
sl@0	394	unsigned char q = (unsigned char ) p;
sl@0	395	sqlite_uint64 vu = v;
sl@0	396	do{
sl@0	397	*q++ = (unsigned char) ((vu & 0x7f) \| 0x80);
sl@0	398	vu >>= 7;
sl@0	399	}while( vu!=0 );
sl@0	400	q[-1] &= 0x7f; /* turn off high bit in final byte */
sl@0	401	assert( q - (unsigned char *)p <= VARINT_MAX );
sl@0	402	return (int) (q - (unsigned char *)p);
sl@0	403	}
sl@0	404
sl@0	405	/* Read a 64-bit variable-length integer from memory starting at p[0].
sl@0	406	* Return the number of bytes read, or 0 on error.
sl@0	407	* The value is stored in v. /
sl@0	408	static int fts3GetVarint(const char p, sqlite_int64 v){
sl@0	409	const unsigned char q = (const unsigned char ) p;
sl@0	410	sqlite_uint64 x = 0, y = 1;
sl@0	411	while( (*q & 0x80) == 0x80 ){
sl@0	412	x += y * (*q++ & 0x7f);
sl@0	413	y <<= 7;
sl@0	414	if( q - (unsigned char )p >= VARINT_MAX ){ / bad data */
sl@0	415	assert( 0 );
sl@0	416	return 0;
sl@0	417	}
sl@0	418	}
sl@0	419	x += y * (*q++);
sl@0	420	*v = (sqlite_int64) x;
sl@0	421	return (int) (q - (unsigned char *)p);
sl@0	422	}
sl@0	423
sl@0	424	static int fts3GetVarint32(const char p, int pi){
sl@0	425	sqlite_int64 i;
sl@0	426	int ret = fts3GetVarint(p, &i);
sl@0	427	*pi = (int) i;
sl@0	428	assert( *pi==i );
sl@0	429	return ret;
sl@0	430	}
sl@0	431
sl@0	432	/*******************************************************************/
sl@0	433	/* DataBuffer is used to collect data into a buffer in piecemeal
sl@0	434	** fashion. It implements the usual distinction between amount of
sl@0	435	** data currently stored (nData) and buffer capacity (nCapacity).
sl@0	436	**
sl@0	437	** dataBufferInit - create a buffer with given initial capacity.
sl@0	438	** dataBufferReset - forget buffer's data, retaining capacity.
sl@0	439	** dataBufferDestroy - free buffer's data.
sl@0	440	** dataBufferSwap - swap contents of two buffers.
sl@0	441	** dataBufferExpand - expand capacity without adding data.
sl@0	442	** dataBufferAppend - append data.
sl@0	443	** dataBufferAppend2 - append two pieces of data at once.
sl@0	444	** dataBufferReplace - replace buffer's data.
sl@0	445	*/
sl@0	446	typedef struct DataBuffer {
sl@0	447	char pData; / Pointer to malloc'ed buffer. */
sl@0	448	int nCapacity; /* Size of pData buffer. */
sl@0	449	int nData; /* End of data loaded into pData. */
sl@0	450	} DataBuffer;
sl@0	451
sl@0	452	static void dataBufferInit(DataBuffer *pBuffer, int nCapacity){
sl@0	453	assert( nCapacity>=0 );
sl@0	454	pBuffer->nData = 0;
sl@0	455	pBuffer->nCapacity = nCapacity;
sl@0	456	pBuffer->pData = nCapacity==0 ? NULL : sqlite3_malloc(nCapacity);
sl@0	457	}
sl@0	458	static void dataBufferReset(DataBuffer *pBuffer){
sl@0	459	pBuffer->nData = 0;
sl@0	460	}
sl@0	461	static void dataBufferDestroy(DataBuffer *pBuffer){
sl@0	462	if( pBuffer->pData!=NULL ) sqlite3_free(pBuffer->pData);
sl@0	463	SCRAMBLE(pBuffer);
sl@0	464	}
sl@0	465	static void dataBufferSwap(DataBuffer pBuffer1, DataBuffer pBuffer2){
sl@0	466	DataBuffer tmp = *pBuffer1;
sl@0	467	pBuffer1 = pBuffer2;
sl@0	468	*pBuffer2 = tmp;
sl@0	469	}
sl@0	470	static void dataBufferExpand(DataBuffer *pBuffer, int nAddCapacity){
sl@0	471	assert( nAddCapacity>0 );
sl@0	472	/* TODO(shess) Consider expanding more aggressively. Note that the
sl@0	473	** underlying malloc implementation may take care of such things for
sl@0	474	** us already.
sl@0	475	*/
sl@0	476	if( pBuffer->nData+nAddCapacity>pBuffer->nCapacity ){
sl@0	477	pBuffer->nCapacity = pBuffer->nData+nAddCapacity;
sl@0	478	pBuffer->pData = sqlite3_realloc(pBuffer->pData, pBuffer->nCapacity);
sl@0	479	}
sl@0	480	}
sl@0	481	static void dataBufferAppend(DataBuffer *pBuffer,
sl@0	482	const char *pSource, int nSource){
sl@0	483	assert( nSource>0 && pSource!=NULL );
sl@0	484	dataBufferExpand(pBuffer, nSource);
sl@0	485	memcpy(pBuffer->pData+pBuffer->nData, pSource, nSource);
sl@0	486	pBuffer->nData += nSource;
sl@0	487	}
sl@0	488	static void dataBufferAppend2(DataBuffer *pBuffer,
sl@0	489	const char *pSource1, int nSource1,
sl@0	490	const char *pSource2, int nSource2){
sl@0	491	assert( nSource1>0 && pSource1!=NULL );
sl@0	492	assert( nSource2>0 && pSource2!=NULL );
sl@0	493	dataBufferExpand(pBuffer, nSource1+nSource2);
sl@0	494	memcpy(pBuffer->pData+pBuffer->nData, pSource1, nSource1);
sl@0	495	memcpy(pBuffer->pData+pBuffer->nData+nSource1, pSource2, nSource2);
sl@0	496	pBuffer->nData += nSource1+nSource2;
sl@0	497	}
sl@0	498	static void dataBufferReplace(DataBuffer *pBuffer,
sl@0	499	const char *pSource, int nSource){
sl@0	500	dataBufferReset(pBuffer);
sl@0	501	dataBufferAppend(pBuffer, pSource, nSource);
sl@0	502	}
sl@0	503
sl@0	504	/* StringBuffer is a null-terminated version of DataBuffer. */
sl@0	505	typedef struct StringBuffer {
sl@0	506	DataBuffer b; /* Includes null terminator. */
sl@0	507	} StringBuffer;
sl@0	508
sl@0	509	static void initStringBuffer(StringBuffer *sb){
sl@0	510	dataBufferInit(&sb->b, 100);
sl@0	511	dataBufferReplace(&sb->b, "", 1);
sl@0	512	}
sl@0	513	static int stringBufferLength(StringBuffer *sb){
sl@0	514	return sb->b.nData-1;
sl@0	515	}
sl@0	516	static char stringBufferData(StringBuffer sb){
sl@0	517	return sb->b.pData;
sl@0	518	}
sl@0	519	static void stringBufferDestroy(StringBuffer *sb){
sl@0	520	dataBufferDestroy(&sb->b);
sl@0	521	}
sl@0	522
sl@0	523	static void nappend(StringBuffer sb, const char zFrom, int nFrom){
sl@0	524	assert( sb->b.nData>0 );
sl@0	525	if( nFrom>0 ){
sl@0	526	sb->b.nData--;
sl@0	527	dataBufferAppend2(&sb->b, zFrom, nFrom, "", 1);
sl@0	528	}
sl@0	529	}
sl@0	530	static void append(StringBuffer sb, const char zFrom){
sl@0	531	nappend(sb, zFrom, strlen(zFrom));
sl@0	532	}
sl@0	533
sl@0	534	/* Append a list of strings separated by commas. */
sl@0	535	static void appendList(StringBuffer sb, int nString, char *azString){
sl@0	536	int i;
sl@0	537	for(i=0; i<nString; ++i){
sl@0	538	if( i>0 ) append(sb, ", ");
sl@0	539	append(sb, azString[i]);
sl@0	540	}
sl@0	541	}
sl@0	542
sl@0	543	static int endsInWhiteSpace(StringBuffer *p){
sl@0	544	return stringBufferLength(p)>0 &&
sl@0	545	safe_isspace(stringBufferData(p)[stringBufferLength(p)-1]);
sl@0	546	}
sl@0	547
sl@0	548	/* If the StringBuffer ends in something other than white space, add a
sl@0	549	** single space character to the end.
sl@0	550	*/
sl@0	551	static void appendWhiteSpace(StringBuffer *p){
sl@0	552	if( stringBufferLength(p)==0 ) return;
sl@0	553	if( !endsInWhiteSpace(p) ) append(p, " ");
sl@0	554	}
sl@0	555
sl@0	556	/* Remove white space from the end of the StringBuffer */
sl@0	557	static void trimWhiteSpace(StringBuffer *p){
sl@0	558	while( endsInWhiteSpace(p) ){
sl@0	559	p->b.pData[--p->b.nData-1] = '\0';
sl@0	560	}
sl@0	561	}
sl@0	562
sl@0	563	/*******************************************************************/
sl@0	564	/* DLReader is used to read document elements from a doclist. The
sl@0	565	** current docid is cached, so dlrDocid() is fast. DLReader does not
sl@0	566	** own the doclist buffer.
sl@0	567	**
sl@0	568	** dlrAtEnd - true if there's no more data to read.
sl@0	569	** dlrDocid - docid of current document.
sl@0	570	** dlrDocData - doclist data for current document (including docid).
sl@0	571	** dlrDocDataBytes - length of same.
sl@0	572	** dlrAllDataBytes - length of all remaining data.
sl@0	573	** dlrPosData - position data for current document.
sl@0	574	** dlrPosDataLen - length of pos data for current document (incl POS_END).
sl@0	575	** dlrStep - step to current document.
sl@0	576	** dlrInit - initial for doclist of given type against given data.
sl@0	577	** dlrDestroy - clean up.
sl@0	578	**
sl@0	579	** Expected usage is something like:
sl@0	580	**
sl@0	581	** DLReader reader;
sl@0	582	** dlrInit(&reader, pData, nData);
sl@0	583	** while( !dlrAtEnd(&reader) ){
sl@0	584	** // calls to dlrDocid() and kin.
sl@0	585	** dlrStep(&reader);
sl@0	586	** }
sl@0	587	** dlrDestroy(&reader);
sl@0	588	*/
sl@0	589	typedef struct DLReader {
sl@0	590	DocListType iType;
sl@0	591	const char *pData;
sl@0	592	int nData;
sl@0	593
sl@0	594	sqlite_int64 iDocid;
sl@0	595	int nElement;
sl@0	596	} DLReader;
sl@0	597
sl@0	598	static int dlrAtEnd(DLReader *pReader){
sl@0	599	assert( pReader->nData>=0 );
sl@0	600	return pReader->nData==0;
sl@0	601	}
sl@0	602	static sqlite_int64 dlrDocid(DLReader *pReader){
sl@0	603	assert( !dlrAtEnd(pReader) );
sl@0	604	return pReader->iDocid;
sl@0	605	}
sl@0	606	static const char dlrDocData(DLReader pReader){
sl@0	607	assert( !dlrAtEnd(pReader) );
sl@0	608	return pReader->pData;
sl@0	609	}
sl@0	610	static int dlrDocDataBytes(DLReader *pReader){
sl@0	611	assert( !dlrAtEnd(pReader) );
sl@0	612	return pReader->nElement;
sl@0	613	}
sl@0	614	static int dlrAllDataBytes(DLReader *pReader){
sl@0	615	assert( !dlrAtEnd(pReader) );
sl@0	616	return pReader->nData;
sl@0	617	}
sl@0	618	/* TODO(shess) Consider adding a field to track iDocid varint length
sl@0	619	** to make these two functions faster. This might matter (a tiny bit)
sl@0	620	** for queries.
sl@0	621	*/
sl@0	622	static const char dlrPosData(DLReader pReader){
sl@0	623	sqlite_int64 iDummy;
sl@0	624	int n = fts3GetVarint(pReader->pData, &iDummy);
sl@0	625	assert( !dlrAtEnd(pReader) );
sl@0	626	return pReader->pData+n;
sl@0	627	}
sl@0	628	static int dlrPosDataLen(DLReader *pReader){
sl@0	629	sqlite_int64 iDummy;
sl@0	630	int n = fts3GetVarint(pReader->pData, &iDummy);
sl@0	631	assert( !dlrAtEnd(pReader) );
sl@0	632	return pReader->nElement-n;
sl@0	633	}
sl@0	634	static void dlrStep(DLReader *pReader){
sl@0	635	assert( !dlrAtEnd(pReader) );
sl@0	636
sl@0	637	/* Skip past current doclist element. */
sl@0	638	assert( pReader->nElement<=pReader->nData );
sl@0	639	pReader->pData += pReader->nElement;
sl@0	640	pReader->nData -= pReader->nElement;
sl@0	641
sl@0	642	/* If there is more data, read the next doclist element. */
sl@0	643	if( pReader->nData!=0 ){
sl@0	644	sqlite_int64 iDocidDelta;
sl@0	645	int iDummy, n = fts3GetVarint(pReader->pData, &iDocidDelta);
sl@0	646	pReader->iDocid += iDocidDelta;
sl@0	647	if( pReader->iType>=DL_POSITIONS ){
sl@0	648	assert( n<pReader->nData );
sl@0	649	while( 1 ){
sl@0	650	n += fts3GetVarint32(pReader->pData+n, &iDummy);
sl@0	651	assert( n<=pReader->nData );
sl@0	652	if( iDummy==POS_END ) break;
sl@0	653	if( iDummy==POS_COLUMN ){
sl@0	654	n += fts3GetVarint32(pReader->pData+n, &iDummy);
sl@0	655	assert( n<pReader->nData );
sl@0	656	}else if( pReader->iType==DL_POSITIONS_OFFSETS ){
sl@0	657	n += fts3GetVarint32(pReader->pData+n, &iDummy);
sl@0	658	n += fts3GetVarint32(pReader->pData+n, &iDummy);
sl@0	659	assert( n<pReader->nData );
sl@0	660	}
sl@0	661	}
sl@0	662	}
sl@0	663	pReader->nElement = n;
sl@0	664	assert( pReader->nElement<=pReader->nData );
sl@0	665	}
sl@0	666	}
sl@0	667	static void dlrInit(DLReader *pReader, DocListType iType,
sl@0	668	const char *pData, int nData){
sl@0	669	assert( pData!=NULL && nData!=0 );
sl@0	670	pReader->iType = iType;
sl@0	671	pReader->pData = pData;
sl@0	672	pReader->nData = nData;
sl@0	673	pReader->nElement = 0;
sl@0	674	pReader->iDocid = 0;
sl@0	675
sl@0	676	/* Load the first element's data. There must be a first element. */
sl@0	677	dlrStep(pReader);
sl@0	678	}
sl@0	679	static void dlrDestroy(DLReader *pReader){
sl@0	680	SCRAMBLE(pReader);
sl@0	681	}
sl@0	682
sl@0	683	#ifndef NDEBUG
sl@0	684	/* Verify that the doclist can be validly decoded. Also returns the
sl@0	685	** last docid found because it is convenient in other assertions for
sl@0	686	** DLWriter.
sl@0	687	*/
sl@0	688	static void docListValidate(DocListType iType, const char *pData, int nData,
sl@0	689	sqlite_int64 *pLastDocid){
sl@0	690	sqlite_int64 iPrevDocid = 0;
sl@0	691	assert( nData>0 );
sl@0	692	assert( pData!=0 );
sl@0	693	assert( pData+nData>pData );
sl@0	694	while( nData!=0 ){
sl@0	695	sqlite_int64 iDocidDelta;
sl@0	696	int n = fts3GetVarint(pData, &iDocidDelta);
sl@0	697	iPrevDocid += iDocidDelta;
sl@0	698	if( iType>DL_DOCIDS ){
sl@0	699	int iDummy;
sl@0	700	while( 1 ){
sl@0	701	n += fts3GetVarint32(pData+n, &iDummy);
sl@0	702	if( iDummy==POS_END ) break;
sl@0	703	if( iDummy==POS_COLUMN ){
sl@0	704	n += fts3GetVarint32(pData+n, &iDummy);
sl@0	705	}else if( iType>DL_POSITIONS ){
sl@0	706	n += fts3GetVarint32(pData+n, &iDummy);
sl@0	707	n += fts3GetVarint32(pData+n, &iDummy);
sl@0	708	}
sl@0	709	assert( n<=nData );
sl@0	710	}
sl@0	711	}
sl@0	712	assert( n<=nData );
sl@0	713	pData += n;
sl@0	714	nData -= n;
sl@0	715	}
sl@0	716	if( pLastDocid ) *pLastDocid = iPrevDocid;
sl@0	717	}
sl@0	718	#define ASSERT_VALID_DOCLIST(i, p, n, o) docListValidate(i, p, n, o)
sl@0	719	#else
sl@0	720	#define ASSERT_VALID_DOCLIST(i, p, n, o) assert( 1 )
sl@0	721	#endif
sl@0	722
sl@0	723	/*******************************************************************/
sl@0	724	/* DLWriter is used to write doclist data to a DataBuffer. DLWriter
sl@0	725	** always appends to the buffer and does not own it.
sl@0	726	**
sl@0	727	** dlwInit - initialize to write a given type doclistto a buffer.
sl@0	728	** dlwDestroy - clear the writer's memory. Does not free buffer.
sl@0	729	** dlwAppend - append raw doclist data to buffer.
sl@0	730	** dlwCopy - copy next doclist from reader to writer.
sl@0	731	** dlwAdd - construct doclist element and append to buffer.
sl@0	732	** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter).
sl@0	733	*/
sl@0	734	typedef struct DLWriter {
sl@0	735	DocListType iType;
sl@0	736	DataBuffer *b;
sl@0	737	sqlite_int64 iPrevDocid;
sl@0	738	#ifndef NDEBUG
sl@0	739	int has_iPrevDocid;
sl@0	740	#endif
sl@0	741	} DLWriter;
sl@0	742
sl@0	743	static void dlwInit(DLWriter pWriter, DocListType iType, DataBuffer b){
sl@0	744	pWriter->b = b;
sl@0	745	pWriter->iType = iType;
sl@0	746	pWriter->iPrevDocid = 0;
sl@0	747	#ifndef NDEBUG
sl@0	748	pWriter->has_iPrevDocid = 0;
sl@0	749	#endif
sl@0	750	}
sl@0	751	static void dlwDestroy(DLWriter *pWriter){
sl@0	752	SCRAMBLE(pWriter);
sl@0	753	}
sl@0	754	/* iFirstDocid is the first docid in the doclist in pData. It is
sl@0	755	** needed because pData may point within a larger doclist, in which
sl@0	756	** case the first item would be delta-encoded.
sl@0	757	**
sl@0	758	** iLastDocid is the final docid in the doclist in pData. It is
sl@0	759	** needed to create the new iPrevDocid for future delta-encoding. The
sl@0	760	** code could decode the passed doclist to recreate iLastDocid, but
sl@0	761	** the only current user (docListMerge) already has decoded this
sl@0	762	** information.
sl@0	763	*/
sl@0	764	/* TODO(shess) This has become just a helper for docListMerge.
sl@0	765	** Consider a refactor to make this cleaner.
sl@0	766	*/
sl@0	767	static void dlwAppend(DLWriter *pWriter,
sl@0	768	const char *pData, int nData,
sl@0	769	sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){
sl@0	770	sqlite_int64 iDocid = 0;
sl@0	771	char c[VARINT_MAX];
sl@0	772	int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */
sl@0	773	#ifndef NDEBUG
sl@0	774	sqlite_int64 iLastDocidDelta;
sl@0	775	#endif
sl@0	776
sl@0	777	/* Recode the initial docid as delta from iPrevDocid. */
sl@0	778	nFirstOld = fts3GetVarint(pData, &iDocid);
sl@0	779	assert( nFirstOld<nData \|\| (nFirstOld==nData && pWriter->iType==DL_DOCIDS) );
sl@0	780	nFirstNew = fts3PutVarint(c, iFirstDocid-pWriter->iPrevDocid);
sl@0	781
sl@0	782	/* Verify that the incoming doclist is valid AND that it ends with
sl@0	783	** the expected docid. This is essential because we'll trust this
sl@0	784	** docid in future delta-encoding.
sl@0	785	*/
sl@0	786	ASSERT_VALID_DOCLIST(pWriter->iType, pData, nData, &iLastDocidDelta);
sl@0	787	assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta );
sl@0	788
sl@0	789	/* Append recoded initial docid and everything else. Rest of docids
sl@0	790	** should have been delta-encoded from previous initial docid.
sl@0	791	*/
sl@0	792	if( nFirstOld<nData ){
sl@0	793	dataBufferAppend2(pWriter->b, c, nFirstNew,
sl@0	794	pData+nFirstOld, nData-nFirstOld);
sl@0	795	}else{
sl@0	796	dataBufferAppend(pWriter->b, c, nFirstNew);
sl@0	797	}
sl@0	798	pWriter->iPrevDocid = iLastDocid;
sl@0	799	}
sl@0	800	static void dlwCopy(DLWriter pWriter, DLReader pReader){
sl@0	801	dlwAppend(pWriter, dlrDocData(pReader), dlrDocDataBytes(pReader),
sl@0	802	dlrDocid(pReader), dlrDocid(pReader));
sl@0	803	}
sl@0	804	static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){
sl@0	805	char c[VARINT_MAX];
sl@0	806	int n = fts3PutVarint(c, iDocid-pWriter->iPrevDocid);
sl@0	807
sl@0	808	/* Docids must ascend. */
sl@0	809	assert( !pWriter->has_iPrevDocid \|\| iDocid>pWriter->iPrevDocid );
sl@0	810	assert( pWriter->iType==DL_DOCIDS );
sl@0	811
sl@0	812	dataBufferAppend(pWriter->b, c, n);
sl@0	813	pWriter->iPrevDocid = iDocid;
sl@0	814	#ifndef NDEBUG
sl@0	815	pWriter->has_iPrevDocid = 1;
sl@0	816	#endif
sl@0	817	}
sl@0	818
sl@0	819	/*******************************************************************/
sl@0	820	/* PLReader is used to read data from a document's position list. As
sl@0	821	** the caller steps through the list, data is cached so that varints
sl@0	822	** only need to be decoded once.
sl@0	823	**
sl@0	824	** plrInit, plrDestroy - create/destroy a reader.
sl@0	825	** plrColumn, plrPosition, plrStartOffset, plrEndOffset - accessors
sl@0	826	** plrAtEnd - at end of stream, only call plrDestroy once true.
sl@0	827	** plrStep - step to the next element.
sl@0	828	*/
sl@0	829	typedef struct PLReader {
sl@0	830	/* These refer to the next position's data. nData will reach 0 when
sl@0	831	** reading the last position, so plrStep() signals EOF by setting
sl@0	832	** pData to NULL.
sl@0	833	*/
sl@0	834	const char *pData;
sl@0	835	int nData;
sl@0	836
sl@0	837	DocListType iType;
sl@0	838	int iColumn; /* the last column read */
sl@0	839	int iPosition; /* the last position read */
sl@0	840	int iStartOffset; /* the last start offset read */
sl@0	841	int iEndOffset; /* the last end offset read */
sl@0	842	} PLReader;
sl@0	843
sl@0	844	static int plrAtEnd(PLReader *pReader){
sl@0	845	return pReader->pData==NULL;
sl@0	846	}
sl@0	847	static int plrColumn(PLReader *pReader){
sl@0	848	assert( !plrAtEnd(pReader) );
sl@0	849	return pReader->iColumn;
sl@0	850	}
sl@0	851	static int plrPosition(PLReader *pReader){
sl@0	852	assert( !plrAtEnd(pReader) );
sl@0	853	return pReader->iPosition;
sl@0	854	}
sl@0	855	static int plrStartOffset(PLReader *pReader){
sl@0	856	assert( !plrAtEnd(pReader) );
sl@0	857	return pReader->iStartOffset;
sl@0	858	}
sl@0	859	static int plrEndOffset(PLReader *pReader){
sl@0	860	assert( !plrAtEnd(pReader) );
sl@0	861	return pReader->iEndOffset;
sl@0	862	}
sl@0	863	static void plrStep(PLReader *pReader){
sl@0	864	int i, n;
sl@0	865
sl@0	866	assert( !plrAtEnd(pReader) );
sl@0	867
sl@0	868	if( pReader->nData==0 ){
sl@0	869	pReader->pData = NULL;
sl@0	870	return;
sl@0	871	}
sl@0	872
sl@0	873	n = fts3GetVarint32(pReader->pData, &i);
sl@0	874	if( i==POS_COLUMN ){
sl@0	875	n += fts3GetVarint32(pReader->pData+n, &pReader->iColumn);
sl@0	876	pReader->iPosition = 0;
sl@0	877	pReader->iStartOffset = 0;
sl@0	878	n += fts3GetVarint32(pReader->pData+n, &i);
sl@0	879	}
sl@0	880	/* Should never see adjacent column changes. */
sl@0	881	assert( i!=POS_COLUMN );
sl@0	882
sl@0	883	if( i==POS_END ){
sl@0	884	pReader->nData = 0;
sl@0	885	pReader->pData = NULL;
sl@0	886	return;
sl@0	887	}
sl@0	888
sl@0	889	pReader->iPosition += i-POS_BASE;
sl@0	890	if( pReader->iType==DL_POSITIONS_OFFSETS ){
sl@0	891	n += fts3GetVarint32(pReader->pData+n, &i);
sl@0	892	pReader->iStartOffset += i;
sl@0	893	n += fts3GetVarint32(pReader->pData+n, &i);
sl@0	894	pReader->iEndOffset = pReader->iStartOffset+i;
sl@0	895	}
sl@0	896	assert( n<=pReader->nData );
sl@0	897	pReader->pData += n;
sl@0	898	pReader->nData -= n;
sl@0	899	}
sl@0	900
sl@0	901	static void plrInit(PLReader pReader, DLReader pDLReader){
sl@0	902	pReader->pData = dlrPosData(pDLReader);
sl@0	903	pReader->nData = dlrPosDataLen(pDLReader);
sl@0	904	pReader->iType = pDLReader->iType;
sl@0	905	pReader->iColumn = 0;
sl@0	906	pReader->iPosition = 0;
sl@0	907	pReader->iStartOffset = 0;
sl@0	908	pReader->iEndOffset = 0;
sl@0	909	plrStep(pReader);
sl@0	910	}
sl@0	911	static void plrDestroy(PLReader *pReader){
sl@0	912	SCRAMBLE(pReader);
sl@0	913	}
sl@0	914
sl@0	915	/*******************************************************************/
sl@0	916	/* PLWriter is used in constructing a document's position list. As a
sl@0	917	** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op.
sl@0	918	** PLWriter writes to the associated DLWriter's buffer.
sl@0	919	**
sl@0	920	** plwInit - init for writing a document's poslist.
sl@0	921	** plwDestroy - clear a writer.
sl@0	922	** plwAdd - append position and offset information.
sl@0	923	** plwCopy - copy next position's data from reader to writer.
sl@0	924	** plwTerminate - add any necessary doclist terminator.
sl@0	925	**
sl@0	926	** Calling plwAdd() after plwTerminate() may result in a corrupt
sl@0	927	** doclist.
sl@0	928	*/
sl@0	929	/* TODO(shess) Until we've written the second item, we can cache the
sl@0	930	** first item's information. Then we'd have three states:
sl@0	931	**
sl@0	932	** - initialized with docid, no positions.
sl@0	933	** - docid and one position.
sl@0	934	** - docid and multiple positions.
sl@0	935	**
sl@0	936	** Only the last state needs to actually write to dlw->b, which would
sl@0	937	** be an improvement in the DLCollector case.
sl@0	938	*/
sl@0	939	typedef struct PLWriter {
sl@0	940	DLWriter *dlw;
sl@0	941
sl@0	942	int iColumn; /* the last column written */
sl@0	943	int iPos; /* the last position written */
sl@0	944	int iOffset; /* the last start offset written */
sl@0	945	} PLWriter;
sl@0	946
sl@0	947	/* TODO(shess) In the case where the parent is reading these values
sl@0	948	** from a PLReader, we could optimize to a copy if that PLReader has
sl@0	949	** the same type as pWriter.
sl@0	950	*/
sl@0	951	static void plwAdd(PLWriter *pWriter, int iColumn, int iPos,
sl@0	952	int iStartOffset, int iEndOffset){
sl@0	953	/* Worst-case space for POS_COLUMN, iColumn, iPosDelta,
sl@0	954	** iStartOffsetDelta, and iEndOffsetDelta.
sl@0	955	*/
sl@0	956	char c[5*VARINT_MAX];
sl@0	957	int n = 0;
sl@0	958
sl@0	959	/* Ban plwAdd() after plwTerminate(). */
sl@0	960	assert( pWriter->iPos!=-1 );
sl@0	961
sl@0	962	if( pWriter->dlw->iType==DL_DOCIDS ) return;
sl@0	963
sl@0	964	if( iColumn!=pWriter->iColumn ){
sl@0	965	n += fts3PutVarint(c+n, POS_COLUMN);
sl@0	966	n += fts3PutVarint(c+n, iColumn);
sl@0	967	pWriter->iColumn = iColumn;
sl@0	968	pWriter->iPos = 0;
sl@0	969	pWriter->iOffset = 0;
sl@0	970	}
sl@0	971	assert( iPos>=pWriter->iPos );
sl@0	972	n += fts3PutVarint(c+n, POS_BASE+(iPos-pWriter->iPos));
sl@0	973	pWriter->iPos = iPos;
sl@0	974	if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){
sl@0	975	assert( iStartOffset>=pWriter->iOffset );
sl@0	976	n += fts3PutVarint(c+n, iStartOffset-pWriter->iOffset);
sl@0	977	pWriter->iOffset = iStartOffset;
sl@0	978	assert( iEndOffset>=iStartOffset );
sl@0	979	n += fts3PutVarint(c+n, iEndOffset-iStartOffset);
sl@0	980	}
sl@0	981	dataBufferAppend(pWriter->dlw->b, c, n);
sl@0	982	}
sl@0	983	static void plwCopy(PLWriter pWriter, PLReader pReader){
sl@0	984	plwAdd(pWriter, plrColumn(pReader), plrPosition(pReader),
sl@0	985	plrStartOffset(pReader), plrEndOffset(pReader));
sl@0	986	}
sl@0	987	static void plwInit(PLWriter pWriter, DLWriter dlw, sqlite_int64 iDocid){
sl@0	988	char c[VARINT_MAX];
sl@0	989	int n;
sl@0	990
sl@0	991	pWriter->dlw = dlw;
sl@0	992
sl@0	993	/* Docids must ascend. */
sl@0	994	assert( !pWriter->dlw->has_iPrevDocid \|\| iDocid>pWriter->dlw->iPrevDocid );
sl@0	995	n = fts3PutVarint(c, iDocid-pWriter->dlw->iPrevDocid);
sl@0	996	dataBufferAppend(pWriter->dlw->b, c, n);
sl@0	997	pWriter->dlw->iPrevDocid = iDocid;
sl@0	998	#ifndef NDEBUG
sl@0	999	pWriter->dlw->has_iPrevDocid = 1;
sl@0	1000	#endif
sl@0	1001
sl@0	1002	pWriter->iColumn = 0;
sl@0	1003	pWriter->iPos = 0;
sl@0	1004	pWriter->iOffset = 0;
sl@0	1005	}
sl@0	1006	/* TODO(shess) Should plwDestroy() also terminate the doclist? But
sl@0	1007	** then plwDestroy() would no longer be just a destructor, it would
sl@0	1008	** also be doing work, which isn't consistent with the overall idiom.
sl@0	1009	** Another option would be for plwAdd() to always append any necessary
sl@0	1010	** terminator, so that the output is always correct. But that would
sl@0	1011	** add incremental work to the common case with the only benefit being
sl@0	1012	** API elegance. Punt for now.
sl@0	1013	*/
sl@0	1014	static void plwTerminate(PLWriter *pWriter){
sl@0	1015	if( pWriter->dlw->iType>DL_DOCIDS ){
sl@0	1016	char c[VARINT_MAX];
sl@0	1017	int n = fts3PutVarint(c, POS_END);
sl@0	1018	dataBufferAppend(pWriter->dlw->b, c, n);
sl@0	1019	}
sl@0	1020	#ifndef NDEBUG
sl@0	1021	/* Mark as terminated for assert in plwAdd(). */
sl@0	1022	pWriter->iPos = -1;
sl@0	1023	#endif
sl@0	1024	}
sl@0	1025	static void plwDestroy(PLWriter *pWriter){
sl@0	1026	SCRAMBLE(pWriter);
sl@0	1027	}
sl@0	1028
sl@0	1029	/*******************************************************************/
sl@0	1030	/* DLCollector wraps PLWriter and DLWriter to provide a
sl@0	1031	** dynamically-allocated doclist area to use during tokenization.
sl@0	1032	**
sl@0	1033	** dlcNew - malloc up and initialize a collector.
sl@0	1034	** dlcDelete - destroy a collector and all contained items.
sl@0	1035	** dlcAddPos - append position and offset information.
sl@0	1036	** dlcAddDoclist - add the collected doclist to the given buffer.
sl@0	1037	** dlcNext - terminate the current document and open another.
sl@0	1038	*/
sl@0	1039	typedef struct DLCollector {
sl@0	1040	DataBuffer b;
sl@0	1041	DLWriter dlw;
sl@0	1042	PLWriter plw;
sl@0	1043	} DLCollector;
sl@0	1044
sl@0	1045	/* TODO(shess) This could also be done by calling plwTerminate() and
sl@0	1046	** dataBufferAppend(). I tried that, expecting nominal performance
sl@0	1047	** differences, but it seemed to pretty reliably be worth 1% to code
sl@0	1048	** it this way. I suspect it is the incremental malloc overhead (some
sl@0	1049	** percentage of the plwTerminate() calls will cause a realloc), so
sl@0	1050	** this might be worth revisiting if the DataBuffer implementation
sl@0	1051	** changes.
sl@0	1052	*/
sl@0	1053	static void dlcAddDoclist(DLCollector pCollector, DataBuffer b){
sl@0	1054	if( pCollector->dlw.iType>DL_DOCIDS ){
sl@0	1055	char c[VARINT_MAX];
sl@0	1056	int n = fts3PutVarint(c, POS_END);
sl@0	1057	dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n);
sl@0	1058	}else{
sl@0	1059	dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData);
sl@0	1060	}
sl@0	1061	}
sl@0	1062	static void dlcNext(DLCollector *pCollector, sqlite_int64 iDocid){
sl@0	1063	plwTerminate(&pCollector->plw);
sl@0	1064	plwDestroy(&pCollector->plw);
sl@0	1065	plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
sl@0	1066	}
sl@0	1067	static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos,
sl@0	1068	int iStartOffset, int iEndOffset){
sl@0	1069	plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset);
sl@0	1070	}
sl@0	1071
sl@0	1072	static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){
sl@0	1073	DLCollector *pCollector = sqlite3_malloc(sizeof(DLCollector));
sl@0	1074	dataBufferInit(&pCollector->b, 0);
sl@0	1075	dlwInit(&pCollector->dlw, iType, &pCollector->b);
sl@0	1076	plwInit(&pCollector->plw, &pCollector->dlw, iDocid);
sl@0	1077	return pCollector;
sl@0	1078	}
sl@0	1079	static void dlcDelete(DLCollector *pCollector){
sl@0	1080	plwDestroy(&pCollector->plw);
sl@0	1081	dlwDestroy(&pCollector->dlw);
sl@0	1082	dataBufferDestroy(&pCollector->b);
sl@0	1083	SCRAMBLE(pCollector);
sl@0	1084	sqlite3_free(pCollector);
sl@0	1085	}
sl@0	1086
sl@0	1087
sl@0	1088	/* Copy the doclist data of iType in pData/nData into *out, trimming
sl@0	1089	** unnecessary data as we go. Only columns matching iColumn are
sl@0	1090	** copied, all columns copied if iColumn is -1. Elements with no
sl@0	1091	** matching columns are dropped. The output is an iOutType doclist.
sl@0	1092	*/
sl@0	1093	/* NOTE(shess) This code is only valid after all doclists are merged.
sl@0	1094	** If this is run before merges, then doclist items which represent
sl@0	1095	** deletion will be trimmed, and will thus not effect a deletion
sl@0	1096	** during the merge.
sl@0	1097	*/
sl@0	1098	static void docListTrim(DocListType iType, const char *pData, int nData,
sl@0	1099	int iColumn, DocListType iOutType, DataBuffer *out){
sl@0	1100	DLReader dlReader;
sl@0	1101	DLWriter dlWriter;
sl@0	1102
sl@0	1103	assert( iOutType<=iType );
sl@0	1104
sl@0	1105	dlrInit(&dlReader, iType, pData, nData);
sl@0	1106	dlwInit(&dlWriter, iOutType, out);
sl@0	1107
sl@0	1108	while( !dlrAtEnd(&dlReader) ){
sl@0	1109	PLReader plReader;
sl@0	1110	PLWriter plWriter;
sl@0	1111	int match = 0;
sl@0	1112
sl@0	1113	plrInit(&plReader, &dlReader);
sl@0	1114
sl@0	1115	while( !plrAtEnd(&plReader) ){
sl@0	1116	if( iColumn==-1 \|\| plrColumn(&plReader)==iColumn ){
sl@0	1117	if( !match ){
sl@0	1118	plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader));
sl@0	1119	match = 1;
sl@0	1120	}
sl@0	1121	plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader),
sl@0	1122	plrStartOffset(&plReader), plrEndOffset(&plReader));
sl@0	1123	}
sl@0	1124	plrStep(&plReader);
sl@0	1125	}
sl@0	1126	if( match ){
sl@0	1127	plwTerminate(&plWriter);
sl@0	1128	plwDestroy(&plWriter);
sl@0	1129	}
sl@0	1130
sl@0	1131	plrDestroy(&plReader);
sl@0	1132	dlrStep(&dlReader);
sl@0	1133	}
sl@0	1134	dlwDestroy(&dlWriter);
sl@0	1135	dlrDestroy(&dlReader);
sl@0	1136	}
sl@0	1137
sl@0	1138	/* Used by docListMerge() to keep doclists in the ascending order by
sl@0	1139	** docid, then ascending order by age (so the newest comes first).
sl@0	1140	*/
sl@0	1141	typedef struct OrderedDLReader {
sl@0	1142	DLReader *pReader;
sl@0	1143
sl@0	1144	/* TODO(shess) If we assume that docListMerge pReaders is ordered by
sl@0	1145	** age (which we do), then we could use pReader comparisons to break
sl@0	1146	** ties.
sl@0	1147	*/
sl@0	1148	int idx;
sl@0	1149	} OrderedDLReader;
sl@0	1150
sl@0	1151	/* Order eof to end, then by docid asc, idx desc. */
sl@0	1152	static int orderedDLReaderCmp(OrderedDLReader r1, OrderedDLReader r2){
sl@0	1153	if( dlrAtEnd(r1->pReader) ){
sl@0	1154	if( dlrAtEnd(r2->pReader) ) return 0; /* Both atEnd(). */
sl@0	1155	return 1; /* Only r1 atEnd(). */
sl@0	1156	}
sl@0	1157	if( dlrAtEnd(r2->pReader) ) return -1; /* Only r2 atEnd(). */
sl@0	1158
sl@0	1159	if( dlrDocid(r1->pReader)<dlrDocid(r2->pReader) ) return -1;
sl@0	1160	if( dlrDocid(r1->pReader)>dlrDocid(r2->pReader) ) return 1;
sl@0	1161
sl@0	1162	/* Descending on idx. */
sl@0	1163	return r2->idx-r1->idx;
sl@0	1164	}
sl@0	1165
sl@0	1166	/* Bubble p[0] to appropriate place in p[1..n-1]. Assumes that
sl@0	1167	** p[1..n-1] is already sorted.
sl@0	1168	*/
sl@0	1169	/* TODO(shess) Is this frequent enough to warrant a binary search?
sl@0	1170	** Before implementing that, instrument the code to check. In most
sl@0	1171	** current usage, I expect that p[0] will be less than p[1] a very
sl@0	1172	** high proportion of the time.
sl@0	1173	*/
sl@0	1174	static void orderedDLReaderReorder(OrderedDLReader *p, int n){
sl@0	1175	while( n>1 && orderedDLReaderCmp(p, p+1)>0 ){
sl@0	1176	OrderedDLReader tmp = p[0];
sl@0	1177	p[0] = p[1];
sl@0	1178	p[1] = tmp;
sl@0	1179	n--;
sl@0	1180	p++;
sl@0	1181	}
sl@0	1182	}
sl@0	1183
sl@0	1184	/* Given an array of doclist readers, merge their doclist elements
sl@0	1185	** into out in sorted order (by docid), dropping elements from older
sl@0	1186	** readers when there is a duplicate docid. pReaders is assumed to be
sl@0	1187	** ordered by age, oldest first.
sl@0	1188	*/
sl@0	1189	/* TODO(shess) nReaders must be <= MERGE_COUNT. This should probably
sl@0	1190	** be fixed.
sl@0	1191	*/
sl@0	1192	static void docListMerge(DataBuffer *out,
sl@0	1193	DLReader *pReaders, int nReaders){
sl@0	1194	OrderedDLReader readers[MERGE_COUNT];
sl@0	1195	DLWriter writer;
sl@0	1196	int i, n;
sl@0	1197	const char *pStart = 0;
sl@0	1198	int nStart = 0;
sl@0	1199	sqlite_int64 iFirstDocid = 0, iLastDocid = 0;
sl@0	1200
sl@0	1201	assert( nReaders>0 );
sl@0	1202	if( nReaders==1 ){
sl@0	1203	dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders));
sl@0	1204	return;
sl@0	1205	}
sl@0	1206
sl@0	1207	assert( nReaders<=MERGE_COUNT );
sl@0	1208	n = 0;
sl@0	1209	for(i=0; i<nReaders; i++){
sl@0	1210	assert( pReaders[i].iType==pReaders[0].iType );
sl@0	1211	readers[i].pReader = pReaders+i;
sl@0	1212	readers[i].idx = i;
sl@0	1213	n += dlrAllDataBytes(&pReaders[i]);
sl@0	1214	}
sl@0	1215	/* Conservatively size output to sum of inputs. Output should end
sl@0	1216	** up strictly smaller than input.
sl@0	1217	*/
sl@0	1218	dataBufferExpand(out, n);
sl@0	1219
sl@0	1220	/* Get the readers into sorted order. */
sl@0	1221	while( i-->0 ){
sl@0	1222	orderedDLReaderReorder(readers+i, nReaders-i);
sl@0	1223	}
sl@0	1224
sl@0	1225	dlwInit(&writer, pReaders[0].iType, out);
sl@0	1226	while( !dlrAtEnd(readers[0].pReader) ){
sl@0	1227	sqlite_int64 iDocid = dlrDocid(readers[0].pReader);
sl@0	1228
sl@0	1229	/* If this is a continuation of the current buffer to copy, extend
sl@0	1230	** that buffer. memcpy() seems to be more efficient if it has a
sl@0	1231	** lots of data to copy.
sl@0	1232	*/
sl@0	1233	if( dlrDocData(readers[0].pReader)==pStart+nStart ){
sl@0	1234	nStart += dlrDocDataBytes(readers[0].pReader);
sl@0	1235	}else{
sl@0	1236	if( pStart!=0 ){
sl@0	1237	dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
sl@0	1238	}
sl@0	1239	pStart = dlrDocData(readers[0].pReader);
sl@0	1240	nStart = dlrDocDataBytes(readers[0].pReader);
sl@0	1241	iFirstDocid = iDocid;
sl@0	1242	}
sl@0	1243	iLastDocid = iDocid;
sl@0	1244	dlrStep(readers[0].pReader);
sl@0	1245
sl@0	1246	/* Drop all of the older elements with the same docid. */
sl@0	1247	for(i=1; i<nReaders &&
sl@0	1248	!dlrAtEnd(readers[i].pReader) &&
sl@0	1249	dlrDocid(readers[i].pReader)==iDocid; i++){
sl@0	1250	dlrStep(readers[i].pReader);
sl@0	1251	}
sl@0	1252
sl@0	1253	/* Get the readers back into order. */
sl@0	1254	while( i-->0 ){
sl@0	1255	orderedDLReaderReorder(readers+i, nReaders-i);
sl@0	1256	}
sl@0	1257	}
sl@0	1258
sl@0	1259	/* Copy over any remaining elements. */
sl@0	1260	if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid);
sl@0	1261	dlwDestroy(&writer);
sl@0	1262	}
sl@0	1263
sl@0	1264	/* Helper function for posListUnion(). Compares the current position
sl@0	1265	** between left and right, returning as standard C idiom of <0 if
sl@0	1266	** left<right, >0 if left>right, and 0 if left==right. "End" always
sl@0	1267	** compares greater.
sl@0	1268	*/
sl@0	1269	static int posListCmp(PLReader pLeft, PLReader pRight){
sl@0	1270	assert( pLeft->iType==pRight->iType );
sl@0	1271	if( pLeft->iType==DL_DOCIDS ) return 0;
sl@0	1272
sl@0	1273	if( plrAtEnd(pLeft) ) return plrAtEnd(pRight) ? 0 : 1;
sl@0	1274	if( plrAtEnd(pRight) ) return -1;
sl@0	1275
sl@0	1276	if( plrColumn(pLeft)<plrColumn(pRight) ) return -1;
sl@0	1277	if( plrColumn(pLeft)>plrColumn(pRight) ) return 1;
sl@0	1278
sl@0	1279	if( plrPosition(pLeft)<plrPosition(pRight) ) return -1;
sl@0	1280	if( plrPosition(pLeft)>plrPosition(pRight) ) return 1;
sl@0	1281	if( pLeft->iType==DL_POSITIONS ) return 0;
sl@0	1282
sl@0	1283	if( plrStartOffset(pLeft)<plrStartOffset(pRight) ) return -1;
sl@0	1284	if( plrStartOffset(pLeft)>plrStartOffset(pRight) ) return 1;
sl@0	1285
sl@0	1286	if( plrEndOffset(pLeft)<plrEndOffset(pRight) ) return -1;
sl@0	1287	if( plrEndOffset(pLeft)>plrEndOffset(pRight) ) return 1;
sl@0	1288
sl@0	1289	return 0;
sl@0	1290	}
sl@0	1291
sl@0	1292	/* Write the union of position lists in pLeft and pRight to pOut.
sl@0	1293	** "Union" in this case meaning "All unique position tuples". Should
sl@0	1294	** work with any doclist type, though both inputs and the output
sl@0	1295	** should be the same type.
sl@0	1296	*/
sl@0	1297	static void posListUnion(DLReader pLeft, DLReader pRight, DLWriter *pOut){
sl@0	1298	PLReader left, right;
sl@0	1299	PLWriter writer;
sl@0	1300
sl@0	1301	assert( dlrDocid(pLeft)==dlrDocid(pRight) );
sl@0	1302	assert( pLeft->iType==pRight->iType );
sl@0	1303	assert( pLeft->iType==pOut->iType );
sl@0	1304
sl@0	1305	plrInit(&left, pLeft);
sl@0	1306	plrInit(&right, pRight);
sl@0	1307	plwInit(&writer, pOut, dlrDocid(pLeft));
sl@0	1308
sl@0	1309	while( !plrAtEnd(&left) \|\| !plrAtEnd(&right) ){
sl@0	1310	int c = posListCmp(&left, &right);
sl@0	1311	if( c<0 ){
sl@0	1312	plwCopy(&writer, &left);
sl@0	1313	plrStep(&left);
sl@0	1314	}else if( c>0 ){
sl@0	1315	plwCopy(&writer, &right);
sl@0	1316	plrStep(&right);
sl@0	1317	}else{
sl@0	1318	plwCopy(&writer, &left);
sl@0	1319	plrStep(&left);
sl@0	1320	plrStep(&right);
sl@0	1321	}
sl@0	1322	}
sl@0	1323
sl@0	1324	plwTerminate(&writer);
sl@0	1325	plwDestroy(&writer);
sl@0	1326	plrDestroy(&left);
sl@0	1327	plrDestroy(&right);
sl@0	1328	}
sl@0	1329
sl@0	1330	/* Write the union of doclists in pLeft and pRight to pOut. For
sl@0	1331	** docids in common between the inputs, the union of the position
sl@0	1332	** lists is written. Inputs and outputs are always type DL_DEFAULT.
sl@0	1333	*/
sl@0	1334	static void docListUnion(
sl@0	1335	const char *pLeft, int nLeft,
sl@0	1336	const char *pRight, int nRight,
sl@0	1337	DataBuffer pOut / Write the combined doclist here */
sl@0	1338	){
sl@0	1339	DLReader left, right;
sl@0	1340	DLWriter writer;
sl@0	1341
sl@0	1342	if( nLeft==0 ){
sl@0	1343	if( nRight!=0) dataBufferAppend(pOut, pRight, nRight);
sl@0	1344	return;
sl@0	1345	}
sl@0	1346	if( nRight==0 ){
sl@0	1347	dataBufferAppend(pOut, pLeft, nLeft);
sl@0	1348	return;
sl@0	1349	}
sl@0	1350
sl@0	1351	dlrInit(&left, DL_DEFAULT, pLeft, nLeft);
sl@0	1352	dlrInit(&right, DL_DEFAULT, pRight, nRight);
sl@0	1353	dlwInit(&writer, DL_DEFAULT, pOut);
sl@0	1354
sl@0	1355	while( !dlrAtEnd(&left) \|\| !dlrAtEnd(&right) ){
sl@0	1356	if( dlrAtEnd(&right) ){
sl@0	1357	dlwCopy(&writer, &left);
sl@0	1358	dlrStep(&left);
sl@0	1359	}else if( dlrAtEnd(&left) ){
sl@0	1360	dlwCopy(&writer, &right);
sl@0	1361	dlrStep(&right);
sl@0	1362	}else if( dlrDocid(&left)<dlrDocid(&right) ){
sl@0	1363	dlwCopy(&writer, &left);
sl@0	1364	dlrStep(&left);
sl@0	1365	}else if( dlrDocid(&left)>dlrDocid(&right) ){
sl@0	1366	dlwCopy(&writer, &right);
sl@0	1367	dlrStep(&right);
sl@0	1368	}else{
sl@0	1369	posListUnion(&left, &right, &writer);
sl@0	1370	dlrStep(&left);
sl@0	1371	dlrStep(&right);
sl@0	1372	}
sl@0	1373	}
sl@0	1374
sl@0	1375	dlrDestroy(&left);
sl@0	1376	dlrDestroy(&right);
sl@0	1377	dlwDestroy(&writer);
sl@0	1378	}
sl@0	1379
sl@0	1380	/*
sl@0	1381	** This function is used as part of the implementation of phrase and
sl@0	1382	** NEAR matching.
sl@0	1383	**
sl@0	1384	** pLeft and pRight are DLReaders positioned to the same docid in
sl@0	1385	** lists of type DL_POSITION. This function writes an entry to the
sl@0	1386	** DLWriter pOut for each position in pRight that is less than
sl@0	1387	** (nNear+1) greater (but not equal to or smaller) than a position
sl@0	1388	** in pLeft. For example, if nNear is 0, and the positions contained
sl@0	1389	** by pLeft and pRight are:
sl@0	1390	**
sl@0	1391	** pLeft: 5 10 15 20
sl@0	1392	** pRight: 6 9 17 21
sl@0	1393	**
sl@0	1394	** then the docid is added to pOut. If pOut is of type DL_POSITIONS,
sl@0	1395	** then a positionids "6" and "21" are also added to pOut.
sl@0	1396	**
sl@0	1397	** If boolean argument isSaveLeft is true, then positionids are copied
sl@0	1398	** from pLeft instead of pRight. In the example above, the positions "5"
sl@0	1399	** and "20" would be added instead of "6" and "21".
sl@0	1400	*/
sl@0	1401	static void posListPhraseMerge(
sl@0	1402	DLReader *pLeft,
sl@0	1403	DLReader *pRight,
sl@0	1404	int nNear,
sl@0	1405	int isSaveLeft,
sl@0	1406	DLWriter *pOut
sl@0	1407	){
sl@0	1408	PLReader left, right;
sl@0	1409	PLWriter writer;
sl@0	1410	int match = 0;
sl@0	1411
sl@0	1412	assert( dlrDocid(pLeft)==dlrDocid(pRight) );
sl@0	1413	assert( pOut->iType!=DL_POSITIONS_OFFSETS );
sl@0	1414
sl@0	1415	plrInit(&left, pLeft);
sl@0	1416	plrInit(&right, pRight);
sl@0	1417
sl@0	1418	while( !plrAtEnd(&left) && !plrAtEnd(&right) ){
sl@0	1419	if( plrColumn(&left)<plrColumn(&right) ){
sl@0	1420	plrStep(&left);
sl@0	1421	}else if( plrColumn(&left)>plrColumn(&right) ){
sl@0	1422	plrStep(&right);
sl@0	1423	}else if( plrPosition(&left)>=plrPosition(&right) ){
sl@0	1424	plrStep(&right);
sl@0	1425	}else{
sl@0	1426	if( (plrPosition(&right)-plrPosition(&left))<=(nNear+1) ){
sl@0	1427	if( !match ){
sl@0	1428	plwInit(&writer, pOut, dlrDocid(pLeft));
sl@0	1429	match = 1;
sl@0	1430	}
sl@0	1431	if( !isSaveLeft ){
sl@0	1432	plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0);
sl@0	1433	}else{
sl@0	1434	plwAdd(&writer, plrColumn(&left), plrPosition(&left), 0, 0);
sl@0	1435	}
sl@0	1436	plrStep(&right);
sl@0	1437	}else{
sl@0	1438	plrStep(&left);
sl@0	1439	}
sl@0	1440	}
sl@0	1441	}
sl@0	1442
sl@0	1443	if( match ){
sl@0	1444	plwTerminate(&writer);
sl@0	1445	plwDestroy(&writer);
sl@0	1446	}
sl@0	1447
sl@0	1448	plrDestroy(&left);
sl@0	1449	plrDestroy(&right);
sl@0	1450	}
sl@0	1451
sl@0	1452	/*
sl@0	1453	** Compare the values pointed to by the PLReaders passed as arguments.
sl@0	1454	** Return -1 if the value pointed to by pLeft is considered less than
sl@0	1455	** the value pointed to by pRight, +1 if it is considered greater
sl@0	1456	** than it, or 0 if it is equal. i.e.
sl@0	1457	**
sl@0	1458	** (pLeft - pRight)
sl@0	1459	**
sl@0	1460	** A PLReader that is in the EOF condition is considered greater than
sl@0	1461	** any other. If neither argument is in EOF state, the return value of
sl@0	1462	** plrColumn() is used. If the plrColumn() values are equal, the
sl@0	1463	** comparison is on the basis of plrPosition().
sl@0	1464	*/
sl@0	1465	static int plrCompare(PLReader pLeft, PLReader pRight){
sl@0	1466	assert(!plrAtEnd(pLeft) \|\| !plrAtEnd(pRight));
sl@0	1467
sl@0	1468	if( plrAtEnd(pRight) \|\| plrAtEnd(pLeft) ){
sl@0	1469	return (plrAtEnd(pRight) ? -1 : 1);
sl@0	1470	}
sl@0	1471	if( plrColumn(pLeft)!=plrColumn(pRight) ){
sl@0	1472	return ((plrColumn(pLeft)<plrColumn(pRight)) ? -1 : 1);
sl@0	1473	}
sl@0	1474	if( plrPosition(pLeft)!=plrPosition(pRight) ){
sl@0	1475	return ((plrPosition(pLeft)<plrPosition(pRight)) ? -1 : 1);
sl@0	1476	}
sl@0	1477	return 0;
sl@0	1478	}
sl@0	1479
sl@0	1480	/* We have two doclists with positions: pLeft and pRight. Depending
sl@0	1481	** on the value of the nNear parameter, perform either a phrase
sl@0	1482	** intersection (if nNear==0) or a NEAR intersection (if nNear>0)
sl@0	1483	** and write the results into pOut.
sl@0	1484	**
sl@0	1485	** A phrase intersection means that two documents only match
sl@0	1486	** if pLeft.iPos+1==pRight.iPos.
sl@0	1487	**
sl@0	1488	** A NEAR intersection means that two documents only match if
sl@0	1489	** (abs(pLeft.iPos-pRight.iPos)<nNear).
sl@0	1490	**
sl@0	1491	** If a NEAR intersection is requested, then the nPhrase argument should
sl@0	1492	** be passed the number of tokens in the two operands to the NEAR operator
sl@0	1493	** combined. For example:
sl@0	1494	**
sl@0	1495	** Query syntax nPhrase
sl@0	1496	** ------------------------------------
sl@0	1497	** "A B C" NEAR "D E" 5
sl@0	1498	** A NEAR B 2
sl@0	1499	**
sl@0	1500	** iType controls the type of data written to pOut. If iType is
sl@0	1501	** DL_POSITIONS, the positions are those from pRight.
sl@0	1502	*/
sl@0	1503	static void docListPhraseMerge(
sl@0	1504	const char *pLeft, int nLeft,
sl@0	1505	const char *pRight, int nRight,
sl@0	1506	int nNear, /* 0 for a phrase merge, non-zero for a NEAR merge */
sl@0	1507	int nPhrase, /* Number of tokens in left+right operands to NEAR */
sl@0	1508	DocListType iType, /* Type of doclist to write to pOut */
sl@0	1509	DataBuffer pOut / Write the combined doclist here */
sl@0	1510	){
sl@0	1511	DLReader left, right;
sl@0	1512	DLWriter writer;
sl@0	1513
sl@0	1514	if( nLeft==0 \|\| nRight==0 ) return;
sl@0	1515
sl@0	1516	assert( iType!=DL_POSITIONS_OFFSETS );
sl@0	1517
sl@0	1518	dlrInit(&left, DL_POSITIONS, pLeft, nLeft);
sl@0	1519	dlrInit(&right, DL_POSITIONS, pRight, nRight);
sl@0	1520	dlwInit(&writer, iType, pOut);
sl@0	1521
sl@0	1522	while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
sl@0	1523	if( dlrDocid(&left)<dlrDocid(&right) ){
sl@0	1524	dlrStep(&left);
sl@0	1525	}else if( dlrDocid(&right)<dlrDocid(&left) ){
sl@0	1526	dlrStep(&right);
sl@0	1527	}else{
sl@0	1528	if( nNear==0 ){
sl@0	1529	posListPhraseMerge(&left, &right, 0, 0, &writer);
sl@0	1530	}else{
sl@0	1531	/* This case occurs when two terms (simple terms or phrases) are
sl@0	1532	* connected by a NEAR operator, span (nNear+1). i.e.
sl@0	1533	*
sl@0	1534	* '"terrible company" NEAR widget'
sl@0	1535	*/
sl@0	1536	DataBuffer one = {0, 0, 0};
sl@0	1537	DataBuffer two = {0, 0, 0};
sl@0	1538
sl@0	1539	DLWriter dlwriter2;
sl@0	1540	DLReader dr1 = {0, 0, 0, 0, 0};
sl@0	1541	DLReader dr2 = {0, 0, 0, 0, 0};
sl@0	1542
sl@0	1543	dlwInit(&dlwriter2, iType, &one);
sl@0	1544	posListPhraseMerge(&right, &left, nNear-3+nPhrase, 1, &dlwriter2);
sl@0	1545	dlwInit(&dlwriter2, iType, &two);
sl@0	1546	posListPhraseMerge(&left, &right, nNear-1, 0, &dlwriter2);
sl@0	1547
sl@0	1548	if( one.nData) dlrInit(&dr1, iType, one.pData, one.nData);
sl@0	1549	if( two.nData) dlrInit(&dr2, iType, two.pData, two.nData);
sl@0	1550
sl@0	1551	if( !dlrAtEnd(&dr1) \|\| !dlrAtEnd(&dr2) ){
sl@0	1552	PLReader pr1 = {0};
sl@0	1553	PLReader pr2 = {0};
sl@0	1554
sl@0	1555	PLWriter plwriter;
sl@0	1556	plwInit(&plwriter, &writer, dlrDocid(dlrAtEnd(&dr1)?&dr2:&dr1));
sl@0	1557
sl@0	1558	if( one.nData ) plrInit(&pr1, &dr1);
sl@0	1559	if( two.nData ) plrInit(&pr2, &dr2);
sl@0	1560	while( !plrAtEnd(&pr1) \|\| !plrAtEnd(&pr2) ){
sl@0	1561	int iCompare = plrCompare(&pr1, &pr2);
sl@0	1562	switch( iCompare ){
sl@0	1563	case -1:
sl@0	1564	plwCopy(&plwriter, &pr1);
sl@0	1565	plrStep(&pr1);
sl@0	1566	break;
sl@0	1567	case 1:
sl@0	1568	plwCopy(&plwriter, &pr2);
sl@0	1569	plrStep(&pr2);
sl@0	1570	break;
sl@0	1571	case 0:
sl@0	1572	plwCopy(&plwriter, &pr1);
sl@0	1573	plrStep(&pr1);
sl@0	1574	plrStep(&pr2);
sl@0	1575	break;
sl@0	1576	}
sl@0	1577	}
sl@0	1578	plwTerminate(&plwriter);
sl@0	1579	}
sl@0	1580	dataBufferDestroy(&one);
sl@0	1581	dataBufferDestroy(&two);
sl@0	1582	}
sl@0	1583	dlrStep(&left);
sl@0	1584	dlrStep(&right);
sl@0	1585	}
sl@0	1586	}
sl@0	1587
sl@0	1588	dlrDestroy(&left);
sl@0	1589	dlrDestroy(&right);
sl@0	1590	dlwDestroy(&writer);
sl@0	1591	}
sl@0	1592
sl@0	1593	/* We have two DL_DOCIDS doclists: pLeft and pRight.
sl@0	1594	** Write the intersection of these two doclists into pOut as a
sl@0	1595	** DL_DOCIDS doclist.
sl@0	1596	*/
sl@0	1597	static void docListAndMerge(
sl@0	1598	const char *pLeft, int nLeft,
sl@0	1599	const char *pRight, int nRight,
sl@0	1600	DataBuffer pOut / Write the combined doclist here */
sl@0	1601	){
sl@0	1602	DLReader left, right;
sl@0	1603	DLWriter writer;
sl@0	1604
sl@0	1605	if( nLeft==0 \|\| nRight==0 ) return;
sl@0	1606
sl@0	1607	dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
sl@0	1608	dlrInit(&right, DL_DOCIDS, pRight, nRight);
sl@0	1609	dlwInit(&writer, DL_DOCIDS, pOut);
sl@0	1610
sl@0	1611	while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){
sl@0	1612	if( dlrDocid(&left)<dlrDocid(&right) ){
sl@0	1613	dlrStep(&left);
sl@0	1614	}else if( dlrDocid(&right)<dlrDocid(&left) ){
sl@0	1615	dlrStep(&right);
sl@0	1616	}else{
sl@0	1617	dlwAdd(&writer, dlrDocid(&left));
sl@0	1618	dlrStep(&left);
sl@0	1619	dlrStep(&right);
sl@0	1620	}
sl@0	1621	}
sl@0	1622
sl@0	1623	dlrDestroy(&left);
sl@0	1624	dlrDestroy(&right);
sl@0	1625	dlwDestroy(&writer);
sl@0	1626	}
sl@0	1627
sl@0	1628	/* We have two DL_DOCIDS doclists: pLeft and pRight.
sl@0	1629	** Write the union of these two doclists into pOut as a
sl@0	1630	** DL_DOCIDS doclist.
sl@0	1631	*/
sl@0	1632	static void docListOrMerge(
sl@0	1633	const char *pLeft, int nLeft,
sl@0	1634	const char *pRight, int nRight,
sl@0	1635	DataBuffer pOut / Write the combined doclist here */
sl@0	1636	){
sl@0	1637	DLReader left, right;
sl@0	1638	DLWriter writer;
sl@0	1639
sl@0	1640	if( nLeft==0 ){
sl@0	1641	if( nRight!=0 ) dataBufferAppend(pOut, pRight, nRight);
sl@0	1642	return;
sl@0	1643	}
sl@0	1644	if( nRight==0 ){
sl@0	1645	dataBufferAppend(pOut, pLeft, nLeft);
sl@0	1646	return;
sl@0	1647	}
sl@0	1648
sl@0	1649	dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
sl@0	1650	dlrInit(&right, DL_DOCIDS, pRight, nRight);
sl@0	1651	dlwInit(&writer, DL_DOCIDS, pOut);
sl@0	1652
sl@0	1653	while( !dlrAtEnd(&left) \|\| !dlrAtEnd(&right) ){
sl@0	1654	if( dlrAtEnd(&right) ){
sl@0	1655	dlwAdd(&writer, dlrDocid(&left));
sl@0	1656	dlrStep(&left);
sl@0	1657	}else if( dlrAtEnd(&left) ){
sl@0	1658	dlwAdd(&writer, dlrDocid(&right));
sl@0	1659	dlrStep(&right);
sl@0	1660	}else if( dlrDocid(&left)<dlrDocid(&right) ){
sl@0	1661	dlwAdd(&writer, dlrDocid(&left));
sl@0	1662	dlrStep(&left);
sl@0	1663	}else if( dlrDocid(&right)<dlrDocid(&left) ){
sl@0	1664	dlwAdd(&writer, dlrDocid(&right));
sl@0	1665	dlrStep(&right);
sl@0	1666	}else{
sl@0	1667	dlwAdd(&writer, dlrDocid(&left));
sl@0	1668	dlrStep(&left);
sl@0	1669	dlrStep(&right);
sl@0	1670	}
sl@0	1671	}
sl@0	1672
sl@0	1673	dlrDestroy(&left);
sl@0	1674	dlrDestroy(&right);
sl@0	1675	dlwDestroy(&writer);
sl@0	1676	}
sl@0	1677
sl@0	1678	/* We have two DL_DOCIDS doclists: pLeft and pRight.
sl@0	1679	** Write into pOut as DL_DOCIDS doclist containing all documents that
sl@0	1680	** occur in pLeft but not in pRight.
sl@0	1681	*/
sl@0	1682	static void docListExceptMerge(
sl@0	1683	const char *pLeft, int nLeft,
sl@0	1684	const char *pRight, int nRight,
sl@0	1685	DataBuffer pOut / Write the combined doclist here */
sl@0	1686	){
sl@0	1687	DLReader left, right;
sl@0	1688	DLWriter writer;
sl@0	1689
sl@0	1690	if( nLeft==0 ) return;
sl@0	1691	if( nRight==0 ){
sl@0	1692	dataBufferAppend(pOut, pLeft, nLeft);
sl@0	1693	return;
sl@0	1694	}
sl@0	1695
sl@0	1696	dlrInit(&left, DL_DOCIDS, pLeft, nLeft);
sl@0	1697	dlrInit(&right, DL_DOCIDS, pRight, nRight);
sl@0	1698	dlwInit(&writer, DL_DOCIDS, pOut);
sl@0	1699
sl@0	1700	while( !dlrAtEnd(&left) ){
sl@0	1701	while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){
sl@0	1702	dlrStep(&right);
sl@0	1703	}
sl@0	1704	if( dlrAtEnd(&right) \|\| dlrDocid(&left)<dlrDocid(&right) ){
sl@0	1705	dlwAdd(&writer, dlrDocid(&left));
sl@0	1706	}
sl@0	1707	dlrStep(&left);
sl@0	1708	}
sl@0	1709
sl@0	1710	dlrDestroy(&left);
sl@0	1711	dlrDestroy(&right);
sl@0	1712	dlwDestroy(&writer);
sl@0	1713	}
sl@0	1714
sl@0	1715	static char string_dup_n(const char s, int n){
sl@0	1716	char *str = sqlite3_malloc(n + 1);
sl@0	1717	memcpy(str, s, n);
sl@0	1718	str[n] = '\0';
sl@0	1719	return str;
sl@0	1720	}
sl@0	1721
sl@0	1722	/* Duplicate a string; the caller must free() the returned string.
sl@0	1723	* (We don't use strdup() since it is not part of the standard C library and
sl@0	1724	* may not be available everywhere.) */
sl@0	1725	static char string_dup(const char s){
sl@0	1726	return string_dup_n(s, strlen(s));
sl@0	1727	}
sl@0	1728
sl@0	1729	/* Format a string, replacing each occurrence of the % character with
sl@0	1730	* zDb.zName. This may be more convenient than sqlite_mprintf()
sl@0	1731	* when one string is used repeatedly in a format string.
sl@0	1732	* The caller must free() the returned string. */
sl@0	1733	static char string_format(const char zFormat,
sl@0	1734	const char zDb, const char zName){
sl@0	1735	const char *p;
sl@0	1736	size_t len = 0;
sl@0	1737	size_t nDb = strlen(zDb);
sl@0	1738	size_t nName = strlen(zName);
sl@0	1739	size_t nFullTableName = nDb+1+nName;
sl@0	1740	char *result;
sl@0	1741	char *r;
sl@0	1742
sl@0	1743	/* first compute length needed */
sl@0	1744	for(p = zFormat ; *p ; ++p){
sl@0	1745	len += (*p=='%' ? nFullTableName : 1);
sl@0	1746	}
sl@0	1747	len += 1; /* for null terminator */
sl@0	1748
sl@0	1749	r = result = sqlite3_malloc(len);
sl@0	1750	for(p = zFormat; *p; ++p){
sl@0	1751	if( *p=='%' ){
sl@0	1752	memcpy(r, zDb, nDb);
sl@0	1753	r += nDb;
sl@0	1754	*r++ = '.';
sl@0	1755	memcpy(r, zName, nName);
sl@0	1756	r += nName;
sl@0	1757	} else {
sl@0	1758	r++ = p;
sl@0	1759	}
sl@0	1760	}
sl@0	1761	*r++ = '\0';
sl@0	1762	assert( r == result + len );
sl@0	1763	return result;
sl@0	1764	}
sl@0	1765
sl@0	1766	static int sql_exec(sqlite3 db, const char zDb, const char *zName,
sl@0	1767	const char *zFormat){
sl@0	1768	char *zCommand = string_format(zFormat, zDb, zName);
sl@0	1769	int rc;
sl@0	1770	FTSTRACE(("FTS3 sql: %s\n", zCommand));
sl@0	1771	rc = sqlite3_exec(db, zCommand, NULL, 0, NULL);
sl@0	1772	sqlite3_free(zCommand);
sl@0	1773	return rc;
sl@0	1774	}
sl@0	1775
sl@0	1776	static int sql_prepare(sqlite3 db, const char zDb, const char *zName,
sl@0	1777	sqlite3_stmt *ppStmt, const char zFormat){
sl@0	1778	char *zCommand = string_format(zFormat, zDb, zName);
sl@0	1779	int rc;
sl@0	1780	FTSTRACE(("FTS3 prepare: %s\n", zCommand));
sl@0	1781	rc = sqlite3_prepare_v2(db, zCommand, -1, ppStmt, NULL);
sl@0	1782	sqlite3_free(zCommand);
sl@0	1783	return rc;
sl@0	1784	}
sl@0	1785
sl@0	1786	/* end utility functions */
sl@0	1787
sl@0	1788	/* Forward reference */
sl@0	1789	typedef struct fulltext_vtab fulltext_vtab;
sl@0	1790
sl@0	1791	/* A single term in a query is represented by an instances of
sl@0	1792	** the following structure. Each word which may match against
sl@0	1793	** document content is a term. Operators, like NEAR or OR, are
sl@0	1794	** not terms. Query terms are organized as a flat list stored
sl@0	1795	** in the Query.pTerms array.
sl@0	1796	**
sl@0	1797	** If the QueryTerm.nPhrase variable is non-zero, then the QueryTerm
sl@0	1798	** is the first in a contiguous string of terms that are either part
sl@0	1799	** of the same phrase, or connected by the NEAR operator.
sl@0	1800	**
sl@0	1801	** If the QueryTerm.nNear variable is non-zero, then the token is followed
sl@0	1802	** by a NEAR operator with span set to (nNear-1). For example, the
sl@0	1803	** following query:
sl@0	1804	**
sl@0	1805	** The QueryTerm.iPhrase variable stores the index of the token within
sl@0	1806	** its phrase, indexed starting at 1, or 1 if the token is not part
sl@0	1807	** of any phrase.
sl@0	1808	**
sl@0	1809	** For example, the data structure used to represent the following query:
sl@0	1810	**
sl@0	1811	** ... MATCH 'sqlite NEAR/5 google NEAR/2 "search engine"'
sl@0	1812	**
sl@0	1813	** is:
sl@0	1814	**
sl@0	1815	** {nPhrase=4, iPhrase=1, nNear=6, pTerm="sqlite"},
sl@0	1816	** {nPhrase=0, iPhrase=1, nNear=3, pTerm="google"},
sl@0	1817	** {nPhrase=0, iPhrase=1, nNear=0, pTerm="search"},
sl@0	1818	** {nPhrase=0, iPhrase=2, nNear=0, pTerm="engine"},
sl@0	1819	**
sl@0	1820	** compiling the FTS3 syntax to Query structures is done by the parseQuery()
sl@0	1821	** function.
sl@0	1822	*/
sl@0	1823	typedef struct QueryTerm {
sl@0	1824	short int nPhrase; /* How many following terms are part of the same phrase */
sl@0	1825	short int iPhrase; /* This is the i-th term of a phrase. */
sl@0	1826	short int iColumn; /* Column of the index that must match this term */
sl@0	1827	short int nNear; /* term followed by a NEAR operator with span=(nNear-1) */
sl@0	1828	signed char isOr; /* this term is preceded by "OR" */
sl@0	1829	signed char isNot; /* this term is preceded by "-" */
sl@0	1830	signed char isPrefix; /* this term is followed by "" /
sl@0	1831	char pTerm; / text of the term. '\000' terminated. malloced */
sl@0	1832	int nTerm; /* Number of bytes in pTerm[] */
sl@0	1833	} QueryTerm;
sl@0	1834
sl@0	1835
sl@0	1836	/* A query string is parsed into a Query structure.
sl@0	1837	*
sl@0	1838	* We could, in theory, allow query strings to be complicated
sl@0	1839	* nested expressions with precedence determined by parentheses.
sl@0	1840	* But none of the major search engines do this. (Perhaps the
sl@0	1841	* feeling is that an parenthesized expression is two complex of
sl@0	1842	* an idea for the average user to grasp.) Taking our lead from
sl@0	1843	* the major search engines, we will allow queries to be a list
sl@0	1844	* of terms (with an implied AND operator) or phrases in double-quotes,
sl@0	1845	* with a single optional "-" before each non-phrase term to designate
sl@0	1846	* negation and an optional OR connector.
sl@0	1847	*
sl@0	1848	* OR binds more tightly than the implied AND, which is what the
sl@0	1849	* major search engines seem to do. So, for example:
sl@0	1850	*
sl@0	1851	* [one two OR three] ==> one AND (two OR three)
sl@0	1852	* [one OR two three] ==> (one OR two) AND three
sl@0	1853	*
sl@0	1854	* A "-" before a term matches all entries that lack that term.
sl@0	1855	* The "-" must occur immediately before the term with in intervening
sl@0	1856	* space. This is how the search engines do it.
sl@0	1857	*
sl@0	1858	* A NOT term cannot be the right-hand operand of an OR. If this
sl@0	1859	* occurs in the query string, the NOT is ignored:
sl@0	1860	*
sl@0	1861	* [one OR -two] ==> one OR two
sl@0	1862	*
sl@0	1863	*/
sl@0	1864	typedef struct Query {
sl@0	1865	fulltext_vtab pFts; / The full text index */
sl@0	1866	int nTerms; /* Number of terms in the query */
sl@0	1867	QueryTerm pTerms; / Array of terms. Space obtained from malloc() */
sl@0	1868	int nextIsOr; /* Set the isOr flag on the next inserted term */
sl@0	1869	int nextIsNear; /* Set the isOr flag on the next inserted term */
sl@0	1870	int nextColumn; /* Next word parsed must be in this column */
sl@0	1871	int dfltColumn; /* The default column */
sl@0	1872	} Query;
sl@0	1873
sl@0	1874
sl@0	1875	/*
sl@0	1876	** An instance of the following structure keeps track of generated
sl@0	1877	** matching-word offset information and snippets.
sl@0	1878	*/
sl@0	1879	typedef struct Snippet {
sl@0	1880	int nMatch; /* Total number of matches */
sl@0	1881	int nAlloc; /* Space allocated for aMatch[] */
sl@0	1882	struct snippetMatch { /* One entry for each matching term */
sl@0	1883	char snStatus; /* Status flag for use while constructing snippets */
sl@0	1884	short int iCol; /* The column that contains the match */
sl@0	1885	short int iTerm; /* The index in Query.pTerms[] of the matching term */
sl@0	1886	int iToken; /* The index of the matching document token */
sl@0	1887	short int nByte; /* Number of bytes in the term */
sl@0	1888	int iStart; /* The offset to the first character of the term */
sl@0	1889	} aMatch; / Points to space obtained from malloc */
sl@0	1890	char zOffset; / Text rendering of aMatch[] */
sl@0	1891	int nOffset; /* strlen(zOffset) */
sl@0	1892	char zSnippet; / Snippet text */
sl@0	1893	int nSnippet; /* strlen(zSnippet) */
sl@0	1894	} Snippet;
sl@0	1895
sl@0	1896
sl@0	1897	typedef enum QueryType {
sl@0	1898	QUERY_GENERIC, /* table scan */
sl@0	1899	QUERY_DOCID, /* lookup by docid */
sl@0	1900	QUERY_FULLTEXT /* QUERY_FULLTEXT + [i] is a full-text search for column i*/
sl@0	1901	} QueryType;
sl@0	1902
sl@0	1903	typedef enum fulltext_statement {
sl@0	1904	CONTENT_INSERT_STMT,
sl@0	1905	CONTENT_SELECT_STMT,
sl@0	1906	CONTENT_UPDATE_STMT,
sl@0	1907	CONTENT_DELETE_STMT,
sl@0	1908	CONTENT_EXISTS_STMT,
sl@0	1909
sl@0	1910	BLOCK_INSERT_STMT,
sl@0	1911	BLOCK_SELECT_STMT,
sl@0	1912	BLOCK_DELETE_STMT,
sl@0	1913	BLOCK_DELETE_ALL_STMT,
sl@0	1914
sl@0	1915	SEGDIR_MAX_INDEX_STMT,
sl@0	1916	SEGDIR_SET_STMT,
sl@0	1917	SEGDIR_SELECT_LEVEL_STMT,
sl@0	1918	SEGDIR_SPAN_STMT,
sl@0	1919	SEGDIR_DELETE_STMT,
sl@0	1920	SEGDIR_SELECT_SEGMENT_STMT,
sl@0	1921	SEGDIR_SELECT_ALL_STMT,
sl@0	1922	SEGDIR_DELETE_ALL_STMT,
sl@0	1923	SEGDIR_COUNT_STMT,
sl@0	1924
sl@0	1925	MAX_STMT /* Always at end! */
sl@0	1926	} fulltext_statement;
sl@0	1927
sl@0	1928	/* These must exactly match the enum above. */
sl@0	1929	/* TODO(shess): Is there some risk that a statement will be used in two
sl@0	1930	** cursors at once, e.g. if a query joins a virtual table to itself?
sl@0	1931	** If so perhaps we should move some of these to the cursor object.
sl@0	1932	*/
sl@0	1933	static const char *const fulltext_zStatement[MAX_STMT] = {
sl@0	1934	/* CONTENT_INSERT / NULL, / generated in contentInsertStatement() */
sl@0	1935	/* CONTENT_SELECT / NULL, / generated in contentSelectStatement() */
sl@0	1936	/* CONTENT_UPDATE / NULL, / generated in contentUpdateStatement() */
sl@0	1937	/* CONTENT_DELETE */ "delete from %_content where docid = ?",
sl@0	1938	/* CONTENT_EXISTS */ "select docid from %_content limit 1",
sl@0	1939
sl@0	1940	/* BLOCK_INSERT */
sl@0	1941	"insert into %_segments (blockid, block) values (null, ?)",
sl@0	1942	/* BLOCK_SELECT */ "select block from %_segments where blockid = ?",
sl@0	1943	/* BLOCK_DELETE */ "delete from %_segments where blockid between ? and ?",
sl@0	1944	/* BLOCK_DELETE_ALL */ "delete from %_segments",
sl@0	1945
sl@0	1946	/* SEGDIR_MAX_INDEX */ "select max(idx) from %_segdir where level = ?",
sl@0	1947	/* SEGDIR_SET */ "insert into %_segdir values (?, ?, ?, ?, ?, ?)",
sl@0	1948	/* SEGDIR_SELECT_LEVEL */
sl@0	1949	"select start_block, leaves_end_block, root from %_segdir "
sl@0	1950	" where level = ? order by idx",
sl@0	1951	/* SEGDIR_SPAN */
sl@0	1952	"select min(start_block), max(end_block) from %_segdir "
sl@0	1953	" where level = ? and start_block <> 0",
sl@0	1954	/* SEGDIR_DELETE */ "delete from %_segdir where level = ?",
sl@0	1955
sl@0	1956	/* NOTE(shess): The first three results of the following two
sl@0	1957	** statements must match.
sl@0	1958	*/
sl@0	1959	/* SEGDIR_SELECT_SEGMENT */
sl@0	1960	"select start_block, leaves_end_block, root from %_segdir "
sl@0	1961	" where level = ? and idx = ?",
sl@0	1962	/* SEGDIR_SELECT_ALL */
sl@0	1963	"select start_block, leaves_end_block, root from %_segdir "
sl@0	1964	" order by level desc, idx asc",
sl@0	1965	/* SEGDIR_DELETE_ALL */ "delete from %_segdir",
sl@0	1966	/* SEGDIR_COUNT / "select count(), ifnull(max(level),0) from %_segdir",
sl@0	1967	};
sl@0	1968
sl@0	1969	/*
sl@0	1970	** A connection to a fulltext index is an instance of the following
sl@0	1971	** structure. The xCreate and xConnect methods create an instance
sl@0	1972	** of this structure and xDestroy and xDisconnect free that instance.
sl@0	1973	** All other methods receive a pointer to the structure as one of their
sl@0	1974	** arguments.
sl@0	1975	*/
sl@0	1976	struct fulltext_vtab {
sl@0	1977	sqlite3_vtab base; /* Base class used by SQLite core */
sl@0	1978	sqlite3 db; / The database connection */
sl@0	1979	const char zDb; / logical database name */
sl@0	1980	const char zName; / virtual table name */
sl@0	1981	int nColumn; /* number of columns in virtual table */
sl@0	1982	char *azColumn; / column names. malloced */
sl@0	1983	char *azContentColumn; / column names in content table; malloced */
sl@0	1984	sqlite3_tokenizer pTokenizer; / tokenizer for inserts and queries */
sl@0	1985
sl@0	1986	/* Precompiled statements which we keep as long as the table is
sl@0	1987	** open.
sl@0	1988	*/
sl@0	1989	sqlite3_stmt *pFulltextStatements[MAX_STMT];
sl@0	1990
sl@0	1991	/* Precompiled statements used for segment merges. We run a
sl@0	1992	** separate select across the leaf level of each tree being merged.
sl@0	1993	*/
sl@0	1994	sqlite3_stmt *pLeafSelectStmts[MERGE_COUNT];
sl@0	1995	/* The statement used to prepare pLeafSelectStmts. */
sl@0	1996	#define LEAF_SELECT \
sl@0	1997	"select block from %_segments where blockid between ? and ? order by blockid"
sl@0	1998
sl@0	1999	/* These buffer pending index updates during transactions.
sl@0	2000	** nPendingData estimates the memory size of the pending data. It
sl@0	2001	** doesn't include the hash-bucket overhead, nor any malloc
sl@0	2002	** overhead. When nPendingData exceeds kPendingThreshold, the
sl@0	2003	** buffer is flushed even before the transaction closes.
sl@0	2004	** pendingTerms stores the data, and is only valid when nPendingData
sl@0	2005	** is >=0 (nPendingData<0 means pendingTerms has not been
sl@0	2006	** initialized). iPrevDocid is the last docid written, used to make
sl@0	2007	** certain we're inserting in sorted order.
sl@0	2008	*/
sl@0	2009	int nPendingData;
sl@0	2010	#define kPendingThreshold (110241024)
sl@0	2011	sqlite_int64 iPrevDocid;
sl@0	2012	fts3Hash pendingTerms;
sl@0	2013	};
sl@0	2014
sl@0	2015	/*
sl@0	2016	** When the core wants to do a query, it create a cursor using a
sl@0	2017	** call to xOpen. This structure is an instance of a cursor. It
sl@0	2018	** is destroyed by xClose.
sl@0	2019	*/
sl@0	2020	typedef struct fulltext_cursor {
sl@0	2021	sqlite3_vtab_cursor base; /* Base class used by SQLite core */
sl@0	2022	QueryType iCursorType; /* Copy of sqlite3_index_info.idxNum */
sl@0	2023	sqlite3_stmt pStmt; / Prepared statement in use by the cursor */
sl@0	2024	int eof; /* True if at End Of Results */
sl@0	2025	Query q; /* Parsed query string */
sl@0	2026	Snippet snippet; /* Cached snippet for the current row */
sl@0	2027	int iColumn; /* Column being searched */
sl@0	2028	DataBuffer result; /* Doclist results from fulltextQuery */
sl@0	2029	DLReader reader; /* Result reader if result not empty */
sl@0	2030	} fulltext_cursor;
sl@0	2031
sl@0	2032	static struct fulltext_vtab cursor_vtab(fulltext_cursor c){
sl@0	2033	return (fulltext_vtab *) c->base.pVtab;
sl@0	2034	}
sl@0	2035
sl@0	2036	static const sqlite3_module fts3Module; /* forward declaration */
sl@0	2037
sl@0	2038	/* Return a dynamically generated statement of the form
sl@0	2039	* insert into %_content (docid, ...) values (?, ...)
sl@0	2040	*/
sl@0	2041	static const char contentInsertStatement(fulltext_vtab v){
sl@0	2042	StringBuffer sb;
sl@0	2043	int i;
sl@0	2044
sl@0	2045	initStringBuffer(&sb);
sl@0	2046	append(&sb, "insert into %_content (docid, ");
sl@0	2047	appendList(&sb, v->nColumn, v->azContentColumn);
sl@0	2048	append(&sb, ") values (?");
sl@0	2049	for(i=0; i<v->nColumn; ++i)
sl@0	2050	append(&sb, ", ?");
sl@0	2051	append(&sb, ")");
sl@0	2052	return stringBufferData(&sb);
sl@0	2053	}
sl@0	2054
sl@0	2055	/* Return a dynamically generated statement of the form
sl@0	2056	* select <content columns> from %_content where docid = ?
sl@0	2057	*/
sl@0	2058	static const char contentSelectStatement(fulltext_vtab v){
sl@0	2059	StringBuffer sb;
sl@0	2060	initStringBuffer(&sb);
sl@0	2061	append(&sb, "SELECT ");
sl@0	2062	appendList(&sb, v->nColumn, v->azContentColumn);
sl@0	2063	append(&sb, " FROM %_content WHERE docid = ?");
sl@0	2064	return stringBufferData(&sb);
sl@0	2065	}
sl@0	2066
sl@0	2067	/* Return a dynamically generated statement of the form
sl@0	2068	* update %_content set [col_0] = ?, [col_1] = ?, ...
sl@0	2069	* where docid = ?
sl@0	2070	*/
sl@0	2071	static const char contentUpdateStatement(fulltext_vtab v){
sl@0	2072	StringBuffer sb;
sl@0	2073	int i;
sl@0	2074
sl@0	2075	initStringBuffer(&sb);
sl@0	2076	append(&sb, "update %_content set ");
sl@0	2077	for(i=0; i<v->nColumn; ++i) {
sl@0	2078	if( i>0 ){
sl@0	2079	append(&sb, ", ");
sl@0	2080	}
sl@0	2081	append(&sb, v->azContentColumn[i]);
sl@0	2082	append(&sb, " = ?");
sl@0	2083	}
sl@0	2084	append(&sb, " where docid = ?");
sl@0	2085	return stringBufferData(&sb);
sl@0	2086	}
sl@0	2087
sl@0	2088	/* Puts a freshly-prepared statement determined by iStmt in *ppStmt.
sl@0	2089	** If the indicated statement has never been prepared, it is prepared
sl@0	2090	** and cached, otherwise the cached version is reset.
sl@0	2091	*/
sl@0	2092	static int sql_get_statement(fulltext_vtab *v, fulltext_statement iStmt,
sl@0	2093	sqlite3_stmt **ppStmt){
sl@0	2094	assert( iStmt<MAX_STMT );
sl@0	2095	if( v->pFulltextStatements[iStmt]==NULL ){
sl@0	2096	const char *zStmt;
sl@0	2097	int rc;
sl@0	2098	switch( iStmt ){
sl@0	2099	case CONTENT_INSERT_STMT:
sl@0	2100	zStmt = contentInsertStatement(v); break;
sl@0	2101	case CONTENT_SELECT_STMT:
sl@0	2102	zStmt = contentSelectStatement(v); break;
sl@0	2103	case CONTENT_UPDATE_STMT:
sl@0	2104	zStmt = contentUpdateStatement(v); break;
sl@0	2105	default:
sl@0	2106	zStmt = fulltext_zStatement[iStmt];
sl@0	2107	}
sl@0	2108	rc = sql_prepare(v->db, v->zDb, v->zName, &v->pFulltextStatements[iStmt],
sl@0	2109	zStmt);
sl@0	2110	if( zStmt != fulltext_zStatement[iStmt]) sqlite3_free((void *) zStmt);
sl@0	2111	if( rc!=SQLITE_OK ) return rc;
sl@0	2112	} else {
sl@0	2113	int rc = sqlite3_reset(v->pFulltextStatements[iStmt]);
sl@0	2114	if( rc!=SQLITE_OK ) return rc;
sl@0	2115	}
sl@0	2116
sl@0	2117	*ppStmt = v->pFulltextStatements[iStmt];
sl@0	2118	return SQLITE_OK;
sl@0	2119	}
sl@0	2120
sl@0	2121	/* Like sqlite3_step(), but convert SQLITE_DONE to SQLITE_OK and
sl@0	2122	** SQLITE_ROW to SQLITE_ERROR. Useful for statements like UPDATE,
sl@0	2123	** where we expect no results.
sl@0	2124	*/
sl@0	2125	static int sql_single_step(sqlite3_stmt *s){
sl@0	2126	int rc = sqlite3_step(s);
sl@0	2127	return (rc==SQLITE_DONE) ? SQLITE_OK : rc;
sl@0	2128	}
sl@0	2129
sl@0	2130	/* Like sql_get_statement(), but for special replicated LEAF_SELECT
sl@0	2131	** statements. idx -1 is a special case for an uncached version of
sl@0	2132	** the statement (used in the optimize implementation).
sl@0	2133	*/
sl@0	2134	/* TODO(shess) Write version for generic statements and then share
sl@0	2135	** that between the cached-statement functions.
sl@0	2136	*/
sl@0	2137	static int sql_get_leaf_statement(fulltext_vtab *v, int idx,
sl@0	2138	sqlite3_stmt **ppStmt){
sl@0	2139	assert( idx>=-1 && idx<MERGE_COUNT );
sl@0	2140	if( idx==-1 ){
sl@0	2141	return sql_prepare(v->db, v->zDb, v->zName, ppStmt, LEAF_SELECT);
sl@0	2142	}else if( v->pLeafSelectStmts[idx]==NULL ){
sl@0	2143	int rc = sql_prepare(v->db, v->zDb, v->zName, &v->pLeafSelectStmts[idx],
sl@0	2144	LEAF_SELECT);
sl@0	2145	if( rc!=SQLITE_OK ) return rc;
sl@0	2146	}else{
sl@0	2147	int rc = sqlite3_reset(v->pLeafSelectStmts[idx]);
sl@0	2148	if( rc!=SQLITE_OK ) return rc;
sl@0	2149	}
sl@0	2150
sl@0	2151	*ppStmt = v->pLeafSelectStmts[idx];
sl@0	2152	return SQLITE_OK;
sl@0	2153	}
sl@0	2154
sl@0	2155	/* insert into %_content (docid, ...) values ([docid], [pValues])
sl@0	2156	** If the docid contains SQL NULL, then a unique docid will be
sl@0	2157	** generated.
sl@0	2158	*/
sl@0	2159	static int content_insert(fulltext_vtab v, sqlite3_value docid,
sl@0	2160	sqlite3_value **pValues){
sl@0	2161	sqlite3_stmt *s;
sl@0	2162	int i;
sl@0	2163	int rc = sql_get_statement(v, CONTENT_INSERT_STMT, &s);
sl@0	2164	if( rc!=SQLITE_OK ) return rc;
sl@0	2165
sl@0	2166	rc = sqlite3_bind_value(s, 1, docid);
sl@0	2167	if( rc!=SQLITE_OK ) return rc;
sl@0	2168
sl@0	2169	for(i=0; i<v->nColumn; ++i){
sl@0	2170	rc = sqlite3_bind_value(s, 2+i, pValues[i]);
sl@0	2171	if( rc!=SQLITE_OK ) return rc;
sl@0	2172	}
sl@0	2173
sl@0	2174	return sql_single_step(s);
sl@0	2175	}
sl@0	2176
sl@0	2177	/* update %_content set col0 = pValues[0], col1 = pValues[1], ...
sl@0	2178	* where docid = [iDocid] */
sl@0	2179	static int content_update(fulltext_vtab v, sqlite3_value *pValues,
sl@0	2180	sqlite_int64 iDocid){
sl@0	2181	sqlite3_stmt *s;
sl@0	2182	int i;
sl@0	2183	int rc = sql_get_statement(v, CONTENT_UPDATE_STMT, &s);
sl@0	2184	if( rc!=SQLITE_OK ) return rc;
sl@0	2185
sl@0	2186	for(i=0; i<v->nColumn; ++i){
sl@0	2187	rc = sqlite3_bind_value(s, 1+i, pValues[i]);
sl@0	2188	if( rc!=SQLITE_OK ) return rc;
sl@0	2189	}
sl@0	2190
sl@0	2191	rc = sqlite3_bind_int64(s, 1+v->nColumn, iDocid);
sl@0	2192	if( rc!=SQLITE_OK ) return rc;
sl@0	2193
sl@0	2194	return sql_single_step(s);
sl@0	2195	}
sl@0	2196
sl@0	2197	static void freeStringArray(int nString, const char **pString){
sl@0	2198	int i;
sl@0	2199
sl@0	2200	for (i=0 ; i < nString ; ++i) {
sl@0	2201	if( pString[i]!=NULL ) sqlite3_free((void *) pString[i]);
sl@0	2202	}
sl@0	2203	sqlite3_free((void *) pString);
sl@0	2204	}
sl@0	2205
sl@0	2206	/* select * from %_content where docid = [iDocid]
sl@0	2207	* The caller must delete the returned array and all strings in it.
sl@0	2208	* null fields will be NULL in the returned array.
sl@0	2209	*
sl@0	2210	* TODO: Perhaps we should return pointer/length strings here for consistency
sl@0	2211	* with other code which uses pointer/length. */
sl@0	2212	static int content_select(fulltext_vtab *v, sqlite_int64 iDocid,
sl@0	2213	const char ***pValues){
sl@0	2214	sqlite3_stmt *s;
sl@0	2215	const char **values;
sl@0	2216	int i;
sl@0	2217	int rc;
sl@0	2218
sl@0	2219	*pValues = NULL;
sl@0	2220
sl@0	2221	rc = sql_get_statement(v, CONTENT_SELECT_STMT, &s);
sl@0	2222	if( rc!=SQLITE_OK ) return rc;
sl@0	2223
sl@0	2224	rc = sqlite3_bind_int64(s, 1, iDocid);
sl@0	2225	if( rc!=SQLITE_OK ) return rc;
sl@0	2226
sl@0	2227	rc = sqlite3_step(s);
sl@0	2228	if( rc!=SQLITE_ROW ) return rc;
sl@0	2229
sl@0	2230	values = (const char *) sqlite3_malloc(v->nColumn sizeof(const char *));
sl@0	2231	for(i=0; i<v->nColumn; ++i){
sl@0	2232	if( sqlite3_column_type(s, i)==SQLITE_NULL ){
sl@0	2233	values[i] = NULL;
sl@0	2234	}else{
sl@0	2235	values[i] = string_dup((char*)sqlite3_column_text(s, i));
sl@0	2236	}
sl@0	2237	}
sl@0	2238
sl@0	2239	/* We expect only one row. We must execute another sqlite3_step()
sl@0	2240	* to complete the iteration; otherwise the table will remain locked. */
sl@0	2241	rc = sqlite3_step(s);
sl@0	2242	if( rc==SQLITE_DONE ){
sl@0	2243	*pValues = values;
sl@0	2244	return SQLITE_OK;
sl@0	2245	}
sl@0	2246
sl@0	2247	freeStringArray(v->nColumn, values);
sl@0	2248	return rc;
sl@0	2249	}
sl@0	2250
sl@0	2251	/* delete from %_content where docid = [iDocid ] */
sl@0	2252	static int content_delete(fulltext_vtab *v, sqlite_int64 iDocid){
sl@0	2253	sqlite3_stmt *s;
sl@0	2254	int rc = sql_get_statement(v, CONTENT_DELETE_STMT, &s);
sl@0	2255	if( rc!=SQLITE_OK ) return rc;
sl@0	2256
sl@0	2257	rc = sqlite3_bind_int64(s, 1, iDocid);
sl@0	2258	if( rc!=SQLITE_OK ) return rc;
sl@0	2259
sl@0	2260	return sql_single_step(s);
sl@0	2261	}
sl@0	2262
sl@0	2263	/* Returns SQLITE_ROW if any rows exist in %_content, SQLITE_DONE if
sl@0	2264	** no rows exist, and any error in case of failure.
sl@0	2265	*/
sl@0	2266	static int content_exists(fulltext_vtab *v){
sl@0	2267	sqlite3_stmt *s;
sl@0	2268	int rc = sql_get_statement(v, CONTENT_EXISTS_STMT, &s);
sl@0	2269	if( rc!=SQLITE_OK ) return rc;
sl@0	2270
sl@0	2271	rc = sqlite3_step(s);
sl@0	2272	if( rc!=SQLITE_ROW ) return rc;
sl@0	2273
sl@0	2274	/* We expect only one row. We must execute another sqlite3_step()
sl@0	2275	* to complete the iteration; otherwise the table will remain locked. */
sl@0	2276	rc = sqlite3_step(s);
sl@0	2277	if( rc==SQLITE_DONE ) return SQLITE_ROW;
sl@0	2278	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2279	return rc;
sl@0	2280	}
sl@0	2281
sl@0	2282	/* insert into %_segments values ([pData])
sl@0	2283	** returns assigned blockid in *piBlockid
sl@0	2284	*/
sl@0	2285	static int block_insert(fulltext_vtab v, const char pData, int nData,
sl@0	2286	sqlite_int64 *piBlockid){
sl@0	2287	sqlite3_stmt *s;
sl@0	2288	int rc = sql_get_statement(v, BLOCK_INSERT_STMT, &s);
sl@0	2289	if( rc!=SQLITE_OK ) return rc;
sl@0	2290
sl@0	2291	rc = sqlite3_bind_blob(s, 1, pData, nData, SQLITE_STATIC);
sl@0	2292	if( rc!=SQLITE_OK ) return rc;
sl@0	2293
sl@0	2294	rc = sqlite3_step(s);
sl@0	2295	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2296	if( rc!=SQLITE_DONE ) return rc;
sl@0	2297
sl@0	2298	/* blockid column is an alias for rowid. */
sl@0	2299	*piBlockid = sqlite3_last_insert_rowid(v->db);
sl@0	2300	return SQLITE_OK;
sl@0	2301	}
sl@0	2302
sl@0	2303	/* delete from %_segments
sl@0	2304	** where blockid between [iStartBlockid] and [iEndBlockid]
sl@0	2305	**
sl@0	2306	** Deletes the range of blocks, inclusive, used to delete the blocks
sl@0	2307	** which form a segment.
sl@0	2308	*/
sl@0	2309	static int block_delete(fulltext_vtab *v,
sl@0	2310	sqlite_int64 iStartBlockid, sqlite_int64 iEndBlockid){
sl@0	2311	sqlite3_stmt *s;
sl@0	2312	int rc = sql_get_statement(v, BLOCK_DELETE_STMT, &s);
sl@0	2313	if( rc!=SQLITE_OK ) return rc;
sl@0	2314
sl@0	2315	rc = sqlite3_bind_int64(s, 1, iStartBlockid);
sl@0	2316	if( rc!=SQLITE_OK ) return rc;
sl@0	2317
sl@0	2318	rc = sqlite3_bind_int64(s, 2, iEndBlockid);
sl@0	2319	if( rc!=SQLITE_OK ) return rc;
sl@0	2320
sl@0	2321	return sql_single_step(s);
sl@0	2322	}
sl@0	2323
sl@0	2324	/* Returns SQLITE_ROW with *pidx set to the maximum segment idx found
sl@0	2325	** at iLevel. Returns SQLITE_DONE if there are no segments at
sl@0	2326	** iLevel. Otherwise returns an error.
sl@0	2327	*/
sl@0	2328	static int segdir_max_index(fulltext_vtab v, int iLevel, int pidx){
sl@0	2329	sqlite3_stmt *s;
sl@0	2330	int rc = sql_get_statement(v, SEGDIR_MAX_INDEX_STMT, &s);
sl@0	2331	if( rc!=SQLITE_OK ) return rc;
sl@0	2332
sl@0	2333	rc = sqlite3_bind_int(s, 1, iLevel);
sl@0	2334	if( rc!=SQLITE_OK ) return rc;
sl@0	2335
sl@0	2336	rc = sqlite3_step(s);
sl@0	2337	/* Should always get at least one row due to how max() works. */
sl@0	2338	if( rc==SQLITE_DONE ) return SQLITE_DONE;
sl@0	2339	if( rc!=SQLITE_ROW ) return rc;
sl@0	2340
sl@0	2341	/* NULL means that there were no inputs to max(). */
sl@0	2342	if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
sl@0	2343	rc = sqlite3_step(s);
sl@0	2344	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2345	return rc;
sl@0	2346	}
sl@0	2347
sl@0	2348	*pidx = sqlite3_column_int(s, 0);
sl@0	2349
sl@0	2350	/* We expect only one row. We must execute another sqlite3_step()
sl@0	2351	* to complete the iteration; otherwise the table will remain locked. */
sl@0	2352	rc = sqlite3_step(s);
sl@0	2353	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2354	if( rc!=SQLITE_DONE ) return rc;
sl@0	2355	return SQLITE_ROW;
sl@0	2356	}
sl@0	2357
sl@0	2358	/* insert into %_segdir values (
sl@0	2359	** [iLevel], [idx],
sl@0	2360	** [iStartBlockid], [iLeavesEndBlockid], [iEndBlockid],
sl@0	2361	** [pRootData]
sl@0	2362	** )
sl@0	2363	*/
sl@0	2364	static int segdir_set(fulltext_vtab *v, int iLevel, int idx,
sl@0	2365	sqlite_int64 iStartBlockid,
sl@0	2366	sqlite_int64 iLeavesEndBlockid,
sl@0	2367	sqlite_int64 iEndBlockid,
sl@0	2368	const char *pRootData, int nRootData){
sl@0	2369	sqlite3_stmt *s;
sl@0	2370	int rc = sql_get_statement(v, SEGDIR_SET_STMT, &s);
sl@0	2371	if( rc!=SQLITE_OK ) return rc;
sl@0	2372
sl@0	2373	rc = sqlite3_bind_int(s, 1, iLevel);
sl@0	2374	if( rc!=SQLITE_OK ) return rc;
sl@0	2375
sl@0	2376	rc = sqlite3_bind_int(s, 2, idx);
sl@0	2377	if( rc!=SQLITE_OK ) return rc;
sl@0	2378
sl@0	2379	rc = sqlite3_bind_int64(s, 3, iStartBlockid);
sl@0	2380	if( rc!=SQLITE_OK ) return rc;
sl@0	2381
sl@0	2382	rc = sqlite3_bind_int64(s, 4, iLeavesEndBlockid);
sl@0	2383	if( rc!=SQLITE_OK ) return rc;
sl@0	2384
sl@0	2385	rc = sqlite3_bind_int64(s, 5, iEndBlockid);
sl@0	2386	if( rc!=SQLITE_OK ) return rc;
sl@0	2387
sl@0	2388	rc = sqlite3_bind_blob(s, 6, pRootData, nRootData, SQLITE_STATIC);
sl@0	2389	if( rc!=SQLITE_OK ) return rc;
sl@0	2390
sl@0	2391	return sql_single_step(s);
sl@0	2392	}
sl@0	2393
sl@0	2394	/* Queries %_segdir for the block span of the segments in level
sl@0	2395	** iLevel. Returns SQLITE_DONE if there are no blocks for iLevel,
sl@0	2396	** SQLITE_ROW if there are blocks, else an error.
sl@0	2397	*/
sl@0	2398	static int segdir_span(fulltext_vtab *v, int iLevel,
sl@0	2399	sqlite_int64 *piStartBlockid,
sl@0	2400	sqlite_int64 *piEndBlockid){
sl@0	2401	sqlite3_stmt *s;
sl@0	2402	int rc = sql_get_statement(v, SEGDIR_SPAN_STMT, &s);
sl@0	2403	if( rc!=SQLITE_OK ) return rc;
sl@0	2404
sl@0	2405	rc = sqlite3_bind_int(s, 1, iLevel);
sl@0	2406	if( rc!=SQLITE_OK ) return rc;
sl@0	2407
sl@0	2408	rc = sqlite3_step(s);
sl@0	2409	if( rc==SQLITE_DONE ) return SQLITE_DONE; /* Should never happen */
sl@0	2410	if( rc!=SQLITE_ROW ) return rc;
sl@0	2411
sl@0	2412	/* This happens if all segments at this level are entirely inline. */
sl@0	2413	if( SQLITE_NULL==sqlite3_column_type(s, 0) ){
sl@0	2414	/* We expect only one row. We must execute another sqlite3_step()
sl@0	2415	* to complete the iteration; otherwise the table will remain locked. */
sl@0	2416	int rc2 = sqlite3_step(s);
sl@0	2417	if( rc2==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2418	return rc2;
sl@0	2419	}
sl@0	2420
sl@0	2421	*piStartBlockid = sqlite3_column_int64(s, 0);
sl@0	2422	*piEndBlockid = sqlite3_column_int64(s, 1);
sl@0	2423
sl@0	2424	/* We expect only one row. We must execute another sqlite3_step()
sl@0	2425	* to complete the iteration; otherwise the table will remain locked. */
sl@0	2426	rc = sqlite3_step(s);
sl@0	2427	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2428	if( rc!=SQLITE_DONE ) return rc;
sl@0	2429	return SQLITE_ROW;
sl@0	2430	}
sl@0	2431
sl@0	2432	/* Delete the segment blocks and segment directory records for all
sl@0	2433	** segments at iLevel.
sl@0	2434	*/
sl@0	2435	static int segdir_delete(fulltext_vtab *v, int iLevel){
sl@0	2436	sqlite3_stmt *s;
sl@0	2437	sqlite_int64 iStartBlockid, iEndBlockid;
sl@0	2438	int rc = segdir_span(v, iLevel, &iStartBlockid, &iEndBlockid);
sl@0	2439	if( rc!=SQLITE_ROW && rc!=SQLITE_DONE ) return rc;
sl@0	2440
sl@0	2441	if( rc==SQLITE_ROW ){
sl@0	2442	rc = block_delete(v, iStartBlockid, iEndBlockid);
sl@0	2443	if( rc!=SQLITE_OK ) return rc;
sl@0	2444	}
sl@0	2445
sl@0	2446	/* Delete the segment directory itself. */
sl@0	2447	rc = sql_get_statement(v, SEGDIR_DELETE_STMT, &s);
sl@0	2448	if( rc!=SQLITE_OK ) return rc;
sl@0	2449
sl@0	2450	rc = sqlite3_bind_int64(s, 1, iLevel);
sl@0	2451	if( rc!=SQLITE_OK ) return rc;
sl@0	2452
sl@0	2453	return sql_single_step(s);
sl@0	2454	}
sl@0	2455
sl@0	2456	/* Delete entire fts index, SQLITE_OK on success, relevant error on
sl@0	2457	** failure.
sl@0	2458	*/
sl@0	2459	static int segdir_delete_all(fulltext_vtab *v){
sl@0	2460	sqlite3_stmt *s;
sl@0	2461	int rc = sql_get_statement(v, SEGDIR_DELETE_ALL_STMT, &s);
sl@0	2462	if( rc!=SQLITE_OK ) return rc;
sl@0	2463
sl@0	2464	rc = sql_single_step(s);
sl@0	2465	if( rc!=SQLITE_OK ) return rc;
sl@0	2466
sl@0	2467	rc = sql_get_statement(v, BLOCK_DELETE_ALL_STMT, &s);
sl@0	2468	if( rc!=SQLITE_OK ) return rc;
sl@0	2469
sl@0	2470	return sql_single_step(s);
sl@0	2471	}
sl@0	2472
sl@0	2473	/* Returns SQLITE_OK with *pnSegments set to the number of entries in
sl@0	2474	** %_segdir and *piMaxLevel set to the highest level which has a
sl@0	2475	** segment. Otherwise returns the SQLite error which caused failure.
sl@0	2476	*/
sl@0	2477	static int segdir_count(fulltext_vtab v, int pnSegments, int *piMaxLevel){
sl@0	2478	sqlite3_stmt *s;
sl@0	2479	int rc = sql_get_statement(v, SEGDIR_COUNT_STMT, &s);
sl@0	2480	if( rc!=SQLITE_OK ) return rc;
sl@0	2481
sl@0	2482	rc = sqlite3_step(s);
sl@0	2483	/* TODO(shess): This case should not be possible? Should stronger
sl@0	2484	** measures be taken if it happens?
sl@0	2485	*/
sl@0	2486	if( rc==SQLITE_DONE ){
sl@0	2487	*pnSegments = 0;
sl@0	2488	*piMaxLevel = 0;
sl@0	2489	return SQLITE_OK;
sl@0	2490	}
sl@0	2491	if( rc!=SQLITE_ROW ) return rc;
sl@0	2492
sl@0	2493	*pnSegments = sqlite3_column_int(s, 0);
sl@0	2494	*piMaxLevel = sqlite3_column_int(s, 1);
sl@0	2495
sl@0	2496	/* We expect only one row. We must execute another sqlite3_step()
sl@0	2497	* to complete the iteration; otherwise the table will remain locked. */
sl@0	2498	rc = sqlite3_step(s);
sl@0	2499	if( rc==SQLITE_DONE ) return SQLITE_OK;
sl@0	2500	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	2501	return rc;
sl@0	2502	}
sl@0	2503
sl@0	2504	/* TODO(shess) clearPendingTerms() is far down the file because
sl@0	2505	** writeZeroSegment() is far down the file because LeafWriter is far
sl@0	2506	** down the file. Consider refactoring the code to move the non-vtab
sl@0	2507	** code above the vtab code so that we don't need this forward
sl@0	2508	** reference.
sl@0	2509	*/
sl@0	2510	static int clearPendingTerms(fulltext_vtab *v);
sl@0	2511
sl@0	2512	/*
sl@0	2513	** Free the memory used to contain a fulltext_vtab structure.
sl@0	2514	*/
sl@0	2515	static void fulltext_vtab_destroy(fulltext_vtab *v){
sl@0	2516	int iStmt, i;
sl@0	2517
sl@0	2518	FTSTRACE(("FTS3 Destroy %p\n", v));
sl@0	2519	for( iStmt=0; iStmt<MAX_STMT; iStmt++ ){
sl@0	2520	if( v->pFulltextStatements[iStmt]!=NULL ){
sl@0	2521	sqlite3_finalize(v->pFulltextStatements[iStmt]);
sl@0	2522	v->pFulltextStatements[iStmt] = NULL;
sl@0	2523	}
sl@0	2524	}
sl@0	2525
sl@0	2526	for( i=0; i<MERGE_COUNT; i++ ){
sl@0	2527	if( v->pLeafSelectStmts[i]!=NULL ){
sl@0	2528	sqlite3_finalize(v->pLeafSelectStmts[i]);
sl@0	2529	v->pLeafSelectStmts[i] = NULL;
sl@0	2530	}
sl@0	2531	}
sl@0	2532
sl@0	2533	if( v->pTokenizer!=NULL ){
sl@0	2534	v->pTokenizer->pModule->xDestroy(v->pTokenizer);
sl@0	2535	v->pTokenizer = NULL;
sl@0	2536	}
sl@0	2537
sl@0	2538	clearPendingTerms(v);
sl@0	2539
sl@0	2540	sqlite3_free(v->azColumn);
sl@0	2541	for(i = 0; i < v->nColumn; ++i) {
sl@0	2542	sqlite3_free(v->azContentColumn[i]);
sl@0	2543	}
sl@0	2544	sqlite3_free(v->azContentColumn);
sl@0	2545	sqlite3_free(v);
sl@0	2546	}
sl@0	2547
sl@0	2548	/*
sl@0	2549	** Token types for parsing the arguments to xConnect or xCreate.
sl@0	2550	*/
sl@0	2551	#define TOKEN_EOF 0 /* End of file */
sl@0	2552	#define TOKEN_SPACE 1 /* Any kind of whitespace */
sl@0	2553	#define TOKEN_ID 2 /* An identifier */
sl@0	2554	#define TOKEN_STRING 3 /* A string literal */
sl@0	2555	#define TOKEN_PUNCT 4 /* A single punctuation character */
sl@0	2556
sl@0	2557	/*
sl@0	2558	** If X is a character that can be used in an identifier then
sl@0	2559	** ftsIdChar(X) will be true. Otherwise it is false.
sl@0	2560	**
sl@0	2561	** For ASCII, any character with the high-order bit set is
sl@0	2562	** allowed in an identifier. For 7-bit characters,
sl@0	2563	** isFtsIdChar[X] must be 1.
sl@0	2564	**
sl@0	2565	** Ticket #1066. the SQL standard does not allow '$' in the
sl@0	2566	** middle of identfiers. But many SQL implementations do.
sl@0	2567	** SQLite will allow '$' in identifiers for compatibility.
sl@0	2568	** But the feature is undocumented.
sl@0	2569	*/
sl@0	2570	static const char isFtsIdChar[] = {
sl@0	2571	/* x0 x1 x2 x3 x4 x5 x6 x7 x8 x9 xA xB xC xD xE xF */
sl@0	2572	0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 2x */
sl@0	2573	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, /* 3x */
sl@0	2574	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 4x */
sl@0	2575	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, /* 5x */
sl@0	2576	0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 6x */
sl@0	2577	1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, /* 7x */
sl@0	2578	};
sl@0	2579	#define ftsIdChar(C) (((c=C)&0x80)!=0 \|\| (c>0x1f && isFtsIdChar[c-0x20]))
sl@0	2580
sl@0	2581
sl@0	2582	/*
sl@0	2583	** Return the length of the token that begins at z[0].
sl@0	2584	** Store the token type in *tokenType before returning.
sl@0	2585	*/
sl@0	2586	static int ftsGetToken(const char z, int tokenType){
sl@0	2587	int i, c;
sl@0	2588	switch( *z ){
sl@0	2589	case 0: {
sl@0	2590	*tokenType = TOKEN_EOF;
sl@0	2591	return 0;
sl@0	2592	}
sl@0	2593	case ' ': case '\t': case '\n': case '\f': case '\r': {
sl@0	2594	for(i=1; safe_isspace(z[i]); i++){}
sl@0	2595	*tokenType = TOKEN_SPACE;
sl@0	2596	return i;
sl@0	2597	}
sl@0	2598	case '`':
sl@0	2599	case '\'':
sl@0	2600	case '"': {
sl@0	2601	int delim = z[0];
sl@0	2602	for(i=1; (c=z[i])!=0; i++){
sl@0	2603	if( c==delim ){
sl@0	2604	if( z[i+1]==delim ){
sl@0	2605	i++;
sl@0	2606	}else{
sl@0	2607	break;
sl@0	2608	}
sl@0	2609	}
sl@0	2610	}
sl@0	2611	*tokenType = TOKEN_STRING;
sl@0	2612	return i + (c!=0);
sl@0	2613	}
sl@0	2614	case '[': {
sl@0	2615	for(i=1, c=z[0]; c!=']' && (c=z[i])!=0; i++){}
sl@0	2616	*tokenType = TOKEN_ID;
sl@0	2617	return i;
sl@0	2618	}
sl@0	2619	default: {
sl@0	2620	if( !ftsIdChar(*z) ){
sl@0	2621	break;
sl@0	2622	}
sl@0	2623	for(i=1; ftsIdChar(z[i]); i++){}
sl@0	2624	*tokenType = TOKEN_ID;
sl@0	2625	return i;
sl@0	2626	}
sl@0	2627	}
sl@0	2628	*tokenType = TOKEN_PUNCT;
sl@0	2629	return 1;
sl@0	2630	}
sl@0	2631
sl@0	2632	/*
sl@0	2633	** A token extracted from a string is an instance of the following
sl@0	2634	** structure.
sl@0	2635	*/
sl@0	2636	typedef struct FtsToken {
sl@0	2637	const char z; / Pointer to token text. Not '\000' terminated */
sl@0	2638	short int n; /* Length of the token text in bytes. */
sl@0	2639	} FtsToken;
sl@0	2640
sl@0	2641	/*
sl@0	2642	** Given a input string (which is really one of the argv[] parameters
sl@0	2643	** passed into xConnect or xCreate) split the string up into tokens.
sl@0	2644	** Return an array of pointers to '\000' terminated strings, one string
sl@0	2645	** for each non-whitespace token.
sl@0	2646	**
sl@0	2647	** The returned array is terminated by a single NULL pointer.
sl@0	2648	**
sl@0	2649	** Space to hold the returned array is obtained from a single
sl@0	2650	** malloc and should be freed by passing the return value to free().
sl@0	2651	** The individual strings within the token list are all a part of
sl@0	2652	** the single memory allocation and will all be freed at once.
sl@0	2653	*/
sl@0	2654	static char *tokenizeString(const char z, int *pnToken){
sl@0	2655	int nToken = 0;
sl@0	2656	FtsToken aToken = sqlite3_malloc( strlen(z) sizeof(aToken[0]) );
sl@0	2657	int n = 1;
sl@0	2658	int e, i;
sl@0	2659	int totalSize = 0;
sl@0	2660	char **azToken;
sl@0	2661	char *zCopy;
sl@0	2662	while( n>0 ){
sl@0	2663	n = ftsGetToken(z, &e);
sl@0	2664	if( e!=TOKEN_SPACE ){
sl@0	2665	aToken[nToken].z = z;
sl@0	2666	aToken[nToken].n = n;
sl@0	2667	nToken++;
sl@0	2668	totalSize += n+1;
sl@0	2669	}
sl@0	2670	z += n;
sl@0	2671	}
sl@0	2672	azToken = (char*)sqlite3_malloc( nTokensizeof(char*) + totalSize );
sl@0	2673	zCopy = (char*)&azToken[nToken];
sl@0	2674	nToken--;
sl@0	2675	for(i=0; i<nToken; i++){
sl@0	2676	azToken[i] = zCopy;
sl@0	2677	n = aToken[i].n;
sl@0	2678	memcpy(zCopy, aToken[i].z, n);
sl@0	2679	zCopy[n] = 0;
sl@0	2680	zCopy += n+1;
sl@0	2681	}
sl@0	2682	azToken[nToken] = 0;
sl@0	2683	sqlite3_free(aToken);
sl@0	2684	*pnToken = nToken;
sl@0	2685	return azToken;
sl@0	2686	}
sl@0	2687
sl@0	2688	/*
sl@0	2689	** Convert an SQL-style quoted string into a normal string by removing
sl@0	2690	** the quote characters. The conversion is done in-place. If the
sl@0	2691	** input does not begin with a quote character, then this routine
sl@0	2692	** is a no-op.
sl@0	2693	**
sl@0	2694	** Examples:
sl@0	2695	**
sl@0	2696	** "abc" becomes abc
sl@0	2697	** 'xyz' becomes xyz
sl@0	2698	** [pqr] becomes pqr
sl@0	2699	** `mno` becomes mno
sl@0	2700	*/
sl@0	2701	static void dequoteString(char *z){
sl@0	2702	int quote;
sl@0	2703	int i, j;
sl@0	2704	if( z==0 ) return;
sl@0	2705	quote = z[0];
sl@0	2706	switch( quote ){
sl@0	2707	case '\'': break;
sl@0	2708	case '"': break;
sl@0	2709	case '`': break; /* For MySQL compatibility */
sl@0	2710	case '[': quote = ']'; break; /* For MS SqlServer compatibility */
sl@0	2711	default: return;
sl@0	2712	}
sl@0	2713	for(i=1, j=0; z[i]; i++){
sl@0	2714	if( z[i]==quote ){
sl@0	2715	if( z[i+1]==quote ){
sl@0	2716	z[j++] = quote;
sl@0	2717	i++;
sl@0	2718	}else{
sl@0	2719	z[j++] = 0;
sl@0	2720	break;
sl@0	2721	}
sl@0	2722	}else{
sl@0	2723	z[j++] = z[i];
sl@0	2724	}
sl@0	2725	}
sl@0	2726	}
sl@0	2727
sl@0	2728	/*
sl@0	2729	** The input azIn is a NULL-terminated list of tokens. Remove the first
sl@0	2730	** token and all punctuation tokens. Remove the quotes from
sl@0	2731	** around string literal tokens.
sl@0	2732	**
sl@0	2733	** Example:
sl@0	2734	**
sl@0	2735	** input: tokenize chinese ( 'simplifed' , 'mixed' )
sl@0	2736	** output: chinese simplifed mixed
sl@0	2737	**
sl@0	2738	** Another example:
sl@0	2739	**
sl@0	2740	** input: delimiters ( '[' , ']' , '...' )
sl@0	2741	** output: [ ] ...
sl@0	2742	*/
sl@0	2743	static void tokenListToIdList(char **azIn){
sl@0	2744	int i, j;
sl@0	2745	if( azIn ){
sl@0	2746	for(i=0, j=-1; azIn[i]; i++){
sl@0	2747	if( safe_isalnum(azIn[i][0]) \|\| azIn[i][1] ){
sl@0	2748	dequoteString(azIn[i]);
sl@0	2749	if( j>=0 ){
sl@0	2750	azIn[j] = azIn[i];
sl@0	2751	}
sl@0	2752	j++;
sl@0	2753	}
sl@0	2754	}
sl@0	2755	azIn[j] = 0;
sl@0	2756	}
sl@0	2757	}
sl@0	2758
sl@0	2759
sl@0	2760	/*
sl@0	2761	** Find the first alphanumeric token in the string zIn. Null-terminate
sl@0	2762	** this token. Remove any quotation marks. And return a pointer to
sl@0	2763	** the result.
sl@0	2764	*/
sl@0	2765	static char firstToken(char zIn, char **pzTail){
sl@0	2766	int n, ttype;
sl@0	2767	while(1){
sl@0	2768	n = ftsGetToken(zIn, &ttype);
sl@0	2769	if( ttype==TOKEN_SPACE ){
sl@0	2770	zIn += n;
sl@0	2771	}else if( ttype==TOKEN_EOF ){
sl@0	2772	*pzTail = zIn;
sl@0	2773	return 0;
sl@0	2774	}else{
sl@0	2775	zIn[n] = 0;
sl@0	2776	*pzTail = &zIn[1];
sl@0	2777	dequoteString(zIn);
sl@0	2778	return zIn;
sl@0	2779	}
sl@0	2780	}
sl@0	2781	/NOTREACHED/
sl@0	2782	}
sl@0	2783
sl@0	2784	/* Return true if...
sl@0	2785	**
sl@0	2786	** * s begins with the string t, ignoring case
sl@0	2787	** * s is longer than t
sl@0	2788	** * The first character of s beyond t is not a alphanumeric
sl@0	2789	**
sl@0	2790	** Ignore leading space in *s.
sl@0	2791	**
sl@0	2792	** To put it another way, return true if the first token of
sl@0	2793	** s[] is t[].
sl@0	2794	*/
sl@0	2795	static int startsWith(const char s, const char t){
sl@0	2796	while( safe_isspace(*s) ){ s++; }
sl@0	2797	while( *t ){
sl@0	2798	if( safe_tolower(s++)!=safe_tolower(t++) ) return 0;
sl@0	2799	}
sl@0	2800	return s!='_' && !safe_isalnum(s);
sl@0	2801	}
sl@0	2802
sl@0	2803	/*
sl@0	2804	** An instance of this structure defines the "spec" of a
sl@0	2805	** full text index. This structure is populated by parseSpec
sl@0	2806	** and use by fulltextConnect and fulltextCreate.
sl@0	2807	*/
sl@0	2808	typedef struct TableSpec {
sl@0	2809	const char zDb; / Logical database name */
sl@0	2810	const char zName; / Name of the full-text index */
sl@0	2811	int nColumn; /* Number of columns to be indexed */
sl@0	2812	char *azColumn; / Original names of columns to be indexed */
sl@0	2813	char *azContentColumn; / Column names for %_content */
sl@0	2814	char *azTokenizer; / Name of tokenizer and its arguments */
sl@0	2815	} TableSpec;
sl@0	2816
sl@0	2817	/*
sl@0	2818	** Reclaim all of the memory used by a TableSpec
sl@0	2819	*/
sl@0	2820	static void clearTableSpec(TableSpec *p) {
sl@0	2821	sqlite3_free(p->azColumn);
sl@0	2822	sqlite3_free(p->azContentColumn);
sl@0	2823	sqlite3_free(p->azTokenizer);
sl@0	2824	}
sl@0	2825
sl@0	2826	/* Parse a CREATE VIRTUAL TABLE statement, which looks like this:
sl@0	2827	*
sl@0	2828	* CREATE VIRTUAL TABLE email
sl@0	2829	* USING fts3(subject, body, tokenize mytokenizer(myarg))
sl@0	2830	*
sl@0	2831	* We return parsed information in a TableSpec structure.
sl@0	2832	*
sl@0	2833	*/
sl@0	2834	static int parseSpec(TableSpec pSpec, int argc, const char const*argv,
sl@0	2835	char**pzErr){
sl@0	2836	int i, n;
sl@0	2837	char z, zDummy;
sl@0	2838	char **azArg;
sl@0	2839	const char zTokenizer = 0; / argv[] entry describing the tokenizer */
sl@0	2840
sl@0	2841	assert( argc>=3 );
sl@0	2842	/* Current interface:
sl@0	2843	** argv[0] - module name
sl@0	2844	** argv[1] - database name
sl@0	2845	** argv[2] - table name
sl@0	2846	** argv[3..] - columns, optionally followed by tokenizer specification
sl@0	2847	** and snippet delimiters specification.
sl@0	2848	*/
sl@0	2849
sl@0	2850	/* Make a copy of the complete argv[][] array in a single allocation.
sl@0	2851	** The argv[][] array is read-only and transient. We can write to the
sl@0	2852	** copy in order to modify things and the copy is persistent.
sl@0	2853	*/
sl@0	2854	CLEAR(pSpec);
sl@0	2855	for(i=n=0; i<argc; i++){
sl@0	2856	n += strlen(argv[i]) + 1;
sl@0	2857	}
sl@0	2858	azArg = sqlite3_malloc( sizeof(char)argc + n );
sl@0	2859	if( azArg==0 ){
sl@0	2860	return SQLITE_NOMEM;
sl@0	2861	}
sl@0	2862	z = (char*)&azArg[argc];
sl@0	2863	for(i=0; i<argc; i++){
sl@0	2864	azArg[i] = z;
sl@0	2865	strcpy(z, argv[i]);
sl@0	2866	z += strlen(z)+1;
sl@0	2867	}
sl@0	2868
sl@0	2869	/* Identify the column names and the tokenizer and delimiter arguments
sl@0	2870	** in the argv[][] array.
sl@0	2871	*/
sl@0	2872	pSpec->zDb = azArg[1];
sl@0	2873	pSpec->zName = azArg[2];
sl@0	2874	pSpec->nColumn = 0;
sl@0	2875	pSpec->azColumn = azArg;
sl@0	2876	zTokenizer = "tokenize simple";
sl@0	2877	for(i=3; i<argc; ++i){
sl@0	2878	if( startsWith(azArg[i],"tokenize") ){
sl@0	2879	zTokenizer = azArg[i];
sl@0	2880	}else{
sl@0	2881	z = azArg[pSpec->nColumn] = firstToken(azArg[i], &zDummy);
sl@0	2882	pSpec->nColumn++;
sl@0	2883	}
sl@0	2884	}
sl@0	2885	if( pSpec->nColumn==0 ){
sl@0	2886	azArg[0] = "content";
sl@0	2887	pSpec->nColumn = 1;
sl@0	2888	}
sl@0	2889
sl@0	2890	/*
sl@0	2891	** Construct the list of content column names.
sl@0	2892	**
sl@0	2893	** Each content column name will be of the form cNNAAAA
sl@0	2894	** where NN is the column number and AAAA is the sanitized
sl@0	2895	** column name. "sanitized" means that special characters are
sl@0	2896	** converted to "_". The cNN prefix guarantees that all column
sl@0	2897	** names are unique.
sl@0	2898	**
sl@0	2899	** The AAAA suffix is not strictly necessary. It is included
sl@0	2900	** for the convenience of people who might examine the generated
sl@0	2901	** %_content table and wonder what the columns are used for.
sl@0	2902	*/
sl@0	2903	pSpec->azContentColumn = sqlite3_malloc( pSpec->nColumn * sizeof(char *) );
sl@0	2904	if( pSpec->azContentColumn==0 ){
sl@0	2905	clearTableSpec(pSpec);
sl@0	2906	return SQLITE_NOMEM;
sl@0	2907	}
sl@0	2908	for(i=0; i<pSpec->nColumn; i++){
sl@0	2909	char *p;
sl@0	2910	pSpec->azContentColumn[i] = sqlite3_mprintf("c%d%s", i, azArg[i]);
sl@0	2911	for (p = pSpec->azContentColumn[i]; *p ; ++p) {
sl@0	2912	if( !safe_isalnum(p) ) p = '_';
sl@0	2913	}
sl@0	2914	}
sl@0	2915
sl@0	2916	/*
sl@0	2917	** Parse the tokenizer specification string.
sl@0	2918	*/
sl@0	2919	pSpec->azTokenizer = tokenizeString(zTokenizer, &n);
sl@0	2920	tokenListToIdList(pSpec->azTokenizer);
sl@0	2921
sl@0	2922	return SQLITE_OK;
sl@0	2923	}
sl@0	2924
sl@0	2925	/*
sl@0	2926	** Generate a CREATE TABLE statement that describes the schema of
sl@0	2927	** the virtual table. Return a pointer to this schema string.
sl@0	2928	**
sl@0	2929	** Space is obtained from sqlite3_mprintf() and should be freed
sl@0	2930	** using sqlite3_free().
sl@0	2931	*/
sl@0	2932	static char *fulltextSchema(
sl@0	2933	int nColumn, /* Number of columns */
sl@0	2934	const char const azColumn, /* List of columns */
sl@0	2935	const char zTableName / Name of the table */
sl@0	2936	){
sl@0	2937	int i;
sl@0	2938	char zSchema, zNext;
sl@0	2939	const char *zSep = "(";
sl@0	2940	zSchema = sqlite3_mprintf("CREATE TABLE x");
sl@0	2941	for(i=0; i<nColumn; i++){
sl@0	2942	zNext = sqlite3_mprintf("%s%s%Q", zSchema, zSep, azColumn[i]);
sl@0	2943	sqlite3_free(zSchema);
sl@0	2944	zSchema = zNext;
sl@0	2945	zSep = ",";
sl@0	2946	}
sl@0	2947	zNext = sqlite3_mprintf("%s,%Q HIDDEN", zSchema, zTableName);
sl@0	2948	sqlite3_free(zSchema);
sl@0	2949	zSchema = zNext;
sl@0	2950	zNext = sqlite3_mprintf("%s,docid HIDDEN)", zSchema);
sl@0	2951	sqlite3_free(zSchema);
sl@0	2952	return zNext;
sl@0	2953	}
sl@0	2954
sl@0	2955	/*
sl@0	2956	** Build a new sqlite3_vtab structure that will describe the
sl@0	2957	** fulltext index defined by spec.
sl@0	2958	*/
sl@0	2959	static int constructVtab(
sl@0	2960	sqlite3 db, / The SQLite database connection */
sl@0	2961	fts3Hash pHash, / Hash table containing tokenizers */
sl@0	2962	TableSpec spec, / Parsed spec information from parseSpec() */
sl@0	2963	sqlite3_vtab *ppVTab, / Write the resulting vtab structure here */
sl@0	2964	char *pzErr / Write any error message here */
sl@0	2965	){
sl@0	2966	int rc;
sl@0	2967	int n;
sl@0	2968	fulltext_vtab *v = 0;
sl@0	2969	const sqlite3_tokenizer_module *m = NULL;
sl@0	2970	char *schema;
sl@0	2971
sl@0	2972	char const zTok; / Name of tokenizer to use for this fts table */
sl@0	2973	int nTok; /* Length of zTok, including nul terminator */
sl@0	2974
sl@0	2975	v = (fulltext_vtab *) sqlite3_malloc(sizeof(fulltext_vtab));
sl@0	2976	if( v==0 ) return SQLITE_NOMEM;
sl@0	2977	CLEAR(v);
sl@0	2978	/* sqlite will initialize v->base */
sl@0	2979	v->db = db;
sl@0	2980	v->zDb = spec->zDb; /* Freed when azColumn is freed */
sl@0	2981	v->zName = spec->zName; /* Freed when azColumn is freed */
sl@0	2982	v->nColumn = spec->nColumn;
sl@0	2983	v->azContentColumn = spec->azContentColumn;
sl@0	2984	spec->azContentColumn = 0;
sl@0	2985	v->azColumn = spec->azColumn;
sl@0	2986	spec->azColumn = 0;
sl@0	2987
sl@0	2988	if( spec->azTokenizer==0 ){
sl@0	2989	return SQLITE_NOMEM;
sl@0	2990	}
sl@0	2991
sl@0	2992	zTok = spec->azTokenizer[0];
sl@0	2993	if( !zTok ){
sl@0	2994	zTok = "simple";
sl@0	2995	}
sl@0	2996	nTok = strlen(zTok)+1;
sl@0	2997
sl@0	2998	m = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zTok, nTok);
sl@0	2999	if( !m ){
sl@0	3000	*pzErr = sqlite3_mprintf("unknown tokenizer: %s", spec->azTokenizer[0]);
sl@0	3001	rc = SQLITE_ERROR;
sl@0	3002	goto err;
sl@0	3003	}
sl@0	3004
sl@0	3005	for(n=0; spec->azTokenizer[n]; n++){}
sl@0	3006	if( n ){
sl@0	3007	rc = m->xCreate(n-1, (const charconst)&spec->azTokenizer[1],
sl@0	3008	&v->pTokenizer);
sl@0	3009	}else{
sl@0	3010	rc = m->xCreate(0, 0, &v->pTokenizer);
sl@0	3011	}
sl@0	3012	if( rc!=SQLITE_OK ) goto err;
sl@0	3013	v->pTokenizer->pModule = m;
sl@0	3014
sl@0	3015	/* TODO: verify the existence of backing tables foo_content, foo_term */
sl@0	3016
sl@0	3017	schema = fulltextSchema(v->nColumn, (const charconst)v->azColumn,
sl@0	3018	spec->zName);
sl@0	3019	rc = sqlite3_declare_vtab(db, schema);
sl@0	3020	sqlite3_free(schema);
sl@0	3021	if( rc!=SQLITE_OK ) goto err;
sl@0	3022
sl@0	3023	memset(v->pFulltextStatements, 0, sizeof(v->pFulltextStatements));
sl@0	3024
sl@0	3025	/* Indicate that the buffer is not live. */
sl@0	3026	v->nPendingData = -1;
sl@0	3027
sl@0	3028	*ppVTab = &v->base;
sl@0	3029	FTSTRACE(("FTS3 Connect %p\n", v));
sl@0	3030
sl@0	3031	return rc;
sl@0	3032
sl@0	3033	err:
sl@0	3034	fulltext_vtab_destroy(v);
sl@0	3035	return rc;
sl@0	3036	}
sl@0	3037
sl@0	3038	static int fulltextConnect(
sl@0	3039	sqlite3 *db,
sl@0	3040	void *pAux,
sl@0	3041	int argc, const char constargv,
sl@0	3042	sqlite3_vtab **ppVTab,
sl@0	3043	char **pzErr
sl@0	3044	){
sl@0	3045	TableSpec spec;
sl@0	3046	int rc = parseSpec(&spec, argc, argv, pzErr);
sl@0	3047	if( rc!=SQLITE_OK ) return rc;
sl@0	3048
sl@0	3049	rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr);
sl@0	3050	clearTableSpec(&spec);
sl@0	3051	return rc;
sl@0	3052	}
sl@0	3053
sl@0	3054	/* The %_content table holds the text of each document, with
sl@0	3055	** the docid column exposed as the SQLite rowid for the table.
sl@0	3056	*/
sl@0	3057	/* TODO(shess) This comment needs elaboration to match the updated
sl@0	3058	** code. Work it into the top-of-file comment at that time.
sl@0	3059	*/
sl@0	3060	static int fulltextCreate(sqlite3 db, void pAux,
sl@0	3061	int argc, const char * const *argv,
sl@0	3062	sqlite3_vtab ppVTab, char pzErr){
sl@0	3063	int rc;
sl@0	3064	TableSpec spec;
sl@0	3065	StringBuffer schema;
sl@0	3066	FTSTRACE(("FTS3 Create\n"));
sl@0	3067
sl@0	3068	rc = parseSpec(&spec, argc, argv, pzErr);
sl@0	3069	if( rc!=SQLITE_OK ) return rc;
sl@0	3070
sl@0	3071	initStringBuffer(&schema);
sl@0	3072	append(&schema, "CREATE TABLE %_content(");
sl@0	3073	append(&schema, " docid INTEGER PRIMARY KEY,");
sl@0	3074	appendList(&schema, spec.nColumn, spec.azContentColumn);
sl@0	3075	append(&schema, ")");
sl@0	3076	rc = sql_exec(db, spec.zDb, spec.zName, stringBufferData(&schema));
sl@0	3077	stringBufferDestroy(&schema);
sl@0	3078	if( rc!=SQLITE_OK ) goto out;
sl@0	3079
sl@0	3080	rc = sql_exec(db, spec.zDb, spec.zName,
sl@0	3081	"create table %_segments("
sl@0	3082	" blockid INTEGER PRIMARY KEY,"
sl@0	3083	" block blob"
sl@0	3084	");"
sl@0	3085	);
sl@0	3086	if( rc!=SQLITE_OK ) goto out;
sl@0	3087
sl@0	3088	rc = sql_exec(db, spec.zDb, spec.zName,
sl@0	3089	"create table %_segdir("
sl@0	3090	" level integer,"
sl@0	3091	" idx integer,"
sl@0	3092	" start_block integer,"
sl@0	3093	" leaves_end_block integer,"
sl@0	3094	" end_block integer,"
sl@0	3095	" root blob,"
sl@0	3096	" primary key(level, idx)"
sl@0	3097	");");
sl@0	3098	if( rc!=SQLITE_OK ) goto out;
sl@0	3099
sl@0	3100	rc = constructVtab(db, (fts3Hash *)pAux, &spec, ppVTab, pzErr);
sl@0	3101
sl@0	3102	out:
sl@0	3103	clearTableSpec(&spec);
sl@0	3104	return rc;
sl@0	3105	}
sl@0	3106
sl@0	3107	/* Decide how to handle an SQL query. */
sl@0	3108	static int fulltextBestIndex(sqlite3_vtab pVTab, sqlite3_index_info pInfo){
sl@0	3109	fulltext_vtab v = (fulltext_vtab )pVTab;
sl@0	3110	int i;
sl@0	3111	FTSTRACE(("FTS3 BestIndex\n"));
sl@0	3112
sl@0	3113	for(i=0; i<pInfo->nConstraint; ++i){
sl@0	3114	const struct sqlite3_index_constraint *pConstraint;
sl@0	3115	pConstraint = &pInfo->aConstraint[i];
sl@0	3116	if( pConstraint->usable ) {
sl@0	3117	if( (pConstraint->iColumn==-1 \|\| pConstraint->iColumn==v->nColumn+1) &&
sl@0	3118	pConstraint->op==SQLITE_INDEX_CONSTRAINT_EQ ){
sl@0	3119	pInfo->idxNum = QUERY_DOCID; /* lookup by docid */
sl@0	3120	FTSTRACE(("FTS3 QUERY_DOCID\n"));
sl@0	3121	} else if( pConstraint->iColumn>=0 && pConstraint->iColumn<=v->nColumn &&
sl@0	3122	pConstraint->op==SQLITE_INDEX_CONSTRAINT_MATCH ){
sl@0	3123	/* full-text search */
sl@0	3124	pInfo->idxNum = QUERY_FULLTEXT + pConstraint->iColumn;
sl@0	3125	FTSTRACE(("FTS3 QUERY_FULLTEXT %d\n", pConstraint->iColumn));
sl@0	3126	} else continue;
sl@0	3127
sl@0	3128	pInfo->aConstraintUsage[i].argvIndex = 1;
sl@0	3129	pInfo->aConstraintUsage[i].omit = 1;
sl@0	3130
sl@0	3131	/* An arbitrary value for now.
sl@0	3132	* TODO: Perhaps docid matches should be considered cheaper than
sl@0	3133	* full-text searches. */
sl@0	3134	pInfo->estimatedCost = 1.0;
sl@0	3135
sl@0	3136	return SQLITE_OK;
sl@0	3137	}
sl@0	3138	}
sl@0	3139	pInfo->idxNum = QUERY_GENERIC;
sl@0	3140	return SQLITE_OK;
sl@0	3141	}
sl@0	3142
sl@0	3143	static int fulltextDisconnect(sqlite3_vtab *pVTab){
sl@0	3144	FTSTRACE(("FTS3 Disconnect %p\n", pVTab));
sl@0	3145	fulltext_vtab_destroy((fulltext_vtab *)pVTab);
sl@0	3146	return SQLITE_OK;
sl@0	3147	}
sl@0	3148
sl@0	3149	static int fulltextDestroy(sqlite3_vtab *pVTab){
sl@0	3150	fulltext_vtab v = (fulltext_vtab )pVTab;
sl@0	3151	int rc;
sl@0	3152
sl@0	3153	FTSTRACE(("FTS3 Destroy %p\n", pVTab));
sl@0	3154	rc = sql_exec(v->db, v->zDb, v->zName,
sl@0	3155	"drop table if exists %_content;"
sl@0	3156	"drop table if exists %_segments;"
sl@0	3157	"drop table if exists %_segdir;"
sl@0	3158	);
sl@0	3159	if( rc!=SQLITE_OK ) return rc;
sl@0	3160
sl@0	3161	fulltext_vtab_destroy((fulltext_vtab *)pVTab);
sl@0	3162	return SQLITE_OK;
sl@0	3163	}
sl@0	3164
sl@0	3165	static int fulltextOpen(sqlite3_vtab pVTab, sqlite3_vtab_cursor *ppCursor){
sl@0	3166	fulltext_cursor *c;
sl@0	3167
sl@0	3168	c = (fulltext_cursor *) sqlite3_malloc(sizeof(fulltext_cursor));
sl@0	3169	if( c ){
sl@0	3170	memset(c, 0, sizeof(fulltext_cursor));
sl@0	3171	/* sqlite will initialize c->base */
sl@0	3172	*ppCursor = &c->base;
sl@0	3173	FTSTRACE(("FTS3 Open %p: %p\n", pVTab, c));
sl@0	3174	return SQLITE_OK;
sl@0	3175	}else{
sl@0	3176	return SQLITE_NOMEM;
sl@0	3177	}
sl@0	3178	}
sl@0	3179
sl@0	3180
sl@0	3181	/* Free all of the dynamically allocated memory held by *q
sl@0	3182	*/
sl@0	3183	static void queryClear(Query *q){
sl@0	3184	int i;
sl@0	3185	for(i = 0; i < q->nTerms; ++i){
sl@0	3186	sqlite3_free(q->pTerms[i].pTerm);
sl@0	3187	}
sl@0	3188	sqlite3_free(q->pTerms);
sl@0	3189	CLEAR(q);
sl@0	3190	}
sl@0	3191
sl@0	3192	/* Free all of the dynamically allocated memory held by the
sl@0	3193	** Snippet
sl@0	3194	*/
sl@0	3195	static void snippetClear(Snippet *p){
sl@0	3196	sqlite3_free(p->aMatch);
sl@0	3197	sqlite3_free(p->zOffset);
sl@0	3198	sqlite3_free(p->zSnippet);
sl@0	3199	CLEAR(p);
sl@0	3200	}
sl@0	3201	/*
sl@0	3202	** Append a single entry to the p->aMatch[] log.
sl@0	3203	*/
sl@0	3204	static void snippetAppendMatch(
sl@0	3205	Snippet p, / Append the entry to this snippet */
sl@0	3206	int iCol, int iTerm, /* The column and query term */
sl@0	3207	int iToken, /* Matching token in document */
sl@0	3208	int iStart, int nByte /* Offset and size of the match */
sl@0	3209	){
sl@0	3210	int i;
sl@0	3211	struct snippetMatch *pMatch;
sl@0	3212	if( p->nMatch+1>=p->nAlloc ){
sl@0	3213	p->nAlloc = p->nAlloc*2 + 10;
sl@0	3214	p->aMatch = sqlite3_realloc(p->aMatch, p->nAlloc*sizeof(p->aMatch[0]) );
sl@0	3215	if( p->aMatch==0 ){
sl@0	3216	p->nMatch = 0;
sl@0	3217	p->nAlloc = 0;
sl@0	3218	return;
sl@0	3219	}
sl@0	3220	}
sl@0	3221	i = p->nMatch++;
sl@0	3222	pMatch = &p->aMatch[i];
sl@0	3223	pMatch->iCol = iCol;
sl@0	3224	pMatch->iTerm = iTerm;
sl@0	3225	pMatch->iToken = iToken;
sl@0	3226	pMatch->iStart = iStart;
sl@0	3227	pMatch->nByte = nByte;
sl@0	3228	}
sl@0	3229
sl@0	3230	/*
sl@0	3231	** Sizing information for the circular buffer used in snippetOffsetsOfColumn()
sl@0	3232	*/
sl@0	3233	#define FTS3_ROTOR_SZ (32)
sl@0	3234	#define FTS3_ROTOR_MASK (FTS3_ROTOR_SZ-1)
sl@0	3235
sl@0	3236	/*
sl@0	3237	** Add entries to pSnippet->aMatch[] for every match that occurs against
sl@0	3238	** document zDoc[0..nDoc-1] which is stored in column iColumn.
sl@0	3239	*/
sl@0	3240	static void snippetOffsetsOfColumn(
sl@0	3241	Query *pQuery,
sl@0	3242	Snippet *pSnippet,
sl@0	3243	int iColumn,
sl@0	3244	const char *zDoc,
sl@0	3245	int nDoc
sl@0	3246	){
sl@0	3247	const sqlite3_tokenizer_module pTModule; / The tokenizer module */
sl@0	3248	sqlite3_tokenizer pTokenizer; / The specific tokenizer */
sl@0	3249	sqlite3_tokenizer_cursor pTCursor; / Tokenizer cursor */
sl@0	3250	fulltext_vtab pVtab; / The full text index */
sl@0	3251	int nColumn; /* Number of columns in the index */
sl@0	3252	const QueryTerm aTerm; / Query string terms */
sl@0	3253	int nTerm; /* Number of query string terms */
sl@0	3254	int i, j; /* Loop counters */
sl@0	3255	int rc; /* Return code */
sl@0	3256	unsigned int match, prevMatch; /* Phrase search bitmasks */
sl@0	3257	const char zToken; / Next token from the tokenizer */
sl@0	3258	int nToken; /* Size of zToken */
sl@0	3259	int iBegin, iEnd, iPos; /* Offsets of beginning and end */
sl@0	3260
sl@0	3261	/* The following variables keep a circular buffer of the last
sl@0	3262	** few tokens */
sl@0	3263	unsigned int iRotor = 0; /* Index of current token */
sl@0	3264	int iRotorBegin[FTS3_ROTOR_SZ]; /* Beginning offset of token */
sl@0	3265	int iRotorLen[FTS3_ROTOR_SZ]; /* Length of token */
sl@0	3266
sl@0	3267	pVtab = pQuery->pFts;
sl@0	3268	nColumn = pVtab->nColumn;
sl@0	3269	pTokenizer = pVtab->pTokenizer;
sl@0	3270	pTModule = pTokenizer->pModule;
sl@0	3271	rc = pTModule->xOpen(pTokenizer, zDoc, nDoc, &pTCursor);
sl@0	3272	if( rc ) return;
sl@0	3273	pTCursor->pTokenizer = pTokenizer;
sl@0	3274	aTerm = pQuery->pTerms;
sl@0	3275	nTerm = pQuery->nTerms;
sl@0	3276	if( nTerm>=FTS3_ROTOR_SZ ){
sl@0	3277	nTerm = FTS3_ROTOR_SZ - 1;
sl@0	3278	}
sl@0	3279	prevMatch = 0;
sl@0	3280	while(1){
sl@0	3281	rc = pTModule->xNext(pTCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
sl@0	3282	if( rc ) break;
sl@0	3283	iRotorBegin[iRotor&FTS3_ROTOR_MASK] = iBegin;
sl@0	3284	iRotorLen[iRotor&FTS3_ROTOR_MASK] = iEnd-iBegin;
sl@0	3285	match = 0;
sl@0	3286	for(i=0; i<nTerm; i++){
sl@0	3287	int iCol;
sl@0	3288	iCol = aTerm[i].iColumn;
sl@0	3289	if( iCol>=0 && iCol<nColumn && iCol!=iColumn ) continue;
sl@0	3290	if( aTerm[i].nTerm>nToken ) continue;
sl@0	3291	if( !aTerm[i].isPrefix && aTerm[i].nTerm<nToken ) continue;
sl@0	3292	assert( aTerm[i].nTerm<=nToken );
sl@0	3293	if( memcmp(aTerm[i].pTerm, zToken, aTerm[i].nTerm) ) continue;
sl@0	3294	if( aTerm[i].iPhrase>1 && (prevMatch & (1<<i))==0 ) continue;
sl@0	3295	match \|= 1<<i;
sl@0	3296	if( i==nTerm-1 \|\| aTerm[i+1].iPhrase==1 ){
sl@0	3297	for(j=aTerm[i].iPhrase-1; j>=0; j--){
sl@0	3298	int k = (iRotor-j) & FTS3_ROTOR_MASK;
sl@0	3299	snippetAppendMatch(pSnippet, iColumn, i-j, iPos-j,
sl@0	3300	iRotorBegin[k], iRotorLen[k]);
sl@0	3301	}
sl@0	3302	}
sl@0	3303	}
sl@0	3304	prevMatch = match<<1;
sl@0	3305	iRotor++;
sl@0	3306	}
sl@0	3307	pTModule->xClose(pTCursor);
sl@0	3308	}
sl@0	3309
sl@0	3310	/*
sl@0	3311	** Remove entries from the pSnippet structure to account for the NEAR
sl@0	3312	** operator. When this is called, pSnippet contains the list of token
sl@0	3313	** offsets produced by treating all NEAR operators as AND operators.
sl@0	3314	** This function removes any entries that should not be present after
sl@0	3315	** accounting for the NEAR restriction. For example, if the queried
sl@0	3316	** document is:
sl@0	3317	**
sl@0	3318	** "A B C D E A"
sl@0	3319	**
sl@0	3320	** and the query is:
sl@0	3321	**
sl@0	3322	** A NEAR/0 E
sl@0	3323	**
sl@0	3324	** then when this function is called the Snippet contains token offsets
sl@0	3325	** 0, 4 and 5. This function removes the "0" entry (because the first A
sl@0	3326	** is not near enough to an E).
sl@0	3327	*/
sl@0	3328	static void trimSnippetOffsetsForNear(Query pQuery, Snippet pSnippet){
sl@0	3329	int ii;
sl@0	3330	int iDir = 1;
sl@0	3331
sl@0	3332	while(iDir>-2) {
sl@0	3333	assert( iDir==1 \|\| iDir==-1 );
sl@0	3334	for(ii=0; ii<pSnippet->nMatch; ii++){
sl@0	3335	int jj;
sl@0	3336	int nNear;
sl@0	3337	struct snippetMatch *pMatch = &pSnippet->aMatch[ii];
sl@0	3338	QueryTerm *pQueryTerm = &pQuery->pTerms[pMatch->iTerm];
sl@0	3339
sl@0	3340	if( (pMatch->iTerm+iDir)<0
sl@0	3341	\|\| (pMatch->iTerm+iDir)>=pQuery->nTerms
sl@0	3342	){
sl@0	3343	continue;
sl@0	3344	}
sl@0	3345
sl@0	3346	nNear = pQueryTerm->nNear;
sl@0	3347	if( iDir<0 ){
sl@0	3348	nNear = pQueryTerm[-1].nNear;
sl@0	3349	}
sl@0	3350
sl@0	3351	if( pMatch->iTerm>=0 && nNear ){
sl@0	3352	int isOk = 0;
sl@0	3353	int iNextTerm = pMatch->iTerm+iDir;
sl@0	3354	int iPrevTerm = iNextTerm;
sl@0	3355
sl@0	3356	int iEndToken;
sl@0	3357	int iStartToken;
sl@0	3358
sl@0	3359	if( iDir<0 ){
sl@0	3360	int nPhrase = 1;
sl@0	3361	iStartToken = pMatch->iToken;
sl@0	3362	while( (pMatch->iTerm+nPhrase)<pQuery->nTerms
sl@0	3363	&& pQuery->pTerms[pMatch->iTerm+nPhrase].iPhrase>1
sl@0	3364	){
sl@0	3365	nPhrase++;
sl@0	3366	}
sl@0	3367	iEndToken = iStartToken + nPhrase - 1;
sl@0	3368	}else{
sl@0	3369	iEndToken = pMatch->iToken;
sl@0	3370	iStartToken = pMatch->iToken+1-pQueryTerm->iPhrase;
sl@0	3371	}
sl@0	3372
sl@0	3373	while( pQuery->pTerms[iNextTerm].iPhrase>1 ){
sl@0	3374	iNextTerm--;
sl@0	3375	}
sl@0	3376	while( (iPrevTerm+1)<pQuery->nTerms &&
sl@0	3377	pQuery->pTerms[iPrevTerm+1].iPhrase>1
sl@0	3378	){
sl@0	3379	iPrevTerm++;
sl@0	3380	}
sl@0	3381
sl@0	3382	for(jj=0; isOk==0 && jj<pSnippet->nMatch; jj++){
sl@0	3383	struct snippetMatch *p = &pSnippet->aMatch[jj];
sl@0	3384	if( p->iCol==pMatch->iCol && ((
sl@0	3385	p->iTerm==iNextTerm &&
sl@0	3386	p->iToken>iEndToken &&
sl@0	3387	p->iToken<=iEndToken+nNear
sl@0	3388	) \|\| (
sl@0	3389	p->iTerm==iPrevTerm &&
sl@0	3390	p->iToken<iStartToken &&
sl@0	3391	p->iToken>=iStartToken-nNear
sl@0	3392	))){
sl@0	3393	isOk = 1;
sl@0	3394	}
sl@0	3395	}
sl@0	3396	if( !isOk ){
sl@0	3397	for(jj=1-pQueryTerm->iPhrase; jj<=0; jj++){
sl@0	3398	pMatch[jj].iTerm = -1;
sl@0	3399	}
sl@0	3400	ii = -1;
sl@0	3401	iDir = 1;
sl@0	3402	}
sl@0	3403	}
sl@0	3404	}
sl@0	3405	iDir -= 2;
sl@0	3406	}
sl@0	3407	}
sl@0	3408
sl@0	3409	/*
sl@0	3410	** Compute all offsets for the current row of the query.
sl@0	3411	** If the offsets have already been computed, this routine is a no-op.
sl@0	3412	*/
sl@0	3413	static void snippetAllOffsets(fulltext_cursor *p){
sl@0	3414	int nColumn;
sl@0	3415	int iColumn, i;
sl@0	3416	int iFirst, iLast;
sl@0	3417	fulltext_vtab *pFts;
sl@0	3418
sl@0	3419	if( p->snippet.nMatch ) return;
sl@0	3420	if( p->q.nTerms==0 ) return;
sl@0	3421	pFts = p->q.pFts;
sl@0	3422	nColumn = pFts->nColumn;
sl@0	3423	iColumn = (p->iCursorType - QUERY_FULLTEXT);
sl@0	3424	if( iColumn<0 \|\| iColumn>=nColumn ){
sl@0	3425	iFirst = 0;
sl@0	3426	iLast = nColumn-1;
sl@0	3427	}else{
sl@0	3428	iFirst = iColumn;
sl@0	3429	iLast = iColumn;
sl@0	3430	}
sl@0	3431	for(i=iFirst; i<=iLast; i++){
sl@0	3432	const char *zDoc;
sl@0	3433	int nDoc;
sl@0	3434	zDoc = (const char*)sqlite3_column_text(p->pStmt, i+1);
sl@0	3435	nDoc = sqlite3_column_bytes(p->pStmt, i+1);
sl@0	3436	snippetOffsetsOfColumn(&p->q, &p->snippet, i, zDoc, nDoc);
sl@0	3437	}
sl@0	3438
sl@0	3439	trimSnippetOffsetsForNear(&p->q, &p->snippet);
sl@0	3440	}
sl@0	3441
sl@0	3442	/*
sl@0	3443	** Convert the information in the aMatch[] array of the snippet
sl@0	3444	** into the string zOffset[0..nOffset-1].
sl@0	3445	*/
sl@0	3446	static void snippetOffsetText(Snippet *p){
sl@0	3447	int i;
sl@0	3448	int cnt = 0;
sl@0	3449	StringBuffer sb;
sl@0	3450	char zBuf[200];
sl@0	3451	if( p->zOffset ) return;
sl@0	3452	initStringBuffer(&sb);
sl@0	3453	for(i=0; i<p->nMatch; i++){
sl@0	3454	struct snippetMatch *pMatch = &p->aMatch[i];
sl@0	3455	if( pMatch->iTerm>=0 ){
sl@0	3456	/* If snippetMatch.iTerm is less than 0, then the match was
sl@0	3457	** discarded as part of processing the NEAR operator (see the
sl@0	3458	** trimSnippetOffsetsForNear() function for details). Ignore
sl@0	3459	** it in this case
sl@0	3460	*/
sl@0	3461	zBuf[0] = ' ';
sl@0	3462	sqlite3_snprintf(sizeof(zBuf)-1, &zBuf[cnt>0], "%d %d %d %d",
sl@0	3463	pMatch->iCol, pMatch->iTerm, pMatch->iStart, pMatch->nByte);
sl@0	3464	append(&sb, zBuf);
sl@0	3465	cnt++;
sl@0	3466	}
sl@0	3467	}
sl@0	3468	p->zOffset = stringBufferData(&sb);
sl@0	3469	p->nOffset = stringBufferLength(&sb);
sl@0	3470	}
sl@0	3471
sl@0	3472	/*
sl@0	3473	** zDoc[0..nDoc-1] is phrase of text. aMatch[0..nMatch-1] are a set
sl@0	3474	** of matching words some of which might be in zDoc. zDoc is column
sl@0	3475	** number iCol.
sl@0	3476	**
sl@0	3477	** iBreak is suggested spot in zDoc where we could begin or end an
sl@0	3478	** excerpt. Return a value similar to iBreak but possibly adjusted
sl@0	3479	** to be a little left or right so that the break point is better.
sl@0	3480	*/
sl@0	3481	static int wordBoundary(
sl@0	3482	int iBreak, /* The suggested break point */
sl@0	3483	const char zDoc, / Document text */
sl@0	3484	int nDoc, /* Number of bytes in zDoc[] */
sl@0	3485	struct snippetMatch aMatch, / Matching words */
sl@0	3486	int nMatch, /* Number of entries in aMatch[] */
sl@0	3487	int iCol /* The column number for zDoc[] */
sl@0	3488	){
sl@0	3489	int i;
sl@0	3490	if( iBreak<=10 ){
sl@0	3491	return 0;
sl@0	3492	}
sl@0	3493	if( iBreak>=nDoc-10 ){
sl@0	3494	return nDoc;
sl@0	3495	}
sl@0	3496	for(i=0; i<nMatch && aMatch[i].iCol<iCol; i++){}
sl@0	3497	while( i<nMatch && aMatch[i].iStart+aMatch[i].nByte<iBreak ){ i++; }
sl@0	3498	if( i<nMatch ){
sl@0	3499	if( aMatch[i].iStart<iBreak+10 ){
sl@0	3500	return aMatch[i].iStart;
sl@0	3501	}
sl@0	3502	if( i>0 && aMatch[i-1].iStart+aMatch[i-1].nByte>=iBreak ){
sl@0	3503	return aMatch[i-1].iStart;
sl@0	3504	}
sl@0	3505	}
sl@0	3506	for(i=1; i<=10; i++){
sl@0	3507	if( safe_isspace(zDoc[iBreak-i]) ){
sl@0	3508	return iBreak - i + 1;
sl@0	3509	}
sl@0	3510	if( safe_isspace(zDoc[iBreak+i]) ){
sl@0	3511	return iBreak + i + 1;
sl@0	3512	}
sl@0	3513	}
sl@0	3514	return iBreak;
sl@0	3515	}
sl@0	3516
sl@0	3517
sl@0	3518
sl@0	3519	/*
sl@0	3520	** Allowed values for Snippet.aMatch[].snStatus
sl@0	3521	*/
sl@0	3522	#define SNIPPET_IGNORE 0 /* It is ok to omit this match from the snippet */
sl@0	3523	#define SNIPPET_DESIRED 1 /* We want to include this match in the snippet */
sl@0	3524
sl@0	3525	/*
sl@0	3526	** Generate the text of a snippet.
sl@0	3527	*/
sl@0	3528	static void snippetText(
sl@0	3529	fulltext_cursor pCursor, / The cursor we need the snippet for */
sl@0	3530	const char zStartMark, / Markup to appear before each match */
sl@0	3531	const char zEndMark, / Markup to appear after each match */
sl@0	3532	const char zEllipsis / Ellipsis mark */
sl@0	3533	){
sl@0	3534	int i, j;
sl@0	3535	struct snippetMatch *aMatch;
sl@0	3536	int nMatch;
sl@0	3537	int nDesired;
sl@0	3538	StringBuffer sb;
sl@0	3539	int tailCol;
sl@0	3540	int tailOffset;
sl@0	3541	int iCol;
sl@0	3542	int nDoc;
sl@0	3543	const char *zDoc;
sl@0	3544	int iStart, iEnd;
sl@0	3545	int tailEllipsis = 0;
sl@0	3546	int iMatch;
sl@0	3547
sl@0	3548
sl@0	3549	sqlite3_free(pCursor->snippet.zSnippet);
sl@0	3550	pCursor->snippet.zSnippet = 0;
sl@0	3551	aMatch = pCursor->snippet.aMatch;
sl@0	3552	nMatch = pCursor->snippet.nMatch;
sl@0	3553	initStringBuffer(&sb);
sl@0	3554
sl@0	3555	for(i=0; i<nMatch; i++){
sl@0	3556	aMatch[i].snStatus = SNIPPET_IGNORE;
sl@0	3557	}
sl@0	3558	nDesired = 0;
sl@0	3559	for(i=0; i<pCursor->q.nTerms; i++){
sl@0	3560	for(j=0; j<nMatch; j++){
sl@0	3561	if( aMatch[j].iTerm==i ){
sl@0	3562	aMatch[j].snStatus = SNIPPET_DESIRED;
sl@0	3563	nDesired++;
sl@0	3564	break;
sl@0	3565	}
sl@0	3566	}
sl@0	3567	}
sl@0	3568
sl@0	3569	iMatch = 0;
sl@0	3570	tailCol = -1;
sl@0	3571	tailOffset = 0;
sl@0	3572	for(i=0; i<nMatch && nDesired>0; i++){
sl@0	3573	if( aMatch[i].snStatus!=SNIPPET_DESIRED ) continue;
sl@0	3574	nDesired--;
sl@0	3575	iCol = aMatch[i].iCol;
sl@0	3576	zDoc = (const char*)sqlite3_column_text(pCursor->pStmt, iCol+1);
sl@0	3577	nDoc = sqlite3_column_bytes(pCursor->pStmt, iCol+1);
sl@0	3578	iStart = aMatch[i].iStart - 40;
sl@0	3579	iStart = wordBoundary(iStart, zDoc, nDoc, aMatch, nMatch, iCol);
sl@0	3580	if( iStart<=10 ){
sl@0	3581	iStart = 0;
sl@0	3582	}
sl@0	3583	if( iCol==tailCol && iStart<=tailOffset+20 ){
sl@0	3584	iStart = tailOffset;
sl@0	3585	}
sl@0	3586	if( (iCol!=tailCol && tailCol>=0) \|\| iStart!=tailOffset ){
sl@0	3587	trimWhiteSpace(&sb);
sl@0	3588	appendWhiteSpace(&sb);
sl@0	3589	append(&sb, zEllipsis);
sl@0	3590	appendWhiteSpace(&sb);
sl@0	3591	}
sl@0	3592	iEnd = aMatch[i].iStart + aMatch[i].nByte + 40;
sl@0	3593	iEnd = wordBoundary(iEnd, zDoc, nDoc, aMatch, nMatch, iCol);
sl@0	3594	if( iEnd>=nDoc-10 ){
sl@0	3595	iEnd = nDoc;
sl@0	3596	tailEllipsis = 0;
sl@0	3597	}else{
sl@0	3598	tailEllipsis = 1;
sl@0	3599	}
sl@0	3600	while( iMatch<nMatch && aMatch[iMatch].iCol<iCol ){ iMatch++; }
sl@0	3601	while( iStart<iEnd ){
sl@0	3602	while( iMatch<nMatch && aMatch[iMatch].iStart<iStart
sl@0	3603	&& aMatch[iMatch].iCol<=iCol ){
sl@0	3604	iMatch++;
sl@0	3605	}
sl@0	3606	if( iMatch<nMatch && aMatch[iMatch].iStart<iEnd
sl@0	3607	&& aMatch[iMatch].iCol==iCol ){
sl@0	3608	nappend(&sb, &zDoc[iStart], aMatch[iMatch].iStart - iStart);
sl@0	3609	iStart = aMatch[iMatch].iStart;
sl@0	3610	append(&sb, zStartMark);
sl@0	3611	nappend(&sb, &zDoc[iStart], aMatch[iMatch].nByte);
sl@0	3612	append(&sb, zEndMark);
sl@0	3613	iStart += aMatch[iMatch].nByte;
sl@0	3614	for(j=iMatch+1; j<nMatch; j++){
sl@0	3615	if( aMatch[j].iTerm==aMatch[iMatch].iTerm
sl@0	3616	&& aMatch[j].snStatus==SNIPPET_DESIRED ){
sl@0	3617	nDesired--;
sl@0	3618	aMatch[j].snStatus = SNIPPET_IGNORE;
sl@0	3619	}
sl@0	3620	}
sl@0	3621	}else{
sl@0	3622	nappend(&sb, &zDoc[iStart], iEnd - iStart);
sl@0	3623	iStart = iEnd;
sl@0	3624	}
sl@0	3625	}
sl@0	3626	tailCol = iCol;
sl@0	3627	tailOffset = iEnd;
sl@0	3628	}
sl@0	3629	trimWhiteSpace(&sb);
sl@0	3630	if( tailEllipsis ){
sl@0	3631	appendWhiteSpace(&sb);
sl@0	3632	append(&sb, zEllipsis);
sl@0	3633	}
sl@0	3634	pCursor->snippet.zSnippet = stringBufferData(&sb);
sl@0	3635	pCursor->snippet.nSnippet = stringBufferLength(&sb);
sl@0	3636	}
sl@0	3637
sl@0	3638
sl@0	3639	/*
sl@0	3640	** Close the cursor. For additional information see the documentation
sl@0	3641	** on the xClose method of the virtual table interface.
sl@0	3642	*/
sl@0	3643	static int fulltextClose(sqlite3_vtab_cursor *pCursor){
sl@0	3644	fulltext_cursor c = (fulltext_cursor ) pCursor;
sl@0	3645	FTSTRACE(("FTS3 Close %p\n", c));
sl@0	3646	sqlite3_finalize(c->pStmt);
sl@0	3647	queryClear(&c->q);
sl@0	3648	snippetClear(&c->snippet);
sl@0	3649	if( c->result.nData!=0 ) dlrDestroy(&c->reader);
sl@0	3650	dataBufferDestroy(&c->result);
sl@0	3651	sqlite3_free(c);
sl@0	3652	return SQLITE_OK;
sl@0	3653	}
sl@0	3654
sl@0	3655	static int fulltextNext(sqlite3_vtab_cursor *pCursor){
sl@0	3656	fulltext_cursor c = (fulltext_cursor ) pCursor;
sl@0	3657	int rc;
sl@0	3658
sl@0	3659	FTSTRACE(("FTS3 Next %p\n", pCursor));
sl@0	3660	snippetClear(&c->snippet);
sl@0	3661	if( c->iCursorType < QUERY_FULLTEXT ){
sl@0	3662	/* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
sl@0	3663	rc = sqlite3_step(c->pStmt);
sl@0	3664	switch( rc ){
sl@0	3665	case SQLITE_ROW:
sl@0	3666	c->eof = 0;
sl@0	3667	return SQLITE_OK;
sl@0	3668	case SQLITE_DONE:
sl@0	3669	c->eof = 1;
sl@0	3670	return SQLITE_OK;
sl@0	3671	default:
sl@0	3672	c->eof = 1;
sl@0	3673	return rc;
sl@0	3674	}
sl@0	3675	} else { /* full-text query */
sl@0	3676	rc = sqlite3_reset(c->pStmt);
sl@0	3677	if( rc!=SQLITE_OK ) return rc;
sl@0	3678
sl@0	3679	if( c->result.nData==0 \|\| dlrAtEnd(&c->reader) ){
sl@0	3680	c->eof = 1;
sl@0	3681	return SQLITE_OK;
sl@0	3682	}
sl@0	3683	rc = sqlite3_bind_int64(c->pStmt, 1, dlrDocid(&c->reader));
sl@0	3684	dlrStep(&c->reader);
sl@0	3685	if( rc!=SQLITE_OK ) return rc;
sl@0	3686	/* TODO(shess) Handle SQLITE_SCHEMA AND SQLITE_BUSY. */
sl@0	3687	rc = sqlite3_step(c->pStmt);
sl@0	3688	if( rc==SQLITE_ROW ){ /* the case we expect */
sl@0	3689	c->eof = 0;
sl@0	3690	return SQLITE_OK;
sl@0	3691	}
sl@0	3692	/* an error occurred; abort */
sl@0	3693	return rc==SQLITE_DONE ? SQLITE_ERROR : rc;
sl@0	3694	}
sl@0	3695	}
sl@0	3696
sl@0	3697
sl@0	3698	/* TODO(shess) If we pushed LeafReader to the top of the file, or to
sl@0	3699	** another file, term_select() could be pushed above
sl@0	3700	** docListOfTerm().
sl@0	3701	*/
sl@0	3702	static int termSelect(fulltext_vtab *v, int iColumn,
sl@0	3703	const char *pTerm, int nTerm, int isPrefix,
sl@0	3704	DocListType iType, DataBuffer *out);
sl@0	3705
sl@0	3706	/* Return a DocList corresponding to the query term pTerm. If pTerm
sl@0	3707	** is the first term of a phrase query, go ahead and evaluate the phrase
sl@0	3708	** query and return the doclist for the entire phrase query.
sl@0	3709	**
sl@0	3710	** The resulting DL_DOCIDS doclist is stored in pResult, which is
sl@0	3711	** overwritten.
sl@0	3712	*/
sl@0	3713	static int docListOfTerm(
sl@0	3714	fulltext_vtab v, / The full text index */
sl@0	3715	int iColumn, /* column to restrict to. No restriction if >=nColumn */
sl@0	3716	QueryTerm pQTerm, / Term we are looking for, or 1st term of a phrase */
sl@0	3717	DataBuffer pResult / Write the result here */
sl@0	3718	){
sl@0	3719	DataBuffer left, right, new;
sl@0	3720	int i, rc;
sl@0	3721
sl@0	3722	/* No phrase search if no position info. */
sl@0	3723	assert( pQTerm->nPhrase==0 \|\| DL_DEFAULT!=DL_DOCIDS );
sl@0	3724
sl@0	3725	/* This code should never be called with buffered updates. */
sl@0	3726	assert( v->nPendingData<0 );
sl@0	3727
sl@0	3728	dataBufferInit(&left, 0);
sl@0	3729	rc = termSelect(v, iColumn, pQTerm->pTerm, pQTerm->nTerm, pQTerm->isPrefix,
sl@0	3730	(0<pQTerm->nPhrase ? DL_POSITIONS : DL_DOCIDS), &left);
sl@0	3731	if( rc ) return rc;
sl@0	3732	for(i=1; i<=pQTerm->nPhrase && left.nData>0; i++){
sl@0	3733	/* If this token is connected to the next by a NEAR operator, and
sl@0	3734	** the next token is the start of a phrase, then set nPhraseRight
sl@0	3735	** to the number of tokens in the phrase. Otherwise leave it at 1.
sl@0	3736	*/
sl@0	3737	int nPhraseRight = 1;
sl@0	3738	while( (i+nPhraseRight)<=pQTerm->nPhrase
sl@0	3739	&& pQTerm[i+nPhraseRight].nNear==0
sl@0	3740	){
sl@0	3741	nPhraseRight++;
sl@0	3742	}
sl@0	3743
sl@0	3744	dataBufferInit(&right, 0);
sl@0	3745	rc = termSelect(v, iColumn, pQTerm[i].pTerm, pQTerm[i].nTerm,
sl@0	3746	pQTerm[i].isPrefix, DL_POSITIONS, &right);
sl@0	3747	if( rc ){
sl@0	3748	dataBufferDestroy(&left);
sl@0	3749	return rc;
sl@0	3750	}
sl@0	3751	dataBufferInit(&new, 0);
sl@0	3752	docListPhraseMerge(left.pData, left.nData, right.pData, right.nData,
sl@0	3753	pQTerm[i-1].nNear, pQTerm[i-1].iPhrase + nPhraseRight,
sl@0	3754	((i<pQTerm->nPhrase) ? DL_POSITIONS : DL_DOCIDS),
sl@0	3755	&new);
sl@0	3756	dataBufferDestroy(&left);
sl@0	3757	dataBufferDestroy(&right);
sl@0	3758	left = new;
sl@0	3759	}
sl@0	3760	*pResult = left;
sl@0	3761	return SQLITE_OK;
sl@0	3762	}
sl@0	3763
sl@0	3764	/* Add a new term pTerm[0..nTerm-1] to the query *q.
sl@0	3765	*/
sl@0	3766	static void queryAdd(Query q, const char pTerm, int nTerm){
sl@0	3767	QueryTerm *t;
sl@0	3768	++q->nTerms;
sl@0	3769	q->pTerms = sqlite3_realloc(q->pTerms, q->nTerms * sizeof(q->pTerms[0]));
sl@0	3770	if( q->pTerms==0 ){
sl@0	3771	q->nTerms = 0;
sl@0	3772	return;
sl@0	3773	}
sl@0	3774	t = &q->pTerms[q->nTerms - 1];
sl@0	3775	CLEAR(t);
sl@0	3776	t->pTerm = sqlite3_malloc(nTerm+1);
sl@0	3777	memcpy(t->pTerm, pTerm, nTerm);
sl@0	3778	t->pTerm[nTerm] = 0;
sl@0	3779	t->nTerm = nTerm;
sl@0	3780	t->isOr = q->nextIsOr;
sl@0	3781	t->isPrefix = 0;
sl@0	3782	q->nextIsOr = 0;
sl@0	3783	t->iColumn = q->nextColumn;
sl@0	3784	q->nextColumn = q->dfltColumn;
sl@0	3785	}
sl@0	3786
sl@0	3787	/*
sl@0	3788	** Check to see if the string zToken[0...nToken-1] matches any
sl@0	3789	** column name in the virtual table. If it does,
sl@0	3790	** return the zero-indexed column number. If not, return -1.
sl@0	3791	*/
sl@0	3792	static int checkColumnSpecifier(
sl@0	3793	fulltext_vtab pVtab, / The virtual table */
sl@0	3794	const char zToken, / Text of the token */
sl@0	3795	int nToken /* Number of characters in the token */
sl@0	3796	){
sl@0	3797	int i;
sl@0	3798	for(i=0; i<pVtab->nColumn; i++){
sl@0	3799	if( memcmp(pVtab->azColumn[i], zToken, nToken)==0
sl@0	3800	&& pVtab->azColumn[i][nToken]==0 ){
sl@0	3801	return i;
sl@0	3802	}
sl@0	3803	}
sl@0	3804	return -1;
sl@0	3805	}
sl@0	3806
sl@0	3807	/*
sl@0	3808	** Parse the text at zSegment[0..nSegment-1]. Add additional terms
sl@0	3809	** to the query being assemblied in pQuery.
sl@0	3810	**
sl@0	3811	** inPhrase is true if zSegment[0..nSegement-1] is contained within
sl@0	3812	** double-quotes. If inPhrase is true, then the first term
sl@0	3813	** is marked with the number of terms in the phrase less one and
sl@0	3814	** OR and "-" syntax is ignored. If inPhrase is false, then every
sl@0	3815	** term found is marked with nPhrase=0 and OR and "-" syntax is significant.
sl@0	3816	*/
sl@0	3817	static int tokenizeSegment(
sl@0	3818	sqlite3_tokenizer pTokenizer, / The tokenizer to use */
sl@0	3819	const char zSegment, int nSegment, / Query expression being parsed */
sl@0	3820	int inPhrase, /* True if within "..." */
sl@0	3821	Query pQuery / Append results here */
sl@0	3822	){
sl@0	3823	const sqlite3_tokenizer_module *pModule = pTokenizer->pModule;
sl@0	3824	sqlite3_tokenizer_cursor *pCursor;
sl@0	3825	int firstIndex = pQuery->nTerms;
sl@0	3826	int iCol;
sl@0	3827	int nTerm = 1;
sl@0	3828
sl@0	3829	int rc = pModule->xOpen(pTokenizer, zSegment, nSegment, &pCursor);
sl@0	3830	if( rc!=SQLITE_OK ) return rc;
sl@0	3831	pCursor->pTokenizer = pTokenizer;
sl@0	3832
sl@0	3833	while( 1 ){
sl@0	3834	const char *zToken;
sl@0	3835	int nToken, iBegin, iEnd, iPos;
sl@0	3836
sl@0	3837	rc = pModule->xNext(pCursor,
sl@0	3838	&zToken, &nToken,
sl@0	3839	&iBegin, &iEnd, &iPos);
sl@0	3840	if( rc!=SQLITE_OK ) break;
sl@0	3841	if( !inPhrase &&
sl@0	3842	zSegment[iEnd]==':' &&
sl@0	3843	(iCol = checkColumnSpecifier(pQuery->pFts, zToken, nToken))>=0 ){
sl@0	3844	pQuery->nextColumn = iCol;
sl@0	3845	continue;
sl@0	3846	}
sl@0	3847	if( !inPhrase && pQuery->nTerms>0 && nToken==2
sl@0	3848	&& zSegment[iBegin+0]=='O'
sl@0	3849	&& zSegment[iBegin+1]=='R'
sl@0	3850	){
sl@0	3851	pQuery->nextIsOr = 1;
sl@0	3852	continue;
sl@0	3853	}
sl@0	3854	if( !inPhrase && pQuery->nTerms>0 && !pQuery->nextIsOr && nToken==4
sl@0	3855	&& memcmp(&zSegment[iBegin], "NEAR", 4)==0
sl@0	3856	){
sl@0	3857	QueryTerm *pTerm = &pQuery->pTerms[pQuery->nTerms-1];
sl@0	3858	if( (iBegin+6)<nSegment
sl@0	3859	&& zSegment[iBegin+4] == '/'
sl@0	3860	&& isdigit(zSegment[iBegin+5])
sl@0	3861	){
sl@0	3862	int k;
sl@0	3863	pTerm->nNear = 0;
sl@0	3864	for(k=5; (iBegin+k)<=nSegment && isdigit(zSegment[iBegin+k]); k++){
sl@0	3865	pTerm->nNear = pTerm->nNear*10 + (zSegment[iBegin+k] - '0');
sl@0	3866	}
sl@0	3867	pModule->xNext(pCursor, &zToken, &nToken, &iBegin, &iEnd, &iPos);
sl@0	3868	} else {
sl@0	3869	pTerm->nNear = SQLITE_FTS3_DEFAULT_NEAR_PARAM;
sl@0	3870	}
sl@0	3871	pTerm->nNear++;
sl@0	3872	continue;
sl@0	3873	}
sl@0	3874
sl@0	3875	queryAdd(pQuery, zToken, nToken);
sl@0	3876	if( !inPhrase && iBegin>0 && zSegment[iBegin-1]=='-' ){
sl@0	3877	pQuery->pTerms[pQuery->nTerms-1].isNot = 1;
sl@0	3878	}
sl@0	3879	if( iEnd<nSegment && zSegment[iEnd]=='*' ){
sl@0	3880	pQuery->pTerms[pQuery->nTerms-1].isPrefix = 1;
sl@0	3881	}
sl@0	3882	pQuery->pTerms[pQuery->nTerms-1].iPhrase = nTerm;
sl@0	3883	if( inPhrase ){
sl@0	3884	nTerm++;
sl@0	3885	}
sl@0	3886	}
sl@0	3887
sl@0	3888	if( inPhrase && pQuery->nTerms>firstIndex ){
sl@0	3889	pQuery->pTerms[firstIndex].nPhrase = pQuery->nTerms - firstIndex - 1;
sl@0	3890	}
sl@0	3891
sl@0	3892	return pModule->xClose(pCursor);
sl@0	3893	}
sl@0	3894
sl@0	3895	/* Parse a query string, yielding a Query object pQuery.
sl@0	3896	**
sl@0	3897	** The calling function will need to queryClear() to clean up
sl@0	3898	** the dynamically allocated memory held by pQuery.
sl@0	3899	*/
sl@0	3900	static int parseQuery(
sl@0	3901	fulltext_vtab v, / The fulltext index */
sl@0	3902	const char zInput, / Input text of the query string */
sl@0	3903	int nInput, /* Size of the input text */
sl@0	3904	int dfltColumn, /* Default column of the index to match against */
sl@0	3905	Query pQuery / Write the parse results here. */
sl@0	3906	){
sl@0	3907	int iInput, inPhrase = 0;
sl@0	3908	int ii;
sl@0	3909	QueryTerm *aTerm;
sl@0	3910
sl@0	3911	if( zInput==0 ) nInput = 0;
sl@0	3912	if( nInput<0 ) nInput = strlen(zInput);
sl@0	3913	pQuery->nTerms = 0;
sl@0	3914	pQuery->pTerms = NULL;
sl@0	3915	pQuery->nextIsOr = 0;
sl@0	3916	pQuery->nextColumn = dfltColumn;
sl@0	3917	pQuery->dfltColumn = dfltColumn;
sl@0	3918	pQuery->pFts = v;
sl@0	3919
sl@0	3920	for(iInput=0; iInput<nInput; ++iInput){
sl@0	3921	int i;
sl@0	3922	for(i=iInput; i<nInput && zInput[i]!='"'; ++i){}
sl@0	3923	if( i>iInput ){
sl@0	3924	tokenizeSegment(v->pTokenizer, zInput+iInput, i-iInput, inPhrase,
sl@0	3925	pQuery);
sl@0	3926	}
sl@0	3927	iInput = i;
sl@0	3928	if( i<nInput ){
sl@0	3929	assert( zInput[i]=='"' );
sl@0	3930	inPhrase = !inPhrase;
sl@0	3931	}
sl@0	3932	}
sl@0	3933
sl@0	3934	if( inPhrase ){
sl@0	3935	/* unmatched quote */
sl@0	3936	queryClear(pQuery);
sl@0	3937	return SQLITE_ERROR;
sl@0	3938	}
sl@0	3939
sl@0	3940	/* Modify the values of the QueryTerm.nPhrase variables to account for
sl@0	3941	** the NEAR operator. For the purposes of QueryTerm.nPhrase, phrases
sl@0	3942	** and tokens connected by the NEAR operator are handled as a single
sl@0	3943	** phrase. See comments above the QueryTerm structure for details.
sl@0	3944	*/
sl@0	3945	aTerm = pQuery->pTerms;
sl@0	3946	for(ii=0; ii<pQuery->nTerms; ii++){
sl@0	3947	if( aTerm[ii].nNear \|\| aTerm[ii].nPhrase ){
sl@0	3948	while (aTerm[ii+aTerm[ii].nPhrase].nNear) {
sl@0	3949	aTerm[ii].nPhrase += (1 + aTerm[ii+aTerm[ii].nPhrase+1].nPhrase);
sl@0	3950	}
sl@0	3951	}
sl@0	3952	}
sl@0	3953
sl@0	3954	return SQLITE_OK;
sl@0	3955	}
sl@0	3956
sl@0	3957	/* TODO(shess) Refactor the code to remove this forward decl. */
sl@0	3958	static int flushPendingTerms(fulltext_vtab *v);
sl@0	3959
sl@0	3960	/* Perform a full-text query using the search expression in
sl@0	3961	** zInput[0..nInput-1]. Return a list of matching documents
sl@0	3962	** in pResult.
sl@0	3963	**
sl@0	3964	** Queries must match column iColumn. Or if iColumn>=nColumn
sl@0	3965	** they are allowed to match against any column.
sl@0	3966	*/
sl@0	3967	static int fulltextQuery(
sl@0	3968	fulltext_vtab v, / The full text index */
sl@0	3969	int iColumn, /* Match against this column by default */
sl@0	3970	const char zInput, / The query string */
sl@0	3971	int nInput, /* Number of bytes in zInput[] */
sl@0	3972	DataBuffer pResult, / Write the result doclist here */
sl@0	3973	Query pQuery / Put parsed query string here */
sl@0	3974	){
sl@0	3975	int i, iNext, rc;
sl@0	3976	DataBuffer left, right, or, new;
sl@0	3977	int nNot = 0;
sl@0	3978	QueryTerm *aTerm;
sl@0	3979
sl@0	3980	/* TODO(shess) Instead of flushing pendingTerms, we could query for
sl@0	3981	** the relevant term and merge the doclist into what we receive from
sl@0	3982	** the database. Wait and see if this is a common issue, first.
sl@0	3983	**
sl@0	3984	** A good reason not to flush is to not generate update-related
sl@0	3985	** error codes from here.
sl@0	3986	*/
sl@0	3987
sl@0	3988	/* Flush any buffered updates before executing the query. */
sl@0	3989	rc = flushPendingTerms(v);
sl@0	3990	if( rc!=SQLITE_OK ) return rc;
sl@0	3991
sl@0	3992	/* TODO(shess) I think that the queryClear() calls below are not
sl@0	3993	** necessary, because fulltextClose() already clears the query.
sl@0	3994	*/
sl@0	3995	rc = parseQuery(v, zInput, nInput, iColumn, pQuery);
sl@0	3996	if( rc!=SQLITE_OK ) return rc;
sl@0	3997
sl@0	3998	/* Empty or NULL queries return no results. */
sl@0	3999	if( pQuery->nTerms==0 ){
sl@0	4000	dataBufferInit(pResult, 0);
sl@0	4001	return SQLITE_OK;
sl@0	4002	}
sl@0	4003
sl@0	4004	/* Merge AND terms. */
sl@0	4005	/* TODO(shess) I think we can early-exit if( i>nNot && left.nData==0 ). */
sl@0	4006	aTerm = pQuery->pTerms;
sl@0	4007	for(i = 0; i<pQuery->nTerms; i=iNext){
sl@0	4008	if( aTerm[i].isNot ){
sl@0	4009	/* Handle all NOT terms in a separate pass */
sl@0	4010	nNot++;
sl@0	4011	iNext = i + aTerm[i].nPhrase+1;
sl@0	4012	continue;
sl@0	4013	}
sl@0	4014	iNext = i + aTerm[i].nPhrase + 1;
sl@0	4015	rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
sl@0	4016	if( rc ){
sl@0	4017	if( i!=nNot ) dataBufferDestroy(&left);
sl@0	4018	queryClear(pQuery);
sl@0	4019	return rc;
sl@0	4020	}
sl@0	4021	while( iNext<pQuery->nTerms && aTerm[iNext].isOr ){
sl@0	4022	rc = docListOfTerm(v, aTerm[iNext].iColumn, &aTerm[iNext], &or);
sl@0	4023	iNext += aTerm[iNext].nPhrase + 1;
sl@0	4024	if( rc ){
sl@0	4025	if( i!=nNot ) dataBufferDestroy(&left);
sl@0	4026	dataBufferDestroy(&right);
sl@0	4027	queryClear(pQuery);
sl@0	4028	return rc;
sl@0	4029	}
sl@0	4030	dataBufferInit(&new, 0);
sl@0	4031	docListOrMerge(right.pData, right.nData, or.pData, or.nData, &new);
sl@0	4032	dataBufferDestroy(&right);
sl@0	4033	dataBufferDestroy(&or);
sl@0	4034	right = new;
sl@0	4035	}
sl@0	4036	if( i==nNot ){ /* first term processed. */
sl@0	4037	left = right;
sl@0	4038	}else{
sl@0	4039	dataBufferInit(&new, 0);
sl@0	4040	docListAndMerge(left.pData, left.nData, right.pData, right.nData, &new);
sl@0	4041	dataBufferDestroy(&right);
sl@0	4042	dataBufferDestroy(&left);
sl@0	4043	left = new;
sl@0	4044	}
sl@0	4045	}
sl@0	4046
sl@0	4047	if( nNot==pQuery->nTerms ){
sl@0	4048	/* We do not yet know how to handle a query of only NOT terms */
sl@0	4049	return SQLITE_ERROR;
sl@0	4050	}
sl@0	4051
sl@0	4052	/* Do the EXCEPT terms */
sl@0	4053	for(i=0; i<pQuery->nTerms; i += aTerm[i].nPhrase + 1){
sl@0	4054	if( !aTerm[i].isNot ) continue;
sl@0	4055	rc = docListOfTerm(v, aTerm[i].iColumn, &aTerm[i], &right);
sl@0	4056	if( rc ){
sl@0	4057	queryClear(pQuery);
sl@0	4058	dataBufferDestroy(&left);
sl@0	4059	return rc;
sl@0	4060	}
sl@0	4061	dataBufferInit(&new, 0);
sl@0	4062	docListExceptMerge(left.pData, left.nData, right.pData, right.nData, &new);
sl@0	4063	dataBufferDestroy(&right);
sl@0	4064	dataBufferDestroy(&left);
sl@0	4065	left = new;
sl@0	4066	}
sl@0	4067
sl@0	4068	*pResult = left;
sl@0	4069	return rc;
sl@0	4070	}
sl@0	4071
sl@0	4072	/*
sl@0	4073	** This is the xFilter interface for the virtual table. See
sl@0	4074	** the virtual table xFilter method documentation for additional
sl@0	4075	** information.
sl@0	4076	**
sl@0	4077	** If idxNum==QUERY_GENERIC then do a full table scan against
sl@0	4078	** the %_content table.
sl@0	4079	**
sl@0	4080	** If idxNum==QUERY_DOCID then do a docid lookup for a single entry
sl@0	4081	** in the %_content table.
sl@0	4082	**
sl@0	4083	** If idxNum>=QUERY_FULLTEXT then use the full text index. The
sl@0	4084	** column on the left-hand side of the MATCH operator is column
sl@0	4085	** number idxNum-QUERY_FULLTEXT, 0 indexed. argv[0] is the right-hand
sl@0	4086	** side of the MATCH operator.
sl@0	4087	*/
sl@0	4088	/* TODO(shess) Upgrade the cursor initialization and destruction to
sl@0	4089	** account for fulltextFilter() being called multiple times on the
sl@0	4090	** same cursor. The current solution is very fragile. Apply fix to
sl@0	4091	** fts3 as appropriate.
sl@0	4092	*/
sl@0	4093	static int fulltextFilter(
sl@0	4094	sqlite3_vtab_cursor pCursor, / The cursor used for this query */
sl@0	4095	int idxNum, const char idxStr, / Which indexing scheme to use */
sl@0	4096	int argc, sqlite3_value *argv / Arguments for the indexing scheme */
sl@0	4097	){
sl@0	4098	fulltext_cursor c = (fulltext_cursor ) pCursor;
sl@0	4099	fulltext_vtab *v = cursor_vtab(c);
sl@0	4100	int rc;
sl@0	4101
sl@0	4102	FTSTRACE(("FTS3 Filter %p\n",pCursor));
sl@0	4103
sl@0	4104	/* If the cursor has a statement that was not prepared according to
sl@0	4105	** idxNum, clear it. I believe all calls to fulltextFilter with a
sl@0	4106	** given cursor will have the same idxNum , but in this case it's
sl@0	4107	** easy to be safe.
sl@0	4108	*/
sl@0	4109	if( c->pStmt && c->iCursorType!=idxNum ){
sl@0	4110	sqlite3_finalize(c->pStmt);
sl@0	4111	c->pStmt = NULL;
sl@0	4112	}
sl@0	4113
sl@0	4114	/* Get a fresh statement appropriate to idxNum. */
sl@0	4115	/* TODO(shess): Add a prepared-statement cache in the vt structure.
sl@0	4116	** The cache must handle multiple open cursors. Easier to cache the
sl@0	4117	** statement variants at the vt to reduce malloc/realloc/free here.
sl@0	4118	** Or we could have a StringBuffer variant which allowed stack
sl@0	4119	** construction for small values.
sl@0	4120	*/
sl@0	4121	if( !c->pStmt ){
sl@0	4122	StringBuffer sb;
sl@0	4123	initStringBuffer(&sb);
sl@0	4124	append(&sb, "SELECT docid, ");
sl@0	4125	appendList(&sb, v->nColumn, v->azContentColumn);
sl@0	4126	append(&sb, " FROM %_content");
sl@0	4127	if( idxNum!=QUERY_GENERIC ) append(&sb, " WHERE docid = ?");
sl@0	4128	rc = sql_prepare(v->db, v->zDb, v->zName, &c->pStmt,
sl@0	4129	stringBufferData(&sb));
sl@0	4130	stringBufferDestroy(&sb);
sl@0	4131	if( rc!=SQLITE_OK ) return rc;
sl@0	4132	c->iCursorType = idxNum;
sl@0	4133	}else{
sl@0	4134	sqlite3_reset(c->pStmt);
sl@0	4135	assert( c->iCursorType==idxNum );
sl@0	4136	}
sl@0	4137
sl@0	4138	switch( idxNum ){
sl@0	4139	case QUERY_GENERIC:
sl@0	4140	break;
sl@0	4141
sl@0	4142	case QUERY_DOCID:
sl@0	4143	rc = sqlite3_bind_int64(c->pStmt, 1, sqlite3_value_int64(argv[0]));
sl@0	4144	if( rc!=SQLITE_OK ) return rc;
sl@0	4145	break;
sl@0	4146
sl@0	4147	default: /* full-text search */
sl@0	4148	{
sl@0	4149	const char zQuery = (const char )sqlite3_value_text(argv[0]);
sl@0	4150	assert( idxNum<=QUERY_FULLTEXT+v->nColumn);
sl@0	4151	assert( argc==1 );
sl@0	4152	queryClear(&c->q);
sl@0	4153	if( c->result.nData!=0 ){
sl@0	4154	/* This case happens if the same cursor is used repeatedly. */
sl@0	4155	dlrDestroy(&c->reader);
sl@0	4156	dataBufferReset(&c->result);
sl@0	4157	}else{
sl@0	4158	dataBufferInit(&c->result, 0);
sl@0	4159	}
sl@0	4160	rc = fulltextQuery(v, idxNum-QUERY_FULLTEXT, zQuery, -1, &c->result, &c->q);
sl@0	4161	if( rc!=SQLITE_OK ) return rc;
sl@0	4162	if( c->result.nData!=0 ){
sl@0	4163	dlrInit(&c->reader, DL_DOCIDS, c->result.pData, c->result.nData);
sl@0	4164	}
sl@0	4165	break;
sl@0	4166	}
sl@0	4167	}
sl@0	4168
sl@0	4169	return fulltextNext(pCursor);
sl@0	4170	}
sl@0	4171
sl@0	4172	/* This is the xEof method of the virtual table. The SQLite core
sl@0	4173	** calls this routine to find out if it has reached the end of
sl@0	4174	** a query's results set.
sl@0	4175	*/
sl@0	4176	static int fulltextEof(sqlite3_vtab_cursor *pCursor){
sl@0	4177	fulltext_cursor c = (fulltext_cursor ) pCursor;
sl@0	4178	return c->eof;
sl@0	4179	}
sl@0	4180
sl@0	4181	/* This is the xColumn method of the virtual table. The SQLite
sl@0	4182	** core calls this method during a query when it needs the value
sl@0	4183	** of a column from the virtual table. This method needs to use
sl@0	4184	** one of the sqlite3_result_*() routines to store the requested
sl@0	4185	** value back in the pContext.
sl@0	4186	*/
sl@0	4187	static int fulltextColumn(sqlite3_vtab_cursor *pCursor,
sl@0	4188	sqlite3_context *pContext, int idxCol){
sl@0	4189	fulltext_cursor c = (fulltext_cursor ) pCursor;
sl@0	4190	fulltext_vtab *v = cursor_vtab(c);
sl@0	4191
sl@0	4192	if( idxCol<v->nColumn ){
sl@0	4193	sqlite3_value *pVal = sqlite3_column_value(c->pStmt, idxCol+1);
sl@0	4194	sqlite3_result_value(pContext, pVal);
sl@0	4195	}else if( idxCol==v->nColumn ){
sl@0	4196	/* The extra column whose name is the same as the table.
sl@0	4197	** Return a blob which is a pointer to the cursor
sl@0	4198	*/
sl@0	4199	sqlite3_result_blob(pContext, &c, sizeof(c), SQLITE_TRANSIENT);
sl@0	4200	}else if( idxCol==v->nColumn+1 ){
sl@0	4201	/* The docid column, which is an alias for rowid. */
sl@0	4202	sqlite3_value *pVal = sqlite3_column_value(c->pStmt, 0);
sl@0	4203	sqlite3_result_value(pContext, pVal);
sl@0	4204	}
sl@0	4205	return SQLITE_OK;
sl@0	4206	}
sl@0	4207
sl@0	4208	/* This is the xRowid method. The SQLite core calls this routine to
sl@0	4209	** retrieve the rowid for the current row of the result set. fts3
sl@0	4210	** exposes %_content.docid as the rowid for the virtual table. The
sl@0	4211	** rowid should be written to *pRowid.
sl@0	4212	*/
sl@0	4213	static int fulltextRowid(sqlite3_vtab_cursor pCursor, sqlite_int64 pRowid){
sl@0	4214	fulltext_cursor c = (fulltext_cursor ) pCursor;
sl@0	4215
sl@0	4216	*pRowid = sqlite3_column_int64(c->pStmt, 0);
sl@0	4217	return SQLITE_OK;
sl@0	4218	}
sl@0	4219
sl@0	4220	/* Add all terms in [zText] to pendingTerms table. If [iColumn] > 0,
sl@0	4221	** we also store positions and offsets in the hash table using that
sl@0	4222	** column number.
sl@0	4223	*/
sl@0	4224	static int buildTerms(fulltext_vtab *v, sqlite_int64 iDocid,
sl@0	4225	const char *zText, int iColumn){
sl@0	4226	sqlite3_tokenizer *pTokenizer = v->pTokenizer;
sl@0	4227	sqlite3_tokenizer_cursor *pCursor;
sl@0	4228	const char *pToken;
sl@0	4229	int nTokenBytes;
sl@0	4230	int iStartOffset, iEndOffset, iPosition;
sl@0	4231	int rc;
sl@0	4232
sl@0	4233	rc = pTokenizer->pModule->xOpen(pTokenizer, zText, -1, &pCursor);
sl@0	4234	if( rc!=SQLITE_OK ) return rc;
sl@0	4235
sl@0	4236	pCursor->pTokenizer = pTokenizer;
sl@0	4237	while( SQLITE_OK==(rc=pTokenizer->pModule->xNext(pCursor,
sl@0	4238	&pToken, &nTokenBytes,
sl@0	4239	&iStartOffset, &iEndOffset,
sl@0	4240	&iPosition)) ){
sl@0	4241	DLCollector *p;
sl@0	4242	int nData; /* Size of doclist before our update. */
sl@0	4243
sl@0	4244	/* Positions can't be negative; we use -1 as a terminator
sl@0	4245	* internally. Token can't be NULL or empty. */
sl@0	4246	if( iPosition<0 \|\| pToken == NULL \|\| nTokenBytes == 0 ){
sl@0	4247	rc = SQLITE_ERROR;
sl@0	4248	break;
sl@0	4249	}
sl@0	4250
sl@0	4251	p = fts3HashFind(&v->pendingTerms, pToken, nTokenBytes);
sl@0	4252	if( p==NULL ){
sl@0	4253	nData = 0;
sl@0	4254	p = dlcNew(iDocid, DL_DEFAULT);
sl@0	4255	fts3HashInsert(&v->pendingTerms, pToken, nTokenBytes, p);
sl@0	4256
sl@0	4257	/* Overhead for our hash table entry, the key, and the value. */
sl@0	4258	v->nPendingData += sizeof(struct fts3HashElem)+sizeof(*p)+nTokenBytes;
sl@0	4259	}else{
sl@0	4260	nData = p->b.nData;
sl@0	4261	if( p->dlw.iPrevDocid!=iDocid ) dlcNext(p, iDocid);
sl@0	4262	}
sl@0	4263	if( iColumn>=0 ){
sl@0	4264	dlcAddPos(p, iColumn, iPosition, iStartOffset, iEndOffset);
sl@0	4265	}
sl@0	4266
sl@0	4267	/* Accumulate data added by dlcNew or dlcNext, and dlcAddPos. */
sl@0	4268	v->nPendingData += p->b.nData-nData;
sl@0	4269	}
sl@0	4270
sl@0	4271	/* TODO(shess) Check return? Should this be able to cause errors at
sl@0	4272	** this point? Actually, same question about sqlite3_finalize(),
sl@0	4273	** though one could argue that failure there means that the data is
sl@0	4274	** not durable. ponder
sl@0	4275	*/
sl@0	4276	pTokenizer->pModule->xClose(pCursor);
sl@0	4277	if( SQLITE_DONE == rc ) return SQLITE_OK;
sl@0	4278	return rc;
sl@0	4279	}
sl@0	4280
sl@0	4281	/* Add doclists for all terms in [pValues] to pendingTerms table. */
sl@0	4282	static int insertTerms(fulltext_vtab *v, sqlite_int64 iDocid,
sl@0	4283	sqlite3_value **pValues){
sl@0	4284	int i;
sl@0	4285	for(i = 0; i < v->nColumn ; ++i){
sl@0	4286	char zText = (char)sqlite3_value_text(pValues[i]);
sl@0	4287	int rc = buildTerms(v, iDocid, zText, i);
sl@0	4288	if( rc!=SQLITE_OK ) return rc;
sl@0	4289	}
sl@0	4290	return SQLITE_OK;
sl@0	4291	}
sl@0	4292
sl@0	4293	/* Add empty doclists for all terms in the given row's content to
sl@0	4294	** pendingTerms.
sl@0	4295	*/
sl@0	4296	static int deleteTerms(fulltext_vtab *v, sqlite_int64 iDocid){
sl@0	4297	const char **pValues;
sl@0	4298	int i, rc;
sl@0	4299
sl@0	4300	/* TODO(shess) Should we allow such tables at all? */
sl@0	4301	if( DL_DEFAULT==DL_DOCIDS ) return SQLITE_ERROR;
sl@0	4302
sl@0	4303	rc = content_select(v, iDocid, &pValues);
sl@0	4304	if( rc!=SQLITE_OK ) return rc;
sl@0	4305
sl@0	4306	for(i = 0 ; i < v->nColumn; ++i) {
sl@0	4307	rc = buildTerms(v, iDocid, pValues[i], -1);
sl@0	4308	if( rc!=SQLITE_OK ) break;
sl@0	4309	}
sl@0	4310
sl@0	4311	freeStringArray(v->nColumn, pValues);
sl@0	4312	return SQLITE_OK;
sl@0	4313	}
sl@0	4314
sl@0	4315	/* TODO(shess) Refactor the code to remove this forward decl. */
sl@0	4316	static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid);
sl@0	4317
sl@0	4318	/* Insert a row into the %_content table; set *piDocid to be the ID of the
sl@0	4319	** new row. Add doclists for terms to pendingTerms.
sl@0	4320	*/
sl@0	4321	static int index_insert(fulltext_vtab v, sqlite3_value pRequestDocid,
sl@0	4322	sqlite3_value *pValues, sqlite_int64 piDocid){
sl@0	4323	int rc;
sl@0	4324
sl@0	4325	rc = content_insert(v, pRequestDocid, pValues); /* execute an SQL INSERT */
sl@0	4326	if( rc!=SQLITE_OK ) return rc;
sl@0	4327
sl@0	4328	/* docid column is an alias for rowid. */
sl@0	4329	*piDocid = sqlite3_last_insert_rowid(v->db);
sl@0	4330	rc = initPendingTerms(v, *piDocid);
sl@0	4331	if( rc!=SQLITE_OK ) return rc;
sl@0	4332
sl@0	4333	return insertTerms(v, *piDocid, pValues);
sl@0	4334	}
sl@0	4335
sl@0	4336	/* Delete a row from the %_content table; add empty doclists for terms
sl@0	4337	** to pendingTerms.
sl@0	4338	*/
sl@0	4339	static int index_delete(fulltext_vtab *v, sqlite_int64 iRow){
sl@0	4340	int rc = initPendingTerms(v, iRow);
sl@0	4341	if( rc!=SQLITE_OK ) return rc;
sl@0	4342
sl@0	4343	rc = deleteTerms(v, iRow);
sl@0	4344	if( rc!=SQLITE_OK ) return rc;
sl@0	4345
sl@0	4346	return content_delete(v, iRow); /* execute an SQL DELETE */
sl@0	4347	}
sl@0	4348
sl@0	4349	/* Update a row in the %_content table; add delete doclists to
sl@0	4350	** pendingTerms for old terms not in the new data, add insert doclists
sl@0	4351	** to pendingTerms for terms in the new data.
sl@0	4352	*/
sl@0	4353	static int index_update(fulltext_vtab *v, sqlite_int64 iRow,
sl@0	4354	sqlite3_value **pValues){
sl@0	4355	int rc = initPendingTerms(v, iRow);
sl@0	4356	if( rc!=SQLITE_OK ) return rc;
sl@0	4357
sl@0	4358	/* Generate an empty doclist for each term that previously appeared in this
sl@0	4359	* row. */
sl@0	4360	rc = deleteTerms(v, iRow);
sl@0	4361	if( rc!=SQLITE_OK ) return rc;
sl@0	4362
sl@0	4363	rc = content_update(v, pValues, iRow); /* execute an SQL UPDATE */
sl@0	4364	if( rc!=SQLITE_OK ) return rc;
sl@0	4365
sl@0	4366	/* Now add positions for terms which appear in the updated row. */
sl@0	4367	return insertTerms(v, iRow, pValues);
sl@0	4368	}
sl@0	4369
sl@0	4370	/*******************************************************************/
sl@0	4371	/* InteriorWriter is used to collect terms and block references into
sl@0	4372	** interior nodes in %_segments. See commentary at top of file for
sl@0	4373	** format.
sl@0	4374	*/
sl@0	4375
sl@0	4376	/* How large interior nodes can grow. */
sl@0	4377	#define INTERIOR_MAX 2048
sl@0	4378
sl@0	4379	/* Minimum number of terms per interior node (except the root). This
sl@0	4380	** prevents large terms from making the tree too skinny - must be >0
sl@0	4381	** so that the tree always makes progress. Note that the min tree
sl@0	4382	** fanout will be INTERIOR_MIN_TERMS+1.
sl@0	4383	*/
sl@0	4384	#define INTERIOR_MIN_TERMS 7
sl@0	4385	#if INTERIOR_MIN_TERMS<1
sl@0	4386	# error INTERIOR_MIN_TERMS must be greater than 0.
sl@0	4387	#endif
sl@0	4388
sl@0	4389	/* ROOT_MAX controls how much data is stored inline in the segment
sl@0	4390	** directory.
sl@0	4391	*/
sl@0	4392	/* TODO(shess) Push ROOT_MAX down to whoever is writing things. It's
sl@0	4393	** only here so that interiorWriterRootInfo() and leafWriterRootInfo()
sl@0	4394	** can both see it, but if the caller passed it in, we wouldn't even
sl@0	4395	** need a define.
sl@0	4396	*/
sl@0	4397	#define ROOT_MAX 1024
sl@0	4398	#if ROOT_MAX<VARINT_MAX*2
sl@0	4399	# error ROOT_MAX must have enough space for a header.
sl@0	4400	#endif
sl@0	4401
sl@0	4402	/* InteriorBlock stores a linked-list of interior blocks while a lower
sl@0	4403	** layer is being constructed.
sl@0	4404	*/
sl@0	4405	typedef struct InteriorBlock {
sl@0	4406	DataBuffer term; /* Leftmost term in block's subtree. */
sl@0	4407	DataBuffer data; /* Accumulated data for the block. */
sl@0	4408	struct InteriorBlock *next;
sl@0	4409	} InteriorBlock;
sl@0	4410
sl@0	4411	static InteriorBlock *interiorBlockNew(int iHeight, sqlite_int64 iChildBlock,
sl@0	4412	const char *pTerm, int nTerm){
sl@0	4413	InteriorBlock *block = sqlite3_malloc(sizeof(InteriorBlock));
sl@0	4414	char c[VARINT_MAX+VARINT_MAX];
sl@0	4415	int n;
sl@0	4416
sl@0	4417	if( block ){
sl@0	4418	memset(block, 0, sizeof(*block));
sl@0	4419	dataBufferInit(&block->term, 0);
sl@0	4420	dataBufferReplace(&block->term, pTerm, nTerm);
sl@0	4421
sl@0	4422	n = fts3PutVarint(c, iHeight);
sl@0	4423	n += fts3PutVarint(c+n, iChildBlock);
sl@0	4424	dataBufferInit(&block->data, INTERIOR_MAX);
sl@0	4425	dataBufferReplace(&block->data, c, n);
sl@0	4426	}
sl@0	4427	return block;
sl@0	4428	}
sl@0	4429
sl@0	4430	#ifndef NDEBUG
sl@0	4431	/* Verify that the data is readable as an interior node. */
sl@0	4432	static void interiorBlockValidate(InteriorBlock *pBlock){
sl@0	4433	const char *pData = pBlock->data.pData;
sl@0	4434	int nData = pBlock->data.nData;
sl@0	4435	int n, iDummy;
sl@0	4436	sqlite_int64 iBlockid;
sl@0	4437
sl@0	4438	assert( nData>0 );
sl@0	4439	assert( pData!=0 );
sl@0	4440	assert( pData+nData>pData );
sl@0	4441
sl@0	4442	/* Must lead with height of node as a varint(n), n>0 */
sl@0	4443	n = fts3GetVarint32(pData, &iDummy);
sl@0	4444	assert( n>0 );
sl@0	4445	assert( iDummy>0 );
sl@0	4446	assert( n<nData );
sl@0	4447	pData += n;
sl@0	4448	nData -= n;
sl@0	4449
sl@0	4450	/* Must contain iBlockid. */
sl@0	4451	n = fts3GetVarint(pData, &iBlockid);
sl@0	4452	assert( n>0 );
sl@0	4453	assert( n<=nData );
sl@0	4454	pData += n;
sl@0	4455	nData -= n;
sl@0	4456
sl@0	4457	/* Zero or more terms of positive length */
sl@0	4458	if( nData!=0 ){
sl@0	4459	/* First term is not delta-encoded. */
sl@0	4460	n = fts3GetVarint32(pData, &iDummy);
sl@0	4461	assert( n>0 );
sl@0	4462	assert( iDummy>0 );
sl@0	4463	assert( n+iDummy>0);
sl@0	4464	assert( n+iDummy<=nData );
sl@0	4465	pData += n+iDummy;
sl@0	4466	nData -= n+iDummy;
sl@0	4467
sl@0	4468	/* Following terms delta-encoded. */
sl@0	4469	while( nData!=0 ){
sl@0	4470	/* Length of shared prefix. */
sl@0	4471	n = fts3GetVarint32(pData, &iDummy);
sl@0	4472	assert( n>0 );
sl@0	4473	assert( iDummy>=0 );
sl@0	4474	assert( n<nData );
sl@0	4475	pData += n;
sl@0	4476	nData -= n;
sl@0	4477
sl@0	4478	/* Length and data of distinct suffix. */
sl@0	4479	n = fts3GetVarint32(pData, &iDummy);
sl@0	4480	assert( n>0 );
sl@0	4481	assert( iDummy>0 );
sl@0	4482	assert( n+iDummy>0);
sl@0	4483	assert( n+iDummy<=nData );
sl@0	4484	pData += n+iDummy;
sl@0	4485	nData -= n+iDummy;
sl@0	4486	}
sl@0	4487	}
sl@0	4488	}
sl@0	4489	#define ASSERT_VALID_INTERIOR_BLOCK(x) interiorBlockValidate(x)
sl@0	4490	#else
sl@0	4491	#define ASSERT_VALID_INTERIOR_BLOCK(x) assert( 1 )
sl@0	4492	#endif
sl@0	4493
sl@0	4494	typedef struct InteriorWriter {
sl@0	4495	int iHeight; /* from 0 at leaves. */
sl@0	4496	InteriorBlock first, last;
sl@0	4497	struct InteriorWriter *parentWriter;
sl@0	4498
sl@0	4499	DataBuffer term; /* Last term written to block "last". */
sl@0	4500	sqlite_int64 iOpeningChildBlock; /* First child block in block "last". */
sl@0	4501	#ifndef NDEBUG
sl@0	4502	sqlite_int64 iLastChildBlock; /* for consistency checks. */
sl@0	4503	#endif
sl@0	4504	} InteriorWriter;
sl@0	4505
sl@0	4506	/* Initialize an interior node where pTerm[nTerm] marks the leftmost
sl@0	4507	** term in the tree. iChildBlock is the leftmost child block at the
sl@0	4508	** next level down the tree.
sl@0	4509	*/
sl@0	4510	static void interiorWriterInit(int iHeight, const char *pTerm, int nTerm,
sl@0	4511	sqlite_int64 iChildBlock,
sl@0	4512	InteriorWriter *pWriter){
sl@0	4513	InteriorBlock *block;
sl@0	4514	assert( iHeight>0 );
sl@0	4515	CLEAR(pWriter);
sl@0	4516
sl@0	4517	pWriter->iHeight = iHeight;
sl@0	4518	pWriter->iOpeningChildBlock = iChildBlock;
sl@0	4519	#ifndef NDEBUG
sl@0	4520	pWriter->iLastChildBlock = iChildBlock;
sl@0	4521	#endif
sl@0	4522	block = interiorBlockNew(iHeight, iChildBlock, pTerm, nTerm);
sl@0	4523	pWriter->last = pWriter->first = block;
sl@0	4524	ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
sl@0	4525	dataBufferInit(&pWriter->term, 0);
sl@0	4526	}
sl@0	4527
sl@0	4528	/* Append the child node rooted at iChildBlock to the interior node,
sl@0	4529	** with pTerm[nTerm] as the leftmost term in iChildBlock's subtree.
sl@0	4530	*/
sl@0	4531	static void interiorWriterAppend(InteriorWriter *pWriter,
sl@0	4532	const char *pTerm, int nTerm,
sl@0	4533	sqlite_int64 iChildBlock){
sl@0	4534	char c[VARINT_MAX+VARINT_MAX];
sl@0	4535	int n, nPrefix = 0;
sl@0	4536
sl@0	4537	ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
sl@0	4538
sl@0	4539	/* The first term written into an interior node is actually
sl@0	4540	** associated with the second child added (the first child was added
sl@0	4541	** in interiorWriterInit, or in the if clause at the bottom of this
sl@0	4542	** function). That term gets encoded straight up, with nPrefix left
sl@0	4543	** at 0.
sl@0	4544	*/
sl@0	4545	if( pWriter->term.nData==0 ){
sl@0	4546	n = fts3PutVarint(c, nTerm);
sl@0	4547	}else{
sl@0	4548	while( nPrefix<pWriter->term.nData &&
sl@0	4549	pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
sl@0	4550	nPrefix++;
sl@0	4551	}
sl@0	4552
sl@0	4553	n = fts3PutVarint(c, nPrefix);
sl@0	4554	n += fts3PutVarint(c+n, nTerm-nPrefix);
sl@0	4555	}
sl@0	4556
sl@0	4557	#ifndef NDEBUG
sl@0	4558	pWriter->iLastChildBlock++;
sl@0	4559	#endif
sl@0	4560	assert( pWriter->iLastChildBlock==iChildBlock );
sl@0	4561
sl@0	4562	/* Overflow to a new block if the new term makes the current block
sl@0	4563	** too big, and the current block already has enough terms.
sl@0	4564	*/
sl@0	4565	if( pWriter->last->data.nData+n+nTerm-nPrefix>INTERIOR_MAX &&
sl@0	4566	iChildBlock-pWriter->iOpeningChildBlock>INTERIOR_MIN_TERMS ){
sl@0	4567	pWriter->last->next = interiorBlockNew(pWriter->iHeight, iChildBlock,
sl@0	4568	pTerm, nTerm);
sl@0	4569	pWriter->last = pWriter->last->next;
sl@0	4570	pWriter->iOpeningChildBlock = iChildBlock;
sl@0	4571	dataBufferReset(&pWriter->term);
sl@0	4572	}else{
sl@0	4573	dataBufferAppend2(&pWriter->last->data, c, n,
sl@0	4574	pTerm+nPrefix, nTerm-nPrefix);
sl@0	4575	dataBufferReplace(&pWriter->term, pTerm, nTerm);
sl@0	4576	}
sl@0	4577	ASSERT_VALID_INTERIOR_BLOCK(pWriter->last);
sl@0	4578	}
sl@0	4579
sl@0	4580	/* Free the space used by pWriter, including the linked-list of
sl@0	4581	** InteriorBlocks, and parentWriter, if present.
sl@0	4582	*/
sl@0	4583	static int interiorWriterDestroy(InteriorWriter *pWriter){
sl@0	4584	InteriorBlock *block = pWriter->first;
sl@0	4585
sl@0	4586	while( block!=NULL ){
sl@0	4587	InteriorBlock *b = block;
sl@0	4588	block = block->next;
sl@0	4589	dataBufferDestroy(&b->term);
sl@0	4590	dataBufferDestroy(&b->data);
sl@0	4591	sqlite3_free(b);
sl@0	4592	}
sl@0	4593	if( pWriter->parentWriter!=NULL ){
sl@0	4594	interiorWriterDestroy(pWriter->parentWriter);
sl@0	4595	sqlite3_free(pWriter->parentWriter);
sl@0	4596	}
sl@0	4597	dataBufferDestroy(&pWriter->term);
sl@0	4598	SCRAMBLE(pWriter);
sl@0	4599	return SQLITE_OK;
sl@0	4600	}
sl@0	4601
sl@0	4602	/* If pWriter can fit entirely in ROOT_MAX, return it as the root info
sl@0	4603	** directly, leaving *piEndBlockid unchanged. Otherwise, flush
sl@0	4604	** pWriter to %_segments, building a new layer of interior nodes, and
sl@0	4605	** recursively ask for their root into.
sl@0	4606	*/
sl@0	4607	static int interiorWriterRootInfo(fulltext_vtab v, InteriorWriter pWriter,
sl@0	4608	char *ppRootInfo, int pnRootInfo,
sl@0	4609	sqlite_int64 *piEndBlockid){
sl@0	4610	InteriorBlock *block = pWriter->first;
sl@0	4611	sqlite_int64 iBlockid = 0;
sl@0	4612	int rc;
sl@0	4613
sl@0	4614	/* If we can fit the segment inline */
sl@0	4615	if( block==pWriter->last && block->data.nData<ROOT_MAX ){
sl@0	4616	*ppRootInfo = block->data.pData;
sl@0	4617	*pnRootInfo = block->data.nData;
sl@0	4618	return SQLITE_OK;
sl@0	4619	}
sl@0	4620
sl@0	4621	/* Flush the first block to %_segments, and create a new level of
sl@0	4622	** interior node.
sl@0	4623	*/
sl@0	4624	ASSERT_VALID_INTERIOR_BLOCK(block);
sl@0	4625	rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
sl@0	4626	if( rc!=SQLITE_OK ) return rc;
sl@0	4627	*piEndBlockid = iBlockid;
sl@0	4628
sl@0	4629	pWriter->parentWriter = sqlite3_malloc(sizeof(*pWriter->parentWriter));
sl@0	4630	interiorWriterInit(pWriter->iHeight+1,
sl@0	4631	block->term.pData, block->term.nData,
sl@0	4632	iBlockid, pWriter->parentWriter);
sl@0	4633
sl@0	4634	/* Flush additional blocks and append to the higher interior
sl@0	4635	** node.
sl@0	4636	*/
sl@0	4637	for(block=block->next; block!=NULL; block=block->next){
sl@0	4638	ASSERT_VALID_INTERIOR_BLOCK(block);
sl@0	4639	rc = block_insert(v, block->data.pData, block->data.nData, &iBlockid);
sl@0	4640	if( rc!=SQLITE_OK ) return rc;
sl@0	4641	*piEndBlockid = iBlockid;
sl@0	4642
sl@0	4643	interiorWriterAppend(pWriter->parentWriter,
sl@0	4644	block->term.pData, block->term.nData, iBlockid);
sl@0	4645	}
sl@0	4646
sl@0	4647	/* Parent node gets the chance to be the root. */
sl@0	4648	return interiorWriterRootInfo(v, pWriter->parentWriter,
sl@0	4649	ppRootInfo, pnRootInfo, piEndBlockid);
sl@0	4650	}
sl@0	4651
sl@0	4652	/****************************************************************/
sl@0	4653	/* InteriorReader is used to read off the data from an interior node
sl@0	4654	** (see comment at top of file for the format).
sl@0	4655	*/
sl@0	4656	typedef struct InteriorReader {
sl@0	4657	const char *pData;
sl@0	4658	int nData;
sl@0	4659
sl@0	4660	DataBuffer term; /* previous term, for decoding term delta. */
sl@0	4661
sl@0	4662	sqlite_int64 iBlockid;
sl@0	4663	} InteriorReader;
sl@0	4664
sl@0	4665	static void interiorReaderDestroy(InteriorReader *pReader){
sl@0	4666	dataBufferDestroy(&pReader->term);
sl@0	4667	SCRAMBLE(pReader);
sl@0	4668	}
sl@0	4669
sl@0	4670	/* TODO(shess) The assertions are great, but what if we're in NDEBUG
sl@0	4671	** and the blob is empty or otherwise contains suspect data?
sl@0	4672	*/
sl@0	4673	static void interiorReaderInit(const char *pData, int nData,
sl@0	4674	InteriorReader *pReader){
sl@0	4675	int n, nTerm;
sl@0	4676
sl@0	4677	/* Require at least the leading flag byte */
sl@0	4678	assert( nData>0 );
sl@0	4679	assert( pData[0]!='\0' );
sl@0	4680
sl@0	4681	CLEAR(pReader);
sl@0	4682
sl@0	4683	/* Decode the base blockid, and set the cursor to the first term. */
sl@0	4684	n = fts3GetVarint(pData+1, &pReader->iBlockid);
sl@0	4685	assert( 1+n<=nData );
sl@0	4686	pReader->pData = pData+1+n;
sl@0	4687	pReader->nData = nData-(1+n);
sl@0	4688
sl@0	4689	/* A single-child interior node (such as when a leaf node was too
sl@0	4690	** large for the segment directory) won't have any terms.
sl@0	4691	** Otherwise, decode the first term.
sl@0	4692	*/
sl@0	4693	if( pReader->nData==0 ){
sl@0	4694	dataBufferInit(&pReader->term, 0);
sl@0	4695	}else{
sl@0	4696	n = fts3GetVarint32(pReader->pData, &nTerm);
sl@0	4697	dataBufferInit(&pReader->term, nTerm);
sl@0	4698	dataBufferReplace(&pReader->term, pReader->pData+n, nTerm);
sl@0	4699	assert( n+nTerm<=pReader->nData );
sl@0	4700	pReader->pData += n+nTerm;
sl@0	4701	pReader->nData -= n+nTerm;
sl@0	4702	}
sl@0	4703	}
sl@0	4704
sl@0	4705	static int interiorReaderAtEnd(InteriorReader *pReader){
sl@0	4706	return pReader->term.nData==0;
sl@0	4707	}
sl@0	4708
sl@0	4709	static sqlite_int64 interiorReaderCurrentBlockid(InteriorReader *pReader){
sl@0	4710	return pReader->iBlockid;
sl@0	4711	}
sl@0	4712
sl@0	4713	static int interiorReaderTermBytes(InteriorReader *pReader){
sl@0	4714	assert( !interiorReaderAtEnd(pReader) );
sl@0	4715	return pReader->term.nData;
sl@0	4716	}
sl@0	4717	static const char interiorReaderTerm(InteriorReader pReader){
sl@0	4718	assert( !interiorReaderAtEnd(pReader) );
sl@0	4719	return pReader->term.pData;
sl@0	4720	}
sl@0	4721
sl@0	4722	/* Step forward to the next term in the node. */
sl@0	4723	static void interiorReaderStep(InteriorReader *pReader){
sl@0	4724	assert( !interiorReaderAtEnd(pReader) );
sl@0	4725
sl@0	4726	/* If the last term has been read, signal eof, else construct the
sl@0	4727	** next term.
sl@0	4728	*/
sl@0	4729	if( pReader->nData==0 ){
sl@0	4730	dataBufferReset(&pReader->term);
sl@0	4731	}else{
sl@0	4732	int n, nPrefix, nSuffix;
sl@0	4733
sl@0	4734	n = fts3GetVarint32(pReader->pData, &nPrefix);
sl@0	4735	n += fts3GetVarint32(pReader->pData+n, &nSuffix);
sl@0	4736
sl@0	4737	/* Truncate the current term and append suffix data. */
sl@0	4738	pReader->term.nData = nPrefix;
sl@0	4739	dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
sl@0	4740
sl@0	4741	assert( n+nSuffix<=pReader->nData );
sl@0	4742	pReader->pData += n+nSuffix;
sl@0	4743	pReader->nData -= n+nSuffix;
sl@0	4744	}
sl@0	4745	pReader->iBlockid++;
sl@0	4746	}
sl@0	4747
sl@0	4748	/* Compare the current term to pTerm[nTerm], returning strcmp-style
sl@0	4749	** results. If isPrefix, equality means equal through nTerm bytes.
sl@0	4750	*/
sl@0	4751	static int interiorReaderTermCmp(InteriorReader *pReader,
sl@0	4752	const char *pTerm, int nTerm, int isPrefix){
sl@0	4753	const char *pReaderTerm = interiorReaderTerm(pReader);
sl@0	4754	int nReaderTerm = interiorReaderTermBytes(pReader);
sl@0	4755	int c, n = nReaderTerm<nTerm ? nReaderTerm : nTerm;
sl@0	4756
sl@0	4757	if( n==0 ){
sl@0	4758	if( nReaderTerm>0 ) return -1;
sl@0	4759	if( nTerm>0 ) return 1;
sl@0	4760	return 0;
sl@0	4761	}
sl@0	4762
sl@0	4763	c = memcmp(pReaderTerm, pTerm, n);
sl@0	4764	if( c!=0 ) return c;
sl@0	4765	if( isPrefix && n==nTerm ) return 0;
sl@0	4766	return nReaderTerm - nTerm;
sl@0	4767	}
sl@0	4768
sl@0	4769	/****************************************************************/
sl@0	4770	/* LeafWriter is used to collect terms and associated doclist data
sl@0	4771	** into leaf blocks in %_segments (see top of file for format info).
sl@0	4772	** Expected usage is:
sl@0	4773	**
sl@0	4774	** LeafWriter writer;
sl@0	4775	** leafWriterInit(0, 0, &writer);
sl@0	4776	** while( sorted_terms_left_to_process ){
sl@0	4777	** // data is doclist data for that term.
sl@0	4778	** rc = leafWriterStep(v, &writer, pTerm, nTerm, pData, nData);
sl@0	4779	** if( rc!=SQLITE_OK ) goto err;
sl@0	4780	** }
sl@0	4781	** rc = leafWriterFinalize(v, &writer);
sl@0	4782	**err:
sl@0	4783	** leafWriterDestroy(&writer);
sl@0	4784	** return rc;
sl@0	4785	**
sl@0	4786	** leafWriterStep() may write a collected leaf out to %_segments.
sl@0	4787	** leafWriterFinalize() finishes writing any buffered data and stores
sl@0	4788	** a root node in %_segdir. leafWriterDestroy() frees all buffers and
sl@0	4789	** InteriorWriters allocated as part of writing this segment.
sl@0	4790	**
sl@0	4791	** TODO(shess) Document leafWriterStepMerge().
sl@0	4792	*/
sl@0	4793
sl@0	4794	/* Put terms with data this big in their own block. */
sl@0	4795	#define STANDALONE_MIN 1024
sl@0	4796
sl@0	4797	/* Keep leaf blocks below this size. */
sl@0	4798	#define LEAF_MAX 2048
sl@0	4799
sl@0	4800	typedef struct LeafWriter {
sl@0	4801	int iLevel;
sl@0	4802	int idx;
sl@0	4803	sqlite_int64 iStartBlockid; /* needed to create the root info */
sl@0	4804	sqlite_int64 iEndBlockid; /* when we're done writing. */
sl@0	4805
sl@0	4806	DataBuffer term; /* previous encoded term */
sl@0	4807	DataBuffer data; /* encoding buffer */
sl@0	4808
sl@0	4809	/* bytes of first term in the current node which distinguishes that
sl@0	4810	** term from the last term of the previous node.
sl@0	4811	*/
sl@0	4812	int nTermDistinct;
sl@0	4813
sl@0	4814	InteriorWriter parentWriter; /* if we overflow */
sl@0	4815	int has_parent;
sl@0	4816	} LeafWriter;
sl@0	4817
sl@0	4818	static void leafWriterInit(int iLevel, int idx, LeafWriter *pWriter){
sl@0	4819	CLEAR(pWriter);
sl@0	4820	pWriter->iLevel = iLevel;
sl@0	4821	pWriter->idx = idx;
sl@0	4822
sl@0	4823	dataBufferInit(&pWriter->term, 32);
sl@0	4824
sl@0	4825	/* Start out with a reasonably sized block, though it can grow. */
sl@0	4826	dataBufferInit(&pWriter->data, LEAF_MAX);
sl@0	4827	}
sl@0	4828
sl@0	4829	#ifndef NDEBUG
sl@0	4830	/* Verify that the data is readable as a leaf node. */
sl@0	4831	static void leafNodeValidate(const char *pData, int nData){
sl@0	4832	int n, iDummy;
sl@0	4833
sl@0	4834	if( nData==0 ) return;
sl@0	4835	assert( nData>0 );
sl@0	4836	assert( pData!=0 );
sl@0	4837	assert( pData+nData>pData );
sl@0	4838
sl@0	4839	/* Must lead with a varint(0) */
sl@0	4840	n = fts3GetVarint32(pData, &iDummy);
sl@0	4841	assert( iDummy==0 );
sl@0	4842	assert( n>0 );
sl@0	4843	assert( n<nData );
sl@0	4844	pData += n;
sl@0	4845	nData -= n;
sl@0	4846
sl@0	4847	/* Leading term length and data must fit in buffer. */
sl@0	4848	n = fts3GetVarint32(pData, &iDummy);
sl@0	4849	assert( n>0 );
sl@0	4850	assert( iDummy>0 );
sl@0	4851	assert( n+iDummy>0 );
sl@0	4852	assert( n+iDummy<nData );
sl@0	4853	pData += n+iDummy;
sl@0	4854	nData -= n+iDummy;
sl@0	4855
sl@0	4856	/* Leading term's doclist length and data must fit. */
sl@0	4857	n = fts3GetVarint32(pData, &iDummy);
sl@0	4858	assert( n>0 );
sl@0	4859	assert( iDummy>0 );
sl@0	4860	assert( n+iDummy>0 );
sl@0	4861	assert( n+iDummy<=nData );
sl@0	4862	ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
sl@0	4863	pData += n+iDummy;
sl@0	4864	nData -= n+iDummy;
sl@0	4865
sl@0	4866	/* Verify that trailing terms and doclists also are readable. */
sl@0	4867	while( nData!=0 ){
sl@0	4868	n = fts3GetVarint32(pData, &iDummy);
sl@0	4869	assert( n>0 );
sl@0	4870	assert( iDummy>=0 );
sl@0	4871	assert( n<nData );
sl@0	4872	pData += n;
sl@0	4873	nData -= n;
sl@0	4874	n = fts3GetVarint32(pData, &iDummy);
sl@0	4875	assert( n>0 );
sl@0	4876	assert( iDummy>0 );
sl@0	4877	assert( n+iDummy>0 );
sl@0	4878	assert( n+iDummy<nData );
sl@0	4879	pData += n+iDummy;
sl@0	4880	nData -= n+iDummy;
sl@0	4881
sl@0	4882	n = fts3GetVarint32(pData, &iDummy);
sl@0	4883	assert( n>0 );
sl@0	4884	assert( iDummy>0 );
sl@0	4885	assert( n+iDummy>0 );
sl@0	4886	assert( n+iDummy<=nData );
sl@0	4887	ASSERT_VALID_DOCLIST(DL_DEFAULT, pData+n, iDummy, NULL);
sl@0	4888	pData += n+iDummy;
sl@0	4889	nData -= n+iDummy;
sl@0	4890	}
sl@0	4891	}
sl@0	4892	#define ASSERT_VALID_LEAF_NODE(p, n) leafNodeValidate(p, n)
sl@0	4893	#else
sl@0	4894	#define ASSERT_VALID_LEAF_NODE(p, n) assert( 1 )
sl@0	4895	#endif
sl@0	4896
sl@0	4897	/* Flush the current leaf node to %_segments, and adding the resulting
sl@0	4898	** blockid and the starting term to the interior node which will
sl@0	4899	** contain it.
sl@0	4900	*/
sl@0	4901	static int leafWriterInternalFlush(fulltext_vtab v, LeafWriter pWriter,
sl@0	4902	int iData, int nData){
sl@0	4903	sqlite_int64 iBlockid = 0;
sl@0	4904	const char *pStartingTerm;
sl@0	4905	int nStartingTerm, rc, n;
sl@0	4906
sl@0	4907	/* Must have the leading varint(0) flag, plus at least some
sl@0	4908	** valid-looking data.
sl@0	4909	*/
sl@0	4910	assert( nData>2 );
sl@0	4911	assert( iData>=0 );
sl@0	4912	assert( iData+nData<=pWriter->data.nData );
sl@0	4913	ASSERT_VALID_LEAF_NODE(pWriter->data.pData+iData, nData);
sl@0	4914
sl@0	4915	rc = block_insert(v, pWriter->data.pData+iData, nData, &iBlockid);
sl@0	4916	if( rc!=SQLITE_OK ) return rc;
sl@0	4917	assert( iBlockid!=0 );
sl@0	4918
sl@0	4919	/* Reconstruct the first term in the leaf for purposes of building
sl@0	4920	** the interior node.
sl@0	4921	*/
sl@0	4922	n = fts3GetVarint32(pWriter->data.pData+iData+1, &nStartingTerm);
sl@0	4923	pStartingTerm = pWriter->data.pData+iData+1+n;
sl@0	4924	assert( pWriter->data.nData>iData+1+n+nStartingTerm );
sl@0	4925	assert( pWriter->nTermDistinct>0 );
sl@0	4926	assert( pWriter->nTermDistinct<=nStartingTerm );
sl@0	4927	nStartingTerm = pWriter->nTermDistinct;
sl@0	4928
sl@0	4929	if( pWriter->has_parent ){
sl@0	4930	interiorWriterAppend(&pWriter->parentWriter,
sl@0	4931	pStartingTerm, nStartingTerm, iBlockid);
sl@0	4932	}else{
sl@0	4933	interiorWriterInit(1, pStartingTerm, nStartingTerm, iBlockid,
sl@0	4934	&pWriter->parentWriter);
sl@0	4935	pWriter->has_parent = 1;
sl@0	4936	}
sl@0	4937
sl@0	4938	/* Track the span of this segment's leaf nodes. */
sl@0	4939	if( pWriter->iEndBlockid==0 ){
sl@0	4940	pWriter->iEndBlockid = pWriter->iStartBlockid = iBlockid;
sl@0	4941	}else{
sl@0	4942	pWriter->iEndBlockid++;
sl@0	4943	assert( iBlockid==pWriter->iEndBlockid );
sl@0	4944	}
sl@0	4945
sl@0	4946	return SQLITE_OK;
sl@0	4947	}
sl@0	4948	static int leafWriterFlush(fulltext_vtab v, LeafWriter pWriter){
sl@0	4949	int rc = leafWriterInternalFlush(v, pWriter, 0, pWriter->data.nData);
sl@0	4950	if( rc!=SQLITE_OK ) return rc;
sl@0	4951
sl@0	4952	/* Re-initialize the output buffer. */
sl@0	4953	dataBufferReset(&pWriter->data);
sl@0	4954
sl@0	4955	return SQLITE_OK;
sl@0	4956	}
sl@0	4957
sl@0	4958	/* Fetch the root info for the segment. If the entire leaf fits
sl@0	4959	** within ROOT_MAX, then it will be returned directly, otherwise it
sl@0	4960	** will be flushed and the root info will be returned from the
sl@0	4961	** interior node. *piEndBlockid is set to the blockid of the last
sl@0	4962	** interior or leaf node written to disk (0 if none are written at
sl@0	4963	** all).
sl@0	4964	*/
sl@0	4965	static int leafWriterRootInfo(fulltext_vtab v, LeafWriter pWriter,
sl@0	4966	char *ppRootInfo, int pnRootInfo,
sl@0	4967	sqlite_int64 *piEndBlockid){
sl@0	4968	/* we can fit the segment entirely inline */
sl@0	4969	if( !pWriter->has_parent && pWriter->data.nData<ROOT_MAX ){
sl@0	4970	*ppRootInfo = pWriter->data.pData;
sl@0	4971	*pnRootInfo = pWriter->data.nData;
sl@0	4972	*piEndBlockid = 0;
sl@0	4973	return SQLITE_OK;
sl@0	4974	}
sl@0	4975
sl@0	4976	/* Flush remaining leaf data. */
sl@0	4977	if( pWriter->data.nData>0 ){
sl@0	4978	int rc = leafWriterFlush(v, pWriter);
sl@0	4979	if( rc!=SQLITE_OK ) return rc;
sl@0	4980	}
sl@0	4981
sl@0	4982	/* We must have flushed a leaf at some point. */
sl@0	4983	assert( pWriter->has_parent );
sl@0	4984
sl@0	4985	/* Tenatively set the end leaf blockid as the end blockid. If the
sl@0	4986	** interior node can be returned inline, this will be the final
sl@0	4987	** blockid, otherwise it will be overwritten by
sl@0	4988	** interiorWriterRootInfo().
sl@0	4989	*/
sl@0	4990	*piEndBlockid = pWriter->iEndBlockid;
sl@0	4991
sl@0	4992	return interiorWriterRootInfo(v, &pWriter->parentWriter,
sl@0	4993	ppRootInfo, pnRootInfo, piEndBlockid);
sl@0	4994	}
sl@0	4995
sl@0	4996	/* Collect the rootInfo data and store it into the segment directory.
sl@0	4997	** This has the effect of flushing the segment's leaf data to
sl@0	4998	** %_segments, and also flushing any interior nodes to %_segments.
sl@0	4999	*/
sl@0	5000	static int leafWriterFinalize(fulltext_vtab v, LeafWriter pWriter){
sl@0	5001	sqlite_int64 iEndBlockid;
sl@0	5002	char *pRootInfo;
sl@0	5003	int rc, nRootInfo;
sl@0	5004
sl@0	5005	rc = leafWriterRootInfo(v, pWriter, &pRootInfo, &nRootInfo, &iEndBlockid);
sl@0	5006	if( rc!=SQLITE_OK ) return rc;
sl@0	5007
sl@0	5008	/* Don't bother storing an entirely empty segment. */
sl@0	5009	if( iEndBlockid==0 && nRootInfo==0 ) return SQLITE_OK;
sl@0	5010
sl@0	5011	return segdir_set(v, pWriter->iLevel, pWriter->idx,
sl@0	5012	pWriter->iStartBlockid, pWriter->iEndBlockid,
sl@0	5013	iEndBlockid, pRootInfo, nRootInfo);
sl@0	5014	}
sl@0	5015
sl@0	5016	static void leafWriterDestroy(LeafWriter *pWriter){
sl@0	5017	if( pWriter->has_parent ) interiorWriterDestroy(&pWriter->parentWriter);
sl@0	5018	dataBufferDestroy(&pWriter->term);
sl@0	5019	dataBufferDestroy(&pWriter->data);
sl@0	5020	}
sl@0	5021
sl@0	5022	/* Encode a term into the leafWriter, delta-encoding as appropriate.
sl@0	5023	** Returns the length of the new term which distinguishes it from the
sl@0	5024	** previous term, which can be used to set nTermDistinct when a node
sl@0	5025	** boundary is crossed.
sl@0	5026	*/
sl@0	5027	static int leafWriterEncodeTerm(LeafWriter *pWriter,
sl@0	5028	const char *pTerm, int nTerm){
sl@0	5029	char c[VARINT_MAX+VARINT_MAX];
sl@0	5030	int n, nPrefix = 0;
sl@0	5031
sl@0	5032	assert( nTerm>0 );
sl@0	5033	while( nPrefix<pWriter->term.nData &&
sl@0	5034	pTerm[nPrefix]==pWriter->term.pData[nPrefix] ){
sl@0	5035	nPrefix++;
sl@0	5036	/* Failing this implies that the terms weren't in order. */
sl@0	5037	assert( nPrefix<nTerm );
sl@0	5038	}
sl@0	5039
sl@0	5040	if( pWriter->data.nData==0 ){
sl@0	5041	/* Encode the node header and leading term as:
sl@0	5042	** varint(0)
sl@0	5043	** varint(nTerm)
sl@0	5044	** char pTerm[nTerm]
sl@0	5045	*/
sl@0	5046	n = fts3PutVarint(c, '\0');
sl@0	5047	n += fts3PutVarint(c+n, nTerm);
sl@0	5048	dataBufferAppend2(&pWriter->data, c, n, pTerm, nTerm);
sl@0	5049	}else{
sl@0	5050	/* Delta-encode the term as:
sl@0	5051	** varint(nPrefix)
sl@0	5052	** varint(nSuffix)
sl@0	5053	** char pTermSuffix[nSuffix]
sl@0	5054	*/
sl@0	5055	n = fts3PutVarint(c, nPrefix);
sl@0	5056	n += fts3PutVarint(c+n, nTerm-nPrefix);
sl@0	5057	dataBufferAppend2(&pWriter->data, c, n, pTerm+nPrefix, nTerm-nPrefix);
sl@0	5058	}
sl@0	5059	dataBufferReplace(&pWriter->term, pTerm, nTerm);
sl@0	5060
sl@0	5061	return nPrefix+1;
sl@0	5062	}
sl@0	5063
sl@0	5064	/* Used to avoid a memmove when a large amount of doclist data is in
sl@0	5065	** the buffer. This constructs a node and term header before
sl@0	5066	** iDoclistData and flushes the resulting complete node using
sl@0	5067	** leafWriterInternalFlush().
sl@0	5068	*/
sl@0	5069	static int leafWriterInlineFlush(fulltext_vtab v, LeafWriter pWriter,
sl@0	5070	const char *pTerm, int nTerm,
sl@0	5071	int iDoclistData){
sl@0	5072	char c[VARINT_MAX+VARINT_MAX];
sl@0	5073	int iData, n = fts3PutVarint(c, 0);
sl@0	5074	n += fts3PutVarint(c+n, nTerm);
sl@0	5075
sl@0	5076	/* There should always be room for the header. Even if pTerm shared
sl@0	5077	** a substantial prefix with the previous term, the entire prefix
sl@0	5078	** could be constructed from earlier data in the doclist, so there
sl@0	5079	** should be room.
sl@0	5080	*/
sl@0	5081	assert( iDoclistData>=n+nTerm );
sl@0	5082
sl@0	5083	iData = iDoclistData-(n+nTerm);
sl@0	5084	memcpy(pWriter->data.pData+iData, c, n);
sl@0	5085	memcpy(pWriter->data.pData+iData+n, pTerm, nTerm);
sl@0	5086
sl@0	5087	return leafWriterInternalFlush(v, pWriter, iData, pWriter->data.nData-iData);
sl@0	5088	}
sl@0	5089
sl@0	5090	/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
sl@0	5091	** %_segments.
sl@0	5092	*/
sl@0	5093	static int leafWriterStepMerge(fulltext_vtab v, LeafWriter pWriter,
sl@0	5094	const char *pTerm, int nTerm,
sl@0	5095	DLReader *pReaders, int nReaders){
sl@0	5096	char c[VARINT_MAX+VARINT_MAX];
sl@0	5097	int iTermData = pWriter->data.nData, iDoclistData;
sl@0	5098	int i, nData, n, nActualData, nActual, rc, nTermDistinct;
sl@0	5099
sl@0	5100	ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
sl@0	5101	nTermDistinct = leafWriterEncodeTerm(pWriter, pTerm, nTerm);
sl@0	5102
sl@0	5103	/* Remember nTermDistinct if opening a new node. */
sl@0	5104	if( iTermData==0 ) pWriter->nTermDistinct = nTermDistinct;
sl@0	5105
sl@0	5106	iDoclistData = pWriter->data.nData;
sl@0	5107
sl@0	5108	/* Estimate the length of the merged doclist so we can leave space
sl@0	5109	** to encode it.
sl@0	5110	*/
sl@0	5111	for(i=0, nData=0; i<nReaders; i++){
sl@0	5112	nData += dlrAllDataBytes(&pReaders[i]);
sl@0	5113	}
sl@0	5114	n = fts3PutVarint(c, nData);
sl@0	5115	dataBufferAppend(&pWriter->data, c, n);
sl@0	5116
sl@0	5117	docListMerge(&pWriter->data, pReaders, nReaders);
sl@0	5118	ASSERT_VALID_DOCLIST(DL_DEFAULT,
sl@0	5119	pWriter->data.pData+iDoclistData+n,
sl@0	5120	pWriter->data.nData-iDoclistData-n, NULL);
sl@0	5121
sl@0	5122	/* The actual amount of doclist data at this point could be smaller
sl@0	5123	** than the length we encoded. Additionally, the space required to
sl@0	5124	** encode this length could be smaller. For small doclists, this is
sl@0	5125	** not a big deal, we can just use memmove() to adjust things.
sl@0	5126	*/
sl@0	5127	nActualData = pWriter->data.nData-(iDoclistData+n);
sl@0	5128	nActual = fts3PutVarint(c, nActualData);
sl@0	5129	assert( nActualData<=nData );
sl@0	5130	assert( nActual<=n );
sl@0	5131
sl@0	5132	/* If the new doclist is big enough for force a standalone leaf
sl@0	5133	** node, we can immediately flush it inline without doing the
sl@0	5134	** memmove().
sl@0	5135	*/
sl@0	5136	/* TODO(shess) This test matches leafWriterStep(), which does this
sl@0	5137	** test before it knows the cost to varint-encode the term and
sl@0	5138	** doclist lengths. At some point, change to
sl@0	5139	** pWriter->data.nData-iTermData>STANDALONE_MIN.
sl@0	5140	*/
sl@0	5141	if( nTerm+nActualData>STANDALONE_MIN ){
sl@0	5142	/* Push leaf node from before this term. */
sl@0	5143	if( iTermData>0 ){
sl@0	5144	rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
sl@0	5145	if( rc!=SQLITE_OK ) return rc;
sl@0	5146
sl@0	5147	pWriter->nTermDistinct = nTermDistinct;
sl@0	5148	}
sl@0	5149
sl@0	5150	/* Fix the encoded doclist length. */
sl@0	5151	iDoclistData += n - nActual;
sl@0	5152	memcpy(pWriter->data.pData+iDoclistData, c, nActual);
sl@0	5153
sl@0	5154	/* Push the standalone leaf node. */
sl@0	5155	rc = leafWriterInlineFlush(v, pWriter, pTerm, nTerm, iDoclistData);
sl@0	5156	if( rc!=SQLITE_OK ) return rc;
sl@0	5157
sl@0	5158	/* Leave the node empty. */
sl@0	5159	dataBufferReset(&pWriter->data);
sl@0	5160
sl@0	5161	return rc;
sl@0	5162	}
sl@0	5163
sl@0	5164	/* At this point, we know that the doclist was small, so do the
sl@0	5165	** memmove if indicated.
sl@0	5166	*/
sl@0	5167	if( nActual<n ){
sl@0	5168	memmove(pWriter->data.pData+iDoclistData+nActual,
sl@0	5169	pWriter->data.pData+iDoclistData+n,
sl@0	5170	pWriter->data.nData-(iDoclistData+n));
sl@0	5171	pWriter->data.nData -= n-nActual;
sl@0	5172	}
sl@0	5173
sl@0	5174	/* Replace written length with actual length. */
sl@0	5175	memcpy(pWriter->data.pData+iDoclistData, c, nActual);
sl@0	5176
sl@0	5177	/* If the node is too large, break things up. */
sl@0	5178	/* TODO(shess) This test matches leafWriterStep(), which does this
sl@0	5179	** test before it knows the cost to varint-encode the term and
sl@0	5180	** doclist lengths. At some point, change to
sl@0	5181	** pWriter->data.nData>LEAF_MAX.
sl@0	5182	*/
sl@0	5183	if( iTermData+nTerm+nActualData>LEAF_MAX ){
sl@0	5184	/* Flush out the leading data as a node */
sl@0	5185	rc = leafWriterInternalFlush(v, pWriter, 0, iTermData);
sl@0	5186	if( rc!=SQLITE_OK ) return rc;
sl@0	5187
sl@0	5188	pWriter->nTermDistinct = nTermDistinct;
sl@0	5189
sl@0	5190	/* Rebuild header using the current term */
sl@0	5191	n = fts3PutVarint(pWriter->data.pData, 0);
sl@0	5192	n += fts3PutVarint(pWriter->data.pData+n, nTerm);
sl@0	5193	memcpy(pWriter->data.pData+n, pTerm, nTerm);
sl@0	5194	n += nTerm;
sl@0	5195
sl@0	5196	/* There should always be room, because the previous encoding
sl@0	5197	** included all data necessary to construct the term.
sl@0	5198	*/
sl@0	5199	assert( n<iDoclistData );
sl@0	5200	/* So long as STANDALONE_MIN is half or less of LEAF_MAX, the
sl@0	5201	** following memcpy() is safe (as opposed to needing a memmove).
sl@0	5202	*/
sl@0	5203	assert( 2*STANDALONE_MIN<=LEAF_MAX );
sl@0	5204	assert( n+pWriter->data.nData-iDoclistData<iDoclistData );
sl@0	5205	memcpy(pWriter->data.pData+n,
sl@0	5206	pWriter->data.pData+iDoclistData,
sl@0	5207	pWriter->data.nData-iDoclistData);
sl@0	5208	pWriter->data.nData -= iDoclistData-n;
sl@0	5209	}
sl@0	5210	ASSERT_VALID_LEAF_NODE(pWriter->data.pData, pWriter->data.nData);
sl@0	5211
sl@0	5212	return SQLITE_OK;
sl@0	5213	}
sl@0	5214
sl@0	5215	/* Push pTerm[nTerm] along with the doclist data to the leaf layer of
sl@0	5216	** %_segments.
sl@0	5217	*/
sl@0	5218	/* TODO(shess) Revise writeZeroSegment() so that doclists are
sl@0	5219	** constructed directly in pWriter->data.
sl@0	5220	*/
sl@0	5221	static int leafWriterStep(fulltext_vtab v, LeafWriter pWriter,
sl@0	5222	const char *pTerm, int nTerm,
sl@0	5223	const char *pData, int nData){
sl@0	5224	int rc;
sl@0	5225	DLReader reader;
sl@0	5226
sl@0	5227	dlrInit(&reader, DL_DEFAULT, pData, nData);
sl@0	5228	rc = leafWriterStepMerge(v, pWriter, pTerm, nTerm, &reader, 1);
sl@0	5229	dlrDestroy(&reader);
sl@0	5230
sl@0	5231	return rc;
sl@0	5232	}
sl@0	5233
sl@0	5234
sl@0	5235	/****************************************************************/
sl@0	5236	/* LeafReader is used to iterate over an individual leaf node. */
sl@0	5237	typedef struct LeafReader {
sl@0	5238	DataBuffer term; /* copy of current term. */
sl@0	5239
sl@0	5240	const char pData; / data for current term. */
sl@0	5241	int nData;
sl@0	5242	} LeafReader;
sl@0	5243
sl@0	5244	static void leafReaderDestroy(LeafReader *pReader){
sl@0	5245	dataBufferDestroy(&pReader->term);
sl@0	5246	SCRAMBLE(pReader);
sl@0	5247	}
sl@0	5248
sl@0	5249	static int leafReaderAtEnd(LeafReader *pReader){
sl@0	5250	return pReader->nData<=0;
sl@0	5251	}
sl@0	5252
sl@0	5253	/* Access the current term. */
sl@0	5254	static int leafReaderTermBytes(LeafReader *pReader){
sl@0	5255	return pReader->term.nData;
sl@0	5256	}
sl@0	5257	static const char leafReaderTerm(LeafReader pReader){
sl@0	5258	assert( pReader->term.nData>0 );
sl@0	5259	return pReader->term.pData;
sl@0	5260	}
sl@0	5261
sl@0	5262	/* Access the doclist data for the current term. */
sl@0	5263	static int leafReaderDataBytes(LeafReader *pReader){
sl@0	5264	int nData;
sl@0	5265	assert( pReader->term.nData>0 );
sl@0	5266	fts3GetVarint32(pReader->pData, &nData);
sl@0	5267	return nData;
sl@0	5268	}
sl@0	5269	static const char leafReaderData(LeafReader pReader){
sl@0	5270	int n, nData;
sl@0	5271	assert( pReader->term.nData>0 );
sl@0	5272	n = fts3GetVarint32(pReader->pData, &nData);
sl@0	5273	return pReader->pData+n;
sl@0	5274	}
sl@0	5275
sl@0	5276	static void leafReaderInit(const char *pData, int nData,
sl@0	5277	LeafReader *pReader){
sl@0	5278	int nTerm, n;
sl@0	5279
sl@0	5280	assert( nData>0 );
sl@0	5281	assert( pData[0]=='\0' );
sl@0	5282
sl@0	5283	CLEAR(pReader);
sl@0	5284
sl@0	5285	/* Read the first term, skipping the header byte. */
sl@0	5286	n = fts3GetVarint32(pData+1, &nTerm);
sl@0	5287	dataBufferInit(&pReader->term, nTerm);
sl@0	5288	dataBufferReplace(&pReader->term, pData+1+n, nTerm);
sl@0	5289
sl@0	5290	/* Position after the first term. */
sl@0	5291	assert( 1+n+nTerm<nData );
sl@0	5292	pReader->pData = pData+1+n+nTerm;
sl@0	5293	pReader->nData = nData-1-n-nTerm;
sl@0	5294	}
sl@0	5295
sl@0	5296	/* Step the reader forward to the next term. */
sl@0	5297	static void leafReaderStep(LeafReader *pReader){
sl@0	5298	int n, nData, nPrefix, nSuffix;
sl@0	5299	assert( !leafReaderAtEnd(pReader) );
sl@0	5300
sl@0	5301	/* Skip previous entry's data block. */
sl@0	5302	n = fts3GetVarint32(pReader->pData, &nData);
sl@0	5303	assert( n+nData<=pReader->nData );
sl@0	5304	pReader->pData += n+nData;
sl@0	5305	pReader->nData -= n+nData;
sl@0	5306
sl@0	5307	if( !leafReaderAtEnd(pReader) ){
sl@0	5308	/* Construct the new term using a prefix from the old term plus a
sl@0	5309	** suffix from the leaf data.
sl@0	5310	*/
sl@0	5311	n = fts3GetVarint32(pReader->pData, &nPrefix);
sl@0	5312	n += fts3GetVarint32(pReader->pData+n, &nSuffix);
sl@0	5313	assert( n+nSuffix<pReader->nData );
sl@0	5314	pReader->term.nData = nPrefix;
sl@0	5315	dataBufferAppend(&pReader->term, pReader->pData+n, nSuffix);
sl@0	5316
sl@0	5317	pReader->pData += n+nSuffix;
sl@0	5318	pReader->nData -= n+nSuffix;
sl@0	5319	}
sl@0	5320	}
sl@0	5321
sl@0	5322	/* strcmp-style comparison of pReader's current term against pTerm.
sl@0	5323	** If isPrefix, equality means equal through nTerm bytes.
sl@0	5324	*/
sl@0	5325	static int leafReaderTermCmp(LeafReader *pReader,
sl@0	5326	const char *pTerm, int nTerm, int isPrefix){
sl@0	5327	int c, n = pReader->term.nData<nTerm ? pReader->term.nData : nTerm;
sl@0	5328	if( n==0 ){
sl@0	5329	if( pReader->term.nData>0 ) return -1;
sl@0	5330	if(nTerm>0 ) return 1;
sl@0	5331	return 0;
sl@0	5332	}
sl@0	5333
sl@0	5334	c = memcmp(pReader->term.pData, pTerm, n);
sl@0	5335	if( c!=0 ) return c;
sl@0	5336	if( isPrefix && n==nTerm ) return 0;
sl@0	5337	return pReader->term.nData - nTerm;
sl@0	5338	}
sl@0	5339
sl@0	5340
sl@0	5341	/****************************************************************/
sl@0	5342	/* LeavesReader wraps LeafReader to allow iterating over the entire
sl@0	5343	** leaf layer of the tree.
sl@0	5344	*/
sl@0	5345	typedef struct LeavesReader {
sl@0	5346	int idx; /* Index within the segment. */
sl@0	5347
sl@0	5348	sqlite3_stmt pStmt; / Statement we're streaming leaves from. */
sl@0	5349	int eof; /* we've seen SQLITE_DONE from pStmt. */
sl@0	5350
sl@0	5351	LeafReader leafReader; /* reader for the current leaf. */
sl@0	5352	DataBuffer rootData; /* root data for inline. */
sl@0	5353	} LeavesReader;
sl@0	5354
sl@0	5355	/* Access the current term. */
sl@0	5356	static int leavesReaderTermBytes(LeavesReader *pReader){
sl@0	5357	assert( !pReader->eof );
sl@0	5358	return leafReaderTermBytes(&pReader->leafReader);
sl@0	5359	}
sl@0	5360	static const char leavesReaderTerm(LeavesReader pReader){
sl@0	5361	assert( !pReader->eof );
sl@0	5362	return leafReaderTerm(&pReader->leafReader);
sl@0	5363	}
sl@0	5364
sl@0	5365	/* Access the doclist data for the current term. */
sl@0	5366	static int leavesReaderDataBytes(LeavesReader *pReader){
sl@0	5367	assert( !pReader->eof );
sl@0	5368	return leafReaderDataBytes(&pReader->leafReader);
sl@0	5369	}
sl@0	5370	static const char leavesReaderData(LeavesReader pReader){
sl@0	5371	assert( !pReader->eof );
sl@0	5372	return leafReaderData(&pReader->leafReader);
sl@0	5373	}
sl@0	5374
sl@0	5375	static int leavesReaderAtEnd(LeavesReader *pReader){
sl@0	5376	return pReader->eof;
sl@0	5377	}
sl@0	5378
sl@0	5379	/* loadSegmentLeaves() may not read all the way to SQLITE_DONE, thus
sl@0	5380	** leaving the statement handle open, which locks the table.
sl@0	5381	*/
sl@0	5382	/* TODO(shess) This "solution" is not satisfactory. Really, there
sl@0	5383	** should be check-in function for all statement handles which
sl@0	5384	** arranges to call sqlite3_reset(). This most likely will require
sl@0	5385	** modification to control flow all over the place, though, so for now
sl@0	5386	** just punt.
sl@0	5387	**
sl@0	5388	** Note the the current system assumes that segment merges will run to
sl@0	5389	** completion, which is why this particular probably hasn't arisen in
sl@0	5390	** this case. Probably a brittle assumption.
sl@0	5391	*/
sl@0	5392	static int leavesReaderReset(LeavesReader *pReader){
sl@0	5393	return sqlite3_reset(pReader->pStmt);
sl@0	5394	}
sl@0	5395
sl@0	5396	static void leavesReaderDestroy(LeavesReader *pReader){
sl@0	5397	/* If idx is -1, that means we're using a non-cached statement
sl@0	5398	** handle in the optimize() case, so we need to release it.
sl@0	5399	*/
sl@0	5400	if( pReader->pStmt!=NULL && pReader->idx==-1 ){
sl@0	5401	sqlite3_finalize(pReader->pStmt);
sl@0	5402	}
sl@0	5403	leafReaderDestroy(&pReader->leafReader);
sl@0	5404	dataBufferDestroy(&pReader->rootData);
sl@0	5405	SCRAMBLE(pReader);
sl@0	5406	}
sl@0	5407
sl@0	5408	/* Initialize pReader with the given root data (if iStartBlockid==0
sl@0	5409	** the leaf data was entirely contained in the root), or from the
sl@0	5410	** stream of blocks between iStartBlockid and iEndBlockid, inclusive.
sl@0	5411	*/
sl@0	5412	static int leavesReaderInit(fulltext_vtab *v,
sl@0	5413	int idx,
sl@0	5414	sqlite_int64 iStartBlockid,
sl@0	5415	sqlite_int64 iEndBlockid,
sl@0	5416	const char *pRootData, int nRootData,
sl@0	5417	LeavesReader *pReader){
sl@0	5418	CLEAR(pReader);
sl@0	5419	pReader->idx = idx;
sl@0	5420
sl@0	5421	dataBufferInit(&pReader->rootData, 0);
sl@0	5422	if( iStartBlockid==0 ){
sl@0	5423	/* Entire leaf level fit in root data. */
sl@0	5424	dataBufferReplace(&pReader->rootData, pRootData, nRootData);
sl@0	5425	leafReaderInit(pReader->rootData.pData, pReader->rootData.nData,
sl@0	5426	&pReader->leafReader);
sl@0	5427	}else{
sl@0	5428	sqlite3_stmt *s;
sl@0	5429	int rc = sql_get_leaf_statement(v, idx, &s);
sl@0	5430	if( rc!=SQLITE_OK ) return rc;
sl@0	5431
sl@0	5432	rc = sqlite3_bind_int64(s, 1, iStartBlockid);
sl@0	5433	if( rc!=SQLITE_OK ) return rc;
sl@0	5434
sl@0	5435	rc = sqlite3_bind_int64(s, 2, iEndBlockid);
sl@0	5436	if( rc!=SQLITE_OK ) return rc;
sl@0	5437
sl@0	5438	rc = sqlite3_step(s);
sl@0	5439	if( rc==SQLITE_DONE ){
sl@0	5440	pReader->eof = 1;
sl@0	5441	return SQLITE_OK;
sl@0	5442	}
sl@0	5443	if( rc!=SQLITE_ROW ) return rc;
sl@0	5444
sl@0	5445	pReader->pStmt = s;
sl@0	5446	leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
sl@0	5447	sqlite3_column_bytes(pReader->pStmt, 0),
sl@0	5448	&pReader->leafReader);
sl@0	5449	}
sl@0	5450	return SQLITE_OK;
sl@0	5451	}
sl@0	5452
sl@0	5453	/* Step the current leaf forward to the next term. If we reach the
sl@0	5454	** end of the current leaf, step forward to the next leaf block.
sl@0	5455	*/
sl@0	5456	static int leavesReaderStep(fulltext_vtab v, LeavesReader pReader){
sl@0	5457	assert( !leavesReaderAtEnd(pReader) );
sl@0	5458	leafReaderStep(&pReader->leafReader);
sl@0	5459
sl@0	5460	if( leafReaderAtEnd(&pReader->leafReader) ){
sl@0	5461	int rc;
sl@0	5462	if( pReader->rootData.pData ){
sl@0	5463	pReader->eof = 1;
sl@0	5464	return SQLITE_OK;
sl@0	5465	}
sl@0	5466	rc = sqlite3_step(pReader->pStmt);
sl@0	5467	if( rc!=SQLITE_ROW ){
sl@0	5468	pReader->eof = 1;
sl@0	5469	return rc==SQLITE_DONE ? SQLITE_OK : rc;
sl@0	5470	}
sl@0	5471	leafReaderDestroy(&pReader->leafReader);
sl@0	5472	leafReaderInit(sqlite3_column_blob(pReader->pStmt, 0),
sl@0	5473	sqlite3_column_bytes(pReader->pStmt, 0),
sl@0	5474	&pReader->leafReader);
sl@0	5475	}
sl@0	5476	return SQLITE_OK;
sl@0	5477	}
sl@0	5478
sl@0	5479	/* Order LeavesReaders by their term, ignoring idx. Readers at eof
sl@0	5480	** always sort to the end.
sl@0	5481	*/
sl@0	5482	static int leavesReaderTermCmp(LeavesReader lr1, LeavesReader lr2){
sl@0	5483	if( leavesReaderAtEnd(lr1) ){
sl@0	5484	if( leavesReaderAtEnd(lr2) ) return 0;
sl@0	5485	return 1;
sl@0	5486	}
sl@0	5487	if( leavesReaderAtEnd(lr2) ) return -1;
sl@0	5488
sl@0	5489	return leafReaderTermCmp(&lr1->leafReader,
sl@0	5490	leavesReaderTerm(lr2), leavesReaderTermBytes(lr2),
sl@0	5491	0);
sl@0	5492	}
sl@0	5493
sl@0	5494	/* Similar to leavesReaderTermCmp(), with additional ordering by idx
sl@0	5495	** so that older segments sort before newer segments.
sl@0	5496	*/
sl@0	5497	static int leavesReaderCmp(LeavesReader lr1, LeavesReader lr2){
sl@0	5498	int c = leavesReaderTermCmp(lr1, lr2);
sl@0	5499	if( c!=0 ) return c;
sl@0	5500	return lr1->idx-lr2->idx;
sl@0	5501	}
sl@0	5502
sl@0	5503	/* Assume that pLr[1]..pLr[nLr] are sorted. Bubble pLr[0] into its
sl@0	5504	** sorted position.
sl@0	5505	*/
sl@0	5506	static void leavesReaderReorder(LeavesReader *pLr, int nLr){
sl@0	5507	while( nLr>1 && leavesReaderCmp(pLr, pLr+1)>0 ){
sl@0	5508	LeavesReader tmp = pLr[0];
sl@0	5509	pLr[0] = pLr[1];
sl@0	5510	pLr[1] = tmp;
sl@0	5511	nLr--;
sl@0	5512	pLr++;
sl@0	5513	}
sl@0	5514	}
sl@0	5515
sl@0	5516	/* Initializes pReaders with the segments from level iLevel, returning
sl@0	5517	** the number of segments in *piReaders. Leaves pReaders in sorted
sl@0	5518	** order.
sl@0	5519	*/
sl@0	5520	static int leavesReadersInit(fulltext_vtab *v, int iLevel,
sl@0	5521	LeavesReader pReaders, int piReaders){
sl@0	5522	sqlite3_stmt *s;
sl@0	5523	int i, rc = sql_get_statement(v, SEGDIR_SELECT_LEVEL_STMT, &s);
sl@0	5524	if( rc!=SQLITE_OK ) return rc;
sl@0	5525
sl@0	5526	rc = sqlite3_bind_int(s, 1, iLevel);
sl@0	5527	if( rc!=SQLITE_OK ) return rc;
sl@0	5528
sl@0	5529	i = 0;
sl@0	5530	while( (rc = sqlite3_step(s))==SQLITE_ROW ){
sl@0	5531	sqlite_int64 iStart = sqlite3_column_int64(s, 0);
sl@0	5532	sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
sl@0	5533	const char *pRootData = sqlite3_column_blob(s, 2);
sl@0	5534	int nRootData = sqlite3_column_bytes(s, 2);
sl@0	5535
sl@0	5536	assert( i<MERGE_COUNT );
sl@0	5537	rc = leavesReaderInit(v, i, iStart, iEnd, pRootData, nRootData,
sl@0	5538	&pReaders[i]);
sl@0	5539	if( rc!=SQLITE_OK ) break;
sl@0	5540
sl@0	5541	i++;
sl@0	5542	}
sl@0	5543	if( rc!=SQLITE_DONE ){
sl@0	5544	while( i-->0 ){
sl@0	5545	leavesReaderDestroy(&pReaders[i]);
sl@0	5546	}
sl@0	5547	return rc;
sl@0	5548	}
sl@0	5549
sl@0	5550	*piReaders = i;
sl@0	5551
sl@0	5552	/* Leave our results sorted by term, then age. */
sl@0	5553	while( i-- ){
sl@0	5554	leavesReaderReorder(pReaders+i, *piReaders-i);
sl@0	5555	}
sl@0	5556	return SQLITE_OK;
sl@0	5557	}
sl@0	5558
sl@0	5559	/* Merge doclists from pReaders[nReaders] into a single doclist, which
sl@0	5560	** is written to pWriter. Assumes pReaders is ordered oldest to
sl@0	5561	** newest.
sl@0	5562	*/
sl@0	5563	/* TODO(shess) Consider putting this inline in segmentMerge(). */
sl@0	5564	static int leavesReadersMerge(fulltext_vtab *v,
sl@0	5565	LeavesReader *pReaders, int nReaders,
sl@0	5566	LeafWriter *pWriter){
sl@0	5567	DLReader dlReaders[MERGE_COUNT];
sl@0	5568	const char *pTerm = leavesReaderTerm(pReaders);
sl@0	5569	int i, nTerm = leavesReaderTermBytes(pReaders);
sl@0	5570
sl@0	5571	assert( nReaders<=MERGE_COUNT );
sl@0	5572
sl@0	5573	for(i=0; i<nReaders; i++){
sl@0	5574	dlrInit(&dlReaders[i], DL_DEFAULT,
sl@0	5575	leavesReaderData(pReaders+i),
sl@0	5576	leavesReaderDataBytes(pReaders+i));
sl@0	5577	}
sl@0	5578
sl@0	5579	return leafWriterStepMerge(v, pWriter, pTerm, nTerm, dlReaders, nReaders);
sl@0	5580	}
sl@0	5581
sl@0	5582	/* Forward ref due to mutual recursion with segdirNextIndex(). */
sl@0	5583	static int segmentMerge(fulltext_vtab *v, int iLevel);
sl@0	5584
sl@0	5585	/* Put the next available index at iLevel into *pidx. If iLevel
sl@0	5586	** already has MERGE_COUNT segments, they are merged to a higher
sl@0	5587	** level to make room.
sl@0	5588	*/
sl@0	5589	static int segdirNextIndex(fulltext_vtab v, int iLevel, int pidx){
sl@0	5590	int rc = segdir_max_index(v, iLevel, pidx);
sl@0	5591	if( rc==SQLITE_DONE ){ /* No segments at iLevel. */
sl@0	5592	*pidx = 0;
sl@0	5593	}else if( rc==SQLITE_ROW ){
sl@0	5594	if( *pidx==(MERGE_COUNT-1) ){
sl@0	5595	rc = segmentMerge(v, iLevel);
sl@0	5596	if( rc!=SQLITE_OK ) return rc;
sl@0	5597	*pidx = 0;
sl@0	5598	}else{
sl@0	5599	(*pidx)++;
sl@0	5600	}
sl@0	5601	}else{
sl@0	5602	return rc;
sl@0	5603	}
sl@0	5604	return SQLITE_OK;
sl@0	5605	}
sl@0	5606
sl@0	5607	/* Merge MERGE_COUNT segments at iLevel into a new segment at
sl@0	5608	** iLevel+1. If iLevel+1 is already full of segments, those will be
sl@0	5609	** merged to make room.
sl@0	5610	*/
sl@0	5611	static int segmentMerge(fulltext_vtab *v, int iLevel){
sl@0	5612	LeafWriter writer;
sl@0	5613	LeavesReader lrs[MERGE_COUNT];
sl@0	5614	int i, rc, idx = 0;
sl@0	5615
sl@0	5616	/* Determine the next available segment index at the next level,
sl@0	5617	** merging as necessary.
sl@0	5618	*/
sl@0	5619	rc = segdirNextIndex(v, iLevel+1, &idx);
sl@0	5620	if( rc!=SQLITE_OK ) return rc;
sl@0	5621
sl@0	5622	/* TODO(shess) This assumes that we'll always see exactly
sl@0	5623	** MERGE_COUNT segments to merge at a given level. That will be
sl@0	5624	** broken if we allow the developer to request preemptive or
sl@0	5625	** deferred merging.
sl@0	5626	*/
sl@0	5627	memset(&lrs, '\0', sizeof(lrs));
sl@0	5628	rc = leavesReadersInit(v, iLevel, lrs, &i);
sl@0	5629	if( rc!=SQLITE_OK ) return rc;
sl@0	5630	assert( i==MERGE_COUNT );
sl@0	5631
sl@0	5632	leafWriterInit(iLevel+1, idx, &writer);
sl@0	5633
sl@0	5634	/* Since leavesReaderReorder() pushes readers at eof to the end,
sl@0	5635	** when the first reader is empty, all will be empty.
sl@0	5636	*/
sl@0	5637	while( !leavesReaderAtEnd(lrs) ){
sl@0	5638	/* Figure out how many readers share their next term. */
sl@0	5639	for(i=1; i<MERGE_COUNT && !leavesReaderAtEnd(lrs+i); i++){
sl@0	5640	if( 0!=leavesReaderTermCmp(lrs, lrs+i) ) break;
sl@0	5641	}
sl@0	5642
sl@0	5643	rc = leavesReadersMerge(v, lrs, i, &writer);
sl@0	5644	if( rc!=SQLITE_OK ) goto err;
sl@0	5645
sl@0	5646	/* Step forward those that were merged. */
sl@0	5647	while( i-->0 ){
sl@0	5648	rc = leavesReaderStep(v, lrs+i);
sl@0	5649	if( rc!=SQLITE_OK ) goto err;
sl@0	5650
sl@0	5651	/* Reorder by term, then by age. */
sl@0	5652	leavesReaderReorder(lrs+i, MERGE_COUNT-i);
sl@0	5653	}
sl@0	5654	}
sl@0	5655
sl@0	5656	for(i=0; i<MERGE_COUNT; i++){
sl@0	5657	leavesReaderDestroy(&lrs[i]);
sl@0	5658	}
sl@0	5659
sl@0	5660	rc = leafWriterFinalize(v, &writer);
sl@0	5661	leafWriterDestroy(&writer);
sl@0	5662	if( rc!=SQLITE_OK ) return rc;
sl@0	5663
sl@0	5664	/* Delete the merged segment data. */
sl@0	5665	return segdir_delete(v, iLevel);
sl@0	5666
sl@0	5667	err:
sl@0	5668	for(i=0; i<MERGE_COUNT; i++){
sl@0	5669	leavesReaderDestroy(&lrs[i]);
sl@0	5670	}
sl@0	5671	leafWriterDestroy(&writer);
sl@0	5672	return rc;
sl@0	5673	}
sl@0	5674
sl@0	5675	/* Accumulate the union of acc and pData into acc. /
sl@0	5676	static void docListAccumulateUnion(DataBuffer *acc,
sl@0	5677	const char *pData, int nData) {
sl@0	5678	DataBuffer tmp = *acc;
sl@0	5679	dataBufferInit(acc, tmp.nData+nData);
sl@0	5680	docListUnion(tmp.pData, tmp.nData, pData, nData, acc);
sl@0	5681	dataBufferDestroy(&tmp);
sl@0	5682	}
sl@0	5683
sl@0	5684	/* TODO(shess) It might be interesting to explore different merge
sl@0	5685	** strategies, here. For instance, since this is a sorted merge, we
sl@0	5686	** could easily merge many doclists in parallel. With some
sl@0	5687	** comprehension of the storage format, we could merge all of the
sl@0	5688	** doclists within a leaf node directly from the leaf node's storage.
sl@0	5689	** It may be worthwhile to merge smaller doclists before larger
sl@0	5690	** doclists, since they can be traversed more quickly - but the
sl@0	5691	** results may have less overlap, making them more expensive in a
sl@0	5692	** different way.
sl@0	5693	*/
sl@0	5694
sl@0	5695	/* Scan pReader for pTerm/nTerm, and merge the term's doclist over
sl@0	5696	** out (any doclists with duplicate docids overwrite those in out).
sl@0	5697	** Internal function for loadSegmentLeaf().
sl@0	5698	*/
sl@0	5699	static int loadSegmentLeavesInt(fulltext_vtab v, LeavesReader pReader,
sl@0	5700	const char *pTerm, int nTerm, int isPrefix,
sl@0	5701	DataBuffer *out){
sl@0	5702	/* doclist data is accumulated into pBuffers similar to how one does
sl@0	5703	** increment in binary arithmetic. If index 0 is empty, the data is
sl@0	5704	** stored there. If there is data there, it is merged and the
sl@0	5705	** results carried into position 1, with further merge-and-carry
sl@0	5706	** until an empty position is found.
sl@0	5707	*/
sl@0	5708	DataBuffer *pBuffers = NULL;
sl@0	5709	int nBuffers = 0, nMaxBuffers = 0, rc;
sl@0	5710
sl@0	5711	assert( nTerm>0 );
sl@0	5712
sl@0	5713	for(rc=SQLITE_OK; rc==SQLITE_OK && !leavesReaderAtEnd(pReader);
sl@0	5714	rc=leavesReaderStep(v, pReader)){
sl@0	5715	/* TODO(shess) Really want leavesReaderTermCmp(), but that name is
sl@0	5716	** already taken to compare the terms of two LeavesReaders. Think
sl@0	5717	** on a better name. [Meanwhile, break encapsulation rather than
sl@0	5718	** use a confusing name.]
sl@0	5719	*/
sl@0	5720	int c = leafReaderTermCmp(&pReader->leafReader, pTerm, nTerm, isPrefix);
sl@0	5721	if( c>0 ) break; /* Past any possible matches. */
sl@0	5722	if( c==0 ){
sl@0	5723	const char *pData = leavesReaderData(pReader);
sl@0	5724	int iBuffer, nData = leavesReaderDataBytes(pReader);
sl@0	5725
sl@0	5726	/* Find the first empty buffer. */
sl@0	5727	for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
sl@0	5728	if( 0==pBuffers[iBuffer].nData ) break;
sl@0	5729	}
sl@0	5730
sl@0	5731	/* Out of buffers, add an empty one. */
sl@0	5732	if( iBuffer==nBuffers ){
sl@0	5733	if( nBuffers==nMaxBuffers ){
sl@0	5734	DataBuffer *p;
sl@0	5735	nMaxBuffers += 20;
sl@0	5736
sl@0	5737	/* Manual realloc so we can handle NULL appropriately. */
sl@0	5738	p = sqlite3_malloc(nMaxBufferssizeof(pBuffers));
sl@0	5739	if( p==NULL ){
sl@0	5740	rc = SQLITE_NOMEM;
sl@0	5741	break;
sl@0	5742	}
sl@0	5743
sl@0	5744	if( nBuffers>0 ){
sl@0	5745	assert(pBuffers!=NULL);
sl@0	5746	memcpy(p, pBuffers, nBufferssizeof(pBuffers));
sl@0	5747	sqlite3_free(pBuffers);
sl@0	5748	}
sl@0	5749	pBuffers = p;
sl@0	5750	}
sl@0	5751	dataBufferInit(&(pBuffers[nBuffers]), 0);
sl@0	5752	nBuffers++;
sl@0	5753	}
sl@0	5754
sl@0	5755	/* At this point, must have an empty at iBuffer. */
sl@0	5756	assert(iBuffer<nBuffers && pBuffers[iBuffer].nData==0);
sl@0	5757
sl@0	5758	/* If empty was first buffer, no need for merge logic. */
sl@0	5759	if( iBuffer==0 ){
sl@0	5760	dataBufferReplace(&(pBuffers[0]), pData, nData);
sl@0	5761	}else{
sl@0	5762	/* pAcc is the empty buffer the merged data will end up in. */
sl@0	5763	DataBuffer *pAcc = &(pBuffers[iBuffer]);
sl@0	5764	DataBuffer *p = &(pBuffers[0]);
sl@0	5765
sl@0	5766	/* Handle position 0 specially to avoid need to prime pAcc
sl@0	5767	** with pData/nData.
sl@0	5768	*/
sl@0	5769	dataBufferSwap(p, pAcc);
sl@0	5770	docListAccumulateUnion(pAcc, pData, nData);
sl@0	5771
sl@0	5772	/* Accumulate remaining doclists into pAcc. */
sl@0	5773	for(++p; p<pAcc; ++p){
sl@0	5774	docListAccumulateUnion(pAcc, p->pData, p->nData);
sl@0	5775
sl@0	5776	/* dataBufferReset() could allow a large doclist to blow up
sl@0	5777	** our memory requirements.
sl@0	5778	*/
sl@0	5779	if( p->nCapacity<1024 ){
sl@0	5780	dataBufferReset(p);
sl@0	5781	}else{
sl@0	5782	dataBufferDestroy(p);
sl@0	5783	dataBufferInit(p, 0);
sl@0	5784	}
sl@0	5785	}
sl@0	5786	}
sl@0	5787	}
sl@0	5788	}
sl@0	5789
sl@0	5790	/* Union all the doclists together into out. /
sl@0	5791	/* TODO(shess) What if out is big? Sigh. /
sl@0	5792	if( rc==SQLITE_OK && nBuffers>0 ){
sl@0	5793	int iBuffer;
sl@0	5794	for(iBuffer=0; iBuffer<nBuffers; ++iBuffer){
sl@0	5795	if( pBuffers[iBuffer].nData>0 ){
sl@0	5796	if( out->nData==0 ){
sl@0	5797	dataBufferSwap(out, &(pBuffers[iBuffer]));
sl@0	5798	}else{
sl@0	5799	docListAccumulateUnion(out, pBuffers[iBuffer].pData,
sl@0	5800	pBuffers[iBuffer].nData);
sl@0	5801	}
sl@0	5802	}
sl@0	5803	}
sl@0	5804	}
sl@0	5805
sl@0	5806	while( nBuffers-- ){
sl@0	5807	dataBufferDestroy(&(pBuffers[nBuffers]));
sl@0	5808	}
sl@0	5809	if( pBuffers!=NULL ) sqlite3_free(pBuffers);
sl@0	5810
sl@0	5811	return rc;
sl@0	5812	}
sl@0	5813
sl@0	5814	/* Call loadSegmentLeavesInt() with pData/nData as input. */
sl@0	5815	static int loadSegmentLeaf(fulltext_vtab v, const char pData, int nData,
sl@0	5816	const char *pTerm, int nTerm, int isPrefix,
sl@0	5817	DataBuffer *out){
sl@0	5818	LeavesReader reader;
sl@0	5819	int rc;
sl@0	5820
sl@0	5821	assert( nData>1 );
sl@0	5822	assert( *pData=='\0' );
sl@0	5823	rc = leavesReaderInit(v, 0, 0, 0, pData, nData, &reader);
sl@0	5824	if( rc!=SQLITE_OK ) return rc;
sl@0	5825
sl@0	5826	rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
sl@0	5827	leavesReaderReset(&reader);
sl@0	5828	leavesReaderDestroy(&reader);
sl@0	5829	return rc;
sl@0	5830	}
sl@0	5831
sl@0	5832	/* Call loadSegmentLeavesInt() with the leaf nodes from iStartLeaf to
sl@0	5833	** iEndLeaf (inclusive) as input, and merge the resulting doclist into
sl@0	5834	** out.
sl@0	5835	*/
sl@0	5836	static int loadSegmentLeaves(fulltext_vtab *v,
sl@0	5837	sqlite_int64 iStartLeaf, sqlite_int64 iEndLeaf,
sl@0	5838	const char *pTerm, int nTerm, int isPrefix,
sl@0	5839	DataBuffer *out){
sl@0	5840	int rc;
sl@0	5841	LeavesReader reader;
sl@0	5842
sl@0	5843	assert( iStartLeaf<=iEndLeaf );
sl@0	5844	rc = leavesReaderInit(v, 0, iStartLeaf, iEndLeaf, NULL, 0, &reader);
sl@0	5845	if( rc!=SQLITE_OK ) return rc;
sl@0	5846
sl@0	5847	rc = loadSegmentLeavesInt(v, &reader, pTerm, nTerm, isPrefix, out);
sl@0	5848	leavesReaderReset(&reader);
sl@0	5849	leavesReaderDestroy(&reader);
sl@0	5850	return rc;
sl@0	5851	}
sl@0	5852
sl@0	5853	/* Taking pData/nData as an interior node, find the sequence of child
sl@0	5854	** nodes which could include pTerm/nTerm/isPrefix. Note that the
sl@0	5855	** interior node terms logically come between the blocks, so there is
sl@0	5856	** one more blockid than there are terms (that block contains terms >=
sl@0	5857	** the last interior-node term).
sl@0	5858	*/
sl@0	5859	/* TODO(shess) The calling code may already know that the end child is
sl@0	5860	** not worth calculating, because the end may be in a later sibling
sl@0	5861	** node. Consider whether breaking symmetry is worthwhile. I suspect
sl@0	5862	** it is not worthwhile.
sl@0	5863	*/
sl@0	5864	static void getChildrenContaining(const char *pData, int nData,
sl@0	5865	const char *pTerm, int nTerm, int isPrefix,
sl@0	5866	sqlite_int64 *piStartChild,
sl@0	5867	sqlite_int64 *piEndChild){
sl@0	5868	InteriorReader reader;
sl@0	5869
sl@0	5870	assert( nData>1 );
sl@0	5871	assert( *pData!='\0' );
sl@0	5872	interiorReaderInit(pData, nData, &reader);
sl@0	5873
sl@0	5874	/* Scan for the first child which could contain pTerm/nTerm. */
sl@0	5875	while( !interiorReaderAtEnd(&reader) ){
sl@0	5876	if( interiorReaderTermCmp(&reader, pTerm, nTerm, 0)>0 ) break;
sl@0	5877	interiorReaderStep(&reader);
sl@0	5878	}
sl@0	5879	*piStartChild = interiorReaderCurrentBlockid(&reader);
sl@0	5880
sl@0	5881	/* Keep scanning to find a term greater than our term, using prefix
sl@0	5882	** comparison if indicated. If isPrefix is false, this will be the
sl@0	5883	** same blockid as the starting block.
sl@0	5884	*/
sl@0	5885	while( !interiorReaderAtEnd(&reader) ){
sl@0	5886	if( interiorReaderTermCmp(&reader, pTerm, nTerm, isPrefix)>0 ) break;
sl@0	5887	interiorReaderStep(&reader);
sl@0	5888	}
sl@0	5889	*piEndChild = interiorReaderCurrentBlockid(&reader);
sl@0	5890
sl@0	5891	interiorReaderDestroy(&reader);
sl@0	5892
sl@0	5893	/* Children must ascend, and if !prefix, both must be the same. */
sl@0	5894	assert( piEndChild>=piStartChild );
sl@0	5895	assert( isPrefix \|\| piStartChild==piEndChild );
sl@0	5896	}
sl@0	5897
sl@0	5898	/* Read block at iBlockid and pass it with other params to
sl@0	5899	** getChildrenContaining().
sl@0	5900	*/
sl@0	5901	static int loadAndGetChildrenContaining(
sl@0	5902	fulltext_vtab *v,
sl@0	5903	sqlite_int64 iBlockid,
sl@0	5904	const char *pTerm, int nTerm, int isPrefix,
sl@0	5905	sqlite_int64 piStartChild, sqlite_int64 piEndChild
sl@0	5906	){
sl@0	5907	sqlite3_stmt *s = NULL;
sl@0	5908	int rc;
sl@0	5909
sl@0	5910	assert( iBlockid!=0 );
sl@0	5911	assert( pTerm!=NULL );
sl@0	5912	assert( nTerm!=0 ); /* TODO(shess) Why not allow this? */
sl@0	5913	assert( piStartChild!=NULL );
sl@0	5914	assert( piEndChild!=NULL );
sl@0	5915
sl@0	5916	rc = sql_get_statement(v, BLOCK_SELECT_STMT, &s);
sl@0	5917	if( rc!=SQLITE_OK ) return rc;
sl@0	5918
sl@0	5919	rc = sqlite3_bind_int64(s, 1, iBlockid);
sl@0	5920	if( rc!=SQLITE_OK ) return rc;
sl@0	5921
sl@0	5922	rc = sqlite3_step(s);
sl@0	5923	if( rc==SQLITE_DONE ) return SQLITE_ERROR;
sl@0	5924	if( rc!=SQLITE_ROW ) return rc;
sl@0	5925
sl@0	5926	getChildrenContaining(sqlite3_column_blob(s, 0), sqlite3_column_bytes(s, 0),
sl@0	5927	pTerm, nTerm, isPrefix, piStartChild, piEndChild);
sl@0	5928
sl@0	5929	/* We expect only one row. We must execute another sqlite3_step()
sl@0	5930	* to complete the iteration; otherwise the table will remain
sl@0	5931	* locked. */
sl@0	5932	rc = sqlite3_step(s);
sl@0	5933	if( rc==SQLITE_ROW ) return SQLITE_ERROR;
sl@0	5934	if( rc!=SQLITE_DONE ) return rc;
sl@0	5935
sl@0	5936	return SQLITE_OK;
sl@0	5937	}
sl@0	5938
sl@0	5939	/* Traverse the tree represented by pData[nData] looking for
sl@0	5940	** pTerm[nTerm], placing its doclist into *out. This is internal to
sl@0	5941	** loadSegment() to make error-handling cleaner.
sl@0	5942	*/
sl@0	5943	static int loadSegmentInt(fulltext_vtab v, const char pData, int nData,
sl@0	5944	sqlite_int64 iLeavesEnd,
sl@0	5945	const char *pTerm, int nTerm, int isPrefix,
sl@0	5946	DataBuffer *out){
sl@0	5947	/* Special case where root is a leaf. */
sl@0	5948	if( *pData=='\0' ){
sl@0	5949	return loadSegmentLeaf(v, pData, nData, pTerm, nTerm, isPrefix, out);
sl@0	5950	}else{
sl@0	5951	int rc;
sl@0	5952	sqlite_int64 iStartChild, iEndChild;
sl@0	5953
sl@0	5954	/* Process pData as an interior node, then loop down the tree
sl@0	5955	** until we find the set of leaf nodes to scan for the term.
sl@0	5956	*/
sl@0	5957	getChildrenContaining(pData, nData, pTerm, nTerm, isPrefix,
sl@0	5958	&iStartChild, &iEndChild);
sl@0	5959	while( iStartChild>iLeavesEnd ){
sl@0	5960	sqlite_int64 iNextStart, iNextEnd;
sl@0	5961	rc = loadAndGetChildrenContaining(v, iStartChild, pTerm, nTerm, isPrefix,
sl@0	5962	&iNextStart, &iNextEnd);
sl@0	5963	if( rc!=SQLITE_OK ) return rc;
sl@0	5964
sl@0	5965	/* If we've branched, follow the end branch, too. */
sl@0	5966	if( iStartChild!=iEndChild ){
sl@0	5967	sqlite_int64 iDummy;
sl@0	5968	rc = loadAndGetChildrenContaining(v, iEndChild, pTerm, nTerm, isPrefix,
sl@0	5969	&iDummy, &iNextEnd);
sl@0	5970	if( rc!=SQLITE_OK ) return rc;
sl@0	5971	}
sl@0	5972
sl@0	5973	assert( iNextStart<=iNextEnd );
sl@0	5974	iStartChild = iNextStart;
sl@0	5975	iEndChild = iNextEnd;
sl@0	5976	}
sl@0	5977	assert( iStartChild<=iLeavesEnd );
sl@0	5978	assert( iEndChild<=iLeavesEnd );
sl@0	5979
sl@0	5980	/* Scan through the leaf segments for doclists. */
sl@0	5981	return loadSegmentLeaves(v, iStartChild, iEndChild,
sl@0	5982	pTerm, nTerm, isPrefix, out);
sl@0	5983	}
sl@0	5984	}
sl@0	5985
sl@0	5986	/* Call loadSegmentInt() to collect the doclist for pTerm/nTerm, then
sl@0	5987	** merge its doclist over *out (any duplicate doclists read from the
sl@0	5988	** segment rooted at pData will overwrite those in *out).
sl@0	5989	*/
sl@0	5990	/* TODO(shess) Consider changing this to determine the depth of the
sl@0	5991	** leaves using either the first characters of interior nodes (when
sl@0	5992	** ==1, we're one level above the leaves), or the first character of
sl@0	5993	** the root (which will describe the height of the tree directly).
sl@0	5994	** Either feels somewhat tricky to me.
sl@0	5995	*/
sl@0	5996	/* TODO(shess) The current merge is likely to be slow for large
sl@0	5997	** doclists (though it should process from newest/smallest to
sl@0	5998	** oldest/largest, so it may not be that bad). It might be useful to
sl@0	5999	** modify things to allow for N-way merging. This could either be
sl@0	6000	** within a segment, with pairwise merges across segments, or across
sl@0	6001	** all segments at once.
sl@0	6002	*/
sl@0	6003	static int loadSegment(fulltext_vtab v, const char pData, int nData,
sl@0	6004	sqlite_int64 iLeavesEnd,
sl@0	6005	const char *pTerm, int nTerm, int isPrefix,
sl@0	6006	DataBuffer *out){
sl@0	6007	DataBuffer result;
sl@0	6008	int rc;
sl@0	6009
sl@0	6010	assert( nData>1 );
sl@0	6011
sl@0	6012	/* This code should never be called with buffered updates. */
sl@0	6013	assert( v->nPendingData<0 );
sl@0	6014
sl@0	6015	dataBufferInit(&result, 0);
sl@0	6016	rc = loadSegmentInt(v, pData, nData, iLeavesEnd,
sl@0	6017	pTerm, nTerm, isPrefix, &result);
sl@0	6018	if( rc==SQLITE_OK && result.nData>0 ){
sl@0	6019	if( out->nData==0 ){
sl@0	6020	DataBuffer tmp = *out;
sl@0	6021	*out = result;
sl@0	6022	result = tmp;
sl@0	6023	}else{
sl@0	6024	DataBuffer merged;
sl@0	6025	DLReader readers[2];
sl@0	6026
sl@0	6027	dlrInit(&readers[0], DL_DEFAULT, out->pData, out->nData);
sl@0	6028	dlrInit(&readers[1], DL_DEFAULT, result.pData, result.nData);
sl@0	6029	dataBufferInit(&merged, out->nData+result.nData);
sl@0	6030	docListMerge(&merged, readers, 2);
sl@0	6031	dataBufferDestroy(out);
sl@0	6032	*out = merged;
sl@0	6033	dlrDestroy(&readers[0]);
sl@0	6034	dlrDestroy(&readers[1]);
sl@0	6035	}
sl@0	6036	}
sl@0	6037	dataBufferDestroy(&result);
sl@0	6038	return rc;
sl@0	6039	}
sl@0	6040
sl@0	6041	/* Scan the database and merge together the posting lists for the term
sl@0	6042	** into *out.
sl@0	6043	*/
sl@0	6044	static int termSelect(fulltext_vtab *v, int iColumn,
sl@0	6045	const char *pTerm, int nTerm, int isPrefix,
sl@0	6046	DocListType iType, DataBuffer *out){
sl@0	6047	DataBuffer doclist;
sl@0	6048	sqlite3_stmt *s;
sl@0	6049	int rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
sl@0	6050	if( rc!=SQLITE_OK ) return rc;
sl@0	6051
sl@0	6052	/* This code should never be called with buffered updates. */
sl@0	6053	assert( v->nPendingData<0 );
sl@0	6054
sl@0	6055	dataBufferInit(&doclist, 0);
sl@0	6056
sl@0	6057	/* Traverse the segments from oldest to newest so that newer doclist
sl@0	6058	** elements for given docids overwrite older elements.
sl@0	6059	*/
sl@0	6060	while( (rc = sqlite3_step(s))==SQLITE_ROW ){
sl@0	6061	const char *pData = sqlite3_column_blob(s, 2);
sl@0	6062	const int nData = sqlite3_column_bytes(s, 2);
sl@0	6063	const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
sl@0	6064	rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, isPrefix,
sl@0	6065	&doclist);
sl@0	6066	if( rc!=SQLITE_OK ) goto err;
sl@0	6067	}
sl@0	6068	if( rc==SQLITE_DONE ){
sl@0	6069	if( doclist.nData!=0 ){
sl@0	6070	/* TODO(shess) The old term_select_all() code applied the column
sl@0	6071	** restrict as we merged segments, leading to smaller buffers.
sl@0	6072	** This is probably worthwhile to bring back, once the new storage
sl@0	6073	** system is checked in.
sl@0	6074	*/
sl@0	6075	if( iColumn==v->nColumn) iColumn = -1;
sl@0	6076	docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
sl@0	6077	iColumn, iType, out);
sl@0	6078	}
sl@0	6079	rc = SQLITE_OK;
sl@0	6080	}
sl@0	6081
sl@0	6082	err:
sl@0	6083	dataBufferDestroy(&doclist);
sl@0	6084	return rc;
sl@0	6085	}
sl@0	6086
sl@0	6087	/****************************************************************/
sl@0	6088	/* Used to hold hashtable data for sorting. */
sl@0	6089	typedef struct TermData {
sl@0	6090	const char *pTerm;
sl@0	6091	int nTerm;
sl@0	6092	DLCollector *pCollector;
sl@0	6093	} TermData;
sl@0	6094
sl@0	6095	/* Orders TermData elements in strcmp fashion ( <0 for less-than, 0
sl@0	6096	** for equal, >0 for greater-than).
sl@0	6097	*/
sl@0	6098	static int termDataCmp(const void av, const void bv){
sl@0	6099	const TermData a = (const TermData )av;
sl@0	6100	const TermData b = (const TermData )bv;
sl@0	6101	int n = a->nTerm<b->nTerm ? a->nTerm : b->nTerm;
sl@0	6102	int c = memcmp(a->pTerm, b->pTerm, n);
sl@0	6103	if( c!=0 ) return c;
sl@0	6104	return a->nTerm-b->nTerm;
sl@0	6105	}
sl@0	6106
sl@0	6107	/* Order pTerms data by term, then write a new level 0 segment using
sl@0	6108	** LeafWriter.
sl@0	6109	*/
sl@0	6110	static int writeZeroSegment(fulltext_vtab v, fts3Hash pTerms){
sl@0	6111	fts3HashElem *e;
sl@0	6112	int idx, rc, i, n;
sl@0	6113	TermData *pData;
sl@0	6114	LeafWriter writer;
sl@0	6115	DataBuffer dl;
sl@0	6116
sl@0	6117	/* Determine the next index at level 0, merging as necessary. */
sl@0	6118	rc = segdirNextIndex(v, 0, &idx);
sl@0	6119	if( rc!=SQLITE_OK ) return rc;
sl@0	6120
sl@0	6121	n = fts3HashCount(pTerms);
sl@0	6122	pData = sqlite3_malloc(n*sizeof(TermData));
sl@0	6123
sl@0	6124	for(i = 0, e = fts3HashFirst(pTerms); e; i++, e = fts3HashNext(e)){
sl@0	6125	assert( i<n );
sl@0	6126	pData[i].pTerm = fts3HashKey(e);
sl@0	6127	pData[i].nTerm = fts3HashKeysize(e);
sl@0	6128	pData[i].pCollector = fts3HashData(e);
sl@0	6129	}
sl@0	6130	assert( i==n );
sl@0	6131
sl@0	6132	/* TODO(shess) Should we allow user-defined collation sequences,
sl@0	6133	** here? I think we only need that once we support prefix searches.
sl@0	6134	*/
sl@0	6135	if( n>1 ) qsort(pData, n, sizeof(*pData), termDataCmp);
sl@0	6136
sl@0	6137	/* TODO(shess) Refactor so that we can write directly to the segment
sl@0	6138	** DataBuffer, as happens for segment merges.
sl@0	6139	*/
sl@0	6140	leafWriterInit(0, idx, &writer);
sl@0	6141	dataBufferInit(&dl, 0);
sl@0	6142	for(i=0; i<n; i++){
sl@0	6143	dataBufferReset(&dl);
sl@0	6144	dlcAddDoclist(pData[i].pCollector, &dl);
sl@0	6145	rc = leafWriterStep(v, &writer,
sl@0	6146	pData[i].pTerm, pData[i].nTerm, dl.pData, dl.nData);
sl@0	6147	if( rc!=SQLITE_OK ) goto err;
sl@0	6148	}
sl@0	6149	rc = leafWriterFinalize(v, &writer);
sl@0	6150
sl@0	6151	err:
sl@0	6152	dataBufferDestroy(&dl);
sl@0	6153	sqlite3_free(pData);
sl@0	6154	leafWriterDestroy(&writer);
sl@0	6155	return rc;
sl@0	6156	}
sl@0	6157
sl@0	6158	/* If pendingTerms has data, free it. */
sl@0	6159	static int clearPendingTerms(fulltext_vtab *v){
sl@0	6160	if( v->nPendingData>=0 ){
sl@0	6161	fts3HashElem *e;
sl@0	6162	for(e=fts3HashFirst(&v->pendingTerms); e; e=fts3HashNext(e)){
sl@0	6163	dlcDelete(fts3HashData(e));
sl@0	6164	}
sl@0	6165	fts3HashClear(&v->pendingTerms);
sl@0	6166	v->nPendingData = -1;
sl@0	6167	}
sl@0	6168	return SQLITE_OK;
sl@0	6169	}
sl@0	6170
sl@0	6171	/* If pendingTerms has data, flush it to a level-zero segment, and
sl@0	6172	** free it.
sl@0	6173	*/
sl@0	6174	static int flushPendingTerms(fulltext_vtab *v){
sl@0	6175	if( v->nPendingData>=0 ){
sl@0	6176	int rc = writeZeroSegment(v, &v->pendingTerms);
sl@0	6177	if( rc==SQLITE_OK ) clearPendingTerms(v);
sl@0	6178	return rc;
sl@0	6179	}
sl@0	6180	return SQLITE_OK;
sl@0	6181	}
sl@0	6182
sl@0	6183	/* If pendingTerms is "too big", or docid is out of order, flush it.
sl@0	6184	** Regardless, be certain that pendingTerms is initialized for use.
sl@0	6185	*/
sl@0	6186	static int initPendingTerms(fulltext_vtab *v, sqlite_int64 iDocid){
sl@0	6187	/* TODO(shess) Explore whether partially flushing the buffer on
sl@0	6188	** forced-flush would provide better performance. I suspect that if
sl@0	6189	** we ordered the doclists by size and flushed the largest until the
sl@0	6190	** buffer was half empty, that would let the less frequent terms
sl@0	6191	** generate longer doclists.
sl@0	6192	*/
sl@0	6193	if( iDocid<=v->iPrevDocid \|\| v->nPendingData>kPendingThreshold ){
sl@0	6194	int rc = flushPendingTerms(v);
sl@0	6195	if( rc!=SQLITE_OK ) return rc;
sl@0	6196	}
sl@0	6197	if( v->nPendingData<0 ){
sl@0	6198	fts3HashInit(&v->pendingTerms, FTS3_HASH_STRING, 1);
sl@0	6199	v->nPendingData = 0;
sl@0	6200	}
sl@0	6201	v->iPrevDocid = iDocid;
sl@0	6202	return SQLITE_OK;
sl@0	6203	}
sl@0	6204
sl@0	6205	/* This function implements the xUpdate callback; it is the top-level entry
sl@0	6206	* point for inserting, deleting or updating a row in a full-text table. */
sl@0	6207	static int fulltextUpdate(sqlite3_vtab pVtab, int nArg, sqlite3_value *ppArg,
sl@0	6208	sqlite_int64 *pRowid){
sl@0	6209	fulltext_vtab v = (fulltext_vtab ) pVtab;
sl@0	6210	int rc;
sl@0	6211
sl@0	6212	FTSTRACE(("FTS3 Update %p\n", pVtab));
sl@0	6213
sl@0	6214	if( nArg<2 ){
sl@0	6215	rc = index_delete(v, sqlite3_value_int64(ppArg[0]));
sl@0	6216	if( rc==SQLITE_OK ){
sl@0	6217	/* If we just deleted the last row in the table, clear out the
sl@0	6218	** index data.
sl@0	6219	*/
sl@0	6220	rc = content_exists(v);
sl@0	6221	if( rc==SQLITE_ROW ){
sl@0	6222	rc = SQLITE_OK;
sl@0	6223	}else if( rc==SQLITE_DONE ){
sl@0	6224	/* Clear the pending terms so we don't flush a useless level-0
sl@0	6225	** segment when the transaction closes.
sl@0	6226	*/
sl@0	6227	rc = clearPendingTerms(v);
sl@0	6228	if( rc==SQLITE_OK ){
sl@0	6229	rc = segdir_delete_all(v);
sl@0	6230	}
sl@0	6231	}
sl@0	6232	}
sl@0	6233	} else if( sqlite3_value_type(ppArg[0]) != SQLITE_NULL ){
sl@0	6234	/* An update:
sl@0	6235	* ppArg[0] = old rowid
sl@0	6236	* ppArg[1] = new rowid
sl@0	6237	* ppArg[2..2+v->nColumn-1] = values
sl@0	6238	* ppArg[2+v->nColumn] = value for magic column (we ignore this)
sl@0	6239	* ppArg[2+v->nColumn+1] = value for docid
sl@0	6240	*/
sl@0	6241	sqlite_int64 rowid = sqlite3_value_int64(ppArg[0]);
sl@0	6242	if( sqlite3_value_type(ppArg[1]) != SQLITE_INTEGER \|\|
sl@0	6243	sqlite3_value_int64(ppArg[1]) != rowid ){
sl@0	6244	rc = SQLITE_ERROR; /* we don't allow changing the rowid */
sl@0	6245	}else if( sqlite3_value_type(ppArg[2+v->nColumn+1]) != SQLITE_INTEGER \|\|
sl@0	6246	sqlite3_value_int64(ppArg[2+v->nColumn+1]) != rowid ){
sl@0	6247	rc = SQLITE_ERROR; /* we don't allow changing the docid */
sl@0	6248	}else{
sl@0	6249	assert( nArg==2+v->nColumn+2);
sl@0	6250	rc = index_update(v, rowid, &ppArg[2]);
sl@0	6251	}
sl@0	6252	} else {
sl@0	6253	/* An insert:
sl@0	6254	* ppArg[1] = requested rowid
sl@0	6255	* ppArg[2..2+v->nColumn-1] = values
sl@0	6256	* ppArg[2+v->nColumn] = value for magic column (we ignore this)
sl@0	6257	* ppArg[2+v->nColumn+1] = value for docid
sl@0	6258	*/
sl@0	6259	sqlite3_value *pRequestDocid = ppArg[2+v->nColumn+1];
sl@0	6260	assert( nArg==2+v->nColumn+2);
sl@0	6261	if( SQLITE_NULL != sqlite3_value_type(pRequestDocid) &&
sl@0	6262	SQLITE_NULL != sqlite3_value_type(ppArg[1]) ){
sl@0	6263	/* TODO(shess) Consider allowing this to work if the values are
sl@0	6264	** identical. I'm inclined to discourage that usage, though,
sl@0	6265	** given that both rowid and docid are special columns. Better
sl@0	6266	** would be to define one or the other as the default winner,
sl@0	6267	** but should it be fts3-centric (docid) or SQLite-centric
sl@0	6268	** (rowid)?
sl@0	6269	*/
sl@0	6270	rc = SQLITE_ERROR;
sl@0	6271	}else{
sl@0	6272	if( SQLITE_NULL == sqlite3_value_type(pRequestDocid) ){
sl@0	6273	pRequestDocid = ppArg[1];
sl@0	6274	}
sl@0	6275	rc = index_insert(v, pRequestDocid, &ppArg[2], pRowid);
sl@0	6276	}
sl@0	6277	}
sl@0	6278
sl@0	6279	return rc;
sl@0	6280	}
sl@0	6281
sl@0	6282	static int fulltextSync(sqlite3_vtab *pVtab){
sl@0	6283	FTSTRACE(("FTS3 xSync()\n"));
sl@0	6284	return flushPendingTerms((fulltext_vtab *)pVtab);
sl@0	6285	}
sl@0	6286
sl@0	6287	static int fulltextBegin(sqlite3_vtab *pVtab){
sl@0	6288	fulltext_vtab v = (fulltext_vtab ) pVtab;
sl@0	6289	FTSTRACE(("FTS3 xBegin()\n"));
sl@0	6290
sl@0	6291	/* Any buffered updates should have been cleared by the previous
sl@0	6292	** transaction.
sl@0	6293	*/
sl@0	6294	assert( v->nPendingData<0 );
sl@0	6295	return clearPendingTerms(v);
sl@0	6296	}
sl@0	6297
sl@0	6298	static int fulltextCommit(sqlite3_vtab *pVtab){
sl@0	6299	fulltext_vtab v = (fulltext_vtab ) pVtab;
sl@0	6300	FTSTRACE(("FTS3 xCommit()\n"));
sl@0	6301
sl@0	6302	/* Buffered updates should have been cleared by fulltextSync(). */
sl@0	6303	assert( v->nPendingData<0 );
sl@0	6304	return clearPendingTerms(v);
sl@0	6305	}
sl@0	6306
sl@0	6307	static int fulltextRollback(sqlite3_vtab *pVtab){
sl@0	6308	FTSTRACE(("FTS3 xRollback()\n"));
sl@0	6309	return clearPendingTerms((fulltext_vtab *)pVtab);
sl@0	6310	}
sl@0	6311
sl@0	6312	/*
sl@0	6313	** Implementation of the snippet() function for FTS3
sl@0	6314	*/
sl@0	6315	static void snippetFunc(
sl@0	6316	sqlite3_context *pContext,
sl@0	6317	int argc,
sl@0	6318	sqlite3_value **argv
sl@0	6319	){
sl@0	6320	fulltext_cursor *pCursor;
sl@0	6321	if( argc<1 ) return;
sl@0	6322	if( sqlite3_value_type(argv[0])!=SQLITE_BLOB \|\|
sl@0	6323	sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
sl@0	6324	sqlite3_result_error(pContext, "illegal first argument to html_snippet",-1);
sl@0	6325	}else{
sl@0	6326	const char *zStart = "<b>";
sl@0	6327	const char *zEnd = "</b>";
sl@0	6328	const char *zEllipsis = "<b>...</b>";
sl@0	6329	memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
sl@0	6330	if( argc>=2 ){
sl@0	6331	zStart = (const char*)sqlite3_value_text(argv[1]);
sl@0	6332	if( argc>=3 ){
sl@0	6333	zEnd = (const char*)sqlite3_value_text(argv[2]);
sl@0	6334	if( argc>=4 ){
sl@0	6335	zEllipsis = (const char*)sqlite3_value_text(argv[3]);
sl@0	6336	}
sl@0	6337	}
sl@0	6338	}
sl@0	6339	snippetAllOffsets(pCursor);
sl@0	6340	snippetText(pCursor, zStart, zEnd, zEllipsis);
sl@0	6341	sqlite3_result_text(pContext, pCursor->snippet.zSnippet,
sl@0	6342	pCursor->snippet.nSnippet, SQLITE_STATIC);
sl@0	6343	}
sl@0	6344	}
sl@0	6345
sl@0	6346	/*
sl@0	6347	** Implementation of the offsets() function for FTS3
sl@0	6348	*/
sl@0	6349	static void snippetOffsetsFunc(
sl@0	6350	sqlite3_context *pContext,
sl@0	6351	int argc,
sl@0	6352	sqlite3_value **argv
sl@0	6353	){
sl@0	6354	fulltext_cursor *pCursor;
sl@0	6355	if( argc<1 ) return;
sl@0	6356	if( sqlite3_value_type(argv[0])!=SQLITE_BLOB \|\|
sl@0	6357	sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
sl@0	6358	sqlite3_result_error(pContext, "illegal first argument to offsets",-1);
sl@0	6359	}else{
sl@0	6360	memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
sl@0	6361	snippetAllOffsets(pCursor);
sl@0	6362	snippetOffsetText(&pCursor->snippet);
sl@0	6363	sqlite3_result_text(pContext,
sl@0	6364	pCursor->snippet.zOffset, pCursor->snippet.nOffset,
sl@0	6365	SQLITE_STATIC);
sl@0	6366	}
sl@0	6367	}
sl@0	6368
sl@0	6369	/* OptLeavesReader is nearly identical to LeavesReader, except that
sl@0	6370	** where LeavesReader is geared towards the merging of complete
sl@0	6371	** segment levels (with exactly MERGE_COUNT segments), OptLeavesReader
sl@0	6372	** is geared towards implementation of the optimize() function, and
sl@0	6373	** can merge all segments simultaneously. This version may be
sl@0	6374	** somewhat less efficient than LeavesReader because it merges into an
sl@0	6375	** accumulator rather than doing an N-way merge, but since segment
sl@0	6376	** size grows exponentially (so segment count logrithmically) this is
sl@0	6377	** probably not an immediate problem.
sl@0	6378	*/
sl@0	6379	/* TODO(shess): Prove that assertion, or extend the merge code to
sl@0	6380	** merge tree fashion (like the prefix-searching code does).
sl@0	6381	*/
sl@0	6382	/* TODO(shess): OptLeavesReader and LeavesReader could probably be
sl@0	6383	** merged with little or no loss of performance for LeavesReader. The
sl@0	6384	** merged code would need to handle >MERGE_COUNT segments, and would
sl@0	6385	** also need to be able to optionally optimize away deletes.
sl@0	6386	*/
sl@0	6387	typedef struct OptLeavesReader {
sl@0	6388	/* Segment number, to order readers by age. */
sl@0	6389	int segment;
sl@0	6390	LeavesReader reader;
sl@0	6391	} OptLeavesReader;
sl@0	6392
sl@0	6393	static int optLeavesReaderAtEnd(OptLeavesReader *pReader){
sl@0	6394	return leavesReaderAtEnd(&pReader->reader);
sl@0	6395	}
sl@0	6396	static int optLeavesReaderTermBytes(OptLeavesReader *pReader){
sl@0	6397	return leavesReaderTermBytes(&pReader->reader);
sl@0	6398	}
sl@0	6399	static const char optLeavesReaderData(OptLeavesReader pReader){
sl@0	6400	return leavesReaderData(&pReader->reader);
sl@0	6401	}
sl@0	6402	static int optLeavesReaderDataBytes(OptLeavesReader *pReader){
sl@0	6403	return leavesReaderDataBytes(&pReader->reader);
sl@0	6404	}
sl@0	6405	static const char optLeavesReaderTerm(OptLeavesReader pReader){
sl@0	6406	return leavesReaderTerm(&pReader->reader);
sl@0	6407	}
sl@0	6408	static int optLeavesReaderStep(fulltext_vtab v, OptLeavesReader pReader){
sl@0	6409	return leavesReaderStep(v, &pReader->reader);
sl@0	6410	}
sl@0	6411	static int optLeavesReaderTermCmp(OptLeavesReader lr1, OptLeavesReader lr2){
sl@0	6412	return leavesReaderTermCmp(&lr1->reader, &lr2->reader);
sl@0	6413	}
sl@0	6414	/* Order by term ascending, segment ascending (oldest to newest), with
sl@0	6415	** exhausted readers to the end.
sl@0	6416	*/
sl@0	6417	static int optLeavesReaderCmp(OptLeavesReader lr1, OptLeavesReader lr2){
sl@0	6418	int c = optLeavesReaderTermCmp(lr1, lr2);
sl@0	6419	if( c!=0 ) return c;
sl@0	6420	return lr1->segment-lr2->segment;
sl@0	6421	}
sl@0	6422	/* Bubble pLr[0] to appropriate place in pLr[1..nLr-1]. Assumes that
sl@0	6423	** pLr[1..nLr-1] is already sorted.
sl@0	6424	*/
sl@0	6425	static void optLeavesReaderReorder(OptLeavesReader *pLr, int nLr){
sl@0	6426	while( nLr>1 && optLeavesReaderCmp(pLr, pLr+1)>0 ){
sl@0	6427	OptLeavesReader tmp = pLr[0];
sl@0	6428	pLr[0] = pLr[1];
sl@0	6429	pLr[1] = tmp;
sl@0	6430	nLr--;
sl@0	6431	pLr++;
sl@0	6432	}
sl@0	6433	}
sl@0	6434
sl@0	6435	/* optimize() helper function. Put the readers in order and iterate
sl@0	6436	** through them, merging doclists for matching terms into pWriter.
sl@0	6437	** Returns SQLITE_OK on success, or the SQLite error code which
sl@0	6438	** prevented success.
sl@0	6439	*/
sl@0	6440	static int optimizeInternal(fulltext_vtab *v,
sl@0	6441	OptLeavesReader *readers, int nReaders,
sl@0	6442	LeafWriter *pWriter){
sl@0	6443	int i, rc = SQLITE_OK;
sl@0	6444	DataBuffer doclist, merged, tmp;
sl@0	6445
sl@0	6446	/* Order the readers. */
sl@0	6447	i = nReaders;
sl@0	6448	while( i-- > 0 ){
sl@0	6449	optLeavesReaderReorder(&readers[i], nReaders-i);
sl@0	6450	}
sl@0	6451
sl@0	6452	dataBufferInit(&doclist, LEAF_MAX);
sl@0	6453	dataBufferInit(&merged, LEAF_MAX);
sl@0	6454
sl@0	6455	/* Exhausted readers bubble to the end, so when the first reader is
sl@0	6456	** at eof, all are at eof.
sl@0	6457	*/
sl@0	6458	while( !optLeavesReaderAtEnd(&readers[0]) ){
sl@0	6459
sl@0	6460	/* Figure out how many readers share the next term. */
sl@0	6461	for(i=1; i<nReaders && !optLeavesReaderAtEnd(&readers[i]); i++){
sl@0	6462	if( 0!=optLeavesReaderTermCmp(&readers[0], &readers[i]) ) break;
sl@0	6463	}
sl@0	6464
sl@0	6465	/* Special-case for no merge. */
sl@0	6466	if( i==1 ){
sl@0	6467	/* Trim deletions from the doclist. */
sl@0	6468	dataBufferReset(&merged);
sl@0	6469	docListTrim(DL_DEFAULT,
sl@0	6470	optLeavesReaderData(&readers[0]),
sl@0	6471	optLeavesReaderDataBytes(&readers[0]),
sl@0	6472	-1, DL_DEFAULT, &merged);
sl@0	6473	}else{
sl@0	6474	DLReader dlReaders[MERGE_COUNT];
sl@0	6475	int iReader, nReaders;
sl@0	6476
sl@0	6477	/* Prime the pipeline with the first reader's doclist. After
sl@0	6478	** one pass index 0 will reference the accumulated doclist.
sl@0	6479	*/
sl@0	6480	dlrInit(&dlReaders[0], DL_DEFAULT,
sl@0	6481	optLeavesReaderData(&readers[0]),
sl@0	6482	optLeavesReaderDataBytes(&readers[0]));
sl@0	6483	iReader = 1;
sl@0	6484
sl@0	6485	assert( iReader<i ); /* Must execute the loop at least once. */
sl@0	6486	while( iReader<i ){
sl@0	6487	/* Merge 16 inputs per pass. */
sl@0	6488	for( nReaders=1; iReader<i && nReaders<MERGE_COUNT;
sl@0	6489	iReader++, nReaders++ ){
sl@0	6490	dlrInit(&dlReaders[nReaders], DL_DEFAULT,
sl@0	6491	optLeavesReaderData(&readers[iReader]),
sl@0	6492	optLeavesReaderDataBytes(&readers[iReader]));
sl@0	6493	}
sl@0	6494
sl@0	6495	/* Merge doclists and swap result into accumulator. */
sl@0	6496	dataBufferReset(&merged);
sl@0	6497	docListMerge(&merged, dlReaders, nReaders);
sl@0	6498	tmp = merged;
sl@0	6499	merged = doclist;
sl@0	6500	doclist = tmp;
sl@0	6501
sl@0	6502	while( nReaders-- > 0 ){
sl@0	6503	dlrDestroy(&dlReaders[nReaders]);
sl@0	6504	}
sl@0	6505
sl@0	6506	/* Accumulated doclist to reader 0 for next pass. */
sl@0	6507	dlrInit(&dlReaders[0], DL_DEFAULT, doclist.pData, doclist.nData);
sl@0	6508	}
sl@0	6509
sl@0	6510	/* Destroy reader that was left in the pipeline. */
sl@0	6511	dlrDestroy(&dlReaders[0]);
sl@0	6512
sl@0	6513	/* Trim deletions from the doclist. */
sl@0	6514	dataBufferReset(&merged);
sl@0	6515	docListTrim(DL_DEFAULT, doclist.pData, doclist.nData,
sl@0	6516	-1, DL_DEFAULT, &merged);
sl@0	6517	}
sl@0	6518
sl@0	6519	/* Only pass doclists with hits (skip if all hits deleted). */
sl@0	6520	if( merged.nData>0 ){
sl@0	6521	rc = leafWriterStep(v, pWriter,
sl@0	6522	optLeavesReaderTerm(&readers[0]),
sl@0	6523	optLeavesReaderTermBytes(&readers[0]),
sl@0	6524	merged.pData, merged.nData);
sl@0	6525	if( rc!=SQLITE_OK ) goto err;
sl@0	6526	}
sl@0	6527
sl@0	6528	/* Step merged readers to next term and reorder. */
sl@0	6529	while( i-- > 0 ){
sl@0	6530	rc = optLeavesReaderStep(v, &readers[i]);
sl@0	6531	if( rc!=SQLITE_OK ) goto err;
sl@0	6532
sl@0	6533	optLeavesReaderReorder(&readers[i], nReaders-i);
sl@0	6534	}
sl@0	6535	}
sl@0	6536
sl@0	6537	err:
sl@0	6538	dataBufferDestroy(&doclist);
sl@0	6539	dataBufferDestroy(&merged);
sl@0	6540	return rc;
sl@0	6541	}
sl@0	6542
sl@0	6543	/* Implement optimize() function for FTS3. optimize(t) merges all
sl@0	6544	** segments in the fts index into a single segment. 't' is the magic
sl@0	6545	** table-named column.
sl@0	6546	*/
sl@0	6547	static void optimizeFunc(sqlite3_context *pContext,
sl@0	6548	int argc, sqlite3_value **argv){
sl@0	6549	fulltext_cursor *pCursor;
sl@0	6550	if( argc>1 ){
sl@0	6551	sqlite3_result_error(pContext, "excess arguments to optimize()",-1);
sl@0	6552	}else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB \|\|
sl@0	6553	sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
sl@0	6554	sqlite3_result_error(pContext, "illegal first argument to optimize",-1);
sl@0	6555	}else{
sl@0	6556	fulltext_vtab *v;
sl@0	6557	int i, rc, iMaxLevel;
sl@0	6558	OptLeavesReader *readers;
sl@0	6559	int nReaders;
sl@0	6560	LeafWriter writer;
sl@0	6561	sqlite3_stmt *s;
sl@0	6562
sl@0	6563	memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
sl@0	6564	v = cursor_vtab(pCursor);
sl@0	6565
sl@0	6566	/* Flush any buffered updates before optimizing. */
sl@0	6567	rc = flushPendingTerms(v);
sl@0	6568	if( rc!=SQLITE_OK ) goto err;
sl@0	6569
sl@0	6570	rc = segdir_count(v, &nReaders, &iMaxLevel);
sl@0	6571	if( rc!=SQLITE_OK ) goto err;
sl@0	6572	if( nReaders==0 \|\| nReaders==1 ){
sl@0	6573	sqlite3_result_text(pContext, "Index already optimal", -1,
sl@0	6574	SQLITE_STATIC);
sl@0	6575	return;
sl@0	6576	}
sl@0	6577
sl@0	6578	rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
sl@0	6579	if( rc!=SQLITE_OK ) goto err;
sl@0	6580
sl@0	6581	readers = sqlite3_malloc(nReaders*sizeof(readers[0]));
sl@0	6582	if( readers==NULL ) goto err;
sl@0	6583
sl@0	6584	/* Note that there will already be a segment at this position
sl@0	6585	** until we call segdir_delete() on iMaxLevel.
sl@0	6586	*/
sl@0	6587	leafWriterInit(iMaxLevel, 0, &writer);
sl@0	6588
sl@0	6589	i = 0;
sl@0	6590	while( (rc = sqlite3_step(s))==SQLITE_ROW ){
sl@0	6591	sqlite_int64 iStart = sqlite3_column_int64(s, 0);
sl@0	6592	sqlite_int64 iEnd = sqlite3_column_int64(s, 1);
sl@0	6593	const char *pRootData = sqlite3_column_blob(s, 2);
sl@0	6594	int nRootData = sqlite3_column_bytes(s, 2);
sl@0	6595
sl@0	6596	assert( i<nReaders );
sl@0	6597	rc = leavesReaderInit(v, -1, iStart, iEnd, pRootData, nRootData,
sl@0	6598	&readers[i].reader);
sl@0	6599	if( rc!=SQLITE_OK ) break;
sl@0	6600
sl@0	6601	readers[i].segment = i;
sl@0	6602	i++;
sl@0	6603	}
sl@0	6604
sl@0	6605	/* If we managed to succesfully read them all, optimize them. */
sl@0	6606	if( rc==SQLITE_DONE ){
sl@0	6607	assert( i==nReaders );
sl@0	6608	rc = optimizeInternal(v, readers, nReaders, &writer);
sl@0	6609	}
sl@0	6610
sl@0	6611	while( i-- > 0 ){
sl@0	6612	leavesReaderDestroy(&readers[i].reader);
sl@0	6613	}
sl@0	6614	sqlite3_free(readers);
sl@0	6615
sl@0	6616	/* If we've successfully gotten to here, delete the old segments
sl@0	6617	** and flush the interior structure of the new segment.
sl@0	6618	*/
sl@0	6619	if( rc==SQLITE_OK ){
sl@0	6620	for( i=0; i<=iMaxLevel; i++ ){
sl@0	6621	rc = segdir_delete(v, i);
sl@0	6622	if( rc!=SQLITE_OK ) break;
sl@0	6623	}
sl@0	6624
sl@0	6625	if( rc==SQLITE_OK ) rc = leafWriterFinalize(v, &writer);
sl@0	6626	}
sl@0	6627
sl@0	6628	leafWriterDestroy(&writer);
sl@0	6629
sl@0	6630	if( rc!=SQLITE_OK ) goto err;
sl@0	6631
sl@0	6632	sqlite3_result_text(pContext, "Index optimized", -1, SQLITE_STATIC);
sl@0	6633	return;
sl@0	6634
sl@0	6635	/* TODO(shess): Error-handling needs to be improved along the
sl@0	6636	** lines of the dump_ functions.
sl@0	6637	*/
sl@0	6638	err:
sl@0	6639	{
sl@0	6640	char buf[512];
sl@0	6641	sqlite3_snprintf(sizeof(buf), buf, "Error in optimize: %s",
sl@0	6642	sqlite3_errmsg(sqlite3_context_db_handle(pContext)));
sl@0	6643	sqlite3_result_error(pContext, buf, -1);
sl@0	6644	}
sl@0	6645	}
sl@0	6646	}
sl@0	6647
sl@0	6648	#ifdef SQLITE_TEST
sl@0	6649	/* Generate an error of the form "<prefix>: <msg>". If msg is NULL,
sl@0	6650	** pull the error from the context's db handle.
sl@0	6651	*/
sl@0	6652	static void generateError(sqlite3_context *pContext,
sl@0	6653	const char prefix, const char msg){
sl@0	6654	char buf[512];
sl@0	6655	if( msg==NULL ) msg = sqlite3_errmsg(sqlite3_context_db_handle(pContext));
sl@0	6656	sqlite3_snprintf(sizeof(buf), buf, "%s: %s", prefix, msg);
sl@0	6657	sqlite3_result_error(pContext, buf, -1);
sl@0	6658	}
sl@0	6659
sl@0	6660	/* Helper function to collect the set of terms in the segment into
sl@0	6661	** pTerms. The segment is defined by the leaf nodes between
sl@0	6662	** iStartBlockid and iEndBlockid, inclusive, or by the contents of
sl@0	6663	** pRootData if iStartBlockid is 0 (in which case the entire segment
sl@0	6664	** fit in a leaf).
sl@0	6665	*/
sl@0	6666	static int collectSegmentTerms(fulltext_vtab v, sqlite3_stmt s,
sl@0	6667	fts3Hash *pTerms){
sl@0	6668	const sqlite_int64 iStartBlockid = sqlite3_column_int64(s, 0);
sl@0	6669	const sqlite_int64 iEndBlockid = sqlite3_column_int64(s, 1);
sl@0	6670	const char *pRootData = sqlite3_column_blob(s, 2);
sl@0	6671	const int nRootData = sqlite3_column_bytes(s, 2);
sl@0	6672	LeavesReader reader;
sl@0	6673	int rc = leavesReaderInit(v, 0, iStartBlockid, iEndBlockid,
sl@0	6674	pRootData, nRootData, &reader);
sl@0	6675	if( rc!=SQLITE_OK ) return rc;
sl@0	6676
sl@0	6677	while( rc==SQLITE_OK && !leavesReaderAtEnd(&reader) ){
sl@0	6678	const char *pTerm = leavesReaderTerm(&reader);
sl@0	6679	const int nTerm = leavesReaderTermBytes(&reader);
sl@0	6680	void *oldValue = sqlite3Fts3HashFind(pTerms, pTerm, nTerm);
sl@0	6681	void newValue = (void )((char *)oldValue+1);
sl@0	6682
sl@0	6683	/* From the comment before sqlite3Fts3HashInsert in fts3_hash.c,
sl@0	6684	** the data value passed is returned in case of malloc failure.
sl@0	6685	*/
sl@0	6686	if( newValue==sqlite3Fts3HashInsert(pTerms, pTerm, nTerm, newValue) ){
sl@0	6687	rc = SQLITE_NOMEM;
sl@0	6688	}else{
sl@0	6689	rc = leavesReaderStep(v, &reader);
sl@0	6690	}
sl@0	6691	}
sl@0	6692
sl@0	6693	leavesReaderDestroy(&reader);
sl@0	6694	return rc;
sl@0	6695	}
sl@0	6696
sl@0	6697	/* Helper function to build the result string for dump_terms(). */
sl@0	6698	static int generateTermsResult(sqlite3_context pContext, fts3Hash pTerms){
sl@0	6699	int iTerm, nTerms, nResultBytes, iByte;
sl@0	6700	char *result;
sl@0	6701	TermData *pData;
sl@0	6702	fts3HashElem *e;
sl@0	6703
sl@0	6704	/* Iterate pTerms to generate an array of terms in pData for
sl@0	6705	** sorting.
sl@0	6706	*/
sl@0	6707	nTerms = fts3HashCount(pTerms);
sl@0	6708	assert( nTerms>0 );
sl@0	6709	pData = sqlite3_malloc(nTerms*sizeof(TermData));
sl@0	6710	if( pData==NULL ) return SQLITE_NOMEM;
sl@0	6711
sl@0	6712	nResultBytes = 0;
sl@0	6713	for(iTerm = 0, e = fts3HashFirst(pTerms); e; iTerm++, e = fts3HashNext(e)){
sl@0	6714	nResultBytes += fts3HashKeysize(e)+1; /* Term plus trailing space */
sl@0	6715	assert( iTerm<nTerms );
sl@0	6716	pData[iTerm].pTerm = fts3HashKey(e);
sl@0	6717	pData[iTerm].nTerm = fts3HashKeysize(e);
sl@0	6718	pData[iTerm].pCollector = fts3HashData(e); /* unused */
sl@0	6719	}
sl@0	6720	assert( iTerm==nTerms );
sl@0	6721
sl@0	6722	assert( nResultBytes>0 ); /* nTerms>0, nResultsBytes must be, too. */
sl@0	6723	result = sqlite3_malloc(nResultBytes);
sl@0	6724	if( result==NULL ){
sl@0	6725	sqlite3_free(pData);
sl@0	6726	return SQLITE_NOMEM;
sl@0	6727	}
sl@0	6728
sl@0	6729	if( nTerms>1 ) qsort(pData, nTerms, sizeof(*pData), termDataCmp);
sl@0	6730
sl@0	6731	/* Read the terms in order to build the result. */
sl@0	6732	iByte = 0;
sl@0	6733	for(iTerm=0; iTerm<nTerms; ++iTerm){
sl@0	6734	memcpy(result+iByte, pData[iTerm].pTerm, pData[iTerm].nTerm);
sl@0	6735	iByte += pData[iTerm].nTerm;
sl@0	6736	result[iByte++] = ' ';
sl@0	6737	}
sl@0	6738	assert( iByte==nResultBytes );
sl@0	6739	assert( result[nResultBytes-1]==' ' );
sl@0	6740	result[nResultBytes-1] = '\0';
sl@0	6741
sl@0	6742	/* Passes away ownership of result. */
sl@0	6743	sqlite3_result_text(pContext, result, nResultBytes-1, sqlite3_free);
sl@0	6744	sqlite3_free(pData);
sl@0	6745	return SQLITE_OK;
sl@0	6746	}
sl@0	6747
sl@0	6748	/* Implements dump_terms() for use in inspecting the fts3 index from
sl@0	6749	** tests. TEXT result containing the ordered list of terms joined by
sl@0	6750	** spaces. dump_terms(t, level, idx) dumps the terms for the segment
sl@0	6751	** specified by level, idx (in %_segdir), while dump_terms(t) dumps
sl@0	6752	** all terms in the index. In both cases t is the fts table's magic
sl@0	6753	** table-named column.
sl@0	6754	*/
sl@0	6755	static void dumpTermsFunc(
sl@0	6756	sqlite3_context *pContext,
sl@0	6757	int argc, sqlite3_value **argv
sl@0	6758	){
sl@0	6759	fulltext_cursor *pCursor;
sl@0	6760	if( argc!=3 && argc!=1 ){
sl@0	6761	generateError(pContext, "dump_terms", "incorrect arguments");
sl@0	6762	}else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB \|\|
sl@0	6763	sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
sl@0	6764	generateError(pContext, "dump_terms", "illegal first argument");
sl@0	6765	}else{
sl@0	6766	fulltext_vtab *v;
sl@0	6767	fts3Hash terms;
sl@0	6768	sqlite3_stmt *s = NULL;
sl@0	6769	int rc;
sl@0	6770
sl@0	6771	memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
sl@0	6772	v = cursor_vtab(pCursor);
sl@0	6773
sl@0	6774	/* If passed only the cursor column, get all segments. Otherwise
sl@0	6775	** get the segment described by the following two arguments.
sl@0	6776	*/
sl@0	6777	if( argc==1 ){
sl@0	6778	rc = sql_get_statement(v, SEGDIR_SELECT_ALL_STMT, &s);
sl@0	6779	}else{
sl@0	6780	rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
sl@0	6781	if( rc==SQLITE_OK ){
sl@0	6782	rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[1]));
sl@0	6783	if( rc==SQLITE_OK ){
sl@0	6784	rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[2]));
sl@0	6785	}
sl@0	6786	}
sl@0	6787	}
sl@0	6788
sl@0	6789	if( rc!=SQLITE_OK ){
sl@0	6790	generateError(pContext, "dump_terms", NULL);
sl@0	6791	return;
sl@0	6792	}
sl@0	6793
sl@0	6794	/* Collect the terms for each segment. */
sl@0	6795	sqlite3Fts3HashInit(&terms, FTS3_HASH_STRING, 1);
sl@0	6796	while( (rc = sqlite3_step(s))==SQLITE_ROW ){
sl@0	6797	rc = collectSegmentTerms(v, s, &terms);
sl@0	6798	if( rc!=SQLITE_OK ) break;
sl@0	6799	}
sl@0	6800
sl@0	6801	if( rc!=SQLITE_DONE ){
sl@0	6802	sqlite3_reset(s);
sl@0	6803	generateError(pContext, "dump_terms", NULL);
sl@0	6804	}else{
sl@0	6805	const int nTerms = fts3HashCount(&terms);
sl@0	6806	if( nTerms>0 ){
sl@0	6807	rc = generateTermsResult(pContext, &terms);
sl@0	6808	if( rc==SQLITE_NOMEM ){
sl@0	6809	generateError(pContext, "dump_terms", "out of memory");
sl@0	6810	}else{
sl@0	6811	assert( rc==SQLITE_OK );
sl@0	6812	}
sl@0	6813	}else if( argc==3 ){
sl@0	6814	/* The specific segment asked for could not be found. */
sl@0	6815	generateError(pContext, "dump_terms", "segment not found");
sl@0	6816	}else{
sl@0	6817	/* No segments found. */
sl@0	6818	/* TODO(shess): It should be impossible to reach this. This
sl@0	6819	** case can only happen for an empty table, in which case
sl@0	6820	** SQLite has no rows to call this function on.
sl@0	6821	*/
sl@0	6822	sqlite3_result_null(pContext);
sl@0	6823	}
sl@0	6824	}
sl@0	6825	sqlite3Fts3HashClear(&terms);
sl@0	6826	}
sl@0	6827	}
sl@0	6828
sl@0	6829	/* Expand the DL_DEFAULT doclist in pData into a text result in
sl@0	6830	** pContext.
sl@0	6831	*/
sl@0	6832	static void createDoclistResult(sqlite3_context *pContext,
sl@0	6833	const char *pData, int nData){
sl@0	6834	DataBuffer dump;
sl@0	6835	DLReader dlReader;
sl@0	6836
sl@0	6837	assert( pData!=NULL && nData>0 );
sl@0	6838
sl@0	6839	dataBufferInit(&dump, 0);
sl@0	6840	dlrInit(&dlReader, DL_DEFAULT, pData, nData);
sl@0	6841	for( ; !dlrAtEnd(&dlReader); dlrStep(&dlReader) ){
sl@0	6842	char buf[256];
sl@0	6843	PLReader plReader;
sl@0	6844
sl@0	6845	plrInit(&plReader, &dlReader);
sl@0	6846	if( DL_DEFAULT==DL_DOCIDS \|\| plrAtEnd(&plReader) ){
sl@0	6847	sqlite3_snprintf(sizeof(buf), buf, "[%lld] ", dlrDocid(&dlReader));
sl@0	6848	dataBufferAppend(&dump, buf, strlen(buf));
sl@0	6849	}else{
sl@0	6850	int iColumn = plrColumn(&plReader);
sl@0	6851
sl@0	6852	sqlite3_snprintf(sizeof(buf), buf, "[%lld %d[",
sl@0	6853	dlrDocid(&dlReader), iColumn);
sl@0	6854	dataBufferAppend(&dump, buf, strlen(buf));
sl@0	6855
sl@0	6856	for( ; !plrAtEnd(&plReader); plrStep(&plReader) ){
sl@0	6857	if( plrColumn(&plReader)!=iColumn ){
sl@0	6858	iColumn = plrColumn(&plReader);
sl@0	6859	sqlite3_snprintf(sizeof(buf), buf, "] %d[", iColumn);
sl@0	6860	assert( dump.nData>0 );
sl@0	6861	dump.nData--; /* Overwrite trailing space. */
sl@0	6862	assert( dump.pData[dump.nData]==' ');
sl@0	6863	dataBufferAppend(&dump, buf, strlen(buf));
sl@0	6864	}
sl@0	6865	if( DL_DEFAULT==DL_POSITIONS_OFFSETS ){
sl@0	6866	sqlite3_snprintf(sizeof(buf), buf, "%d,%d,%d ",
sl@0	6867	plrPosition(&plReader),
sl@0	6868	plrStartOffset(&plReader), plrEndOffset(&plReader));
sl@0	6869	}else if( DL_DEFAULT==DL_POSITIONS ){
sl@0	6870	sqlite3_snprintf(sizeof(buf), buf, "%d ", plrPosition(&plReader));
sl@0	6871	}else{
sl@0	6872	assert( NULL=="Unhandled DL_DEFAULT value");
sl@0	6873	}
sl@0	6874	dataBufferAppend(&dump, buf, strlen(buf));
sl@0	6875	}
sl@0	6876	plrDestroy(&plReader);
sl@0	6877
sl@0	6878	assert( dump.nData>0 );
sl@0	6879	dump.nData--; /* Overwrite trailing space. */
sl@0	6880	assert( dump.pData[dump.nData]==' ');
sl@0	6881	dataBufferAppend(&dump, "]] ", 3);
sl@0	6882	}
sl@0	6883	}
sl@0	6884	dlrDestroy(&dlReader);
sl@0	6885
sl@0	6886	assert( dump.nData>0 );
sl@0	6887	dump.nData--; /* Overwrite trailing space. */
sl@0	6888	assert( dump.pData[dump.nData]==' ');
sl@0	6889	dump.pData[dump.nData] = '\0';
sl@0	6890	assert( dump.nData>0 );
sl@0	6891
sl@0	6892	/* Passes ownership of dump's buffer to pContext. */
sl@0	6893	sqlite3_result_text(pContext, dump.pData, dump.nData, sqlite3_free);
sl@0	6894	dump.pData = NULL;
sl@0	6895	dump.nData = dump.nCapacity = 0;
sl@0	6896	}
sl@0	6897
sl@0	6898	/* Implements dump_doclist() for use in inspecting the fts3 index from
sl@0	6899	** tests. TEXT result containing a string representation of the
sl@0	6900	** doclist for the indicated term. dump_doclist(t, term, level, idx)
sl@0	6901	** dumps the doclist for term from the segment specified by level, idx
sl@0	6902	** (in %_segdir), while dump_doclist(t, term) dumps the logical
sl@0	6903	** doclist for the term across all segments. The per-segment doclist
sl@0	6904	** can contain deletions, while the full-index doclist will not
sl@0	6905	** (deletions are omitted).
sl@0	6906	**
sl@0	6907	** Result formats differ with the setting of DL_DEFAULTS. Examples:
sl@0	6908	**
sl@0	6909	** DL_DOCIDS: [1] [3] [7]
sl@0	6910	** DL_POSITIONS: [1 0[0 4] 1[17]] [3 1[5]]
sl@0	6911	** DL_POSITIONS_OFFSETS: [1 0[0,0,3 4,23,26] 1[17,102,105]] [3 1[5,20,23]]
sl@0	6912	**
sl@0	6913	** In each case the number after the outer '[' is the docid. In the
sl@0	6914	** latter two cases, the number before the inner '[' is the column
sl@0	6915	** associated with the values within. For DL_POSITIONS the numbers
sl@0	6916	** within are the positions, for DL_POSITIONS_OFFSETS they are the
sl@0	6917	** position, the start offset, and the end offset.
sl@0	6918	*/
sl@0	6919	static void dumpDoclistFunc(
sl@0	6920	sqlite3_context *pContext,
sl@0	6921	int argc, sqlite3_value **argv
sl@0	6922	){
sl@0	6923	fulltext_cursor *pCursor;
sl@0	6924	if( argc!=2 && argc!=4 ){
sl@0	6925	generateError(pContext, "dump_doclist", "incorrect arguments");
sl@0	6926	}else if( sqlite3_value_type(argv[0])!=SQLITE_BLOB \|\|
sl@0	6927	sqlite3_value_bytes(argv[0])!=sizeof(pCursor) ){
sl@0	6928	generateError(pContext, "dump_doclist", "illegal first argument");
sl@0	6929	}else if( sqlite3_value_text(argv[1])==NULL \|\|
sl@0	6930	sqlite3_value_text(argv[1])[0]=='\0' ){
sl@0	6931	generateError(pContext, "dump_doclist", "empty second argument");
sl@0	6932	}else{
sl@0	6933	const char pTerm = (const char )sqlite3_value_text(argv[1]);
sl@0	6934	const int nTerm = strlen(pTerm);
sl@0	6935	fulltext_vtab *v;
sl@0	6936	int rc;
sl@0	6937	DataBuffer doclist;
sl@0	6938
sl@0	6939	memcpy(&pCursor, sqlite3_value_blob(argv[0]), sizeof(pCursor));
sl@0	6940	v = cursor_vtab(pCursor);
sl@0	6941
sl@0	6942	dataBufferInit(&doclist, 0);
sl@0	6943
sl@0	6944	/* termSelect() yields the same logical doclist that queries are
sl@0	6945	** run against.
sl@0	6946	*/
sl@0	6947	if( argc==2 ){
sl@0	6948	rc = termSelect(v, v->nColumn, pTerm, nTerm, 0, DL_DEFAULT, &doclist);
sl@0	6949	}else{
sl@0	6950	sqlite3_stmt *s = NULL;
sl@0	6951
sl@0	6952	/* Get our specific segment's information. */
sl@0	6953	rc = sql_get_statement(v, SEGDIR_SELECT_SEGMENT_STMT, &s);
sl@0	6954	if( rc==SQLITE_OK ){
sl@0	6955	rc = sqlite3_bind_int(s, 1, sqlite3_value_int(argv[2]));
sl@0	6956	if( rc==SQLITE_OK ){
sl@0	6957	rc = sqlite3_bind_int(s, 2, sqlite3_value_int(argv[3]));
sl@0	6958	}
sl@0	6959	}
sl@0	6960
sl@0	6961	if( rc==SQLITE_OK ){
sl@0	6962	rc = sqlite3_step(s);
sl@0	6963
sl@0	6964	if( rc==SQLITE_DONE ){
sl@0	6965	dataBufferDestroy(&doclist);
sl@0	6966	generateError(pContext, "dump_doclist", "segment not found");
sl@0	6967	return;
sl@0	6968	}
sl@0	6969
sl@0	6970	/* Found a segment, load it into doclist. */
sl@0	6971	if( rc==SQLITE_ROW ){
sl@0	6972	const sqlite_int64 iLeavesEnd = sqlite3_column_int64(s, 1);
sl@0	6973	const char *pData = sqlite3_column_blob(s, 2);
sl@0	6974	const int nData = sqlite3_column_bytes(s, 2);
sl@0	6975
sl@0	6976	/* loadSegment() is used by termSelect() to load each
sl@0	6977	** segment's data.
sl@0	6978	*/
sl@0	6979	rc = loadSegment(v, pData, nData, iLeavesEnd, pTerm, nTerm, 0,
sl@0	6980	&doclist);
sl@0	6981	if( rc==SQLITE_OK ){
sl@0	6982	rc = sqlite3_step(s);
sl@0	6983
sl@0	6984	/* Should not have more than one matching segment. */
sl@0	6985	if( rc!=SQLITE_DONE ){
sl@0	6986	sqlite3_reset(s);
sl@0	6987	dataBufferDestroy(&doclist);
sl@0	6988	generateError(pContext, "dump_doclist", "invalid segdir");
sl@0	6989	return;
sl@0	6990	}
sl@0	6991	rc = SQLITE_OK;
sl@0	6992	}
sl@0	6993	}
sl@0	6994	}
sl@0	6995
sl@0	6996	sqlite3_reset(s);
sl@0	6997	}
sl@0	6998
sl@0	6999	if( rc==SQLITE_OK ){
sl@0	7000	if( doclist.nData>0 ){
sl@0	7001	createDoclistResult(pContext, doclist.pData, doclist.nData);
sl@0	7002	}else{
sl@0	7003	/* TODO(shess): This can happen if the term is not present, or
sl@0	7004	** if all instances of the term have been deleted and this is
sl@0	7005	** an all-index dump. It may be interesting to distinguish
sl@0	7006	** these cases.
sl@0	7007	*/
sl@0	7008	sqlite3_result_text(pContext, "", 0, SQLITE_STATIC);
sl@0	7009	}
sl@0	7010	}else if( rc==SQLITE_NOMEM ){
sl@0	7011	/* Handle out-of-memory cases specially because if they are
sl@0	7012	** generated in fts3 code they may not be reflected in the db
sl@0	7013	** handle.
sl@0	7014	*/
sl@0	7015	/* TODO(shess): Handle this more comprehensively.
sl@0	7016	** sqlite3ErrStr() has what I need, but is internal.
sl@0	7017	*/
sl@0	7018	generateError(pContext, "dump_doclist", "out of memory");
sl@0	7019	}else{
sl@0	7020	generateError(pContext, "dump_doclist", NULL);
sl@0	7021	}
sl@0	7022
sl@0	7023	dataBufferDestroy(&doclist);
sl@0	7024	}
sl@0	7025	}
sl@0	7026	#endif
sl@0	7027
sl@0	7028	/*
sl@0	7029	** This routine implements the xFindFunction method for the FTS3
sl@0	7030	** virtual table.
sl@0	7031	*/
sl@0	7032	static int fulltextFindFunction(
sl@0	7033	sqlite3_vtab *pVtab,
sl@0	7034	int nArg,
sl@0	7035	const char *zName,
sl@0	7036	void (*pxFunc)(sqlite3_context,int,sqlite3_value**),
sl@0	7037	void **ppArg
sl@0	7038	){
sl@0	7039	if( strcmp(zName,"snippet")==0 ){
sl@0	7040	*pxFunc = snippetFunc;
sl@0	7041	return 1;
sl@0	7042	}else if( strcmp(zName,"offsets")==0 ){
sl@0	7043	*pxFunc = snippetOffsetsFunc;
sl@0	7044	return 1;
sl@0	7045	}else if( strcmp(zName,"optimize")==0 ){
sl@0	7046	*pxFunc = optimizeFunc;
sl@0	7047	return 1;
sl@0	7048	#ifdef SQLITE_TEST
sl@0	7049	/* NOTE(shess): These functions are present only for testing
sl@0	7050	** purposes. No particular effort is made to optimize their
sl@0	7051	** execution or how they build their results.
sl@0	7052	*/
sl@0	7053	}else if( strcmp(zName,"dump_terms")==0 ){
sl@0	7054	/* fprintf(stderr, "Found dump_terms\n"); */
sl@0	7055	*pxFunc = dumpTermsFunc;
sl@0	7056	return 1;
sl@0	7057	}else if( strcmp(zName,"dump_doclist")==0 ){
sl@0	7058	/* fprintf(stderr, "Found dump_doclist\n"); */
sl@0	7059	*pxFunc = dumpDoclistFunc;
sl@0	7060	return 1;
sl@0	7061	#endif
sl@0	7062	}
sl@0	7063	return 0;
sl@0	7064	}
sl@0	7065
sl@0	7066	/*
sl@0	7067	** Rename an fts3 table.
sl@0	7068	*/
sl@0	7069	static int fulltextRename(
sl@0	7070	sqlite3_vtab *pVtab,
sl@0	7071	const char *zName
sl@0	7072	){
sl@0	7073	fulltext_vtab p = (fulltext_vtab )pVtab;
sl@0	7074	int rc = SQLITE_NOMEM;
sl@0	7075	char *zSql = sqlite3_mprintf(
sl@0	7076	"ALTER TABLE %Q.'%q_content' RENAME TO '%q_content';"
sl@0	7077	"ALTER TABLE %Q.'%q_segments' RENAME TO '%q_segments';"
sl@0	7078	"ALTER TABLE %Q.'%q_segdir' RENAME TO '%q_segdir';"
sl@0	7079	, p->zDb, p->zName, zName
sl@0	7080	, p->zDb, p->zName, zName
sl@0	7081	, p->zDb, p->zName, zName
sl@0	7082	);
sl@0	7083	if( zSql ){
sl@0	7084	rc = sqlite3_exec(p->db, zSql, 0, 0, 0);
sl@0	7085	sqlite3_free(zSql);
sl@0	7086	}
sl@0	7087	return rc;
sl@0	7088	}
sl@0	7089
sl@0	7090	static const sqlite3_module fts3Module = {
sl@0	7091	/* iVersion */ 0,
sl@0	7092	/* xCreate */ fulltextCreate,
sl@0	7093	/* xConnect */ fulltextConnect,
sl@0	7094	/* xBestIndex */ fulltextBestIndex,
sl@0	7095	/* xDisconnect */ fulltextDisconnect,
sl@0	7096	/* xDestroy */ fulltextDestroy,
sl@0	7097	/* xOpen */ fulltextOpen,
sl@0	7098	/* xClose */ fulltextClose,
sl@0	7099	/* xFilter */ fulltextFilter,
sl@0	7100	/* xNext */ fulltextNext,
sl@0	7101	/* xEof */ fulltextEof,
sl@0	7102	/* xColumn */ fulltextColumn,
sl@0	7103	/* xRowid */ fulltextRowid,
sl@0	7104	/* xUpdate */ fulltextUpdate,
sl@0	7105	/* xBegin */ fulltextBegin,
sl@0	7106	/* xSync */ fulltextSync,
sl@0	7107	/* xCommit */ fulltextCommit,
sl@0	7108	/* xRollback */ fulltextRollback,
sl@0	7109	/* xFindFunction */ fulltextFindFunction,
sl@0	7110	/* xRename */ fulltextRename,
sl@0	7111	};
sl@0	7112
sl@0	7113	static void hashDestroy(void *p){
sl@0	7114	fts3Hash pHash = (fts3Hash )p;
sl@0	7115	sqlite3Fts3HashClear(pHash);
sl@0	7116	sqlite3_free(pHash);
sl@0	7117	}
sl@0	7118
sl@0	7119	/*
sl@0	7120	** The fts3 built-in tokenizers - "simple" and "porter" - are implemented
sl@0	7121	** in files fts3_tokenizer1.c and fts3_porter.c respectively. The following
sl@0	7122	** two forward declarations are for functions declared in these files
sl@0	7123	** used to retrieve the respective implementations.
sl@0	7124	**
sl@0	7125	** Calling sqlite3Fts3SimpleTokenizerModule() sets the value pointed
sl@0	7126	** to by the argument to point a the "simple" tokenizer implementation.
sl@0	7127	** Function ...PorterTokenizerModule() sets *pModule to point to the
sl@0	7128	** porter tokenizer/stemmer implementation.
sl@0	7129	*/
sl@0	7130	void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
sl@0	7131	void sqlite3Fts3PorterTokenizerModule(sqlite3_tokenizer_module const**ppModule);
sl@0	7132	void sqlite3Fts3IcuTokenizerModule(sqlite3_tokenizer_module const**ppModule);
sl@0	7133
sl@0	7134	int sqlite3Fts3InitHashTable(sqlite3 , fts3Hash , const char *);
sl@0	7135
sl@0	7136	/*
sl@0	7137	** Initialise the fts3 extension. If this extension is built as part
sl@0	7138	** of the sqlite library, then this function is called directly by
sl@0	7139	** SQLite. If fts3 is built as a dynamically loadable extension, this
sl@0	7140	** function is called by the sqlite3_extension_init() entry point.
sl@0	7141	*/
sl@0	7142	int sqlite3Fts3Init(sqlite3 *db){
sl@0	7143	int rc = SQLITE_OK;
sl@0	7144	fts3Hash *pHash = 0;
sl@0	7145	const sqlite3_tokenizer_module *pSimple = 0;
sl@0	7146	const sqlite3_tokenizer_module *pPorter = 0;
sl@0	7147	const sqlite3_tokenizer_module *pIcu = 0;
sl@0	7148
sl@0	7149	sqlite3Fts3SimpleTokenizerModule(&pSimple);
sl@0	7150	sqlite3Fts3PorterTokenizerModule(&pPorter);
sl@0	7151	#ifdef SQLITE_ENABLE_ICU
sl@0	7152	sqlite3Fts3IcuTokenizerModule(&pIcu);
sl@0	7153	#endif
sl@0	7154
sl@0	7155	/* Allocate and initialise the hash-table used to store tokenizers. */
sl@0	7156	pHash = sqlite3_malloc(sizeof(fts3Hash));
sl@0	7157	if( !pHash ){
sl@0	7158	rc = SQLITE_NOMEM;
sl@0	7159	}else{
sl@0	7160	sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
sl@0	7161	}
sl@0	7162
sl@0	7163	/* Load the built-in tokenizers into the hash table */
sl@0	7164	if( rc==SQLITE_OK ){
sl@0	7165	if( sqlite3Fts3HashInsert(pHash, "simple", 7, (void *)pSimple)
sl@0	7166	\|\| sqlite3Fts3HashInsert(pHash, "porter", 7, (void *)pPorter)
sl@0	7167	\|\| (pIcu && sqlite3Fts3HashInsert(pHash, "icu", 4, (void *)pIcu))
sl@0	7168	){
sl@0	7169	rc = SQLITE_NOMEM;
sl@0	7170	}
sl@0	7171	}
sl@0	7172
sl@0	7173	/* Create the virtual table wrapper around the hash-table and overload
sl@0	7174	** the two scalar functions. If this is successful, register the
sl@0	7175	** module with sqlite.
sl@0	7176	*/
sl@0	7177	if( SQLITE_OK==rc
sl@0	7178	&& SQLITE_OK==(rc = sqlite3Fts3InitHashTable(db, pHash, "fts3_tokenizer"))
sl@0	7179	&& SQLITE_OK==(rc = sqlite3_overload_function(db, "snippet", -1))
sl@0	7180	&& SQLITE_OK==(rc = sqlite3_overload_function(db, "offsets", -1))
sl@0	7181	&& SQLITE_OK==(rc = sqlite3_overload_function(db, "optimize", -1))
sl@0	7182	#ifdef SQLITE_TEST
sl@0	7183	&& SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_terms", -1))
sl@0	7184	&& SQLITE_OK==(rc = sqlite3_overload_function(db, "dump_doclist", -1))
sl@0	7185	#endif
sl@0	7186	){
sl@0	7187	return sqlite3_create_module_v2(
sl@0	7188	db, "fts3", &fts3Module, (void *)pHash, hashDestroy
sl@0	7189	);
sl@0	7190	}
sl@0	7191
sl@0	7192	/* An error has occured. Delete the hash table and return the error code. */
sl@0	7193	assert( rc!=SQLITE_OK );
sl@0	7194	if( pHash ){
sl@0	7195	sqlite3Fts3HashClear(pHash);
sl@0	7196	sqlite3_free(pHash);
sl@0	7197	}
sl@0	7198	return rc;
sl@0	7199	}
sl@0	7200
sl@0	7201	#if !SQLITE_CORE
sl@0	7202	int sqlite3_extension_init(
sl@0	7203	sqlite3 *db,
sl@0	7204	char **pzErrMsg,
sl@0	7205	const sqlite3_api_routines *pApi
sl@0	7206	){
sl@0	7207	SQLITE_EXTENSION_INIT2(pApi)
sl@0	7208	return sqlite3Fts3Init(db);
sl@0	7209	}
sl@0	7210	#endif
sl@0	7211
sl@0	7212	#endif /* !defined(SQLITE_CORE) \|\| defined(SQLITE_ENABLE_FTS3) */

author	sl@SLION-WIN7.fritz.box
	Fri, 15 Jun 2012 03:10:57 +0200
changeset 0	bde4ae8d615e
permissions	-rw-r--r--