sl@0: /* sl@0: ** The author disclaims copyright to this source code. sl@0: ** sl@0: ************************************************************************* sl@0: ** Implementation of the "simple" full-text-search tokenizer. sl@0: */ sl@0: sl@0: /* sl@0: ** The code in this file is only compiled if: sl@0: ** sl@0: ** * The FTS1 module is being built as an extension sl@0: ** (in which case SQLITE_CORE is not defined), or sl@0: ** sl@0: ** * The FTS1 module is being built into the core of sl@0: ** SQLite (in which case SQLITE_ENABLE_FTS1 is defined). sl@0: */ sl@0: #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) sl@0: sl@0: sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: #include sl@0: sl@0: #include "fts1_tokenizer.h" sl@0: sl@0: typedef struct simple_tokenizer { sl@0: sqlite3_tokenizer base; sl@0: char delim[128]; /* flag ASCII delimiters */ sl@0: } simple_tokenizer; sl@0: sl@0: typedef struct simple_tokenizer_cursor { sl@0: sqlite3_tokenizer_cursor base; sl@0: const char *pInput; /* input we are tokenizing */ sl@0: int nBytes; /* size of the input */ sl@0: int iOffset; /* current position in pInput */ sl@0: int iToken; /* index of next token to be returned */ sl@0: char *pToken; /* storage for current token */ sl@0: int nTokenAllocated; /* space allocated to zToken buffer */ sl@0: } simple_tokenizer_cursor; sl@0: sl@0: sl@0: /* Forward declaration */ sl@0: static const sqlite3_tokenizer_module simpleTokenizerModule; sl@0: sl@0: static int isDelim(simple_tokenizer *t, unsigned char c){ sl@0: return c<0x80 && t->delim[c]; sl@0: } sl@0: sl@0: /* sl@0: ** Create a new tokenizer instance. sl@0: */ sl@0: static int simpleCreate( sl@0: int argc, const char * const *argv, sl@0: sqlite3_tokenizer **ppTokenizer sl@0: ){ sl@0: simple_tokenizer *t; sl@0: sl@0: t = (simple_tokenizer *) calloc(sizeof(*t), 1); sl@0: if( t==NULL ) return SQLITE_NOMEM; sl@0: sl@0: /* TODO(shess) Delimiters need to remain the same from run to run, sl@0: ** else we need to reindex. One solution would be a meta-table to sl@0: ** track such information in the database, then we'd only want this sl@0: ** information on the initial create. sl@0: */ sl@0: if( argc>1 ){ sl@0: int i, n = strlen(argv[1]); sl@0: for(i=0; i=0x80 ){ sl@0: free(t); sl@0: return SQLITE_ERROR; sl@0: } sl@0: t->delim[ch] = 1; sl@0: } sl@0: } else { sl@0: /* Mark non-alphanumeric ASCII characters as delimiters */ sl@0: int i; sl@0: for(i=1; i<0x80; i++){ sl@0: t->delim[i] = !isalnum(i); sl@0: } sl@0: } sl@0: sl@0: *ppTokenizer = &t->base; sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Destroy a tokenizer sl@0: */ sl@0: static int simpleDestroy(sqlite3_tokenizer *pTokenizer){ sl@0: free(pTokenizer); sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Prepare to begin tokenizing a particular string. The input sl@0: ** string to be tokenized is pInput[0..nBytes-1]. A cursor sl@0: ** used to incrementally tokenize this string is returned in sl@0: ** *ppCursor. sl@0: */ sl@0: static int simpleOpen( sl@0: sqlite3_tokenizer *pTokenizer, /* The tokenizer */ sl@0: const char *pInput, int nBytes, /* String to be tokenized */ sl@0: sqlite3_tokenizer_cursor **ppCursor /* OUT: Tokenization cursor */ sl@0: ){ sl@0: simple_tokenizer_cursor *c; sl@0: sl@0: c = (simple_tokenizer_cursor *) malloc(sizeof(*c)); sl@0: if( c==NULL ) return SQLITE_NOMEM; sl@0: sl@0: c->pInput = pInput; sl@0: if( pInput==0 ){ sl@0: c->nBytes = 0; sl@0: }else if( nBytes<0 ){ sl@0: c->nBytes = (int)strlen(pInput); sl@0: }else{ sl@0: c->nBytes = nBytes; sl@0: } sl@0: c->iOffset = 0; /* start tokenizing at the beginning */ sl@0: c->iToken = 0; sl@0: c->pToken = NULL; /* no space allocated, yet. */ sl@0: c->nTokenAllocated = 0; sl@0: sl@0: *ppCursor = &c->base; sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Close a tokenization cursor previously opened by a call to sl@0: ** simpleOpen() above. sl@0: */ sl@0: static int simpleClose(sqlite3_tokenizer_cursor *pCursor){ sl@0: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; sl@0: free(c->pToken); sl@0: free(c); sl@0: return SQLITE_OK; sl@0: } sl@0: sl@0: /* sl@0: ** Extract the next token from a tokenization cursor. The cursor must sl@0: ** have been opened by a prior call to simpleOpen(). sl@0: */ sl@0: static int simpleNext( sl@0: sqlite3_tokenizer_cursor *pCursor, /* Cursor returned by simpleOpen */ sl@0: const char **ppToken, /* OUT: *ppToken is the token text */ sl@0: int *pnBytes, /* OUT: Number of bytes in token */ sl@0: int *piStartOffset, /* OUT: Starting offset of token */ sl@0: int *piEndOffset, /* OUT: Ending offset of token */ sl@0: int *piPosition /* OUT: Position integer of token */ sl@0: ){ sl@0: simple_tokenizer_cursor *c = (simple_tokenizer_cursor *) pCursor; sl@0: simple_tokenizer *t = (simple_tokenizer *) pCursor->pTokenizer; sl@0: unsigned char *p = (unsigned char *)c->pInput; sl@0: sl@0: while( c->iOffsetnBytes ){ sl@0: int iStartOffset; sl@0: sl@0: /* Scan past delimiter characters */ sl@0: while( c->iOffsetnBytes && isDelim(t, p[c->iOffset]) ){ sl@0: c->iOffset++; sl@0: } sl@0: sl@0: /* Count non-delimiter characters. */ sl@0: iStartOffset = c->iOffset; sl@0: while( c->iOffsetnBytes && !isDelim(t, p[c->iOffset]) ){ sl@0: c->iOffset++; sl@0: } sl@0: sl@0: if( c->iOffset>iStartOffset ){ sl@0: int i, n = c->iOffset-iStartOffset; sl@0: if( n>c->nTokenAllocated ){ sl@0: c->nTokenAllocated = n+20; sl@0: c->pToken = realloc(c->pToken, c->nTokenAllocated); sl@0: if( c->pToken==NULL ) return SQLITE_NOMEM; sl@0: } sl@0: for(i=0; ipToken[i] = ch<0x80 ? tolower(ch) : ch; sl@0: } sl@0: *ppToken = c->pToken; sl@0: *pnBytes = n; sl@0: *piStartOffset = iStartOffset; sl@0: *piEndOffset = c->iOffset; sl@0: *piPosition = c->iToken++; sl@0: sl@0: return SQLITE_OK; sl@0: } sl@0: } sl@0: return SQLITE_DONE; sl@0: } sl@0: sl@0: /* sl@0: ** The set of routines that implement the simple tokenizer sl@0: */ sl@0: static const sqlite3_tokenizer_module simpleTokenizerModule = { sl@0: 0, sl@0: simpleCreate, sl@0: simpleDestroy, sl@0: simpleOpen, sl@0: simpleClose, sl@0: simpleNext, sl@0: }; sl@0: sl@0: /* sl@0: ** Allocate a new simple tokenizer. Return a pointer to the new sl@0: ** tokenizer in *ppModule sl@0: */ sl@0: void sqlite3Fts1SimpleTokenizerModule( sl@0: sqlite3_tokenizer_module const**ppModule sl@0: ){ sl@0: *ppModule = &simpleTokenizerModule; sl@0: } sl@0: sl@0: #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS1) */