os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3_tokenizer.c
author sl
Tue, 10 Jun 2014 14:32:02 +0200
changeset 1 260cb5ec6c19
permissions -rw-r--r--
Update contrib.
     1 /*
     2 ** 2007 June 22
     3 **
     4 ** The author disclaims copyright to this source code.  In place of
     5 ** a legal notice, here is a blessing:
     6 **
     7 **    May you do good and not evil.
     8 **    May you find forgiveness for yourself and forgive others.
     9 **    May you share freely, never taking more than you give.
    10 **
    11 ******************************************************************************
    12 **
    13 ** This is part of an SQLite module implementing full-text search.
    14 ** This particular file implements the generic tokenizer interface.
    15 */
    16 
    17 /*
    18 ** The code in this file is only compiled if:
    19 **
    20 **     * The FTS3 module is being built as an extension
    21 **       (in which case SQLITE_CORE is not defined), or
    22 **
    23 **     * The FTS3 module is being built into the core of
    24 **       SQLite (in which case SQLITE_ENABLE_FTS3 is defined).
    25 */
    26 #if !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3)
    27 
    28 #include "sqlite3ext.h"
    29 #ifndef SQLITE_CORE
    30   SQLITE_EXTENSION_INIT1
    31 #endif
    32 
    33 #include "fts3_hash.h"
    34 #include "fts3_tokenizer.h"
    35 #include <assert.h>
    36 
    37 /*
    38 ** Implementation of the SQL scalar function for accessing the underlying 
    39 ** hash table. This function may be called as follows:
    40 **
    41 **   SELECT <function-name>(<key-name>);
    42 **   SELECT <function-name>(<key-name>, <pointer>);
    43 **
    44 ** where <function-name> is the name passed as the second argument
    45 ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer').
    46 **
    47 ** If the <pointer> argument is specified, it must be a blob value
    48 ** containing a pointer to be stored as the hash data corresponding
    49 ** to the string <key-name>. If <pointer> is not specified, then
    50 ** the string <key-name> must already exist in the has table. Otherwise,
    51 ** an error is returned.
    52 **
    53 ** Whether or not the <pointer> argument is specified, the value returned
    54 ** is a blob containing the pointer stored as the hash data corresponding
    55 ** to string <key-name> (after the hash-table is updated, if applicable).
    56 */
    57 static void scalarFunc(
    58   sqlite3_context *context,
    59   int argc,
    60   sqlite3_value **argv
    61 ){
    62   fts3Hash *pHash;
    63   void *pPtr = 0;
    64   const unsigned char *zName;
    65   int nName;
    66 
    67   assert( argc==1 || argc==2 );
    68 
    69   pHash = (fts3Hash *)sqlite3_user_data(context);
    70 
    71   zName = sqlite3_value_text(argv[0]);
    72   nName = sqlite3_value_bytes(argv[0])+1;
    73 
    74   if( argc==2 ){
    75     void *pOld;
    76     int n = sqlite3_value_bytes(argv[1]);
    77     if( n!=sizeof(pPtr) ){
    78       sqlite3_result_error(context, "argument type mismatch", -1);
    79       return;
    80     }
    81     pPtr = *(void **)sqlite3_value_blob(argv[1]);
    82     pOld = sqlite3Fts3HashInsert(pHash, (void *)zName, nName, pPtr);
    83     if( pOld==pPtr ){
    84       sqlite3_result_error(context, "out of memory", -1);
    85       return;
    86     }
    87   }else{
    88     pPtr = sqlite3Fts3HashFind(pHash, zName, nName);
    89     if( !pPtr ){
    90       char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
    91       sqlite3_result_error(context, zErr, -1);
    92       sqlite3_free(zErr);
    93       return;
    94     }
    95   }
    96 
    97   sqlite3_result_blob(context, (void *)&pPtr, sizeof(pPtr), SQLITE_TRANSIENT);
    98 }
    99 
   100 #ifdef SQLITE_TEST
   101 
   102 #include "tcl.h"
   103 #include <string.h>
   104 
   105 /*
   106 ** Implementation of a special SQL scalar function for testing tokenizers 
   107 ** designed to be used in concert with the Tcl testing framework. This
   108 ** function must be called with two arguments:
   109 **
   110 **   SELECT <function-name>(<key-name>, <input-string>);
   111 **   SELECT <function-name>(<key-name>, <pointer>);
   112 **
   113 ** where <function-name> is the name passed as the second argument
   114 ** to the sqlite3Fts3InitHashTable() function (e.g. 'fts3_tokenizer')
   115 ** concatenated with the string '_test' (e.g. 'fts3_tokenizer_test').
   116 **
   117 ** The return value is a string that may be interpreted as a Tcl
   118 ** list. For each token in the <input-string>, three elements are
   119 ** added to the returned list. The first is the token position, the 
   120 ** second is the token text (folded, stemmed, etc.) and the third is the
   121 ** substring of <input-string> associated with the token. For example, 
   122 ** using the built-in "simple" tokenizer:
   123 **
   124 **   SELECT fts_tokenizer_test('simple', 'I don't see how');
   125 **
   126 ** will return the string:
   127 **
   128 **   "{0 i I 1 dont don't 2 see see 3 how how}"
   129 **   
   130 */
   131 static void testFunc(
   132   sqlite3_context *context,
   133   int argc,
   134   sqlite3_value **argv
   135 ){
   136   fts3Hash *pHash;
   137   sqlite3_tokenizer_module *p;
   138   sqlite3_tokenizer *pTokenizer = 0;
   139   sqlite3_tokenizer_cursor *pCsr = 0;
   140 
   141   const char *zErr = 0;
   142 
   143   const char *zName;
   144   int nName;
   145   const char *zInput;
   146   int nInput;
   147 
   148   const char *zArg = 0;
   149 
   150   const char *zToken;
   151   int nToken;
   152   int iStart;
   153   int iEnd;
   154   int iPos;
   155 
   156   Tcl_Obj *pRet;
   157 
   158   assert( argc==2 || argc==3 );
   159 
   160   nName = sqlite3_value_bytes(argv[0]);
   161   zName = (const char *)sqlite3_value_text(argv[0]);
   162   nInput = sqlite3_value_bytes(argv[argc-1]);
   163   zInput = (const char *)sqlite3_value_text(argv[argc-1]);
   164 
   165   if( argc==3 ){
   166     zArg = (const char *)sqlite3_value_text(argv[1]);
   167   }
   168 
   169   pHash = (fts3Hash *)sqlite3_user_data(context);
   170   p = (sqlite3_tokenizer_module *)sqlite3Fts3HashFind(pHash, zName, nName+1);
   171 
   172   if( !p ){
   173     char *zErr = sqlite3_mprintf("unknown tokenizer: %s", zName);
   174     sqlite3_result_error(context, zErr, -1);
   175     sqlite3_free(zErr);
   176     return;
   177   }
   178 
   179   pRet = Tcl_NewObj();
   180   Tcl_IncrRefCount(pRet);
   181 
   182   if( SQLITE_OK!=p->xCreate(zArg ? 1 : 0, &zArg, &pTokenizer) ){
   183     zErr = "error in xCreate()";
   184     goto finish;
   185   }
   186   pTokenizer->pModule = p;
   187   if( SQLITE_OK!=p->xOpen(pTokenizer, zInput, nInput, &pCsr) ){
   188     zErr = "error in xOpen()";
   189     goto finish;
   190   }
   191   pCsr->pTokenizer = pTokenizer;
   192 
   193   while( SQLITE_OK==p->xNext(pCsr, &zToken, &nToken, &iStart, &iEnd, &iPos) ){
   194     Tcl_ListObjAppendElement(0, pRet, Tcl_NewIntObj(iPos));
   195     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
   196     zToken = &zInput[iStart];
   197     nToken = iEnd-iStart;
   198     Tcl_ListObjAppendElement(0, pRet, Tcl_NewStringObj(zToken, nToken));
   199   }
   200 
   201   if( SQLITE_OK!=p->xClose(pCsr) ){
   202     zErr = "error in xClose()";
   203     goto finish;
   204   }
   205   if( SQLITE_OK!=p->xDestroy(pTokenizer) ){
   206     zErr = "error in xDestroy()";
   207     goto finish;
   208   }
   209 
   210 finish:
   211   if( zErr ){
   212     sqlite3_result_error(context, zErr, -1);
   213   }else{
   214     sqlite3_result_text(context, Tcl_GetString(pRet), -1, SQLITE_TRANSIENT);
   215   }
   216   Tcl_DecrRefCount(pRet);
   217 }
   218 
   219 static
   220 int registerTokenizer(
   221   sqlite3 *db, 
   222   char *zName, 
   223   const sqlite3_tokenizer_module *p
   224 ){
   225   int rc;
   226   sqlite3_stmt *pStmt;
   227   const char zSql[] = "SELECT fts3_tokenizer(?, ?)";
   228 
   229   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
   230   if( rc!=SQLITE_OK ){
   231     return rc;
   232   }
   233 
   234   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
   235   sqlite3_bind_blob(pStmt, 2, &p, sizeof(p), SQLITE_STATIC);
   236   sqlite3_step(pStmt);
   237 
   238   return sqlite3_finalize(pStmt);
   239 }
   240 
   241 static
   242 int queryTokenizer(
   243   sqlite3 *db, 
   244   char *zName,  
   245   const sqlite3_tokenizer_module **pp
   246 ){
   247   int rc;
   248   sqlite3_stmt *pStmt;
   249   const char zSql[] = "SELECT fts3_tokenizer(?)";
   250 
   251   *pp = 0;
   252   rc = sqlite3_prepare_v2(db, zSql, -1, &pStmt, 0);
   253   if( rc!=SQLITE_OK ){
   254     return rc;
   255   }
   256 
   257   sqlite3_bind_text(pStmt, 1, zName, -1, SQLITE_STATIC);
   258   if( SQLITE_ROW==sqlite3_step(pStmt) ){
   259     if( sqlite3_column_type(pStmt, 0)==SQLITE_BLOB ){
   260       memcpy(pp, sqlite3_column_blob(pStmt, 0), sizeof(*pp));
   261     }
   262   }
   263 
   264   return sqlite3_finalize(pStmt);
   265 }
   266 
   267 void sqlite3Fts3SimpleTokenizerModule(sqlite3_tokenizer_module const**ppModule);
   268 
   269 /*
   270 ** Implementation of the scalar function fts3_tokenizer_internal_test().
   271 ** This function is used for testing only, it is not included in the
   272 ** build unless SQLITE_TEST is defined.
   273 **
   274 ** The purpose of this is to test that the fts3_tokenizer() function
   275 ** can be used as designed by the C-code in the queryTokenizer and
   276 ** registerTokenizer() functions above. These two functions are repeated
   277 ** in the README.tokenizer file as an example, so it is important to
   278 ** test them.
   279 **
   280 ** To run the tests, evaluate the fts3_tokenizer_internal_test() scalar
   281 ** function with no arguments. An assert() will fail if a problem is
   282 ** detected. i.e.:
   283 **
   284 **     SELECT fts3_tokenizer_internal_test();
   285 **
   286 */
   287 static void intTestFunc(
   288   sqlite3_context *context,
   289   int argc,
   290   sqlite3_value **argv
   291 ){
   292   int rc;
   293   const sqlite3_tokenizer_module *p1;
   294   const sqlite3_tokenizer_module *p2;
   295   sqlite3 *db = (sqlite3 *)sqlite3_user_data(context);
   296 
   297   /* Test the query function */
   298   sqlite3Fts3SimpleTokenizerModule(&p1);
   299   rc = queryTokenizer(db, "simple", &p2);
   300   assert( rc==SQLITE_OK );
   301   assert( p1==p2 );
   302   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
   303   assert( rc==SQLITE_ERROR );
   304   assert( p2==0 );
   305   assert( 0==strcmp(sqlite3_errmsg(db), "unknown tokenizer: nosuchtokenizer") );
   306 
   307   /* Test the storage function */
   308   rc = registerTokenizer(db, "nosuchtokenizer", p1);
   309   assert( rc==SQLITE_OK );
   310   rc = queryTokenizer(db, "nosuchtokenizer", &p2);
   311   assert( rc==SQLITE_OK );
   312   assert( p2==p1 );
   313 
   314   sqlite3_result_text(context, "ok", -1, SQLITE_STATIC);
   315 }
   316 
   317 #endif
   318 
   319 /*
   320 ** Set up SQL objects in database db used to access the contents of
   321 ** the hash table pointed to by argument pHash. The hash table must
   322 ** been initialised to use string keys, and to take a private copy 
   323 ** of the key when a value is inserted. i.e. by a call similar to:
   324 **
   325 **    sqlite3Fts3HashInit(pHash, FTS3_HASH_STRING, 1);
   326 **
   327 ** This function adds a scalar function (see header comment above
   328 ** scalarFunc() in this file for details) and, if ENABLE_TABLE is
   329 ** defined at compilation time, a temporary virtual table (see header 
   330 ** comment above struct HashTableVtab) to the database schema. Both 
   331 ** provide read/write access to the contents of *pHash.
   332 **
   333 ** The third argument to this function, zName, is used as the name
   334 ** of both the scalar and, if created, the virtual table.
   335 */
   336 int sqlite3Fts3InitHashTable(
   337   sqlite3 *db, 
   338   fts3Hash *pHash, 
   339   const char *zName
   340 ){
   341   int rc = SQLITE_OK;
   342   void *p = (void *)pHash;
   343   const int any = SQLITE_ANY;
   344   char *zTest = 0;
   345   char *zTest2 = 0;
   346 
   347 #ifdef SQLITE_TEST
   348   void *pdb = (void *)db;
   349   zTest = sqlite3_mprintf("%s_test", zName);
   350   zTest2 = sqlite3_mprintf("%s_internal_test", zName);
   351   if( !zTest || !zTest2 ){
   352     rc = SQLITE_NOMEM;
   353   }
   354 #endif
   355 
   356   if( rc!=SQLITE_OK
   357    || (rc = sqlite3_create_function(db, zName, 1, any, p, scalarFunc, 0, 0))
   358    || (rc = sqlite3_create_function(db, zName, 2, any, p, scalarFunc, 0, 0))
   359 #ifdef SQLITE_TEST
   360    || (rc = sqlite3_create_function(db, zTest, 2, any, p, testFunc, 0, 0))
   361    || (rc = sqlite3_create_function(db, zTest, 3, any, p, testFunc, 0, 0))
   362    || (rc = sqlite3_create_function(db, zTest2, 0, any, pdb, intTestFunc, 0, 0))
   363 #endif
   364   );
   365 
   366   sqlite3_free(zTest);
   367   sqlite3_free(zTest2);
   368   return rc;
   369 }
   370 
   371 #endif /* !defined(SQLITE_CORE) || defined(SQLITE_ENABLE_FTS3) */