os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3_tokenizer.h
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
     1 /*
     2 ** 2006 July 10
     3 **
     4 ** The author disclaims copyright to this source code.
     5 **
     6 *************************************************************************
     7 ** Defines the interface to tokenizers used by fulltext-search.  There
     8 ** are three basic components:
     9 **
    10 ** sqlite3_tokenizer_module is a singleton defining the tokenizer
    11 ** interface functions.  This is essentially the class structure for
    12 ** tokenizers.
    13 **
    14 ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
    15 ** including customization information defined at creation time.
    16 **
    17 ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
    18 ** tokens from a particular input.
    19 */
    20 #ifndef _FTS3_TOKENIZER_H_
    21 #define _FTS3_TOKENIZER_H_
    22 
    23 /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
    24 ** If tokenizers are to be allowed to call sqlite3_*() functions, then
    25 ** we will need a way to register the API consistently.
    26 */
    27 #include "sqlite3.h"
    28 
    29 /*
    30 ** Structures used by the tokenizer interface. When a new tokenizer
    31 ** implementation is registered, the caller provides a pointer to
    32 ** an sqlite3_tokenizer_module containing pointers to the callback
    33 ** functions that make up an implementation.
    34 **
    35 ** When an fts3 table is created, it passes any arguments passed to
    36 ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
    37 ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
    38 ** implementation. The xCreate() function in turn returns an 
    39 ** sqlite3_tokenizer structure representing the specific tokenizer to
    40 ** be used for the fts3 table (customized by the tokenizer clause arguments).
    41 **
    42 ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
    43 ** method is called. It returns an sqlite3_tokenizer_cursor object
    44 ** that may be used to tokenize a specific input buffer based on
    45 ** the tokenization rules supplied by a specific sqlite3_tokenizer
    46 ** object.
    47 */
    48 typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
    49 typedef struct sqlite3_tokenizer sqlite3_tokenizer;
    50 typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
    51 
    52 struct sqlite3_tokenizer_module {
    53 
    54   /*
    55   ** Structure version. Should always be set to 0.
    56   */
    57   int iVersion;
    58 
    59   /*
    60   ** Create a new tokenizer. The values in the argv[] array are the
    61   ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
    62   ** TABLE statement that created the fts3 table. For example, if
    63   ** the following SQL is executed:
    64   **
    65   **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
    66   **
    67   ** then argc is set to 2, and the argv[] array contains pointers
    68   ** to the strings "arg1" and "arg2".
    69   **
    70   ** This method should return either SQLITE_OK (0), or an SQLite error 
    71   ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
    72   ** to point at the newly created tokenizer structure. The generic
    73   ** sqlite3_tokenizer.pModule variable should not be initialised by
    74   ** this callback. The caller will do so.
    75   */
    76   int (*xCreate)(
    77     int argc,                           /* Size of argv array */
    78     const char *const*argv,             /* Tokenizer argument strings */
    79     sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
    80   );
    81 
    82   /*
    83   ** Destroy an existing tokenizer. The fts3 module calls this method
    84   ** exactly once for each successful call to xCreate().
    85   */
    86   int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
    87 
    88   /*
    89   ** Create a tokenizer cursor to tokenize an input buffer. The caller
    90   ** is responsible for ensuring that the input buffer remains valid
    91   ** until the cursor is closed (using the xClose() method). 
    92   */
    93   int (*xOpen)(
    94     sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
    95     const char *pInput, int nBytes,      /* Input buffer */
    96     sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
    97   );
    98 
    99   /*
   100   ** Destroy an existing tokenizer cursor. The fts3 module calls this 
   101   ** method exactly once for each successful call to xOpen().
   102   */
   103   int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
   104 
   105   /*
   106   ** Retrieve the next token from the tokenizer cursor pCursor. This
   107   ** method should either return SQLITE_OK and set the values of the
   108   ** "OUT" variables identified below, or SQLITE_DONE to indicate that
   109   ** the end of the buffer has been reached, or an SQLite error code.
   110   **
   111   ** *ppToken should be set to point at a buffer containing the 
   112   ** normalized version of the token (i.e. after any case-folding and/or
   113   ** stemming has been performed). *pnBytes should be set to the length
   114   ** of this buffer in bytes. The input text that generated the token is
   115   ** identified by the byte offsets returned in *piStartOffset and
   116   ** *piEndOffset.
   117   **
   118   ** The buffer *ppToken is set to point at is managed by the tokenizer
   119   ** implementation. It is only required to be valid until the next call
   120   ** to xNext() or xClose(). 
   121   */
   122   /* TODO(shess) current implementation requires pInput to be
   123   ** nul-terminated.  This should either be fixed, or pInput/nBytes
   124   ** should be converted to zInput.
   125   */
   126   int (*xNext)(
   127     sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
   128     const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
   129     int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
   130     int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
   131     int *piPosition      /* OUT: Number of tokens returned before this one */
   132   );
   133 };
   134 
   135 struct sqlite3_tokenizer {
   136   const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
   137   /* Tokenizer implementations will typically add additional fields */
   138 };
   139 
   140 struct sqlite3_tokenizer_cursor {
   141   sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
   142   /* Tokenizer implementations will typically add additional fields */
   143 };
   144 
   145 #endif /* _FTS3_TOKENIZER_H_ */