os/persistentdata/persistentstorage/sqlite3api/SQLite/fts3_tokenizer.h
author sl@SLION-WIN7.fritz.box
Fri, 15 Jun 2012 03:10:57 +0200
changeset 0 bde4ae8d615e
permissions -rw-r--r--
First public contribution.
sl@0
     1
/*
sl@0
     2
** 2006 July 10
sl@0
     3
**
sl@0
     4
** The author disclaims copyright to this source code.
sl@0
     5
**
sl@0
     6
*************************************************************************
sl@0
     7
** Defines the interface to tokenizers used by fulltext-search.  There
sl@0
     8
** are three basic components:
sl@0
     9
**
sl@0
    10
** sqlite3_tokenizer_module is a singleton defining the tokenizer
sl@0
    11
** interface functions.  This is essentially the class structure for
sl@0
    12
** tokenizers.
sl@0
    13
**
sl@0
    14
** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
sl@0
    15
** including customization information defined at creation time.
sl@0
    16
**
sl@0
    17
** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
sl@0
    18
** tokens from a particular input.
sl@0
    19
*/
sl@0
    20
#ifndef _FTS3_TOKENIZER_H_
sl@0
    21
#define _FTS3_TOKENIZER_H_
sl@0
    22
sl@0
    23
/* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
sl@0
    24
** If tokenizers are to be allowed to call sqlite3_*() functions, then
sl@0
    25
** we will need a way to register the API consistently.
sl@0
    26
*/
sl@0
    27
#include "sqlite3.h"
sl@0
    28
sl@0
    29
/*
sl@0
    30
** Structures used by the tokenizer interface. When a new tokenizer
sl@0
    31
** implementation is registered, the caller provides a pointer to
sl@0
    32
** an sqlite3_tokenizer_module containing pointers to the callback
sl@0
    33
** functions that make up an implementation.
sl@0
    34
**
sl@0
    35
** When an fts3 table is created, it passes any arguments passed to
sl@0
    36
** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
sl@0
    37
** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
sl@0
    38
** implementation. The xCreate() function in turn returns an 
sl@0
    39
** sqlite3_tokenizer structure representing the specific tokenizer to
sl@0
    40
** be used for the fts3 table (customized by the tokenizer clause arguments).
sl@0
    41
**
sl@0
    42
** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
sl@0
    43
** method is called. It returns an sqlite3_tokenizer_cursor object
sl@0
    44
** that may be used to tokenize a specific input buffer based on
sl@0
    45
** the tokenization rules supplied by a specific sqlite3_tokenizer
sl@0
    46
** object.
sl@0
    47
*/
sl@0
    48
typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
sl@0
    49
typedef struct sqlite3_tokenizer sqlite3_tokenizer;
sl@0
    50
typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
sl@0
    51
sl@0
    52
struct sqlite3_tokenizer_module {
sl@0
    53
sl@0
    54
  /*
sl@0
    55
  ** Structure version. Should always be set to 0.
sl@0
    56
  */
sl@0
    57
  int iVersion;
sl@0
    58
sl@0
    59
  /*
sl@0
    60
  ** Create a new tokenizer. The values in the argv[] array are the
sl@0
    61
  ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
sl@0
    62
  ** TABLE statement that created the fts3 table. For example, if
sl@0
    63
  ** the following SQL is executed:
sl@0
    64
  **
sl@0
    65
  **   CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
sl@0
    66
  **
sl@0
    67
  ** then argc is set to 2, and the argv[] array contains pointers
sl@0
    68
  ** to the strings "arg1" and "arg2".
sl@0
    69
  **
sl@0
    70
  ** This method should return either SQLITE_OK (0), or an SQLite error 
sl@0
    71
  ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
sl@0
    72
  ** to point at the newly created tokenizer structure. The generic
sl@0
    73
  ** sqlite3_tokenizer.pModule variable should not be initialised by
sl@0
    74
  ** this callback. The caller will do so.
sl@0
    75
  */
sl@0
    76
  int (*xCreate)(
sl@0
    77
    int argc,                           /* Size of argv array */
sl@0
    78
    const char *const*argv,             /* Tokenizer argument strings */
sl@0
    79
    sqlite3_tokenizer **ppTokenizer     /* OUT: Created tokenizer */
sl@0
    80
  );
sl@0
    81
sl@0
    82
  /*
sl@0
    83
  ** Destroy an existing tokenizer. The fts3 module calls this method
sl@0
    84
  ** exactly once for each successful call to xCreate().
sl@0
    85
  */
sl@0
    86
  int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
sl@0
    87
sl@0
    88
  /*
sl@0
    89
  ** Create a tokenizer cursor to tokenize an input buffer. The caller
sl@0
    90
  ** is responsible for ensuring that the input buffer remains valid
sl@0
    91
  ** until the cursor is closed (using the xClose() method). 
sl@0
    92
  */
sl@0
    93
  int (*xOpen)(
sl@0
    94
    sqlite3_tokenizer *pTokenizer,       /* Tokenizer object */
sl@0
    95
    const char *pInput, int nBytes,      /* Input buffer */
sl@0
    96
    sqlite3_tokenizer_cursor **ppCursor  /* OUT: Created tokenizer cursor */
sl@0
    97
  );
sl@0
    98
sl@0
    99
  /*
sl@0
   100
  ** Destroy an existing tokenizer cursor. The fts3 module calls this 
sl@0
   101
  ** method exactly once for each successful call to xOpen().
sl@0
   102
  */
sl@0
   103
  int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
sl@0
   104
sl@0
   105
  /*
sl@0
   106
  ** Retrieve the next token from the tokenizer cursor pCursor. This
sl@0
   107
  ** method should either return SQLITE_OK and set the values of the
sl@0
   108
  ** "OUT" variables identified below, or SQLITE_DONE to indicate that
sl@0
   109
  ** the end of the buffer has been reached, or an SQLite error code.
sl@0
   110
  **
sl@0
   111
  ** *ppToken should be set to point at a buffer containing the 
sl@0
   112
  ** normalized version of the token (i.e. after any case-folding and/or
sl@0
   113
  ** stemming has been performed). *pnBytes should be set to the length
sl@0
   114
  ** of this buffer in bytes. The input text that generated the token is
sl@0
   115
  ** identified by the byte offsets returned in *piStartOffset and
sl@0
   116
  ** *piEndOffset.
sl@0
   117
  **
sl@0
   118
  ** The buffer *ppToken is set to point at is managed by the tokenizer
sl@0
   119
  ** implementation. It is only required to be valid until the next call
sl@0
   120
  ** to xNext() or xClose(). 
sl@0
   121
  */
sl@0
   122
  /* TODO(shess) current implementation requires pInput to be
sl@0
   123
  ** nul-terminated.  This should either be fixed, or pInput/nBytes
sl@0
   124
  ** should be converted to zInput.
sl@0
   125
  */
sl@0
   126
  int (*xNext)(
sl@0
   127
    sqlite3_tokenizer_cursor *pCursor,   /* Tokenizer cursor */
sl@0
   128
    const char **ppToken, int *pnBytes,  /* OUT: Normalized text for token */
sl@0
   129
    int *piStartOffset,  /* OUT: Byte offset of token in input buffer */
sl@0
   130
    int *piEndOffset,    /* OUT: Byte offset of end of token in input buffer */
sl@0
   131
    int *piPosition      /* OUT: Number of tokens returned before this one */
sl@0
   132
  );
sl@0
   133
};
sl@0
   134
sl@0
   135
struct sqlite3_tokenizer {
sl@0
   136
  const sqlite3_tokenizer_module *pModule;  /* The module for this tokenizer */
sl@0
   137
  /* Tokenizer implementations will typically add additional fields */
sl@0
   138
};
sl@0
   139
sl@0
   140
struct sqlite3_tokenizer_cursor {
sl@0
   141
  sqlite3_tokenizer *pTokenizer;       /* Tokenizer for this cursor. */
sl@0
   142
  /* Tokenizer implementations will typically add additional fields */
sl@0
   143
};
sl@0
   144
sl@0
   145
#endif /* _FTS3_TOKENIZER_H_ */