fts3_tokenizer.h 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160
  1. /*
  2. ** 2006 July 10
  3. **
  4. ** The author disclaims copyright to this source code.
  5. **
  6. *************************************************************************
  7. ** Defines the interface to tokenizers used by fulltext-search. There
  8. ** are three basic components:
  9. **
  10. ** sqlite3_tokenizer_module is a singleton defining the tokenizer
  11. ** interface functions. This is essentially the class structure for
  12. ** tokenizers.
  13. **
  14. ** sqlite3_tokenizer is used to define a particular tokenizer, perhaps
  15. ** including customization information defined at creation time.
  16. **
  17. ** sqlite3_tokenizer_cursor is generated by a tokenizer to generate
  18. ** tokens from a particular input.
  19. */
  20. #ifndef _FTS3_TOKENIZER_H_
  21. #define _FTS3_TOKENIZER_H_
  22. /* TODO(shess) Only used for SQLITE_OK and SQLITE_DONE at this time.
  23. ** If tokenizers are to be allowed to call sqlite3_*() functions, then
  24. ** we will need a way to register the API consistently.
  25. */
  26. /*
  27. ** Structures used by the tokenizer interface. When a new tokenizer
  28. ** implementation is registered, the caller provides a pointer to
  29. ** an sqlite3_tokenizer_module containing pointers to the callback
  30. ** functions that make up an implementation.
  31. **
  32. ** When an fts3 table is created, it passes any arguments passed to
  33. ** the tokenizer clause of the CREATE VIRTUAL TABLE statement to the
  34. ** sqlite3_tokenizer_module.xCreate() function of the requested tokenizer
  35. ** implementation. The xCreate() function in turn returns an
  36. ** sqlite3_tokenizer structure representing the specific tokenizer to
  37. ** be used for the fts3 table (customized by the tokenizer clause arguments).
  38. **
  39. ** To tokenize an input buffer, the sqlite3_tokenizer_module.xOpen()
  40. ** method is called. It returns an sqlite3_tokenizer_cursor object
  41. ** that may be used to tokenize a specific input buffer based on
  42. ** the tokenization rules supplied by a specific sqlite3_tokenizer
  43. ** object.
  44. */
  45. typedef struct sqlite3_tokenizer_module sqlite3_tokenizer_module;
  46. typedef struct sqlite3_tokenizer sqlite3_tokenizer;
  47. typedef struct sqlite3_tokenizer_cursor sqlite3_tokenizer_cursor;
  48. struct sqlite3_tokenizer_module {
  49. /*
  50. ** Structure version. Should always be set to 0 or 1.
  51. */
  52. int iVersion;
  53. /*
  54. ** Create a new tokenizer. The values in the argv[] array are the
  55. ** arguments passed to the "tokenizer" clause of the CREATE VIRTUAL
  56. ** TABLE statement that created the fts3 table. For example, if
  57. ** the following SQL is executed:
  58. **
  59. ** CREATE .. USING fts3( ... , tokenizer <tokenizer-name> arg1 arg2)
  60. **
  61. ** then argc is set to 2, and the argv[] array contains pointers
  62. ** to the strings "arg1" and "arg2".
  63. **
  64. ** This method should return either SQLITE_OK (0), or an SQLite error
  65. ** code. If SQLITE_OK is returned, then *ppTokenizer should be set
  66. ** to point at the newly created tokenizer structure. The generic
  67. ** sqlite3_tokenizer.pModule variable should not be initialised by
  68. ** this callback. The caller will do so.
  69. */
  70. int (*xCreate)(
  71. int argc, /* Size of argv array */
  72. const char *const*argv, /* Tokenizer argument strings */
  73. sqlite3_tokenizer **ppTokenizer /* OUT: Created tokenizer */
  74. );
  75. /*
  76. ** Destroy an existing tokenizer. The fts3 module calls this method
  77. ** exactly once for each successful call to xCreate().
  78. */
  79. int (*xDestroy)(sqlite3_tokenizer *pTokenizer);
  80. /*
  81. ** Create a tokenizer cursor to tokenize an input buffer. The caller
  82. ** is responsible for ensuring that the input buffer remains valid
  83. ** until the cursor is closed (using the xClose() method).
  84. */
  85. int (*xOpen)(
  86. sqlite3_tokenizer *pTokenizer, /* Tokenizer object */
  87. const char *pInput, int nBytes, /* Input buffer */
  88. sqlite3_tokenizer_cursor **ppCursor /* OUT: Created tokenizer cursor */
  89. );
  90. /*
  91. ** Destroy an existing tokenizer cursor. The fts3 module calls this
  92. ** method exactly once for each successful call to xOpen().
  93. */
  94. int (*xClose)(sqlite3_tokenizer_cursor *pCursor);
  95. /*
  96. ** Retrieve the next token from the tokenizer cursor pCursor. This
  97. ** method should either return SQLITE_OK and set the values of the
  98. ** "OUT" variables identified below, or SQLITE_DONE to indicate that
  99. ** the end of the buffer has been reached, or an SQLite error code.
  100. **
  101. ** *ppToken should be set to point at a buffer containing the
  102. ** normalized version of the token (i.e. after any case-folding and/or
  103. ** stemming has been performed). *pnBytes should be set to the length
  104. ** of this buffer in bytes. The input text that generated the token is
  105. ** identified by the byte offsets returned in *piStartOffset and
  106. ** *piEndOffset. *piStartOffset should be set to the index of the first
  107. ** byte of the token in the input buffer. *piEndOffset should be set
  108. ** to the index of the first byte just past the end of the token in
  109. ** the input buffer.
  110. **
  111. ** The buffer *ppToken is set to point at is managed by the tokenizer
  112. ** implementation. It is only required to be valid until the next call
  113. ** to xNext() or xClose().
  114. */
  115. /* TODO(shess) current implementation requires pInput to be
  116. ** nul-terminated. This should either be fixed, or pInput/nBytes
  117. ** should be converted to zInput.
  118. */
  119. int (*xNext)(
  120. sqlite3_tokenizer_cursor *pCursor, /* Tokenizer cursor */
  121. const char **ppToken, int *pnBytes, /* OUT: Normalized text for token */
  122. int *piStartOffset, /* OUT: Byte offset of token in input buffer */
  123. int *piEndOffset, /* OUT: Byte offset of end of token in input buffer */
  124. int *piPosition /* OUT: Number of tokens returned before this one */
  125. );
  126. /***********************************************************************
  127. ** Methods below this point are only available if iVersion>=1.
  128. */
  129. /*
  130. ** Configure the language id of a tokenizer cursor.
  131. */
  132. int (*xLanguageid)(sqlite3_tokenizer_cursor *pCsr, int iLangid);
  133. };
  134. struct sqlite3_tokenizer {
  135. const sqlite3_tokenizer_module *pModule; /* The module for this tokenizer */
  136. /* Tokenizer implementations will typically add additional fields */
  137. };
  138. struct sqlite3_tokenizer_cursor {
  139. sqlite3_tokenizer *pTokenizer; /* Tokenizer for this cursor. */
  140. /* Tokenizer implementations will typically add additional fields */
  141. };
  142. int fts3_global_term_cnt(int iTerm, int iCol);
  143. int fts3_term_cnt(int iTerm, int iCol);
  144. #endif /* _FTS3_TOKENIZER_H_ */