Parcourir la source

Improve FTS tokenization performance

- `CFStringGetBytes` was calculating the UTF-8 offset from the beginning of string to current token again and again for each token. The longer the document the longer this operation took. Most tokenizers are sequential, and this repeated calculation was unnecessary
- Started caching last token range and its UTF-8 offset to avoid this repeated calculation.
- Extremely long documents (1 million characters) could take 30-40 seconds to index on a fast machine. Now they are indexed in less than a second
- Non-sequential tokenizers are still supported: cached range is reset if a token does not follow previous token.
Anton Sotkov il y a 9 ans
Parent
commit
96d1b6fb3e
2 fichiers modifiés avec 30 ajouts et 11 suppressions
  1. 3 1
      src/extra/fts3/FMDatabase+FTS3.h
  2. 27 10
      src/extra/fts3/FMDatabase+FTS3.m

+ 3 - 1
src/extra/fts3/FMDatabase+FTS3.h

@@ -59,11 +59,13 @@ typedef struct FMTokenizerCursor
 {
     void       *tokenizer;      /* Internal SQLite reference */
     CFStringRef inputString;    /* The input text being tokenized */
-    CFRange     currentRange;   /* The current offset within `inputString` */
+    CFRange     currentRange;   /* The current range within `inputString` */
     CFStringRef tokenString;    /* The contents of the current token */
     CFTypeRef   userObject;     /* Additional state for the cursor */
     int         tokenIndex;     /* Index of next token to be returned */
     UInt8       outputBuf[128]; /* Result for SQLite */
+    CFRange     previousRange;  /* Cached range of previous token within `inputString` */
+    CFRange     previousOffsetRange; /* Cached range of previous token as UTF-8 offset */
 } FMTokenizerCursor;
 
 @protocol FMTokenizerDelegate

+ 27 - 10
src/extra/fts3/FMDatabase+FTS3.m

@@ -96,6 +96,8 @@ static int FMDBTokenizerOpen(sqlite3_tokenizer *pTokenizer,         /* The token
     cursor->tokenString = NULL;
     cursor->userObject = NULL;
     cursor->outputBuf[0] = '\0';
+    cursor->previousRange = CFRangeMake(0, 0);
+    cursor->previousOffsetRange = CFRangeMake(0, 0);
         
     [tokenizer->delegate openTokenizerCursor:cursor];
 
@@ -146,27 +148,42 @@ static int FMDBTokenizerNext(sqlite3_tokenizer_cursor *pCursor,  /* Cursor retur
         return SQLITE_DONE;
     }
     
-    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite.
-    CFIndex startOffset, endOffset, newBytesUsed;
-    CFRange rangeToStartToken = CFRangeMake(0, cursor->currentRange.location);
-    CFRange newTokenRange = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
-    
+    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite
+    // Conversion to bytes is very expensive on longer strings. In order to avoid processing the same data over and over again for each token, we cache the previousRange and previousOffsetRange
+    // Not all tokenizers may process strings sequentially. Reset the cached ranges if necessary
+    if (cursor->currentRange.location < cursor->previousRange.location + cursor->previousRange.length) {
+        cursor->previousRange = CFRangeMake(0, 0);
+        cursor->previousOffsetRange = CFRangeMake(0, 0);
+    }
+
+    // First calculate the offset of current token range in original string
+    CFIndex locationOffset, lengthOffset;
+    const CFRange rangeToStartToken = CFRangeMake((cursor->previousRange.location + cursor->previousRange.length), cursor->currentRange.location - (cursor->previousRange.location + cursor->previousRange.length));
+
     // This will tell us how many UTF-8 bytes there are before the start of the token
     CFStringGetBytes(cursor->inputString, rangeToStartToken, kCFStringEncodingUTF8, '?', false,
-                     NULL, 0, &startOffset);
-
+                     NULL, 0, &locationOffset);
     // and how many UTF-8 bytes there are within the token in the original string
     CFStringGetBytes(cursor->inputString, cursor->currentRange, kCFStringEncodingUTF8, '?', false,
-                     NULL, 0, &endOffset);
+                     NULL, 0, &lengthOffset);
+
+    // Update the location offset
+    locationOffset += (cursor->previousOffsetRange.location + cursor->previousOffsetRange.length);
+
+    // Cache the data to reuse on next token
+    cursor->previousRange = cursor->currentRange;
+    cursor->previousOffsetRange = CFRangeMake(locationOffset, lengthOffset);
 
     // Determine how many bytes the new token string uses
+    CFIndex newBytesUsed;
+    const CFRange newTokenRange = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
     CFStringGetBytes(cursor->tokenString, newTokenRange, kCFStringEncodingUTF8, '?', false,
                      cursor->outputBuf, sizeof(cursor->outputBuf), &newBytesUsed);
     
     *pzToken = (char *) cursor->outputBuf;
     *pnBytes = (int) newBytesUsed;
-    *piStartOffset = (int) startOffset;
-    *piEndOffset = (int) (startOffset + endOffset);
+    *piStartOffset = (int) locationOffset;
+    *piEndOffset = (int) (locationOffset + lengthOffset);
     *piPosition = cursor->tokenIndex++;
     
     return SQLITE_OK;