Explorar el Código

Merge pull request #466 from antons/master

Improve FTS tokenization performance
August "Gus" Mueller hace 9 años
padre
commit
e6772b9585
Se han modificado 2 ficheros con 30 adiciones y 11 borrados
  1. 3 1
      src/extra/fts3/FMDatabase+FTS3.h
  2. 27 10
      src/extra/fts3/FMDatabase+FTS3.m

+ 3 - 1
src/extra/fts3/FMDatabase+FTS3.h

@@ -59,11 +59,13 @@ typedef struct FMTokenizerCursor
 {
     void       *tokenizer;      /* Internal SQLite reference */
     CFStringRef inputString;    /* The input text being tokenized */
-    CFRange     currentRange;   /* The current offset within `inputString` */
+    CFRange     currentRange;   /* The current range within `inputString` */
     CFStringRef tokenString;    /* The contents of the current token */
     CFTypeRef   userObject;     /* Additional state for the cursor */
     int         tokenIndex;     /* Index of next token to be returned */
     UInt8       outputBuf[128]; /* Result for SQLite */
+    CFRange     previousRange;  /* Cached range of previous token within `inputString` */
+    CFRange     previousOffsetRange; /* Cached range of previous token as UTF-8 offset */
 } FMTokenizerCursor;
 
 @protocol FMTokenizerDelegate

+ 27 - 10
src/extra/fts3/FMDatabase+FTS3.m

@@ -96,6 +96,8 @@ static int FMDBTokenizerOpen(sqlite3_tokenizer *pTokenizer,         /* The token
     cursor->tokenString = NULL;
     cursor->userObject = NULL;
     cursor->outputBuf[0] = '\0';
+    cursor->previousRange = CFRangeMake(0, 0);
+    cursor->previousOffsetRange = CFRangeMake(0, 0);
         
     [tokenizer->delegate openTokenizerCursor:cursor];
 
@@ -146,27 +148,42 @@ static int FMDBTokenizerNext(sqlite3_tokenizer_cursor *pCursor,  /* Cursor retur
         return SQLITE_DONE;
     }
     
-    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite.
-    CFIndex startOffset, endOffset, newBytesUsed;
-    CFRange rangeToStartToken = CFRangeMake(0, cursor->currentRange.location);
-    CFRange newTokenRange = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
-    
+    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite
+    // Conversion to bytes is very expensive on longer strings. In order to avoid processing the same data over and over again for each token, we cache the previousRange and previousOffsetRange
+    // Not all tokenizers may process strings sequentially. Reset the cached ranges if necessary
+    if (cursor->currentRange.location < cursor->previousRange.location + cursor->previousRange.length) {
+        cursor->previousRange = CFRangeMake(0, 0);
+        cursor->previousOffsetRange = CFRangeMake(0, 0);
+    }
+
+    // First calculate the offset of current token range in original string
+    CFIndex locationOffset, lengthOffset;
+    const CFRange rangeToStartToken = CFRangeMake((cursor->previousRange.location + cursor->previousRange.length), cursor->currentRange.location - (cursor->previousRange.location + cursor->previousRange.length));
+
     // This will tell us how many UTF-8 bytes there are before the start of the token
     CFStringGetBytes(cursor->inputString, rangeToStartToken, kCFStringEncodingUTF8, '?', false,
-                     NULL, 0, &startOffset);
-
+                     NULL, 0, &locationOffset);
     // and how many UTF-8 bytes there are within the token in the original string
     CFStringGetBytes(cursor->inputString, cursor->currentRange, kCFStringEncodingUTF8, '?', false,
-                     NULL, 0, &endOffset);
+                     NULL, 0, &lengthOffset);
+
+    // Update the location offset
+    locationOffset += (cursor->previousOffsetRange.location + cursor->previousOffsetRange.length);
+
+    // Cache the data to reuse on next token
+    cursor->previousRange = cursor->currentRange;
+    cursor->previousOffsetRange = CFRangeMake(locationOffset, lengthOffset);
 
     // Determine how many bytes the new token string uses
+    CFIndex newBytesUsed;
+    const CFRange newTokenRange = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
     CFStringGetBytes(cursor->tokenString, newTokenRange, kCFStringEncodingUTF8, '?', false,
                      cursor->outputBuf, sizeof(cursor->outputBuf), &newBytesUsed);
     
     *pzToken = (char *) cursor->outputBuf;
     *pnBytes = (int) newBytesUsed;
-    *piStartOffset = (int) startOffset;
-    *piEndOffset = (int) (startOffset + endOffset);
+    *piStartOffset = (int) locationOffset;
+    *piEndOffset = (int) (locationOffset + lengthOffset);
     *piPosition = cursor->tokenIndex++;
     
     return SQLITE_OK;