il y a 9 ans · 96d1b6fb3e
--- a/src/extra/fts3/FMDatabase+FTS3.h
+++ b/src/extra/fts3/FMDatabase+FTS3.h
@@ -59,11 +59,13 @@ typedef struct FMTokenizerCursor
 
				 {
			
 
				     void       *tokenizer;      /* Internal SQLite reference */
			
 
				     CFStringRef inputString;    /* The input text being tokenized */
			
 
				-    CFRange     currentRange;   /* The current offset within `inputString` */
			
 
				+    CFRange     currentRange;   /* The current range within `inputString` */
			
 
				     CFStringRef tokenString;    /* The contents of the current token */
			
 
				     CFTypeRef   userObject;     /* Additional state for the cursor */
			
 
				     int         tokenIndex;     /* Index of next token to be returned */
			
 
				     UInt8       outputBuf[128]; /* Result for SQLite */
			
 
				+    CFRange     previousRange;  /* Cached range of previous token within `inputString` */
			
 
				+    CFRange     previousOffsetRange; /* Cached range of previous token as UTF-8 offset */
			
 
				 } FMTokenizerCursor;
			
 
				 
			
 
				 @protocol FMTokenizerDelegate
			
--- a/src/extra/fts3/FMDatabase+FTS3.m
+++ b/src/extra/fts3/FMDatabase+FTS3.m
@@ -96,6 +96,8 @@ static int FMDBTokenizerOpen(sqlite3_tokenizer *pTokenizer,         /* The token
 
				     cursor->tokenString = NULL;
			
 
				     cursor->userObject = NULL;
			
 
				     cursor->outputBuf[0] = '\0';
			
 
				+    cursor->previousRange = CFRangeMake(0, 0);
			
 
				+    cursor->previousOffsetRange = CFRangeMake(0, 0);
			
 
				         
			
 
				     [tokenizer->delegate openTokenizerCursor:cursor];
			
 
				 
			
@@ -146,27 +148,42 @@ static int FMDBTokenizerNext(sqlite3_tokenizer_cursor *pCursor,  /* Cursor retur
 
				         return SQLITE_DONE;
			
 
				     }
			
 
				     
			
 
				-    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite.
			
 
				-    CFIndex startOffset, endOffset, newBytesUsed;
			
 
				-    CFRange rangeToStartToken = CFRangeMake(0, cursor->currentRange.location);
			
 
				-    CFRange newTokenRange = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
			
 
				-    
			
 
				+    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite
			
 
				+    // Conversion to bytes is very expensive on longer strings. In order to avoid processing the same data over and over again for each token, we cache the previousRange and previousOffsetRange
			
 
				+    // Not all tokenizers may process strings sequentially. Reset the cached ranges if necessary
			
 
				+    if (cursor->currentRange.location < cursor->previousRange.location + cursor->previousRange.length) {
			
 
				+        cursor->previousRange = CFRangeMake(0, 0);
			
 
				+        cursor->previousOffsetRange = CFRangeMake(0, 0);
			
 
				+    }
			
 
				+
			
 
				+    // First calculate the offset of current token range in original string
			
 
				+    CFIndex locationOffset, lengthOffset;
			
 
				+    const CFRange rangeToStartToken = CFRangeMake((cursor->previousRange.location + cursor->previousRange.length), cursor->currentRange.location - (cursor->previousRange.location + cursor->previousRange.length));
			
 
				+
			
 
				     // This will tell us how many UTF-8 bytes there are before the start of the token
			
 
				     CFStringGetBytes(cursor->inputString, rangeToStartToken, kCFStringEncodingUTF8, '?', false,
			
 
				-                     NULL, 0, &startOffset);
			
 
				-
			
 
				+                     NULL, 0, &locationOffset);
			
 
				     // and how many UTF-8 bytes there are within the token in the original string
			
 
				     CFStringGetBytes(cursor->inputString, cursor->currentRange, kCFStringEncodingUTF8, '?', false,
			
 
				-                     NULL, 0, &endOffset);
			
 
				+                     NULL, 0, &lengthOffset);
			
 
				+
			
 
				+    // Update the location offset
			
 
				+    locationOffset += (cursor->previousOffsetRange.location + cursor->previousOffsetRange.length);
			
 
				+
			
 
				+    // Cache the data to reuse on next token
			
 
				+    cursor->previousRange = cursor->currentRange;
			
 
				+    cursor->previousOffsetRange = CFRangeMake(locationOffset, lengthOffset);
			
 
				 
			
 
				     // Determine how many bytes the new token string uses
			
 
				+    CFIndex newBytesUsed;
			
 
				+    const CFRange newTokenRange = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
			
 
				     CFStringGetBytes(cursor->tokenString, newTokenRange, kCFStringEncodingUTF8, '?', false,
			
 
				                      cursor->outputBuf, sizeof(cursor->outputBuf), &newBytesUsed);
			
 
				     
			
 
				     *pzToken = (char *) cursor->outputBuf;
			
 
				     *pnBytes = (int) newBytesUsed;
			
 
				-    *piStartOffset = (int) startOffset;
			
 
				-    *piEndOffset = (int) (startOffset + endOffset);
			
 
				+    *piStartOffset = (int) locationOffset;
			
 
				+    *piEndOffset = (int) (locationOffset + lengthOffset);
			
 
				     *piPosition = cursor->tokenIndex++;
			
 
				     
			
 
				     return SQLITE_OK;