Procházet zdrojové kódy

Can now access all the offsets() integers, not just the first.
Fixed byte position issues with custom tokenizers.

Andrew Goodale před 11 roky
rodič
revize
4d5ad5f819
3 změnil soubory, kde provedl 148 přidání a 37 odebrání
  1. 83 0
      Tests/FMDatabaseFTS3Tests.m
  2. 16 8
      src/fts3/FMDatabase+FTS3.h
  3. 49 29
      src/fts3/FMDatabase+FTS3.m

+ 83 - 0
Tests/FMDatabaseFTS3Tests.m

@@ -0,0 +1,83 @@
+//
+//  FMDatabaseFTS3Tests.m
+//  fmdb
+//
+//  Created by Seaview Software on 8/26/14.
+//
+//
+
+#import "FMDBTempDBTests.h"
+#import "FMDatabase+FTS3.h"
+#import "FMTokenizers.h"
+
+@interface FMDatabaseFTS3Tests : FMDBTempDBTests
+
+@end
+
+static id<FMTokenizerDelegate> g_testTok = nil;
+
+@implementation FMDatabaseFTS3Tests
+
++ (void)populateDatabase:(FMDatabase *)db
+{
+    [db executeUpdate:@"CREATE VIRTUAL TABLE mail USING fts3(subject, body)"];
+    
+    [db executeUpdate:@"INSERT INTO mail VALUES('hello world', 'This message is a hello world message.')"];
+    [db executeUpdate:@"INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more serious mail')"];
+
+    // Create a tokenizer instance that will not be de-allocated when the method finishes.
+    g_testTok = [[FMSimpleTokenizer alloc] initWithLocale:NULL];
+    [FMDatabase registerTokenizer:g_testTok withName:@"testTok"];
+}
+
+- (void)setUp
+{
+    [super setUp];
+    // Put setup code here. This method is called before the invocation of each test method in the class.
+}
+
+- (void)tearDown
+{
+    // Put teardown code here. This method is called after the invocation of each test method in the class.
+    [super tearDown];
+}
+
+- (void)testOffsets
+{
+    FMResultSet *results = [self.db executeQuery:@"SELECT offsets(mail) FROM mail WHERE mail MATCH 'world'"];
+    
+    if ([results next]) {
+        FMTextOffsets *offsets = [results offsetsForColumnIndex:0];
+        
+        [offsets enumerateWithBlock:^(NSInteger columnNumber, NSInteger termNumber, NSRange matchRange) {
+            if (columnNumber == 0) {
+                XCTAssertEqual(termNumber, 0L);
+                XCTAssertEqual(matchRange.location, 6UL);
+                XCTAssertEqual(matchRange.length, 5UL);
+            } else if (columnNumber == 1) {
+                XCTAssertEqual(termNumber, 0L);
+                XCTAssertEqual(matchRange.location, 24UL);
+                XCTAssertEqual(matchRange.length, 5UL);
+            }
+        }];
+    }
+}
+
+- (void)testTokenizer
+{
+    [self.db installTokenizerModule];
+    
+    BOOL ok = [self.db executeUpdate:@"CREATE VIRTUAL TABLE simple USING fts3(tokenize=fmdb testTok)"];
+    XCTAssertTrue(ok, @"Failed to create virtual table: %@", [self.db lastErrorMessage]);
+
+    // The FMSimpleTokenizer handles non-ASCII characters well, since it's based on CFStringTokenizer.
+    NSString *text = @"I like the band Queensrÿche. They are really great.";
+    
+    ok = [self.db executeUpdate:@"INSERT INTO simple VALUES(?)", text];
+    XCTAssertTrue(ok, @"Failed to insert data: %@", [self.db lastErrorMessage]);
+    
+    FMResultSet *results = [self.db executeQuery:@"SELECT * FROM simple WHERE simple MATCH ?", @"Queensrÿche"];
+    XCTAssertTrue([results next], @"Failed to find result");
+}
+
+@end

+ 16 - 8
src/fts3/FMDatabase+FTS3.h

@@ -68,13 +68,21 @@ typedef struct FMTokenizerCursor
 
 #pragma mark
 
-struct FMTextOffsets
-{
-    uint32_t columnNumber;
-    uint32_t termNumber;
-    NSRange  matchRange;    // NOTE: This range is in bytes, not characters!
-};
-typedef struct FMTextOffsets FMTextOffsets;
+/**
+ The container of offset information.
+ */
+@interface FMTextOffsets : NSObject
+
+- (instancetype)initWithDBOffsets:(const char *)offsets;
+
+/**
+ Enumerate each set of offsets in the result. The column number can be turned into a column name
+ using `[FMResultSet columnNameForIndex:]`. The `matchRange` is in UTF-8 byte positions, so it must be 
+ modified to use with `NSString` data.
+ */
+- (void)enumerateWithBlock:(void (^)(NSInteger columnNumber, NSInteger termNumber, NSRange matchRange))block;
+
+@end
 
 /**
  A category that adds support for the encoded data returned by FTS3 functions.
@@ -89,6 +97,6 @@ typedef struct FMTextOffsets FMTextOffsets;
  
  @return `FMTextOffsets` structure.
  */
-- (FMTextOffsets)offsetsForColumnIndex:(int)columnIdx;
+- (FMTextOffsets *)offsetsForColumnIndex:(int)columnIdx;
 
 @end

+ 49 - 29
src/fts3/FMDatabase+FTS3.m

@@ -43,6 +43,10 @@ static int FMDBTokenizerCreate(int argc, const char * const *argv, sqlite3_token
     memset(tokenizer, 0, sizeof(*tokenizer));
     tokenizer->delegate = [g_delegateMap objectForKey:[NSString stringWithUTF8String:argv[0]]];
     
+    if (!tokenizer->delegate) {
+        return SQLITE_ERROR;
+    }
+    
     *ppTokenizer = &tokenizer->base;
     return SQLITE_OK;
 }
@@ -136,16 +140,22 @@ static int FMDBTokenizerNext(sqlite3_tokenizer_cursor *pCursor,  /* Cursor retur
         return SQLITE_DONE;
     }
     
-    CFRange range = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
-    CFIndex usedBytes = 0;
+    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite.
+    CFIndex usedBytes1, usedBytes2;
+    CFRange range1 = CFRangeMake(0, cursor->currentRange.location);
+    CFRange range2 = CFRangeMake(0, cursor->currentRange.length);
+    
+    // This will tell us how many UTF-8 bytes there are before the start of the token
+    CFStringGetBytes(cursor->inputString, range1, kCFStringEncodingUTF8, '?', false,
+                     NULL, 0, &usedBytes1);
     
-    CFStringGetBytes(cursor->tokenString, range, kCFStringEncodingUTF8, '?', false,
-                     cursor->outputBuf, sizeof(cursor->outputBuf), &usedBytes);
+    CFStringGetBytes(cursor->tokenString, range2, kCFStringEncodingUTF8, '?', false,
+                     cursor->outputBuf, sizeof(cursor->outputBuf), &usedBytes2);
     
     *pzToken = (char *) cursor->outputBuf;
-    *pnBytes = (int) usedBytes;
-    *piStartOffset = (int) cursor->currentRange.location;
-    *piEndOffset = (int) (cursor->currentRange.location + cursor->currentRange.length);
+    *pnBytes = (int) usedBytes2;
+    *piStartOffset = (int) usedBytes1;
+    *piEndOffset = (int) (usedBytes1 + usedBytes2);
     *piPosition = cursor->tokenIndex++;
     
     return SQLITE_OK;
@@ -210,33 +220,43 @@ - (BOOL)issueCommand:(NSString *)command forTable:(NSString *)tableName
 
 #pragma mark
 
+@implementation FMTextOffsets
+{
+    NSString *_rawOffsets;
+}
+
+- (instancetype)initWithDBOffsets:(const char *)rawOffsets
+{
+    if ((self = [super init])) {
+        _rawOffsets = [NSString stringWithUTF8String:rawOffsets];
+    }
+    return self;
+}
+
+- (void)enumerateWithBlock:(void (^)(NSInteger, NSInteger, NSRange))block
+{
+    const char *rawOffsets = [_rawOffsets UTF8String];
+    uint32_t offsetInt[4];
+    int charsRead = 0;
+
+    while (sscanf(rawOffsets, "%u %u %u %u%n",
+                  &offsetInt[0], &offsetInt[1], &offsetInt[2], &offsetInt[3], &charsRead) == 4) {
+
+        block(offsetInt[0], offsetInt[1], NSMakeRange(offsetInt[2], offsetInt[3]));
+        rawOffsets += charsRead;
+    }
+}
+
+@end
+
 @implementation FMResultSet (FTS3)
 
-- (FMTextOffsets)offsetsForColumnIndex:(int)columnIdx
+- (FMTextOffsets *)offsetsForColumnIndex:(int)columnIdx
 {
-    // The offsets() value is a space separated string of 4 integers
-    int offsetInts[64], numInts;
+    // The offsets() value is a space separated groups of 4 integers
     const char *rawOffsets = (const char *)sqlite3_column_text([_statement statement], columnIdx);
     
-    NSScanner *scanner = [NSScanner scannerWithString:[NSString stringWithUTF8String:rawOffsets]];
-    
-    for (numInts = 0; numInts < 64; ++numInts) {
-        if (![scanner scanInt:&offsetInts[numInts]]) {
-            break;
-        }
-    }
-    
-    FMTextOffsets offsets = { offsetInts[0], offsetInts[1], NSMakeRange(offsetInts[2], offsetInts[3]) };
-    
-    // Quick hack to make hit highlighting work for 2-word terms
-    if (numInts > 4 && (offsetInts[0] == offsetInts[4])) {
-        int nextWord = offsetInts[2] + offsetInts[3] + 1;   // 1 for a space
-        
-        if (offsetInts[2+4] == nextWord) {
-            offsets.matchRange = NSMakeRange(offsetInts[2], offsetInts[3] + 1 + offsetInts[3+4]);
-        }
-    }
-    return offsets;
+    return [[FMTextOffsets alloc] initWithDBOffsets:rawOffsets];
 }
 
 @end