Преглед на файлове

Can now access all the offsets() integers, not just the first.
Fixed byte position issues with custom tokenizers.

Andrew Goodale преди 11 години
родител
ревизия
4d5ad5f819
променени са 3 файла, в които са добавени 148 реда и са изтрити 37 реда
  1. 83 0
      Tests/FMDatabaseFTS3Tests.m
  2. 16 8
      src/fts3/FMDatabase+FTS3.h
  3. 49 29
      src/fts3/FMDatabase+FTS3.m

+ 83 - 0
Tests/FMDatabaseFTS3Tests.m

@@ -0,0 +1,83 @@
+//
+//  FMDatabaseFTS3Tests.m
+//  fmdb
+//
+//  Created by Seaview Software on 8/26/14.
+//
+//
+
+#import "FMDBTempDBTests.h"
+#import "FMDatabase+FTS3.h"
+#import "FMTokenizers.h"
+
+@interface FMDatabaseFTS3Tests : FMDBTempDBTests
+
+@end
+
+static id<FMTokenizerDelegate> g_testTok = nil;
+
+@implementation FMDatabaseFTS3Tests
+
++ (void)populateDatabase:(FMDatabase *)db
+{
+    [db executeUpdate:@"CREATE VIRTUAL TABLE mail USING fts3(subject, body)"];
+    
+    [db executeUpdate:@"INSERT INTO mail VALUES('hello world', 'This message is a hello world message.')"];
+    [db executeUpdate:@"INSERT INTO mail VALUES('urgent: serious', 'This mail is seen as a more serious mail')"];
+
+    // Create a tokenizer instance that will not be de-allocated when the method finishes.
+    g_testTok = [[FMSimpleTokenizer alloc] initWithLocale:NULL];
+    [FMDatabase registerTokenizer:g_testTok withName:@"testTok"];
+}
+
+- (void)setUp
+{
+    [super setUp];
+    // Put setup code here. This method is called before the invocation of each test method in the class.
+}
+
+- (void)tearDown
+{
+    // Put teardown code here. This method is called after the invocation of each test method in the class.
+    [super tearDown];
+}
+
+- (void)testOffsets
+{
+    FMResultSet *results = [self.db executeQuery:@"SELECT offsets(mail) FROM mail WHERE mail MATCH 'world'"];
+    
+    if ([results next]) {
+        FMTextOffsets *offsets = [results offsetsForColumnIndex:0];
+        
+        [offsets enumerateWithBlock:^(NSInteger columnNumber, NSInteger termNumber, NSRange matchRange) {
+            if (columnNumber == 0) {
+                XCTAssertEqual(termNumber, 0L);
+                XCTAssertEqual(matchRange.location, 6UL);
+                XCTAssertEqual(matchRange.length, 5UL);
+            } else if (columnNumber == 1) {
+                XCTAssertEqual(termNumber, 0L);
+                XCTAssertEqual(matchRange.location, 24UL);
+                XCTAssertEqual(matchRange.length, 5UL);
+            }
+        }];
+    }
+}
+
+- (void)testTokenizer
+{
+    [self.db installTokenizerModule];
+    
+    BOOL ok = [self.db executeUpdate:@"CREATE VIRTUAL TABLE simple USING fts3(tokenize=fmdb testTok)"];
+    XCTAssertTrue(ok, @"Failed to create virtual table: %@", [self.db lastErrorMessage]);
+
+    // The FMSimpleTokenizer handles non-ASCII characters well, since it's based on CFStringTokenizer.
+    NSString *text = @"I like the band Queensrÿche. They are really great.";
+    
+    ok = [self.db executeUpdate:@"INSERT INTO simple VALUES(?)", text];
+    XCTAssertTrue(ok, @"Failed to insert data: %@", [self.db lastErrorMessage]);
+    
+    FMResultSet *results = [self.db executeQuery:@"SELECT * FROM simple WHERE simple MATCH ?", @"Queensrÿche"];
+    XCTAssertTrue([results next], @"Failed to find result");
+}
+
+@end

+ 16 - 8
src/fts3/FMDatabase+FTS3.h

@@ -68,13 +68,21 @@ typedef struct FMTokenizerCursor
 
 #pragma mark
 
-struct FMTextOffsets
-{
-    uint32_t columnNumber;
-    uint32_t termNumber;
-    NSRange  matchRange;    // NOTE: This range is in bytes, not characters!
-};
-typedef struct FMTextOffsets FMTextOffsets;
+/**
+ The container of offset information.
+ */
+@interface FMTextOffsets : NSObject
+
+- (instancetype)initWithDBOffsets:(const char *)offsets;
+
+/**
+ Enumerate each set of offsets in the result. The column number can be turned into a column name
+ using `[FMResultSet columnNameForIndex:]`. The `matchRange` is in UTF-8 byte positions, so it must be 
+ modified to use with `NSString` data.
+ */
+- (void)enumerateWithBlock:(void (^)(NSInteger columnNumber, NSInteger termNumber, NSRange matchRange))block;
+
+@end
 
 /**
  A category that adds support for the encoded data returned by FTS3 functions.
@@ -89,6 +97,6 @@ typedef struct FMTextOffsets FMTextOffsets;
  
  @return `FMTextOffsets` structure.
  */
-- (FMTextOffsets)offsetsForColumnIndex:(int)columnIdx;
+- (FMTextOffsets *)offsetsForColumnIndex:(int)columnIdx;
 
 @end

+ 49 - 29
src/fts3/FMDatabase+FTS3.m

@@ -43,6 +43,10 @@ static int FMDBTokenizerCreate(int argc, const char * const *argv, sqlite3_token
     memset(tokenizer, 0, sizeof(*tokenizer));
     tokenizer->delegate = [g_delegateMap objectForKey:[NSString stringWithUTF8String:argv[0]]];
     
+    if (!tokenizer->delegate) {
+        return SQLITE_ERROR;
+    }
+    
     *ppTokenizer = &tokenizer->base;
     return SQLITE_OK;
 }
@@ -136,16 +140,22 @@ static int FMDBTokenizerNext(sqlite3_tokenizer_cursor *pCursor,  /* Cursor retur
         return SQLITE_DONE;
     }
     
-    CFRange range = CFRangeMake(0, CFStringGetLength(cursor->tokenString));
-    CFIndex usedBytes = 0;
+    // The range from the tokenizer is in UTF-16 positions, we need give UTF-8 positions to SQLite.
+    CFIndex usedBytes1, usedBytes2;
+    CFRange range1 = CFRangeMake(0, cursor->currentRange.location);
+    CFRange range2 = CFRangeMake(0, cursor->currentRange.length);
+    
+    // This will tell us how many UTF-8 bytes there are before the start of the token
+    CFStringGetBytes(cursor->inputString, range1, kCFStringEncodingUTF8, '?', false,
+                     NULL, 0, &usedBytes1);
     
-    CFStringGetBytes(cursor->tokenString, range, kCFStringEncodingUTF8, '?', false,
-                     cursor->outputBuf, sizeof(cursor->outputBuf), &usedBytes);
+    CFStringGetBytes(cursor->tokenString, range2, kCFStringEncodingUTF8, '?', false,
+                     cursor->outputBuf, sizeof(cursor->outputBuf), &usedBytes2);
     
     *pzToken = (char *) cursor->outputBuf;
-    *pnBytes = (int) usedBytes;
-    *piStartOffset = (int) cursor->currentRange.location;
-    *piEndOffset = (int) (cursor->currentRange.location + cursor->currentRange.length);
+    *pnBytes = (int) usedBytes2;
+    *piStartOffset = (int) usedBytes1;
+    *piEndOffset = (int) (usedBytes1 + usedBytes2);
     *piPosition = cursor->tokenIndex++;
     
     return SQLITE_OK;
@@ -210,33 +220,43 @@ - (BOOL)issueCommand:(NSString *)command forTable:(NSString *)tableName
 
 #pragma mark
 
+@implementation FMTextOffsets
+{
+    NSString *_rawOffsets;
+}
+
+- (instancetype)initWithDBOffsets:(const char *)rawOffsets
+{
+    if ((self = [super init])) {
+        _rawOffsets = [NSString stringWithUTF8String:rawOffsets];
+    }
+    return self;
+}
+
+- (void)enumerateWithBlock:(void (^)(NSInteger, NSInteger, NSRange))block
+{
+    const char *rawOffsets = [_rawOffsets UTF8String];
+    uint32_t offsetInt[4];
+    int charsRead = 0;
+
+    while (sscanf(rawOffsets, "%u %u %u %u%n",
+                  &offsetInt[0], &offsetInt[1], &offsetInt[2], &offsetInt[3], &charsRead) == 4) {
+
+        block(offsetInt[0], offsetInt[1], NSMakeRange(offsetInt[2], offsetInt[3]));
+        rawOffsets += charsRead;
+    }
+}
+
+@end
+
 @implementation FMResultSet (FTS3)
 
-- (FMTextOffsets)offsetsForColumnIndex:(int)columnIdx
+- (FMTextOffsets *)offsetsForColumnIndex:(int)columnIdx
 {
-    // The offsets() value is a space separated string of 4 integers
-    int offsetInts[64], numInts;
+    // The offsets() value is a space separated groups of 4 integers
     const char *rawOffsets = (const char *)sqlite3_column_text([_statement statement], columnIdx);
     
-    NSScanner *scanner = [NSScanner scannerWithString:[NSString stringWithUTF8String:rawOffsets]];
-    
-    for (numInts = 0; numInts < 64; ++numInts) {
-        if (![scanner scanInt:&offsetInts[numInts]]) {
-            break;
-        }
-    }
-    
-    FMTextOffsets offsets = { offsetInts[0], offsetInts[1], NSMakeRange(offsetInts[2], offsetInts[3]) };
-    
-    // Quick hack to make hit highlighting work for 2-word terms
-    if (numInts > 4 && (offsetInts[0] == offsetInts[4])) {
-        int nextWord = offsetInts[2] + offsetInts[3] + 1;   // 1 for a space
-        
-        if (offsetInts[2+4] == nextWord) {
-            offsets.matchRange = NSMakeRange(offsetInts[2], offsetInts[3] + 1 + offsetInts[3+4]);
-        }
-    }
-    return offsets;
+    return [[FMTextOffsets alloc] initWithDBOffsets:rawOffsets];
 }
 
 @end