ソースを参照

Two tokenizer implementations

Andrew Goodale 11 年 前
コミット
e5ce8ab615
2 ファイル変更173 行追加0 行削除
  1. 45 0
      src/fts3/FMTokenizers.h
  2. 128 0
      src/fts3/FMTokenizers.m

+ 45 - 0
src/fts3/FMTokenizers.h

@@ -0,0 +1,45 @@
+//
+//  FMTokenizers.h
+//  fmdb
+//
+//  Created by Andrew on 4/9/14.
+//  Copyright (c) 2014 Andrew Goodale. All rights reserved.
+//
+
+#import <Foundation/Foundation.h>
+#import "FMDatabase+FTS3.h"
+
+/**
+ This is the base tokenizer implementation, using a CFStringTokenizer to find words.
+ */
+@interface FMSimpleTokenizer : NSObject <FMTokenizerDelegate>
+
+/**
+ Create the tokenizer with a given locale. The locale will be used to initialize the string tokenizer and to lowercase the parsed word.
+ The locale can be `NULL`, in which case the current locale will be used.
+ */
+- (instancetype)initWithLocale:(CFLocaleRef)locale;
+
+@end
+
+#pragma mark
+
+/**
+ This tokenizer extends the simple tokenizer with support for a stop word list.
+ */
+@interface FMStopWordTokenizer : NSObject <FMTokenizerDelegate>
+
+@property (atomic, copy) NSSet *words;
+
+/**
+ Load a stop-word tokenizer using a file containing words delimited by newlines. The file should be encoded in UTF-8.
+ */
++ (instancetype)tokenizerWithFileURL:(NSURL *)wordFileURL baseTokenizer:(id<FMTokenizerDelegate>)tokenizer error:(NSError **)error;
+
+/**
+ Initialize an instance of the tokenizer using the set of words. The words should be lowercase if you're using the 
+ `FMSimpleTokenizer` as the base.
+ */
+- (instancetype)initWithWords:(NSSet *)words baseTokenizer:(id<FMTokenizerDelegate>)tokenizer;
+
+@end

+ 128 - 0
src/fts3/FMTokenizers.m

@@ -0,0 +1,128 @@
+//
+//  FMTokenizers.m
+//  fmdb
+//
+//  Created by Andrew on 4/9/14.
+//  Copyright (c) 2014 Andrew Goodale. All rights reserved.
+//
+
+#import "FMTokenizers.h"
+
+@implementation FMSimpleTokenizer
+{
+    CFLocaleRef m_locale;
+}
+
+- (id)initWithLocale:(CFLocaleRef)locale
+{
+    if ((self = [super init])) {
+        m_locale = (locale != NULL) ? CFRetain(locale) : CFLocaleCopyCurrent();
+    }
+    return self;
+}
+
+- (void)dealloc
+{
+    CFRelease(m_locale);
+}
+
+- (void)openTokenizerCursor:(FMTokenizerCursor *)cursor
+{
+    cursor->tokenString = CFStringCreateMutable(NULL, 0);
+    cursor->userObject = CFStringTokenizerCreate(NULL, cursor->inputString,
+                                                 CFRangeMake(0, CFStringGetLength(cursor->inputString)),
+                                                 kCFStringTokenizerUnitWord, m_locale);
+}
+
+- (BOOL)nextTokenForCursor:(FMTokenizerCursor *)cursor
+{
+    CFStringTokenizerRef tokenizer = (CFStringTokenizerRef) cursor->userObject;
+    CFMutableStringRef tokenString = (CFMutableStringRef) cursor->tokenString;
+    
+    CFStringTokenizerTokenType tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer);
+    
+    if (tokenType == kCFStringTokenizerTokenNone) {
+        // No more tokens, we are finished.
+        return YES;
+    }
+        
+    // Found a regular word. The token is the lowercase version of the word.
+    cursor->currentRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
+
+    // The inline buffer approach is faster and uses less memory than CFStringCreateWithSubstring()
+    CFStringInlineBuffer inlineBuf;
+    CFStringInitInlineBuffer(cursor->inputString, &inlineBuf, cursor->currentRange);
+    CFStringDelete(tokenString, CFRangeMake(0, CFStringGetLength(tokenString)));
+    
+    for (int i = 0; i < cursor->currentRange.length; ++i) {
+        UniChar nextChar = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i);
+        CFStringAppendCharacters(tokenString, &nextChar, 1);
+    }
+    
+    CFStringLowercase(tokenString, m_locale);
+    
+    return NO;
+}
+
+- (void)closeTokenizerCursor:(FMTokenizerCursor *)cursor
+{
+    // FMDatabase will CFRelease the tokenString and the userObject.
+}
+
+@end
+
+#pragma mark
+
+@implementation FMStopWordTokenizer
+{
+    id<FMTokenizerDelegate> m_baseTokenizer;
+}
+
++ (instancetype)tokenizerWithFileURL:(NSURL *)wordFileURL
+                       baseTokenizer:(id<FMTokenizerDelegate>)tokenizer
+                               error:(NSError *__autoreleasing *)error
+{
+    NSParameterAssert(wordFileURL);
+    
+    NSString *contents = [NSString stringWithContentsOfURL:wordFileURL encoding:NSUTF8StringEncoding error:error];
+    NSArray *stopWords = [contents componentsSeparatedByString:@"\n"];
+
+    if (contents == nil) {
+        return nil;
+    }
+    return [[self alloc] initWithWords:[NSSet setWithArray:stopWords] baseTokenizer:tokenizer];
+}
+
+- (instancetype)initWithWords:(NSSet *)words baseTokenizer:(id<FMTokenizerDelegate>)tokenizer
+{
+    NSParameterAssert(tokenizer);
+    
+    if ((self = [super init])) {
+        _words = [words copy];
+        m_baseTokenizer = tokenizer;
+    }
+    return self;
+}
+
+- (void)openTokenizerCursor:(FMTokenizerCursor *)cursor
+{
+    [m_baseTokenizer openTokenizerCursor:cursor];
+}
+
+- (BOOL)nextTokenForCursor:(FMTokenizerCursor *)cursor
+{
+    BOOL done = [m_baseTokenizer nextTokenForCursor:cursor];
+    
+    while (!done && [self.words containsObject:(__bridge id)(cursor->tokenString)]) {
+        done = [m_baseTokenizer nextTokenForCursor:cursor];
+    }
+    
+    return done;
+}
+
+- (void)closeTokenizerCursor:(FMTokenizerCursor *)cursor
+{
+    [m_baseTokenizer closeTokenizerCursor:cursor];
+}
+
+@end