| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128 |
- //
- // FMTokenizers.m
- // fmdb
- //
- // Created by Andrew on 4/9/14.
- // Copyright (c) 2014 Andrew Goodale. All rights reserved.
- //
- #import "FMTokenizers.h"
- @implementation FMSimpleTokenizer
- {
- CFLocaleRef m_locale;
- }
- - (id)initWithLocale:(CFLocaleRef)locale
- {
- if ((self = [super init])) {
- m_locale = (locale != NULL) ? CFRetain(locale) : CFLocaleCopyCurrent();
- }
- return self;
- }
- - (void)dealloc
- {
- CFRelease(m_locale);
- }
- - (void)openTokenizerCursor:(FMTokenizerCursor *)cursor
- {
- cursor->tokenString = CFStringCreateMutable(NULL, 0);
- cursor->userObject = CFStringTokenizerCreate(NULL, cursor->inputString,
- CFRangeMake(0, CFStringGetLength(cursor->inputString)),
- kCFStringTokenizerUnitWord, m_locale);
- }
- - (BOOL)nextTokenForCursor:(FMTokenizerCursor *)cursor
- {
- CFStringTokenizerRef tokenizer = (CFStringTokenizerRef) cursor->userObject;
- CFMutableStringRef tokenString = (CFMutableStringRef) cursor->tokenString;
-
- CFStringTokenizerTokenType tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer);
-
- if (tokenType == kCFStringTokenizerTokenNone) {
- // No more tokens, we are finished.
- return YES;
- }
-
- // Found a regular word. The token is the lowercase version of the word.
- cursor->currentRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
- // The inline buffer approach is faster and uses less memory than CFStringCreateWithSubstring()
- CFStringInlineBuffer inlineBuf;
- CFStringInitInlineBuffer(cursor->inputString, &inlineBuf, cursor->currentRange);
- CFStringDelete(tokenString, CFRangeMake(0, CFStringGetLength(tokenString)));
-
- for (int i = 0; i < cursor->currentRange.length; ++i) {
- UniChar nextChar = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i);
- CFStringAppendCharacters(tokenString, &nextChar, 1);
- }
-
- CFStringLowercase(tokenString, m_locale);
-
- return NO;
- }
- - (void)closeTokenizerCursor:(FMTokenizerCursor *)cursor
- {
- // FMDatabase will CFRelease the tokenString and the userObject.
- }
- @end
- #pragma mark
- @implementation FMStopWordTokenizer
- {
- id<FMTokenizerDelegate> m_baseTokenizer;
- }
- + (instancetype)tokenizerWithFileURL:(NSURL *)wordFileURL
- baseTokenizer:(id<FMTokenizerDelegate>)tokenizer
- error:(NSError *__autoreleasing *)error
- {
- NSParameterAssert(wordFileURL);
-
- NSString *contents = [NSString stringWithContentsOfURL:wordFileURL encoding:NSUTF8StringEncoding error:error];
- NSArray *stopWords = [contents componentsSeparatedByString:@"\n"];
- if (contents == nil) {
- return nil;
- }
- return [[self alloc] initWithWords:[NSSet setWithArray:stopWords] baseTokenizer:tokenizer];
- }
- - (instancetype)initWithWords:(NSSet *)words baseTokenizer:(id<FMTokenizerDelegate>)tokenizer
- {
- NSParameterAssert(tokenizer);
-
- if ((self = [super init])) {
- _words = [words copy];
- m_baseTokenizer = tokenizer;
- }
- return self;
- }
- - (void)openTokenizerCursor:(FMTokenizerCursor *)cursor
- {
- [m_baseTokenizer openTokenizerCursor:cursor];
- }
- - (BOOL)nextTokenForCursor:(FMTokenizerCursor *)cursor
- {
- BOOL done = [m_baseTokenizer nextTokenForCursor:cursor];
-
- while (!done && [self.words containsObject:(__bridge id)(cursor->tokenString)]) {
- done = [m_baseTokenizer nextTokenForCursor:cursor];
- }
-
- return done;
- }
- - (void)closeTokenizerCursor:(FMTokenizerCursor *)cursor
- {
- [m_baseTokenizer closeTokenizerCursor:cursor];
- }
- @end
|