FMTokenizers.m 3.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128
  1. //
  2. // FMTokenizers.m
  3. // fmdb
  4. //
  5. // Created by Andrew on 4/9/14.
  6. // Copyright (c) 2014 Andrew Goodale. All rights reserved.
  7. //
  8. #import "FMTokenizers.h"
  9. @implementation FMSimpleTokenizer
  10. {
  11. CFLocaleRef m_locale;
  12. }
  13. - (id)initWithLocale:(CFLocaleRef)locale
  14. {
  15. if ((self = [super init])) {
  16. m_locale = (locale != NULL) ? CFRetain(locale) : CFLocaleCopyCurrent();
  17. }
  18. return self;
  19. }
  20. - (void)dealloc
  21. {
  22. CFRelease(m_locale);
  23. }
  24. - (void)openTokenizerCursor:(FMTokenizerCursor *)cursor
  25. {
  26. cursor->tokenString = CFStringCreateMutable(NULL, 0);
  27. cursor->userObject = CFStringTokenizerCreate(NULL, cursor->inputString,
  28. CFRangeMake(0, CFStringGetLength(cursor->inputString)),
  29. kCFStringTokenizerUnitWord, m_locale);
  30. }
  31. - (BOOL)nextTokenForCursor:(FMTokenizerCursor *)cursor
  32. {
  33. CFStringTokenizerRef tokenizer = (CFStringTokenizerRef) cursor->userObject;
  34. CFMutableStringRef tokenString = (CFMutableStringRef) cursor->tokenString;
  35. CFStringTokenizerTokenType tokenType = CFStringTokenizerAdvanceToNextToken(tokenizer);
  36. if (tokenType == kCFStringTokenizerTokenNone) {
  37. // No more tokens, we are finished.
  38. return YES;
  39. }
  40. // Found a regular word. The token is the lowercase version of the word.
  41. cursor->currentRange = CFStringTokenizerGetCurrentTokenRange(tokenizer);
  42. // The inline buffer approach is faster and uses less memory than CFStringCreateWithSubstring()
  43. CFStringInlineBuffer inlineBuf;
  44. CFStringInitInlineBuffer(cursor->inputString, &inlineBuf, cursor->currentRange);
  45. CFStringDelete(tokenString, CFRangeMake(0, CFStringGetLength(tokenString)));
  46. for (int i = 0; i < cursor->currentRange.length; ++i) {
  47. UniChar nextChar = CFStringGetCharacterFromInlineBuffer(&inlineBuf, i);
  48. CFStringAppendCharacters(tokenString, &nextChar, 1);
  49. }
  50. CFStringLowercase(tokenString, m_locale);
  51. return NO;
  52. }
  53. - (void)closeTokenizerCursor:(FMTokenizerCursor *)cursor
  54. {
  55. // FMDatabase will CFRelease the tokenString and the userObject.
  56. }
  57. @end
  58. #pragma mark
  59. @implementation FMStopWordTokenizer
  60. {
  61. id<FMTokenizerDelegate> m_baseTokenizer;
  62. }
  63. + (instancetype)tokenizerWithFileURL:(NSURL *)wordFileURL
  64. baseTokenizer:(id<FMTokenizerDelegate>)tokenizer
  65. error:(NSError *__autoreleasing *)error
  66. {
  67. NSParameterAssert(wordFileURL);
  68. NSString *contents = [NSString stringWithContentsOfURL:wordFileURL encoding:NSUTF8StringEncoding error:error];
  69. NSArray *stopWords = [contents componentsSeparatedByString:@"\n"];
  70. if (contents == nil) {
  71. return nil;
  72. }
  73. return [[self alloc] initWithWords:[NSSet setWithArray:stopWords] baseTokenizer:tokenizer];
  74. }
  75. - (instancetype)initWithWords:(NSSet *)words baseTokenizer:(id<FMTokenizerDelegate>)tokenizer
  76. {
  77. NSParameterAssert(tokenizer);
  78. if ((self = [super init])) {
  79. _words = [words copy];
  80. m_baseTokenizer = tokenizer;
  81. }
  82. return self;
  83. }
  84. - (void)openTokenizerCursor:(FMTokenizerCursor *)cursor
  85. {
  86. [m_baseTokenizer openTokenizerCursor:cursor];
  87. }
  88. - (BOOL)nextTokenForCursor:(FMTokenizerCursor *)cursor
  89. {
  90. BOOL done = [m_baseTokenizer nextTokenForCursor:cursor];
  91. while (!done && [self.words containsObject:(__bridge id)(cursor->tokenString)]) {
  92. done = [m_baseTokenizer nextTokenForCursor:cursor];
  93. }
  94. return done;
  95. }
  96. - (void)closeTokenizerCursor:(FMTokenizerCursor *)cursor
  97. {
  98. [m_baseTokenizer closeTokenizerCursor:cursor];
  99. }
  100. @end