Class: Lurn::Text::WordTokenizer

Inherits:
Object
  • Object
show all
Defined in:
lib/lurn/text/word_tokenizer.rb

Constant Summary collapse

STOP_WORDS =
%w[
  a about above after again against all am an and any are aren't as at be
  because been before being below between both but by can't cannot could
  couldn't did didn't do does doesn't doing don't down during each few for
  from further had hadn't has hasn't have haven't having he he'd he'll
  he's her here here's hers herself him himself his how how's i i'd i'll
  i'm i've if in into is isn't it it's its itself let's me more most
  mustn't my myself no nor not of off on once only or other ought our ours
]

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options = {}) ⇒ WordTokenizer

Returns a new instance of WordTokenizer.



19
20
21
22
23
24
# File 'lib/lurn/text/word_tokenizer.rb', line 19

def initialize(options = {})
  @options = options
  @options[:strip_punctuation] ||= false
  @options[:strip_stopwords] ||= false
  @options[:stem_words] ||= false
end

Instance Attribute Details

#optionsObject

Returns the value of attribute options.



7
8
9
# File 'lib/lurn/text/word_tokenizer.rb', line 7

def options
  @options
end

Instance Method Details

#to_hObject



38
39
40
# File 'lib/lurn/text/word_tokenizer.rb', line 38

def to_h
  options
end

#tokenize(document) ⇒ Object



26
27
28
29
30
31
32
33
34
35
36
# File 'lib/lurn/text/word_tokenizer.rb', line 26

def tokenize(document)
  document = document.gsub(/[[:punct:]]/, '') if @options[:strip_punctuation] == true
  document = document.gsub(/\s+/, ' ').split(" ")

  if(@options[:stem_words])
    stemmer = Lingua::Stemmer.new(language: :en)
    document = document.map { |word| stemmer.stem(word) }
  end

  document
end