Class: Lda::Document

Inherits:
Object
  • Object
show all
Defined in:
lib/lda-ruby/document/document.rb,
ext/lda-ruby/lda-inference.c

Direct Known Subclasses

DataDocument, TextDocument

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(corpus) ⇒ Document

Returns a new instance of Document.



5
6
7
8
9
10
11
12
13
# File 'lib/lda-ruby/document/document.rb', line 5

def initialize(corpus)
  @corpus = corpus

  @words  = Array.new
  @counts = Array.new
  @tokens = Array.new
  @length = 0
  @total  = 0
end

Instance Attribute Details

#corpusObject (readonly)

Returns the value of attribute corpus.



3
4
5
# File 'lib/lda-ruby/document/document.rb', line 3

def corpus
  @corpus
end

#countsObject (readonly)

Returns the value of attribute counts.



3
4
5
# File 'lib/lda-ruby/document/document.rb', line 3

def counts
  @counts
end

#lengthObject (readonly)

Returns the value of attribute length.



3
4
5
# File 'lib/lda-ruby/document/document.rb', line 3

def length
  @length
end

#tokensObject (readonly)

Returns the value of attribute tokens.



3
4
5
# File 'lib/lda-ruby/document/document.rb', line 3

def tokens
  @tokens
end

#totalObject (readonly)

Returns the value of attribute total.



3
4
5
# File 'lib/lda-ruby/document/document.rb', line 3

def total
  @total
end

#wordsObject (readonly)

Returns the value of attribute words.



3
4
5
# File 'lib/lda-ruby/document/document.rb', line 3

def words
  @words
end

Instance Method Details

#handle(tokens) ⇒ Object



27
28
29
# File 'lib/lda-ruby/document/document.rb', line 27

def handle(tokens)
  tokens
end

#has_text?Boolean

Returns:

  • (Boolean)


23
24
25
# File 'lib/lda-ruby/document/document.rb', line 23

def has_text?
  false
end

#recomputeObject

Recompute the total and length values.



18
19
20
21
# File 'lib/lda-ruby/document/document.rb', line 18

def recompute
  @total = @counts.inject(0) { |sum, i| sum + i }
  @length = @words.size
end

#tokenize(text) ⇒ Object



31
32
33
34
# File 'lib/lda-ruby/document/document.rb', line 31

def tokenize(text)
  clean_text = text.gsub(/[^A-Za-z'\s]+/, ' ').gsub(/\s+/, ' ')        # remove everything but letters and ' and leave only single spaces
  @tokens = handle(clean_text.split(' '))
end