Class: Cass::Context

Inherits:
Object
  • Object
show all
Defined in:
lib/cass/context.rb

Overview

Represents the context of a document, i.e., a list of words to analyze, along with an index.

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(doc, opts) ⇒ Context

Returns a new instance of Context.



8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
# File 'lib/cass/context.rb', line 8

def initialize(doc, opts)
  min_prop = opts['min_prop'] || 0
  max_prop = opts['max_prop'] || 1
  if (defined?(VERBOSE) and VERBOSE)
    puts "Creating new context..." 
    puts "Using all words with token frequency in range of #{min_prop} and #{max_prop}."
  end
  words = doc.lines.join(' ').split(/\s+/)
  nwords = words.size
  puts "Found #{nwords} words." if (defined?(VERBOSE) and VERBOSE)
  if min_prop > 0 or max_prop < 1
    word_hash = Hash.new(0)
    words.each {|w| word_hash[w] += 1 }
    min_t, max_t = (min_prop * nwords).round, (max_prop * nwords).round
    words = word_hash.delete_if { |w,c| c < min_t or c > max_t }.keys
  else
    words.uniq!
  end
  # words = words - doc.targets
  if opts.key?('stop_file') and !opts['stop_file'].empty?
    begin
      stopwords = File.new(opts['stop_file']).read.split(/\s+/)
    rescue
      abort("Error: could not open stopword file #{opts['stop_file']}!")
    end
    puts "Removing #{stopwords.size} stopwords from context." if (defined?(VERBOSE) and VERBOSE)
    words -= stopwords
  end
  @words = opts.key?('context_size') ? words.sort_by{rand}[0, opts['context_size']] : words
  index_words
  puts "Using #{@words.size} words as context." if (defined?(VERBOSE) and VERBOSE)
end

Instance Attribute Details

#indexObject

Returns the value of attribute index.



6
7
8
# File 'lib/cass/context.rb', line 6

def index
  @index
end

#wordsObject

Returns the value of attribute words.



6
7
8
# File 'lib/cass/context.rb', line 6

def words
  @words
end

Instance Method Details

#[](el) ⇒ Object

Convenience accessor method for getting either words in the context, or their index in the array. If an integer is passed, returns a word; If a string is passed, return the index of the word in the array.



50
51
52
# File 'lib/cass/context.rb', line 50

def [](el)
  el.class == Integer ? @words[el] : @index[el]
end

#index_wordsObject

Index the context. Necessary when words are updated manually.



42
43
44
45
# File 'lib/cass/context.rb', line 42

def index_words
  @index = {}
  @words.each_index { |i| @index[@words[i]] = i }
end

#key?(k) ⇒ Boolean

Returns true if a word is in the context, false otherwise.

Returns:

  • (Boolean)


55
56
57
# File 'lib/cass/context.rb', line 55

def key?(k)
  @index.key?(k)
end

#sizeObject

Number of words in the context.



60
61
62
# File 'lib/cass/context.rb', line 60

def size
  @words.size
end