Class: Bae::Classifier

Inherits:
Object
  • Object
show all
Defined in:
lib/bae/classifier.rb

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initializeClassifier

Returns a new instance of Classifier.



7
8
9
10
11
12
13
# File 'lib/bae/classifier.rb', line 7

def initialize
  @frequency_table = ::Hash.new { |hash, feature| hash[feature] = [] }
  @label_instance_count = ::Hash.new { |hash, label| hash[label] = 0 }
  @label_index = ::Hash.new { |hash, label| hash[label] = 0 }
  @label_index_sequence = -1 # start at -1 so 0 is first value
  @total_terms = 0.0
end

Instance Attribute Details

#frequency_tableObject

Returns the value of attribute frequency_table.



4
5
6
# File 'lib/bae/classifier.rb', line 4

def frequency_table
  @frequency_table
end

#label_indexObject

Returns the value of attribute label_index.



4
5
6
# File 'lib/bae/classifier.rb', line 4

def label_index
  @label_index
end

#label_index_sequenceObject

Returns the value of attribute label_index_sequence.



4
5
6
# File 'lib/bae/classifier.rb', line 4

def label_index_sequence
  @label_index_sequence
end

#label_instance_countObject

Returns the value of attribute label_instance_count.



4
5
6
# File 'lib/bae/classifier.rb', line 4

def label_instance_count
  @label_instance_count
end

#total_termsObject

Returns the value of attribute total_terms.



4
5
6
# File 'lib/bae/classifier.rb', line 4

def total_terms
  @total_terms
end

Instance Method Details

#classify(data) ⇒ Object



50
51
52
53
54
55
56
57
58
# File 'lib/bae/classifier.rb', line 50

def classify(data)
  if data.is_a?(::String)
    classify_from_string(data)
  elsif data.is_a?(::Hash)
    classify_from_hash(data)
  else
    fail 'Training data must either be a string or hash'
  end
end

#classify_from_hash(frequency_hash) ⇒ Object



60
61
62
63
64
# File 'lib/bae/classifier.rb', line 60

def classify_from_hash(frequency_hash)
  document = frequency_hash.map{ |word, frequency| (word + ' ') * frequency }.join

  classify_from_string(document)
end

#classify_from_string(document) ⇒ Object



66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# File 'lib/bae/classifier.rb', line 66

def classify_from_string(document)
  words = document.split.uniq
  likelihoods = @likelihoods.dup
  posterior = {}

  vocab_size = frequency_table.keys.size

  label_index.each do |label, index|
    words.map do |word|
      row = frequency_table[word]

      unless row.empty?
        laplace_word_likelihood = (row[index] + 1.0).to_f / (label_instance_count[label] + vocab_size).to_f
        likelihoods[label] *= laplace_word_likelihood / (1.0 - laplace_word_likelihood)
      end
    end

    posterior[label] = @priors[label] * likelihoods[label]
  end

  normalize(posterior)
end

#finish_training!Object



15
16
17
18
# File 'lib/bae/classifier.rb', line 15

def finish_training!
  calculate_likelihoods!
  calculate_priors!
end

#load_state(path) ⇒ Object



102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# File 'lib/bae/classifier.rb', line 102

def load_state(path)
  state = ::JSON.parse(::File.read(::File.expand_path(path)))

  fail 'Missing frequency_table' unless state['frequency_table']
  fail 'Missing label_instance_count' unless state['label_instance_count']
  fail 'Missing label_index' unless state['label_index']
  fail 'Missing label_index_sequence' unless state['label_index_sequence']
  fail 'Missing total_terms' unless state['total_terms']

  @frequency_table = state['frequency_table']
  @label_instance_count = state['label_instance_count']
  @label_index = state['label_index']
  @label_index_sequence = state['label_index_sequence']
  @total_terms = state['total_terms']

  finish_training!
end

#save_state(path) ⇒ Object



89
90
91
92
93
94
95
96
97
98
99
100
# File 'lib/bae/classifier.rb', line 89

def save_state(path)
  state = {}
  state['frequency_table'] = frequency_table
  state['label_instance_count'] = label_instance_count
  state['label_index'] = label_index
  state['label_index_sequence'] = label_index_sequence
  state['total_terms'] = total_terms

  ::File.open(::File.expand_path(path), 'w') do |handle|
    handle.write(state.to_json)
  end
end

#train(label, training_data) ⇒ Object



20
21
22
23
24
25
26
27
28
# File 'lib/bae/classifier.rb', line 20

def train(label, training_data)
  if training_data.is_a?(::String)
    train_from_string(label, training_data)
  elsif training_data.is_a?(::Hash)
    train_from_hash(label, training_data)
  else
    fail 'Training data must either be a string or hash'
  end
end

#train_from_hash(label, frequency_hash) ⇒ Object



41
42
43
44
45
46
47
48
# File 'lib/bae/classifier.rb', line 41

def train_from_hash(label, frequency_hash)
  frequency_hash.each do |word, frequency|
    update_label_index(label)
    update_frequency_table(label, word, frequency)
  end
  @label_instance_count[label] += 1
  @total_terms += 1
end

#train_from_string(label, document) ⇒ Object



30
31
32
33
34
35
36
37
38
39
# File 'lib/bae/classifier.rb', line 30

def train_from_string(label, document)
  words = document.split

  words.each do |word|
    update_label_index(label)
    update_frequency_table(label, word, 1)
  end
  @label_instance_count[label] += 1
  @total_terms += 1
end