Class: DocumentCache

Inherits:
Object
  • Object
show all
Defined in:
lib/document-cache.rb

Instance Method Summary collapse

Constructor Details

#initializeDocumentCache

Returns a new instance of DocumentCache.



8
9
10
# File 'lib/document-cache.rb', line 8

def initialize
  @chest = VocabularyChest.new
end

Instance Method Details

#add(document) ⇒ Object



12
13
14
15
# File 'lib/document-cache.rb', line 12

def add document
  filename = "#{TAUConfig::cache_dir}/#{UUID.new.generate}"
  File.open(filename,'w'){|f| f.write(document)}
end

#clean(sentence) ⇒ Object



65
66
67
# File 'lib/document-cache.rb', line 65

def clean(sentence)
  sentence.strip + "."
end

#clearObject



57
58
59
# File 'lib/document-cache.rb', line 57

def clear
  documents.each {|doc| FileUtils.rm_rf doc}
end

#documentsObject



53
54
55
# File 'lib/document-cache.rb', line 53

def documents
  Dir["#{TAUConfig::cache_dir}/*"]
end

#extract_matching_words(search, sentence) ⇒ Object



69
70
71
72
73
# File 'lib/document-cache.rb', line 69

def extract_matching_words search, sentence
  matches = find_matches_by_stemming(search, [sentence])
  return matches.values.first if !matches.empty?
  return find_matches_by_grepping(search, [sentence]).values.first
end

#find_examples_for(search, count = 1) ⇒ Object



61
62
63
# File 'lib/document-cache.rb', line 61

def find_examples_for search, count=1
  find_matches_in documents, search, count
end

#find_matches_by_grepping(search, sentences) ⇒ Object



27
28
29
30
31
32
# File 'lib/document-cache.rb', line 27

def find_matches_by_grepping search, sentences
  sentences.inject({}){|hash, s| 
    hash[clean(s)] = [search] if s.include? search 
    hash
  }
end

#find_matches_by_stemming(search, sentences) ⇒ Object



17
18
19
20
21
22
23
24
25
# File 'lib/document-cache.rb', line 17

def find_matches_by_stemming search, sentences
  token = @chest.stem(search)
  sentences.inject({}){|hash, s| 
    words = s.split(" ")
    found = words.select{|w| @chest.stem(w) == token}
    hash[clean(s)] = found.map{|f| @chest.sanitize(f)} if !found.empty?
    hash
  }
end

#find_matches_in(filenames, search, count) ⇒ Object



34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# File 'lib/document-cache.rb', line 34

def find_matches_in filenames, search, count
  matches = {}

  [:find_matches_by_stemming, :find_matches_by_grepping].each{|matcher|
    filenames.each {|filename|
      File.open(filename){|file|
        contents = file.read
        sentences = contents.split(/[\.?!\n]/)
        matches.merge!(self.send(matcher, search, sentences))

        matches.shift until matches.size <= count if matches.size > count
        return matches if matches.size == count
      }
    }
  }

  matches
end

#frequency_listObject



75
76
77
78
79
80
81
# File 'lib/document-cache.rb', line 75

def frequency_list
  text = ""
  documents.each{|f| text += File.open(f).read }
  counts = text.split(" ").inject(Hash.new(0)) {|h,w| h[w] += 1; h }
  counts.reject!{|word, count| count < 2}
  counts.sort_by {|k,v| v}.reverse
end

#stemmed_frequency_listObject



83
84
85
86
87
88
89
90
# File 'lib/document-cache.rb', line 83

def stemmed_frequency_list
  text = ""
  documents.each{|f| text += File.open(f).read }
  stems = text.split(" ").map{|w| @chest.stem(w)}
  counts = stems.inject(Hash.new(0)) {|h,stem| h[stem] += 1; h }
  counts.reject!{|stem, count| count < 2}
  counts.sort_by {|k,v| v}.reverse
end