Class: Cass::Analysis

Inherits:
Object
  • Object
show all
Defined in:
lib/cass/analysis.rb

Overview

Various methods used to conduct analyses on one or more Documents. The primary processing stream is run_spec, which is essentially a wrapper arouond the other methods for conducting one and two-sample tests.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#contextsObject

Returns the value of attribute contexts.



9
10
11
# File 'lib/cass/analysis.rb', line 9

def contexts
  @contexts
end

#docsObject

Returns the value of attribute docs.



9
10
11
# File 'lib/cass/analysis.rb', line 9

def docs
  @docs
end

#targetsObject

Returns the value of attribute targets.



9
10
11
# File 'lib/cass/analysis.rb', line 9

def targets
  @targets
end

Class Method Details

.bootstrap_test(doc, contrasts, output_file, n_boot, opts = {}) ⇒ Object

Do a bootstrap test comparing the bootstrapped distribution to zero.

  • doc: The Document object to analyze

  • contrasts: an array of Contrast objects to apply

  • output_file: name of output file

  • n_boot: number of bootstrap iterations to run

  • opts: an optional hash of additional settings. Currently, only

‘verbose’ and ‘normalize_weights’ apply here.



141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
# File 'lib/cass/analysis.rb', line 141

def self.bootstrap_test(doc, contrasts, output_file, n_boot, opts={})

   # Merge options with defaults
   opts = {'verbose'=>true, 'normalize_weights'=>false }.merge(opts)
       
   outf = File.new(output_file,'w')
   outf.puts(%w[contrast result_id doc_name pair_1 pair_2 pair_3 pair_4 interaction_term].join("\t"))	  
	outf.sync = true
		
	doc.cooccurrence(opts['normalize_weights'])
	
	contrasts = [contrasts] if contrasts.class == Contrast
   contrasts.each { |c|
		observed = c.apply(doc)
		outf.puts "#{c.words.join(".")}\tobserved\t#{observed}"
	}
	d1 = doc.clone
	n_boot.times { |i|
		puts "Running bootstrap iteration #{i+1}..." if opts['verbose']
		d1.clines = doc.resample(clines=true)
		# d1.context = Context.new(d1)   # Currently uses the same context; can uncomment
		d1.cooccurrence(opts['normalize_weights'])
		contrasts.each { |c|
			res = c.apply(d1)
			outf.puts "#{c.words.join(".")}\tboot_#{i+1}\t#{res}"
		}
	}
end

.p_values(input_file, mode = 'boot', mean = true) ⇒ Object

Takes the results of a bootstrap or permutation test as input and saves a file summarizing the corresponding p-values.

  • input_file: path to the results of the bootstrapping/permutation analysis

  • mode: indicates the source analysis type. Must be either ‘boot’ or ‘perm’

  • mean: boolean variable indicating whether or not to compute the mean across all contrasts



188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
# File 'lib/cass/analysis.rb', line 188

def self.p_values(input_file, mode='boot', mean=true)
  c = File.new(input_file).readlines
  c.shift
  buffer = ["file\tcontrast\tN_permutations\tvalue\tp-value"]
  tests = {}
  c.each { |l|
    l = l.strip.split(/\t/)
    row = [l[0], l[1], l[-1].to_f]
    fname =  mode == 'boot' ? l[2] : input_file
    tests[fname] = [] if !tests.key?(fname)
    tests[fname] << row
  }

  tests.each { |fname, rows|
    dists, obs, means = {}, {}, []
    rows.each { |row|
      test, iter, val = row
      if iter == 'observed'
        obs[test] = val
      else
        dists[test] = [] if !dists.key?(test)
        dists[test] << val
        if mean
          i = iter[/\d+$/].to_i-1
          means[i] = 0 if means[i].nil?
          means[i] += val
        end
      end
    }
    if mean
      means.map! { |m| m/obs.size }
      dists['mean'] = means
      obs['mean'] = obs.values.inject(0) {|sum, e| sum+e }/obs.size
    end
  
    dists.each { |k,v|
      v, o = v.sort, obs[k]
      gt = v.inject(0) { |sum, e| 
        sum + 
        if mode == 'perm'
          o >= e ? 1 : 0
        else
          e > 0 ? 1 : 0
        end
      }
      p = gt.to_f / v.size
      p = 1 - p if p > 0.5
      line = [fname, k, v.size, o, p*2]
      buffer << line.join("\t")
    }
  
  }
  base = File.basename(input_file, '.txt')
  File.new("#{base}_p_values.txt",'w').puts buffer
end

.parse_contrasts(contrast_file) ⇒ Object

Parse contrast file. Takes a filename as input and returns an array of Contrasts.



85
86
87
# File 'lib/cass/analysis.rb', line 85

def self.parse_contrasts(contrast_file)
  File.new(contrast_file).readlines.map { |l| next if l.empty?; Contrast.parse(l) }
end

.permutation_test(doc1, doc2, contrasts, output_file, n_perm, opts = {}) ⇒ Object

Run a permutation test comparing two Documents.

  • doc1, doc2: The two Documents to compare

  • contrasts: an array of Contrasts used to compare the documents

  • output_file: name of output file

  • n_perm: number of permutations to run

  • opts: an optional hash of additional settings. Currently, only

‘verbose’ and ‘normalize_weights’ apply here.



96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
# File 'lib/cass/analysis.rb', line 96

def self.permutation_test(doc1, doc2, contrasts, output_file, n_perm, opts={})

  # Merge options with defaults
  opts = {'verbose'=>true, 'normalize_weights'=>false }.merge(opts)
  
  # Merge contexts. Could change this later to allow different contexts for each
  # document, but that would make processing substantially slower.
  context = doc1.context
  context.words = context.words & doc2.context.words
  context.index_words
  doc1.context, doc2.context = context, context

  # Generate cooccurrence matrices and get observed difference.
  doc1.cooccurrence(opts['normalize_weights'])
  doc2.cooccurrence(opts['normalize_weights'])

  outf = File.new(output_file,'w')
  outf.puts "contrast\titeration\t#{doc1.name}\t#{doc2.name}\tdifference"
  outf.sync = true
  # Save observed values
  contrasts.each { |c|
    res1, res2, diff = compare_docs(c, doc1, doc2)
    outf.puts "#{c.words.join(".")}\tobserved\t#{res1}\t#{res2}\t#{diff}"
  }
  # Run permutations and save results
  d1, d2 = doc1.clone, doc2.clone
  n_perm.times { |i|
    puts "Running permutation #{i+1}..." if opts['verbose']
    d1.clines, d2.clines = permute_labels(doc1.clines, doc2.clines)
    d1.cooccurrence(opts['normalize_weights'])
    d2.cooccurrence(opts['normalize_weights'])
    contrasts.each { |c|
      res1, res2, diff = compare_docs(c, d1, d2)
      outf.puts "#{c.words.join(".")}\tperm_#{i+1}\t#{res1}\t#{res2}\t#{diff}"
    }
  }
end

.run_spec(spec_file = 'default.spec') ⇒ Object

Read and parse the specifications for an analysis, then run the analysis. Only does basic error checking for now…



13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
# File 'lib/cass/analysis.rb', line 13

def self.run_spec(spec_file='default.spec')

  # Basic error checking
  abort("Error: can't find spec file (#{spec_file}).") if !File.exist?(spec_file)
  load spec_file
  abort("Error: can't find contrast file (#{CONTRAST_FILE}).") if !File.exist?(CONTRAST_FILE) 
  
  # Create options hash
  opts = {}
  # Ruby 1.9 returns constants as symbols, 1.8.6 uses strings, so standardize
  consts = Module.constants.map { |c| c.to_s }
  %w[PARSE_TEXT N_PERM N_BOOT MAX_LINES RECODE CONTEXT_SIZE MIN_PROP STOP_FILE NORMALIZE_WEIGHTS VERBOSE].each { |c|
    opts[c.downcase] = Module.const_get(c) if consts.include?(c)  
  }
  
  if (defined?(VERBOSE) and VERBOSE)
    puts "\nRunning CASS with the following options:"
    opts.each { |k,v| puts "\t#{k}: #{v}" }
  end
  
  contrasts = parse_contrasts(CONTRAST_FILE)

  # Create contrasts
  puts "\nFound #{contrasts.size} contrasts." if (defined?(VERBOSE) and VERBOSE)
  
  # Set targets
  targets = contrasts.inject([]) { |t, c| t += c.words.flatten }.uniq
  puts "\nFound #{targets.size} target words." if (defined?(VERBOSE) and VERBOSE)
  
  # Read in files and create documents
  docs = []
  FILES.each { |f| 
    abort("Error: can't find input file #{f}.") if !File.exist?(f)
    puts "\nReading in file #{f}..."
    text = File.new(f).read
    docs << Document.new(f.split(/\//)[-1], targets, text, opts)
  }
  docs

  # Load contrasts
  contrasts = parse_contrasts(CONTRAST_FILE)

  # Make sure N_PERM is zero if we don't want stats
  n_perm = STATS ? N_PERM : 0
  
  # One or two-sample test?
  case TEST_TYPE
  when 1
    docs.each { |d|
      base = File.basename(d.name, '.txt')
      puts "\nRunning one-sample analysis on document '#{d.name}'."
      puts "Generating #{n_perm} bootstraps..." if (defined?(VERBOSE) and VERBOSE) and STATS
      bootstrap_test(d, contrasts, "#{OUTPUT_ROOT}_#{base}_results.txt", n_perm, opts)
      p_values("#{OUTPUT_ROOT}_#{base}_results.txt", 'boot', true) if STATS
    }
    
  when 2
    abort("Error: in order to run a permutation test, you need to pass exactly two files as input.") if FILES.size != 2 or docs.size != 2
    puts "Running two-sample comparison between '#{File.basename(FILES[0])}' and '#{File.basename(FILES[1])}'." if (defined?(VERBOSE) and VERBOSE)
    puts "Generating #{n_perm} permutations..." if (defined?(VERBOSE) and VERBOSE) and STATS
    permutation_test(docs[0], docs[1], contrasts, "#{OUTPUT_ROOT}_results.txt", n_perm, opts)
    p_values("#{OUTPUT_ROOT}_results.txt", 'perm', true)
  
  # No other test types implemented for now.
  else
    
  end    
  puts "Done!"

end