Class: Cass::Analysis

Inherits:
Object
  • Object
show all
Defined in:
lib/cass/analysis.rb

Overview

Instantiates an analysis on one or more Documents. Currently, only the default processing stream (runSpec) is implemented. Eventually, direct methods for specific analyses (e.g., two-document permutation tests) will be supported.

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#contextsObject

Returns the value of attribute contexts.



10
11
12
# File 'lib/cass/analysis.rb', line 10

def contexts
  @contexts
end

#docsObject

Returns the value of attribute docs.



10
11
12
# File 'lib/cass/analysis.rb', line 10

def docs
  @docs
end

#targetsObject

Returns the value of attribute targets.



10
11
12
# File 'lib/cass/analysis.rb', line 10

def targets
  @targets
end

Class Method Details

.bootstrap_test(doc, contrasts, output_file, n_boot) ⇒ Object

Do a bootstrap test comparing the bootstrapped distribution to zero.

  • doc: The Document object to analyze

  • contrasts: an array of Contrast objects to apply

  • output_file: name of output file

  • n_boot: number of bootstrap iterations to run



127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
# File 'lib/cass/analysis.rb', line 127

def self.bootstrap_test(doc, contrasts, output_file, n_boot)
 
   outf = File.new(output_file,'w')
   outf.puts(%w[contrast result_id doc_name pair_1 pair_2 pair_3 pair_4 interaction_term].join("\t"))	  
	outf.sync = true
		
	doc.cooccurrence(NORMALIZE_WEIGHTS)
   contrasts.each { |c|
		observed = c.apply(doc)
		outf.puts "#{c.words.join(".")}\tobserved\t#{observed}"
	}
	d1 = doc.clone
	n_boot.times { |i|
		puts "\n\nRunning bootstrap iteration #{i+1}..." if VERBOSE
		d1.clines = doc.resample(clines=true)
		# d1.context = Context.new(d1)   # Currently uses the same context; can uncomment
		d1.cooccurrence(NORMALIZE_WEIGHTS)
		contrasts.each { |c|
			res = c.apply(d1)
			outf.puts "#{c.words.join(".")}\tboot_#{i+1}\t#{res}"
		}
	}
end

.p_values(input_file, mode = 'boot', mean = true) ⇒ Object

Takes the results of a bootstrap or permutation test as input and saves a file summarizing the corresponding p-values.

  • input_file: path to the results of the bootstrapping/permutation analysis

  • mode: indicates the source analysis type. Must be either ‘boot’ or ‘perm’

  • mean: boolean variable indicating whether or not to compute the mean across all contrasts



169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
# File 'lib/cass/analysis.rb', line 169

def self.p_values(input_file, mode='boot', mean=true)
  c = File.new(input_file).readlines
  c.shift
  buffer = ["file\tcontrast\tN_permutations\tvalue\tp-value"]
  tests = {}
  c.each { |l|
    l = l.strip.split(/\t/)
    row = [l[0], l[1], l[-1].to_f]
    fname =  mode == 'boot' ? l[2] : input_file
    tests[fname] = [] if !tests.key?(fname)
    tests[fname] << row
  }

  tests.each { |fname, rows|
    dists, obs, means = {}, {}, []
    rows.each { |row|
      test, iter, val = row
      if iter == 'observed'
        obs[test] = val
      else
        dists[test] = [] if !dists.key?(test)
        dists[test] << val
        if mean
          i = iter[/\d+$/].to_i-1
          means[i] = 0 if means[i].nil?
          means[i] += val
        end
      end
    }
    if mean
      means.map! { |m| m/obs.size }
      dists['mean'] = means
      obs['mean'] = obs.values.inject(0) {|sum, e| sum+e }/obs.size
    end
  
    dists.each { |k,v|
      v, o = v.sort, obs[k]
      gt = v.inject(0) { |sum, e| 
        sum + 
        if mode == 'perm'
          o >= e ? 1 : 0
        else
          e > 0 ? 1 : 0
        end
      }
      p = gt.to_f / v.size
      p = 1 - p if p > 0.5
      line = [fname, k, v.size, o, p*2]
      buffer << line.join("\t")
    }
  
  }
  base = File.basename(input_file, '.txt')
  File.new("#{base}_p_values.txt",'w').puts buffer
end

.parse_contrasts(contrast_file) ⇒ Object

Parse contrast file. Takes a filename as input and returns an array of Contrasts.



78
79
80
# File 'lib/cass/analysis.rb', line 78

def self.parse_contrasts(contrast_file)
  File.new(contrast_file).readlines.map { |l| next if l.empty?; Contrast.parse(l) }
end

.permutation_test(doc1, doc2, contrasts, output_file, n_perm) ⇒ Object

Run a permutation test comparing two Documents.

  • doc1, doc2: The two Documents to compare

  • contrasts: an array of Contrasts used to compare the documents

  • output_file: name of output file

  • n_perm: number of permutations to run



87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
# File 'lib/cass/analysis.rb', line 87

def self.permutation_test(doc1, doc2, contrasts, output_file, n_perm)

  # Merge contexts. Could change this later to allow different contexts for each
  # document, but that would make processing substantially slower.
  context = doc1.context
  context.words = context.words & doc2.context.words
  context.index_words
  doc1.context, doc2.context = context, context

  # Generate cooccurrence matrices and get observed difference.
  doc1.cooccurrence(NORMALIZE_WEIGHTS)
  doc2.cooccurrence(NORMALIZE_WEIGHTS)

  outf = File.new(output_file,'w')
  outf.puts "contrast\titeration\t#{doc1.name}\t#{doc2.name}\tdifference"
  outf.sync = true
  # Save observed values
  contrasts.each { |c|
    res1, res2, diff = compare_docs(c, doc1, doc2)
    outf.puts "#{c.words.join(".")}\tobserved\t#{res1}\t#{res2}\t#{diff}"
  }
  # Run permutations and save results
  d1, d2 = doc1.clone, doc2.clone
  n_perm.times { |i|
    puts "\n\nRunning permutation #{i+1}..."
    d1.clines, d2.clines = permute_labels(doc1.clines, doc2.clines)
    d1.cooccurrence(NORMALIZE_WEIGHTS)
    d2.cooccurrence(NORMALIZE_WEIGHTS)
    contrasts.each { |c|
      res1, res2, diff = compare_docs(c, d1, d2)
      outf.puts "#{c.words.join(".")}\tperm_#{i+1}\t#{res1}\t#{res2}\t#{diff}"
    }
  }
end

.run_spec(spec_file = 'default.spec') ⇒ Object

Read and parse the specifications for an analysis, then run the analysis. Only does basic error checking for now…



14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# File 'lib/cass/analysis.rb', line 14

def self.run_spec(spec_file='default.spec')

  # Basic error checking
  abort("Error: can't find spec file (#{spec_file}).") if !File.exist?(spec_file)
  load spec_file
  abort("Error: can't find contrast file (#{CONTRAST_FILE}).") if !File.exist?(CONTRAST_FILE) 
  contrasts = parse_contrasts(CONTRAST_FILE)

  # Create contrasts
  puts "Found #{contrasts.size} contrasts." if VERBOSE
  
  # Set targets
  targets = contrasts.inject([]) { |t, c| t += c.words.flatten }.uniq
  puts "Found #{targets.size} target words." if VERBOSE
  
  # Create options hash
  opts = {}
  %w[PARSE_TEXT N_PERM N_BOOT MAX_LINES RECODE CONTEXT_SIZE MIN_PROP STOP_FILE NORMALIZE_WEIGHTS].each { |c|
    opts[c.downcase] = Module.const_get(c) if Module.constants.include?(c)  
  }
  
  # Read in files and create documents
  docs = []
  FILES.each { |f| 
    abort("Error: can't find input file #{f}.") if !File.exist?(f)
    puts "Reading in file #{f}..."
    text = File.new(f).read
    docs << Document.new(f.split(/\//)[-1], targets, text, opts)
  }
  docs

  # Load contrasts
  contrasts = parse_contrasts(CONTRAST_FILE)

  # Make sure N_PERM is zero if we don't want stats
  n_perm = STATS ? N_PERM : 0
  
  # One or two-sample test?
  case TEST_TYPE
  when 1
    docs.each { |d|
      base = File.basename(d.name, '.txt')
      puts "\nRunning one-sample analysis on document '#{d.name}'."
      puts "Generating #{n_perm} bootstraps..." if VERBOSE and STATS
      bootstrap_test(d, contrasts, "#{OUTPUT_ROOT}_#{base}_results.txt", n_perm)
      p_values("#{OUTPUT_ROOT}_#{base}_results.txt", 'boot', true) if STATS
    }
    
  when 2
    abort("Error: in order to run a permutation test, you need to pass exactly two files as input.") if FILES.size != 2
    puts "Running two-sample comparison between '#{File.basename(FILES[0])}' and '#{File.basename(FILES[1])}'." if VERBOSE
    puts "Generating #{n_perm} permutations..." if VERBOSE and STATS
    permutation_test(*docs, contrasts, "#{OUTPUT_ROOT}_results.txt", n_perm)
    p_values("#{OUTPUT_ROOT}_results.txt", 'perm', true)
  
  # No other test types implemented for now.
  else
    
  end    
  puts "Done!"

end