Class: Cass::Analysis

Inherits:

Object

Object
Cass::Analysis

show all

Defined in:: lib/cass/analysis.rb

Overview

Instantiates an analysis on one or more Documents. Currently, only the default processing stream (runSpec) is implemented. Eventually, direct methods for specific analyses (e.g., two-document permutation tests) will be supported.

Instance Attribute Summary collapse

#contexts ⇒ Object

Returns the value of attribute contexts.
#docs ⇒ Object

Returns the value of attribute docs.
#targets ⇒ Object

Returns the value of attribute targets.

Class Method Summary collapse

.bootstrap_test(doc, contrasts, output_file, n_boot) ⇒ Object

Do a bootstrap test comparing the bootstrapped distribution to zero.
.p_values(input_file, mode = 'boot', mean = true) ⇒ Object

Takes the results of a bootstrap or permutation test as input and saves a file summarizing the corresponding p-values.
.parse_contrasts(contrast_file) ⇒ Object

Parse contrast file.
.permutation_test(doc1, doc2, contrasts, output_file, n_perm) ⇒ Object

Run a permutation test comparing two Documents.
.run_spec(spec_file = 'default.spec') ⇒ Object

Read and parse the specifications for an analysis, then run the analysis.

Instance Attribute Details

#contexts ⇒ `Object`

Returns the value of attribute contexts.



10
11
12

# File 'lib/cass/analysis.rb', line 10

def contexts
  @contexts
end

#docs ⇒ `Object`

Returns the value of attribute docs.



10
11
12

# File 'lib/cass/analysis.rb', line 10

def docs
  @docs
end

#targets ⇒ `Object`

Returns the value of attribute targets.



10
11
12

# File 'lib/cass/analysis.rb', line 10

def targets
  @targets
end

Class Method Details

.bootstrap_test(doc, contrasts, output_file, n_boot) ⇒ `Object`

Do a bootstrap test comparing the bootstrapped distribution to zero.

doc: The Document object to analyze
contrasts: an array of Contrast objects to apply
output_file: name of output file
n_boot: number of bootstrap iterations to run

# File 'lib/cass/analysis.rb', line 127

def self.bootstrap_test(doc, contrasts, output_file, n_boot)
 
   outf = File.new(output_file,'w')
   outf.puts(%w[contrast result_id doc_name pair_1 pair_2 pair_3 pair_4 interaction_term].join("\t"))	  
	outf.sync = true
		
	doc.cooccurrence(NORMALIZE_WEIGHTS)
   contrasts.each { |c|
		observed = c.apply(doc)
		outf.puts "#{c.words.join(".")}\tobserved\t#{observed}"
	}
	d1 = doc.clone
	n_boot.times { |i|
		puts "\n\nRunning bootstrap iteration #{i+1}..." if VERBOSE
		d1.clines = doc.resample(clines=true)
		# d1.context = Context.new(d1)   # Currently uses the same context; can uncomment
		d1.cooccurrence(NORMALIZE_WEIGHTS)
		contrasts.each { |c|
			res = c.apply(d1)
			outf.puts "#{c.words.join(".")}\tboot_#{i+1}\t#{res}"
		}
	}
end

.p_values(input_file, mode = 'boot', mean = true) ⇒ `Object`

Takes the results of a bootstrap or permutation test as input and saves a file summarizing the corresponding p-values.

input_file: path to the results of the bootstrapping/permutation analysis
mode: indicates the source analysis type. Must be either ‘boot’ or ‘perm’
mean: boolean variable indicating whether or not to compute the mean across all contrasts

# File 'lib/cass/analysis.rb', line 169

def self.p_values(input_file, mode='boot', mean=true)
  c = File.new(input_file).readlines
  c.shift
  buffer = ["file\tcontrast\tN_permutations\tvalue\tp-value"]
  tests = {}
  c.each { |l|
    l = l.strip.split(/\t/)
    row = [l[0], l[1], l[-1].to_f]
    fname =  mode == 'boot' ? l[2] : input_file
    tests[fname] = [] if !tests.key?(fname)
    tests[fname] << row
  }

  tests.each { |fname, rows|
    dists, obs, means = {}, {}, []
    rows.each { |row|
      test, iter, val = row
      if iter == 'observed'
        obs[test] = val
      else
        dists[test] = [] if !dists.key?(test)
        dists[test] << val
        if mean
          i = iter[/\d+$/].to_i-1
          means[i] = 0 if means[i].nil?
          means[i] += val
        end
      end
    }
    if mean
      means.map! { |m| m/obs.size }
      dists['mean'] = means
      obs['mean'] = obs.values.inject(0) {|sum, e| sum+e }/obs.size
    end
  
    dists.each { |k,v|
      v, o = v.sort, obs[k]
      gt = v.inject(0) { |sum, e| 
        sum + 
        if mode == 'perm'
          o >= e ? 1 : 0
        else
          e > 0 ? 1 : 0
        end
      }
      p = gt.to_f / v.size
      p = 1 - p if p > 0.5
      line = [fname, k, v.size, o, p*2]
      buffer << line.join("\t")
    }
  
  }
  base = File.basename(input_file, '.txt')
  File.new("#{base}_p_values.txt",'w').puts buffer
end

.parse_contrasts(contrast_file) ⇒ `Object`

Parse contrast file. Takes a filename as input and returns an array of Contrasts.



78
79
80

# File 'lib/cass/analysis.rb', line 78

def self.parse_contrasts(contrast_file)
  File.new(contrast_file).readlines.map { |l| next if l.empty?; Contrast.parse(l) }
end

.permutation_test(doc1, doc2, contrasts, output_file, n_perm) ⇒ `Object`

Run a permutation test comparing two Documents.

doc1, doc2: The two Documents to compare
contrasts: an array of Contrasts used to compare the documents
output_file: name of output file
n_perm: number of permutations to run

# File 'lib/cass/analysis.rb', line 87

def self.permutation_test(doc1, doc2, contrasts, output_file, n_perm)

  # Merge contexts. Could change this later to allow different contexts for each
  # document, but that would make processing substantially slower.
  context = doc1.context
  context.words = context.words & doc2.context.words
  context.index_words
  doc1.context, doc2.context = context, context

  # Generate cooccurrence matrices and get observed difference.
  doc1.cooccurrence(NORMALIZE_WEIGHTS)
  doc2.cooccurrence(NORMALIZE_WEIGHTS)

  outf = File.new(output_file,'w')
  outf.puts "contrast\titeration\t#{doc1.name}\t#{doc2.name}\tdifference"
  outf.sync = true
  # Save observed values
  contrasts.each { |c|
    res1, res2, diff = compare_docs(c, doc1, doc2)
    outf.puts "#{c.words.join(".")}\tobserved\t#{res1}\t#{res2}\t#{diff}"
  }
  # Run permutations and save results
  d1, d2 = doc1.clone, doc2.clone
  n_perm.times { |i|
    puts "\n\nRunning permutation #{i+1}..."
    d1.clines, d2.clines = permute_labels(doc1.clines, doc2.clines)
    d1.cooccurrence(NORMALIZE_WEIGHTS)
    d2.cooccurrence(NORMALIZE_WEIGHTS)
    contrasts.each { |c|
      res1, res2, diff = compare_docs(c, d1, d2)
      outf.puts "#{c.words.join(".")}\tperm_#{i+1}\t#{res1}\t#{res2}\t#{diff}"
    }
  }
end

.run_spec(spec_file = 'default.spec') ⇒ `Object`

Read and parse the specifications for an analysis, then run the analysis. Only does basic error checking for now…

# File 'lib/cass/analysis.rb', line 14

def self.run_spec(spec_file='default.spec')

  # Basic error checking
  abort("Error: can't find spec file (#{spec_file}).") if !File.exist?(spec_file)
  load spec_file
  abort("Error: can't find contrast file (#{CONTRAST_FILE}).") if !File.exist?(CONTRAST_FILE) 
  contrasts = parse_contrasts(CONTRAST_FILE)

  # Create contrasts
  puts "Found #{contrasts.size} contrasts." if VERBOSE
  
  # Set targets
  targets = contrasts.inject([]) { |t, c| t += c.words.flatten }.uniq
  puts "Found #{targets.size} target words." if VERBOSE
  
  # Create options hash
  opts = {}
  %w[PARSE_TEXT N_PERM N_BOOT MAX_LINES RECODE CONTEXT_SIZE MIN_PROP STOP_FILE NORMALIZE_WEIGHTS].each { |c|
    opts[c.downcase] = Module.const_get(c) if Module.constants.include?(c)  
  }
  
  # Read in files and create documents
  docs = []
  FILES.each { |f| 
    abort("Error: can't find input file #{f}.") if !File.exist?(f)
    puts "Reading in file #{f}..."
    text = File.new(f).read
    docs << Document.new(f.split(/\//)[-1], targets, text, opts)
  }
  docs

  # Load contrasts
  contrasts = parse_contrasts(CONTRAST_FILE)

  # Make sure N_PERM is zero if we don't want stats
  n_perm = STATS ? N_PERM : 0
  
  # One or two-sample test?
  case TEST_TYPE
  when 1
    docs.each { |d|
      base = File.basename(d.name, '.txt')
      puts "\nRunning one-sample analysis on document '#{d.name}'."
      puts "Generating #{n_perm} bootstraps..." if VERBOSE and STATS
      bootstrap_test(d, contrasts, "#{OUTPUT_ROOT}_#{base}_results.txt", n_perm)
      p_values("#{OUTPUT_ROOT}_#{base}_results.txt", 'boot', true) if STATS
    }
    
  when 2
    abort("Error: in order to run a permutation test, you need to pass exactly two files as input.") if FILES.size != 2
    puts "Running two-sample comparison between '#{File.basename(FILES[0])}' and '#{File.basename(FILES[1])}'." if VERBOSE
    puts "Generating #{n_perm} permutations..." if VERBOSE and STATS
    permutation_test(*docs, contrasts, "#{OUTPUT_ROOT}_results.txt", n_perm)
    p_values("#{OUTPUT_ROOT}_results.txt", 'perm', true)
  
  # No other test types implemented for now.
  else
    
  end    
  puts "Done!"

end

Class: Cass::Analysis

Overview

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Attribute Details

#contexts ⇒ Object

#docs ⇒ Object

#targets ⇒ Object

Class Method Details

.bootstrap_test(doc, contrasts, output_file, n_boot) ⇒ Object

.p_values(input_file, mode = 'boot', mean = true) ⇒ Object

.parse_contrasts(contrast_file) ⇒ Object

.permutation_test(doc1, doc2, contrasts, output_file, n_perm) ⇒ Object

.run_spec(spec_file = 'default.spec') ⇒ Object