Class: Cheripic::Variants

Inherits:
Object
  • Object
show all
Extended by:
Forwardable
Includes:
Enumerable
Defined in:
lib/cheripic/variants.rb

Overview

A Variants object for each analysis pipeline that stores assembly details and extracts pileups for each contig assembly and pileup details are stored as hashes of Contig and ContigPileups objects

Instance Attribute Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(options) ⇒ Variants

creates a Variants object using user input files

Parameters:

  • options (OpenStruct)

    a hash of required input files as keys and file paths as values



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
# File 'lib/cheripic/variants.rb', line 35

def initialize(options)
  @params = options
  @assembly = {}
  @pileups = {}
  Bio::FastaFormat.open(@params.assembly).each do |entry|
    if entry.seq.length == 0
      logger.error "No sequence found for entry #{entry.entry_id}"
      raise VariantsError
    end
    contig = Contig.new(entry)
    if @assembly.key?(contig.id)
      logger.error "fasta id already found in the file for #{contig.id}"
      logger.error 'make sure there are no duplicate entries in the fasta file'
      raise VariantsError
    end
    @assembly[contig.id] = contig
    @pileups[contig.id] = ContigPileups.new(contig.id)
  end
  @pileups_analyzed = false
  unless @params.repeats_file == ''
    store_repeat_regions
  end
end

Instance Attribute Details

#assemblyHash (readonly)

Returns a hash of contig ids from assembly as keys and respective Contig objects as values.

Returns:

  • (Hash)

    a hash of contig ids from assembly as keys and respective Contig objects as values



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/cheripic/variants.rb', line 26

class Variants

  include Enumerable
  extend Forwardable
  def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[]
  attr_reader :assembly, :pileups, :pileups_analyzed

  # creates a Variants object using user input files
  # @param options [OpenStruct] a hash of required input files as keys and file paths as values
  def initialize(options)
    @params = options
    @assembly = {}
    @pileups = {}
    Bio::FastaFormat.open(@params.assembly).each do |entry|
      if entry.seq.length == 0
        logger.error "No sequence found for entry #{entry.entry_id}"
        raise VariantsError
      end
      contig = Contig.new(entry)
      if @assembly.key?(contig.id)
        logger.error "fasta id already found in the file for #{contig.id}"
        logger.error 'make sure there are no duplicate entries in the fasta file'
        raise VariantsError
      end
      @assembly[contig.id] = contig
      @pileups[contig.id] = ContigPileups.new(contig.id)
    end
    @pileups_analyzed = false
    unless @params.repeats_file == ''
      store_repeat_regions
    end
  end

  # reads repeat masker output file and stores masked regions to ignore variants in thos regions
  def store_repeat_regions
    File.foreach(@params.repeats_file) do |line|
      line.strip!
      next if line =~ /^SW/ or line =~ /^score/ or line == ''
      info = line.split("\s")
      pileups_obj = @pileups[info[4]]
      index = pileups_obj.masked_regions.length
      pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
      pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
    end
  end

  # Reads and store pileup data for each of input bulk and parents pileup files
  # And sets pileups_analyzed to true that pileups files are processed
  def analyse_pileups
    if @params.input_format == 'bam'
      @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
    end
    %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
      infile = @params[input]
      if infile != ''
        logger.info "processing #{input} file"
        if @params.input_format == 'pileup'
          extract_pileup(infile, input)
        elsif @params.input_format == 'vcf'
          extract_vcfs(infile, input)
        else
          extract_bam_pileup(infile, input)
        end
      end
    end

    @pileups_analyzed = true
  end

  # Input vcf file is read and positions are selected that pass the thresholds
  # @param vcffile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_vcfs(vcffile, sym)
    # read vcf file and process each variant
    File.foreach(vcffile) do |line|
      next if line =~ /^#/
      v = Bio::DB::Vcf.new(line)
      unless v.alt == '.'
        pileup_string = Vcf.to_pileup(v)
        pileup = Pileup.new(pileup_string)
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input pileup file is read and positions are selected that pass the thresholds
  # @param pileupfile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_pileup(pileupfile, sym)
    # read mpileup file and process each variant
    File.foreach(pileupfile) do |line|
      pileup = Pileup.new(line)
      if pileup.is_var
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input bamfile is read and selected positions pileups are stored
  # @param bamfile [String] path to the bam file to read
  # @param sym [Symbol] Symbol of the bam file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_bam_pileup(bamfile, sym)
    bq = Options.base_quality
    mq = Options.mapping_quality
    bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
    bamobject.index unless bamobject.indexed?

    # or calculate from bamfile
    set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
    # check if user has set max depth or set to zero to ignore
    max_d = Options.maxdepth
    logger.info "max depth used for #{sym} file\t#{max_d}"

    @vcf_hash.each_key do | id |
      positions = @vcf_hash[id][:het].keys
      positions << @vcf_hash[id][:hom].keys
      positions.flatten!
      next if positions.empty?
      positions.each do | pos |
        command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
        stdout = capture_command(command)
        if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
          logger.info "pileup data empty for\t#{id}\t#{pos}"
        else
          pileup = Pileup.new(stdout)
          store_pileup_info(pileup, sym)
        end
      end
    end
  end

  # Bam object is read and each contig mean and std deviation of depth calculated
  # @param bamobject [Bio::DB::Sam]
  # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
  def set_max_depth(bamobject, bamfile)
    logger.info "processing #{bamfile} file for depth"
    all_depths = []
    bq = Options.base_quality
    mq = Options.mapping_quality
    @assembly.each_key do | id |
      contig_obj = @assembly[id]
      len = contig_obj.length
      command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
      data = capture_command(command)
      if data == ''
        logger.info "depth data empty for\t#{id}"
        next
      end
      depths = []
      data.split("\n").each do |line|
        info = line.split("\t")
        depths << info[2].to_i
      end
      variance = 0
      mean_depth = depths.reduce(0, :+) / len.to_f
      depths.each do |value|
        variance += (value.to_f - mean_depth)**2
      end
      all_depths << mean_depth
      contig_obj.sd_depth = Math.sqrt(variance)
      contig_obj.mean_depth = mean_depth
    end
    # setting max depth as 3 times the average depth
    mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
    Options.maxdepth = Options.max_d_multiple * mean_coverage
  end

  def capture_command(command)
    stdout, stderr, status = Open3.capture3(command)
    unless status.success?
      logger.error "resulted in exit code #{status.exitstatus} using #{command}"
      logger.error "stderr output is: #{stderr}"
      raise CheripicError
    end
    stdout.chomp
  end

  # stores pileup information provided to respective contig_pileup object using sym input
  # @param pileup [Pileup] Pileup objects
  # @param sym [Symbol] Symbol of the input file used to write selected variants
  # pileup information stored to respective ContigPileups object
  def store_pileup_info(pileup, sym)
    # discarding variants with higher than max depth only for mut_bulk
    if sym == :mut_bulk
      unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
        logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
        return nil
      end
    end
    contig_obj = @pileups[pileup.ref_name]
    contig_obj.send(sym).store(pileup.pos, pileup)
    nil
  end

  # Once pileup files are analysed and variants are extracted from each bulk;
  # bulks are compared to identify and isolate variants for downstream analysis.
  # If polyploidy set to trye and mut_parent and bg_parent bulks are provided
  # hemisnps in parents are extracted for bulk frequency ratio analysis
  def compare_pileups
    unless @pileups_analyzed
      self.analyse_pileups
    end
    @assembly.each_key do | id |
      contig = @assembly[id]
      # extract parental hemi snps for polyploids before bulks are compared
      if Options.polyploidy
        if @params.mut_parent != '' or @params.bg_parent != ''
          @pileups[id].hemisnps_in_parent
        end
      end
      contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
    end
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for homozygosity enrichment score (hme_score)
  def hmes_frags
    # calculate every time method gets called
    @hmes_frags = select_contigs(:hme_score)
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for bulk frequency ratio (bfr_score)
  def bfr_frags
    unless defined?(@bfr_frags)
      @bfr_frags = select_contigs(:bfr_score)
    end
    @bfr_frags
  end

  # Applies selection procedure on assembly contigs based on the ratio_type provided.
  # If use_all_contigs is set to false then contigs without any variant are discarded for :hme_score
  # while contigs without any hemisnps are discarded for :bfr_score
  # If include_low_hmes is set to false then contigs are further filtered based on a cut off value of the score
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  def select_contigs(ratio_type)
    selected_contigs ={}
    use_all_contigs = Options.use_all_contigs
    @assembly.each_key do | frag |
      if use_all_contigs
        selected_contigs[frag] = @assembly[frag]
      else
        if ratio_type == :hme_score
          # selecting fragments which have a variant
          if @assembly[frag].hm_num + @assembly[frag].ht_num > 2 * Options.hmes_adjust
            selected_contigs[frag] = @assembly[frag]
          end
        else # ratio_type == :bfr_score
          # in polyploidy scenario selecting fragments with at least one bfr position
          if @assembly[frag].hemi_num > 0
            selected_contigs[frag] = @assembly[frag]
          end
        end
      end
    end
    selected_contigs = filter_contigs(selected_contigs, ratio_type)
    if use_all_contigs
      logger.info "No filtering was applied to fragments\n"
    else
      logger.info "Selected #{selected_contigs.length} out of #{@assembly.length} fragments with #{ratio_type} score\n"
    end
    selected_contigs
  end

  # Filters out contigs below a cutoff for selected ratio_type
  # a cutoff value is calculated based on ratio_type provided
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def filter_contigs(selected_contigs, ratio_type)
    cutoff = get_cutoff(selected_contigs, ratio_type)
    selected_contigs.each_key do | frag |
      if selected_contigs[frag].send(ratio_type) < cutoff
        selected_contigs.delete(frag)
      end
    end
    selected_contigs
  end

  # Cut off value calculation used to filter out low scored contigs.
  #
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def get_cutoff(selected_contigs, ratio_type)
    include_low_hmes = Options.include_low_hmes
    # set minimum cut off hme_score or bfr_score to pick fragments with variants
    # calculate min hme score for back or out crossed data or bfr_score for polypoidy data
    # if no filtering applied set cutoff to 1.1
    if include_low_hmes
      cutoff = 0.0
    else
      if ratio_type == :hme_score
        adjust = Options.hmes_adjust
        if Options.cross_type == 'back'
          cutoff = (1.0/adjust) + 1.0
        else # outcross
          cutoff = (2.0/adjust) + 1.0
        end
      else # ratio_type is bfr_score
        cutoff = bfr_cutoff(selected_contigs)
      end
    end
    cutoff
  end

  # Cut off value calculation for bfr contigs.
  # ratio value at index 0.1% length of an array or at index zero of an array that contains decreasing order of bfr ratios
  # @param selected_contigs [Hash] a hash of contigs with selected bfr score, a subset of assembly hash
  def bfr_cutoff(selected_contigs, prop=0.1)
    ratios = []
    selected_contigs.each_key do | frag |
      ratios << selected_contigs[frag].bfr_score
    end
    ratios.sort!.reverse!
    index = (ratios.length * prop)/100
    # set a minmum index to get at least one contig
    if index < 1
      index = 1
    end
    ratios[index - 1]
  end

  # Method is to discard homozygous variant positions for which background bulk
  # pileup shows a fraction value higher than 0.35 for variant allele/non-reference allele
  # a recessive variant is expected to have 1/3rd frequency in background bulk
  def verify_bg_bulk_pileup
    unless defined?(@hmes_frags)
      self.hmes_frags
    end
    @hmes_frags.each_key do | frag |
      positions = @assembly[frag].hm_pos.keys
      contig_pileup_obj = @pileups[frag]
      positions.each do | pos |
        if contig_pileup_obj.mut_bulk.key?(pos)
          mut_pileup = contig_pileup_obj.mut_bulk[pos]
          if mut_pileup.is_var
            if contig_pileup_obj.bg_bulk.key?(pos)
              bg_pileup = contig_pileup_obj.bg_bulk[pos]
              if bg_pileup.non_ref_ratio > 0.35
                @assembly[frag].hm_pos.delete(pos)
              end
            end
          else
            # this should not happen, may be catch as as an error
            @assembly[frag].hm_pos.delete(pos)
          end
        else
          # this should not happen, may be catch as as an error
          @assembly[frag].hm_pos.delete(pos)
        end
      end
    end
    # recalculate hmes_frags once pileups are verified
    self.hmes_frags
  end

end

#bfr_fragsObject (readonly)

From Assembly contig objects, contigs are selected based on user selected options for bulk frequency ratio (bfr_score)



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/cheripic/variants.rb', line 26

class Variants

  include Enumerable
  extend Forwardable
  def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[]
  attr_reader :assembly, :pileups, :pileups_analyzed

  # creates a Variants object using user input files
  # @param options [OpenStruct] a hash of required input files as keys and file paths as values
  def initialize(options)
    @params = options
    @assembly = {}
    @pileups = {}
    Bio::FastaFormat.open(@params.assembly).each do |entry|
      if entry.seq.length == 0
        logger.error "No sequence found for entry #{entry.entry_id}"
        raise VariantsError
      end
      contig = Contig.new(entry)
      if @assembly.key?(contig.id)
        logger.error "fasta id already found in the file for #{contig.id}"
        logger.error 'make sure there are no duplicate entries in the fasta file'
        raise VariantsError
      end
      @assembly[contig.id] = contig
      @pileups[contig.id] = ContigPileups.new(contig.id)
    end
    @pileups_analyzed = false
    unless @params.repeats_file == ''
      store_repeat_regions
    end
  end

  # reads repeat masker output file and stores masked regions to ignore variants in thos regions
  def store_repeat_regions
    File.foreach(@params.repeats_file) do |line|
      line.strip!
      next if line =~ /^SW/ or line =~ /^score/ or line == ''
      info = line.split("\s")
      pileups_obj = @pileups[info[4]]
      index = pileups_obj.masked_regions.length
      pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
      pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
    end
  end

  # Reads and store pileup data for each of input bulk and parents pileup files
  # And sets pileups_analyzed to true that pileups files are processed
  def analyse_pileups
    if @params.input_format == 'bam'
      @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
    end
    %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
      infile = @params[input]
      if infile != ''
        logger.info "processing #{input} file"
        if @params.input_format == 'pileup'
          extract_pileup(infile, input)
        elsif @params.input_format == 'vcf'
          extract_vcfs(infile, input)
        else
          extract_bam_pileup(infile, input)
        end
      end
    end

    @pileups_analyzed = true
  end

  # Input vcf file is read and positions are selected that pass the thresholds
  # @param vcffile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_vcfs(vcffile, sym)
    # read vcf file and process each variant
    File.foreach(vcffile) do |line|
      next if line =~ /^#/
      v = Bio::DB::Vcf.new(line)
      unless v.alt == '.'
        pileup_string = Vcf.to_pileup(v)
        pileup = Pileup.new(pileup_string)
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input pileup file is read and positions are selected that pass the thresholds
  # @param pileupfile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_pileup(pileupfile, sym)
    # read mpileup file and process each variant
    File.foreach(pileupfile) do |line|
      pileup = Pileup.new(line)
      if pileup.is_var
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input bamfile is read and selected positions pileups are stored
  # @param bamfile [String] path to the bam file to read
  # @param sym [Symbol] Symbol of the bam file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_bam_pileup(bamfile, sym)
    bq = Options.base_quality
    mq = Options.mapping_quality
    bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
    bamobject.index unless bamobject.indexed?

    # or calculate from bamfile
    set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
    # check if user has set max depth or set to zero to ignore
    max_d = Options.maxdepth
    logger.info "max depth used for #{sym} file\t#{max_d}"

    @vcf_hash.each_key do | id |
      positions = @vcf_hash[id][:het].keys
      positions << @vcf_hash[id][:hom].keys
      positions.flatten!
      next if positions.empty?
      positions.each do | pos |
        command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
        stdout = capture_command(command)
        if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
          logger.info "pileup data empty for\t#{id}\t#{pos}"
        else
          pileup = Pileup.new(stdout)
          store_pileup_info(pileup, sym)
        end
      end
    end
  end

  # Bam object is read and each contig mean and std deviation of depth calculated
  # @param bamobject [Bio::DB::Sam]
  # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
  def set_max_depth(bamobject, bamfile)
    logger.info "processing #{bamfile} file for depth"
    all_depths = []
    bq = Options.base_quality
    mq = Options.mapping_quality
    @assembly.each_key do | id |
      contig_obj = @assembly[id]
      len = contig_obj.length
      command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
      data = capture_command(command)
      if data == ''
        logger.info "depth data empty for\t#{id}"
        next
      end
      depths = []
      data.split("\n").each do |line|
        info = line.split("\t")
        depths << info[2].to_i
      end
      variance = 0
      mean_depth = depths.reduce(0, :+) / len.to_f
      depths.each do |value|
        variance += (value.to_f - mean_depth)**2
      end
      all_depths << mean_depth
      contig_obj.sd_depth = Math.sqrt(variance)
      contig_obj.mean_depth = mean_depth
    end
    # setting max depth as 3 times the average depth
    mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
    Options.maxdepth = Options.max_d_multiple * mean_coverage
  end

  def capture_command(command)
    stdout, stderr, status = Open3.capture3(command)
    unless status.success?
      logger.error "resulted in exit code #{status.exitstatus} using #{command}"
      logger.error "stderr output is: #{stderr}"
      raise CheripicError
    end
    stdout.chomp
  end

  # stores pileup information provided to respective contig_pileup object using sym input
  # @param pileup [Pileup] Pileup objects
  # @param sym [Symbol] Symbol of the input file used to write selected variants
  # pileup information stored to respective ContigPileups object
  def store_pileup_info(pileup, sym)
    # discarding variants with higher than max depth only for mut_bulk
    if sym == :mut_bulk
      unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
        logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
        return nil
      end
    end
    contig_obj = @pileups[pileup.ref_name]
    contig_obj.send(sym).store(pileup.pos, pileup)
    nil
  end

  # Once pileup files are analysed and variants are extracted from each bulk;
  # bulks are compared to identify and isolate variants for downstream analysis.
  # If polyploidy set to trye and mut_parent and bg_parent bulks are provided
  # hemisnps in parents are extracted for bulk frequency ratio analysis
  def compare_pileups
    unless @pileups_analyzed
      self.analyse_pileups
    end
    @assembly.each_key do | id |
      contig = @assembly[id]
      # extract parental hemi snps for polyploids before bulks are compared
      if Options.polyploidy
        if @params.mut_parent != '' or @params.bg_parent != ''
          @pileups[id].hemisnps_in_parent
        end
      end
      contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
    end
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for homozygosity enrichment score (hme_score)
  def hmes_frags
    # calculate every time method gets called
    @hmes_frags = select_contigs(:hme_score)
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for bulk frequency ratio (bfr_score)
  def bfr_frags
    unless defined?(@bfr_frags)
      @bfr_frags = select_contigs(:bfr_score)
    end
    @bfr_frags
  end

  # Applies selection procedure on assembly contigs based on the ratio_type provided.
  # If use_all_contigs is set to false then contigs without any variant are discarded for :hme_score
  # while contigs without any hemisnps are discarded for :bfr_score
  # If include_low_hmes is set to false then contigs are further filtered based on a cut off value of the score
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  def select_contigs(ratio_type)
    selected_contigs ={}
    use_all_contigs = Options.use_all_contigs
    @assembly.each_key do | frag |
      if use_all_contigs
        selected_contigs[frag] = @assembly[frag]
      else
        if ratio_type == :hme_score
          # selecting fragments which have a variant
          if @assembly[frag].hm_num + @assembly[frag].ht_num > 2 * Options.hmes_adjust
            selected_contigs[frag] = @assembly[frag]
          end
        else # ratio_type == :bfr_score
          # in polyploidy scenario selecting fragments with at least one bfr position
          if @assembly[frag].hemi_num > 0
            selected_contigs[frag] = @assembly[frag]
          end
        end
      end
    end
    selected_contigs = filter_contigs(selected_contigs, ratio_type)
    if use_all_contigs
      logger.info "No filtering was applied to fragments\n"
    else
      logger.info "Selected #{selected_contigs.length} out of #{@assembly.length} fragments with #{ratio_type} score\n"
    end
    selected_contigs
  end

  # Filters out contigs below a cutoff for selected ratio_type
  # a cutoff value is calculated based on ratio_type provided
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def filter_contigs(selected_contigs, ratio_type)
    cutoff = get_cutoff(selected_contigs, ratio_type)
    selected_contigs.each_key do | frag |
      if selected_contigs[frag].send(ratio_type) < cutoff
        selected_contigs.delete(frag)
      end
    end
    selected_contigs
  end

  # Cut off value calculation used to filter out low scored contigs.
  #
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def get_cutoff(selected_contigs, ratio_type)
    include_low_hmes = Options.include_low_hmes
    # set minimum cut off hme_score or bfr_score to pick fragments with variants
    # calculate min hme score for back or out crossed data or bfr_score for polypoidy data
    # if no filtering applied set cutoff to 1.1
    if include_low_hmes
      cutoff = 0.0
    else
      if ratio_type == :hme_score
        adjust = Options.hmes_adjust
        if Options.cross_type == 'back'
          cutoff = (1.0/adjust) + 1.0
        else # outcross
          cutoff = (2.0/adjust) + 1.0
        end
      else # ratio_type is bfr_score
        cutoff = bfr_cutoff(selected_contigs)
      end
    end
    cutoff
  end

  # Cut off value calculation for bfr contigs.
  # ratio value at index 0.1% length of an array or at index zero of an array that contains decreasing order of bfr ratios
  # @param selected_contigs [Hash] a hash of contigs with selected bfr score, a subset of assembly hash
  def bfr_cutoff(selected_contigs, prop=0.1)
    ratios = []
    selected_contigs.each_key do | frag |
      ratios << selected_contigs[frag].bfr_score
    end
    ratios.sort!.reverse!
    index = (ratios.length * prop)/100
    # set a minmum index to get at least one contig
    if index < 1
      index = 1
    end
    ratios[index - 1]
  end

  # Method is to discard homozygous variant positions for which background bulk
  # pileup shows a fraction value higher than 0.35 for variant allele/non-reference allele
  # a recessive variant is expected to have 1/3rd frequency in background bulk
  def verify_bg_bulk_pileup
    unless defined?(@hmes_frags)
      self.hmes_frags
    end
    @hmes_frags.each_key do | frag |
      positions = @assembly[frag].hm_pos.keys
      contig_pileup_obj = @pileups[frag]
      positions.each do | pos |
        if contig_pileup_obj.mut_bulk.key?(pos)
          mut_pileup = contig_pileup_obj.mut_bulk[pos]
          if mut_pileup.is_var
            if contig_pileup_obj.bg_bulk.key?(pos)
              bg_pileup = contig_pileup_obj.bg_bulk[pos]
              if bg_pileup.non_ref_ratio > 0.35
                @assembly[frag].hm_pos.delete(pos)
              end
            end
          else
            # this should not happen, may be catch as as an error
            @assembly[frag].hm_pos.delete(pos)
          end
        else
          # this should not happen, may be catch as as an error
          @assembly[frag].hm_pos.delete(pos)
        end
      end
    end
    # recalculate hmes_frags once pileups are verified
    self.hmes_frags
  end

end

#hmes_fragsObject (readonly)

From Assembly contig objects, contigs are selected based on user selected options for homozygosity enrichment score (hme_score)



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/cheripic/variants.rb', line 26

class Variants

  include Enumerable
  extend Forwardable
  def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[]
  attr_reader :assembly, :pileups, :pileups_analyzed

  # creates a Variants object using user input files
  # @param options [OpenStruct] a hash of required input files as keys and file paths as values
  def initialize(options)
    @params = options
    @assembly = {}
    @pileups = {}
    Bio::FastaFormat.open(@params.assembly).each do |entry|
      if entry.seq.length == 0
        logger.error "No sequence found for entry #{entry.entry_id}"
        raise VariantsError
      end
      contig = Contig.new(entry)
      if @assembly.key?(contig.id)
        logger.error "fasta id already found in the file for #{contig.id}"
        logger.error 'make sure there are no duplicate entries in the fasta file'
        raise VariantsError
      end
      @assembly[contig.id] = contig
      @pileups[contig.id] = ContigPileups.new(contig.id)
    end
    @pileups_analyzed = false
    unless @params.repeats_file == ''
      store_repeat_regions
    end
  end

  # reads repeat masker output file and stores masked regions to ignore variants in thos regions
  def store_repeat_regions
    File.foreach(@params.repeats_file) do |line|
      line.strip!
      next if line =~ /^SW/ or line =~ /^score/ or line == ''
      info = line.split("\s")
      pileups_obj = @pileups[info[4]]
      index = pileups_obj.masked_regions.length
      pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
      pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
    end
  end

  # Reads and store pileup data for each of input bulk and parents pileup files
  # And sets pileups_analyzed to true that pileups files are processed
  def analyse_pileups
    if @params.input_format == 'bam'
      @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
    end
    %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
      infile = @params[input]
      if infile != ''
        logger.info "processing #{input} file"
        if @params.input_format == 'pileup'
          extract_pileup(infile, input)
        elsif @params.input_format == 'vcf'
          extract_vcfs(infile, input)
        else
          extract_bam_pileup(infile, input)
        end
      end
    end

    @pileups_analyzed = true
  end

  # Input vcf file is read and positions are selected that pass the thresholds
  # @param vcffile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_vcfs(vcffile, sym)
    # read vcf file and process each variant
    File.foreach(vcffile) do |line|
      next if line =~ /^#/
      v = Bio::DB::Vcf.new(line)
      unless v.alt == '.'
        pileup_string = Vcf.to_pileup(v)
        pileup = Pileup.new(pileup_string)
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input pileup file is read and positions are selected that pass the thresholds
  # @param pileupfile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_pileup(pileupfile, sym)
    # read mpileup file and process each variant
    File.foreach(pileupfile) do |line|
      pileup = Pileup.new(line)
      if pileup.is_var
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input bamfile is read and selected positions pileups are stored
  # @param bamfile [String] path to the bam file to read
  # @param sym [Symbol] Symbol of the bam file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_bam_pileup(bamfile, sym)
    bq = Options.base_quality
    mq = Options.mapping_quality
    bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
    bamobject.index unless bamobject.indexed?

    # or calculate from bamfile
    set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
    # check if user has set max depth or set to zero to ignore
    max_d = Options.maxdepth
    logger.info "max depth used for #{sym} file\t#{max_d}"

    @vcf_hash.each_key do | id |
      positions = @vcf_hash[id][:het].keys
      positions << @vcf_hash[id][:hom].keys
      positions.flatten!
      next if positions.empty?
      positions.each do | pos |
        command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
        stdout = capture_command(command)
        if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
          logger.info "pileup data empty for\t#{id}\t#{pos}"
        else
          pileup = Pileup.new(stdout)
          store_pileup_info(pileup, sym)
        end
      end
    end
  end

  # Bam object is read and each contig mean and std deviation of depth calculated
  # @param bamobject [Bio::DB::Sam]
  # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
  def set_max_depth(bamobject, bamfile)
    logger.info "processing #{bamfile} file for depth"
    all_depths = []
    bq = Options.base_quality
    mq = Options.mapping_quality
    @assembly.each_key do | id |
      contig_obj = @assembly[id]
      len = contig_obj.length
      command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
      data = capture_command(command)
      if data == ''
        logger.info "depth data empty for\t#{id}"
        next
      end
      depths = []
      data.split("\n").each do |line|
        info = line.split("\t")
        depths << info[2].to_i
      end
      variance = 0
      mean_depth = depths.reduce(0, :+) / len.to_f
      depths.each do |value|
        variance += (value.to_f - mean_depth)**2
      end
      all_depths << mean_depth
      contig_obj.sd_depth = Math.sqrt(variance)
      contig_obj.mean_depth = mean_depth
    end
    # setting max depth as 3 times the average depth
    mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
    Options.maxdepth = Options.max_d_multiple * mean_coverage
  end

  def capture_command(command)
    stdout, stderr, status = Open3.capture3(command)
    unless status.success?
      logger.error "resulted in exit code #{status.exitstatus} using #{command}"
      logger.error "stderr output is: #{stderr}"
      raise CheripicError
    end
    stdout.chomp
  end

  # stores pileup information provided to respective contig_pileup object using sym input
  # @param pileup [Pileup] Pileup objects
  # @param sym [Symbol] Symbol of the input file used to write selected variants
  # pileup information stored to respective ContigPileups object
  def store_pileup_info(pileup, sym)
    # discarding variants with higher than max depth only for mut_bulk
    if sym == :mut_bulk
      unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
        logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
        return nil
      end
    end
    contig_obj = @pileups[pileup.ref_name]
    contig_obj.send(sym).store(pileup.pos, pileup)
    nil
  end

  # Once pileup files are analysed and variants are extracted from each bulk;
  # bulks are compared to identify and isolate variants for downstream analysis.
  # If polyploidy set to trye and mut_parent and bg_parent bulks are provided
  # hemisnps in parents are extracted for bulk frequency ratio analysis
  def compare_pileups
    unless @pileups_analyzed
      self.analyse_pileups
    end
    @assembly.each_key do | id |
      contig = @assembly[id]
      # extract parental hemi snps for polyploids before bulks are compared
      if Options.polyploidy
        if @params.mut_parent != '' or @params.bg_parent != ''
          @pileups[id].hemisnps_in_parent
        end
      end
      contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
    end
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for homozygosity enrichment score (hme_score)
  def hmes_frags
    # calculate every time method gets called
    @hmes_frags = select_contigs(:hme_score)
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for bulk frequency ratio (bfr_score)
  def bfr_frags
    unless defined?(@bfr_frags)
      @bfr_frags = select_contigs(:bfr_score)
    end
    @bfr_frags
  end

  # Applies selection procedure on assembly contigs based on the ratio_type provided.
  # If use_all_contigs is set to false then contigs without any variant are discarded for :hme_score
  # while contigs without any hemisnps are discarded for :bfr_score
  # If include_low_hmes is set to false then contigs are further filtered based on a cut off value of the score
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  def select_contigs(ratio_type)
    selected_contigs ={}
    use_all_contigs = Options.use_all_contigs
    @assembly.each_key do | frag |
      if use_all_contigs
        selected_contigs[frag] = @assembly[frag]
      else
        if ratio_type == :hme_score
          # selecting fragments which have a variant
          if @assembly[frag].hm_num + @assembly[frag].ht_num > 2 * Options.hmes_adjust
            selected_contigs[frag] = @assembly[frag]
          end
        else # ratio_type == :bfr_score
          # in polyploidy scenario selecting fragments with at least one bfr position
          if @assembly[frag].hemi_num > 0
            selected_contigs[frag] = @assembly[frag]
          end
        end
      end
    end
    selected_contigs = filter_contigs(selected_contigs, ratio_type)
    if use_all_contigs
      logger.info "No filtering was applied to fragments\n"
    else
      logger.info "Selected #{selected_contigs.length} out of #{@assembly.length} fragments with #{ratio_type} score\n"
    end
    selected_contigs
  end

  # Filters out contigs below a cutoff for selected ratio_type
  # a cutoff value is calculated based on ratio_type provided
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def filter_contigs(selected_contigs, ratio_type)
    cutoff = get_cutoff(selected_contigs, ratio_type)
    selected_contigs.each_key do | frag |
      if selected_contigs[frag].send(ratio_type) < cutoff
        selected_contigs.delete(frag)
      end
    end
    selected_contigs
  end

  # Cut off value calculation used to filter out low scored contigs.
  #
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def get_cutoff(selected_contigs, ratio_type)
    include_low_hmes = Options.include_low_hmes
    # set minimum cut off hme_score or bfr_score to pick fragments with variants
    # calculate min hme score for back or out crossed data or bfr_score for polypoidy data
    # if no filtering applied set cutoff to 1.1
    if include_low_hmes
      cutoff = 0.0
    else
      if ratio_type == :hme_score
        adjust = Options.hmes_adjust
        if Options.cross_type == 'back'
          cutoff = (1.0/adjust) + 1.0
        else # outcross
          cutoff = (2.0/adjust) + 1.0
        end
      else # ratio_type is bfr_score
        cutoff = bfr_cutoff(selected_contigs)
      end
    end
    cutoff
  end

  # Cut off value calculation for bfr contigs.
  # ratio value at index 0.1% length of an array or at index zero of an array that contains decreasing order of bfr ratios
  # @param selected_contigs [Hash] a hash of contigs with selected bfr score, a subset of assembly hash
  def bfr_cutoff(selected_contigs, prop=0.1)
    ratios = []
    selected_contigs.each_key do | frag |
      ratios << selected_contigs[frag].bfr_score
    end
    ratios.sort!.reverse!
    index = (ratios.length * prop)/100
    # set a minmum index to get at least one contig
    if index < 1
      index = 1
    end
    ratios[index - 1]
  end

  # Method is to discard homozygous variant positions for which background bulk
  # pileup shows a fraction value higher than 0.35 for variant allele/non-reference allele
  # a recessive variant is expected to have 1/3rd frequency in background bulk
  def verify_bg_bulk_pileup
    unless defined?(@hmes_frags)
      self.hmes_frags
    end
    @hmes_frags.each_key do | frag |
      positions = @assembly[frag].hm_pos.keys
      contig_pileup_obj = @pileups[frag]
      positions.each do | pos |
        if contig_pileup_obj.mut_bulk.key?(pos)
          mut_pileup = contig_pileup_obj.mut_bulk[pos]
          if mut_pileup.is_var
            if contig_pileup_obj.bg_bulk.key?(pos)
              bg_pileup = contig_pileup_obj.bg_bulk[pos]
              if bg_pileup.non_ref_ratio > 0.35
                @assembly[frag].hm_pos.delete(pos)
              end
            end
          else
            # this should not happen, may be catch as as an error
            @assembly[frag].hm_pos.delete(pos)
          end
        else
          # this should not happen, may be catch as as an error
          @assembly[frag].hm_pos.delete(pos)
        end
      end
    end
    # recalculate hmes_frags once pileups are verified
    self.hmes_frags
  end

end

#pileupsHash (readonly)

Returns a hash of contig ids from assembly as keys and respective ContigPileups objects as values.

Returns:

  • (Hash)

    a hash of contig ids from assembly as keys and respective ContigPileups objects as values



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/cheripic/variants.rb', line 26

class Variants

  include Enumerable
  extend Forwardable
  def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[]
  attr_reader :assembly, :pileups, :pileups_analyzed

  # creates a Variants object using user input files
  # @param options [OpenStruct] a hash of required input files as keys and file paths as values
  def initialize(options)
    @params = options
    @assembly = {}
    @pileups = {}
    Bio::FastaFormat.open(@params.assembly).each do |entry|
      if entry.seq.length == 0
        logger.error "No sequence found for entry #{entry.entry_id}"
        raise VariantsError
      end
      contig = Contig.new(entry)
      if @assembly.key?(contig.id)
        logger.error "fasta id already found in the file for #{contig.id}"
        logger.error 'make sure there are no duplicate entries in the fasta file'
        raise VariantsError
      end
      @assembly[contig.id] = contig
      @pileups[contig.id] = ContigPileups.new(contig.id)
    end
    @pileups_analyzed = false
    unless @params.repeats_file == ''
      store_repeat_regions
    end
  end

  # reads repeat masker output file and stores masked regions to ignore variants in thos regions
  def store_repeat_regions
    File.foreach(@params.repeats_file) do |line|
      line.strip!
      next if line =~ /^SW/ or line =~ /^score/ or line == ''
      info = line.split("\s")
      pileups_obj = @pileups[info[4]]
      index = pileups_obj.masked_regions.length
      pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
      pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
    end
  end

  # Reads and store pileup data for each of input bulk and parents pileup files
  # And sets pileups_analyzed to true that pileups files are processed
  def analyse_pileups
    if @params.input_format == 'bam'
      @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
    end
    %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
      infile = @params[input]
      if infile != ''
        logger.info "processing #{input} file"
        if @params.input_format == 'pileup'
          extract_pileup(infile, input)
        elsif @params.input_format == 'vcf'
          extract_vcfs(infile, input)
        else
          extract_bam_pileup(infile, input)
        end
      end
    end

    @pileups_analyzed = true
  end

  # Input vcf file is read and positions are selected that pass the thresholds
  # @param vcffile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_vcfs(vcffile, sym)
    # read vcf file and process each variant
    File.foreach(vcffile) do |line|
      next if line =~ /^#/
      v = Bio::DB::Vcf.new(line)
      unless v.alt == '.'
        pileup_string = Vcf.to_pileup(v)
        pileup = Pileup.new(pileup_string)
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input pileup file is read and positions are selected that pass the thresholds
  # @param pileupfile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_pileup(pileupfile, sym)
    # read mpileup file and process each variant
    File.foreach(pileupfile) do |line|
      pileup = Pileup.new(line)
      if pileup.is_var
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input bamfile is read and selected positions pileups are stored
  # @param bamfile [String] path to the bam file to read
  # @param sym [Symbol] Symbol of the bam file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_bam_pileup(bamfile, sym)
    bq = Options.base_quality
    mq = Options.mapping_quality
    bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
    bamobject.index unless bamobject.indexed?

    # or calculate from bamfile
    set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
    # check if user has set max depth or set to zero to ignore
    max_d = Options.maxdepth
    logger.info "max depth used for #{sym} file\t#{max_d}"

    @vcf_hash.each_key do | id |
      positions = @vcf_hash[id][:het].keys
      positions << @vcf_hash[id][:hom].keys
      positions.flatten!
      next if positions.empty?
      positions.each do | pos |
        command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
        stdout = capture_command(command)
        if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
          logger.info "pileup data empty for\t#{id}\t#{pos}"
        else
          pileup = Pileup.new(stdout)
          store_pileup_info(pileup, sym)
        end
      end
    end
  end

  # Bam object is read and each contig mean and std deviation of depth calculated
  # @param bamobject [Bio::DB::Sam]
  # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
  def set_max_depth(bamobject, bamfile)
    logger.info "processing #{bamfile} file for depth"
    all_depths = []
    bq = Options.base_quality
    mq = Options.mapping_quality
    @assembly.each_key do | id |
      contig_obj = @assembly[id]
      len = contig_obj.length
      command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
      data = capture_command(command)
      if data == ''
        logger.info "depth data empty for\t#{id}"
        next
      end
      depths = []
      data.split("\n").each do |line|
        info = line.split("\t")
        depths << info[2].to_i
      end
      variance = 0
      mean_depth = depths.reduce(0, :+) / len.to_f
      depths.each do |value|
        variance += (value.to_f - mean_depth)**2
      end
      all_depths << mean_depth
      contig_obj.sd_depth = Math.sqrt(variance)
      contig_obj.mean_depth = mean_depth
    end
    # setting max depth as 3 times the average depth
    mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
    Options.maxdepth = Options.max_d_multiple * mean_coverage
  end

  def capture_command(command)
    stdout, stderr, status = Open3.capture3(command)
    unless status.success?
      logger.error "resulted in exit code #{status.exitstatus} using #{command}"
      logger.error "stderr output is: #{stderr}"
      raise CheripicError
    end
    stdout.chomp
  end

  # stores pileup information provided to respective contig_pileup object using sym input
  # @param pileup [Pileup] Pileup objects
  # @param sym [Symbol] Symbol of the input file used to write selected variants
  # pileup information stored to respective ContigPileups object
  def store_pileup_info(pileup, sym)
    # discarding variants with higher than max depth only for mut_bulk
    if sym == :mut_bulk
      unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
        logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
        return nil
      end
    end
    contig_obj = @pileups[pileup.ref_name]
    contig_obj.send(sym).store(pileup.pos, pileup)
    nil
  end

  # Once pileup files are analysed and variants are extracted from each bulk;
  # bulks are compared to identify and isolate variants for downstream analysis.
  # If polyploidy set to trye and mut_parent and bg_parent bulks are provided
  # hemisnps in parents are extracted for bulk frequency ratio analysis
  def compare_pileups
    unless @pileups_analyzed
      self.analyse_pileups
    end
    @assembly.each_key do | id |
      contig = @assembly[id]
      # extract parental hemi snps for polyploids before bulks are compared
      if Options.polyploidy
        if @params.mut_parent != '' or @params.bg_parent != ''
          @pileups[id].hemisnps_in_parent
        end
      end
      contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
    end
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for homozygosity enrichment score (hme_score)
  def hmes_frags
    # calculate every time method gets called
    @hmes_frags = select_contigs(:hme_score)
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for bulk frequency ratio (bfr_score)
  def bfr_frags
    unless defined?(@bfr_frags)
      @bfr_frags = select_contigs(:bfr_score)
    end
    @bfr_frags
  end

  # Applies selection procedure on assembly contigs based on the ratio_type provided.
  # If use_all_contigs is set to false then contigs without any variant are discarded for :hme_score
  # while contigs without any hemisnps are discarded for :bfr_score
  # If include_low_hmes is set to false then contigs are further filtered based on a cut off value of the score
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  def select_contigs(ratio_type)
    selected_contigs ={}
    use_all_contigs = Options.use_all_contigs
    @assembly.each_key do | frag |
      if use_all_contigs
        selected_contigs[frag] = @assembly[frag]
      else
        if ratio_type == :hme_score
          # selecting fragments which have a variant
          if @assembly[frag].hm_num + @assembly[frag].ht_num > 2 * Options.hmes_adjust
            selected_contigs[frag] = @assembly[frag]
          end
        else # ratio_type == :bfr_score
          # in polyploidy scenario selecting fragments with at least one bfr position
          if @assembly[frag].hemi_num > 0
            selected_contigs[frag] = @assembly[frag]
          end
        end
      end
    end
    selected_contigs = filter_contigs(selected_contigs, ratio_type)
    if use_all_contigs
      logger.info "No filtering was applied to fragments\n"
    else
      logger.info "Selected #{selected_contigs.length} out of #{@assembly.length} fragments with #{ratio_type} score\n"
    end
    selected_contigs
  end

  # Filters out contigs below a cutoff for selected ratio_type
  # a cutoff value is calculated based on ratio_type provided
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def filter_contigs(selected_contigs, ratio_type)
    cutoff = get_cutoff(selected_contigs, ratio_type)
    selected_contigs.each_key do | frag |
      if selected_contigs[frag].send(ratio_type) < cutoff
        selected_contigs.delete(frag)
      end
    end
    selected_contigs
  end

  # Cut off value calculation used to filter out low scored contigs.
  #
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def get_cutoff(selected_contigs, ratio_type)
    include_low_hmes = Options.include_low_hmes
    # set minimum cut off hme_score or bfr_score to pick fragments with variants
    # calculate min hme score for back or out crossed data or bfr_score for polypoidy data
    # if no filtering applied set cutoff to 1.1
    if include_low_hmes
      cutoff = 0.0
    else
      if ratio_type == :hme_score
        adjust = Options.hmes_adjust
        if Options.cross_type == 'back'
          cutoff = (1.0/adjust) + 1.0
        else # outcross
          cutoff = (2.0/adjust) + 1.0
        end
      else # ratio_type is bfr_score
        cutoff = bfr_cutoff(selected_contigs)
      end
    end
    cutoff
  end

  # Cut off value calculation for bfr contigs.
  # ratio value at index 0.1% length of an array or at index zero of an array that contains decreasing order of bfr ratios
  # @param selected_contigs [Hash] a hash of contigs with selected bfr score, a subset of assembly hash
  def bfr_cutoff(selected_contigs, prop=0.1)
    ratios = []
    selected_contigs.each_key do | frag |
      ratios << selected_contigs[frag].bfr_score
    end
    ratios.sort!.reverse!
    index = (ratios.length * prop)/100
    # set a minmum index to get at least one contig
    if index < 1
      index = 1
    end
    ratios[index - 1]
  end

  # Method is to discard homozygous variant positions for which background bulk
  # pileup shows a fraction value higher than 0.35 for variant allele/non-reference allele
  # a recessive variant is expected to have 1/3rd frequency in background bulk
  def verify_bg_bulk_pileup
    unless defined?(@hmes_frags)
      self.hmes_frags
    end
    @hmes_frags.each_key do | frag |
      positions = @assembly[frag].hm_pos.keys
      contig_pileup_obj = @pileups[frag]
      positions.each do | pos |
        if contig_pileup_obj.mut_bulk.key?(pos)
          mut_pileup = contig_pileup_obj.mut_bulk[pos]
          if mut_pileup.is_var
            if contig_pileup_obj.bg_bulk.key?(pos)
              bg_pileup = contig_pileup_obj.bg_bulk[pos]
              if bg_pileup.non_ref_ratio > 0.35
                @assembly[frag].hm_pos.delete(pos)
              end
            end
          else
            # this should not happen, may be catch as as an error
            @assembly[frag].hm_pos.delete(pos)
          end
        else
          # this should not happen, may be catch as as an error
          @assembly[frag].hm_pos.delete(pos)
        end
      end
    end
    # recalculate hmes_frags once pileups are verified
    self.hmes_frags
  end

end

#pileups_analyzedBoolean (readonly)

Returns a Boolean option to check if pileups for the assembly are analyzed or not.

Returns:

  • (Boolean)

    a Boolean option to check if pileups for the assembly are analyzed or not



26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
# File 'lib/cheripic/variants.rb', line 26

class Variants

  include Enumerable
  extend Forwardable
  def_delegators :@assembly, :each, :each_key, :each_value, :size, :length, :[]
  attr_reader :assembly, :pileups, :pileups_analyzed

  # creates a Variants object using user input files
  # @param options [OpenStruct] a hash of required input files as keys and file paths as values
  def initialize(options)
    @params = options
    @assembly = {}
    @pileups = {}
    Bio::FastaFormat.open(@params.assembly).each do |entry|
      if entry.seq.length == 0
        logger.error "No sequence found for entry #{entry.entry_id}"
        raise VariantsError
      end
      contig = Contig.new(entry)
      if @assembly.key?(contig.id)
        logger.error "fasta id already found in the file for #{contig.id}"
        logger.error 'make sure there are no duplicate entries in the fasta file'
        raise VariantsError
      end
      @assembly[contig.id] = contig
      @pileups[contig.id] = ContigPileups.new(contig.id)
    end
    @pileups_analyzed = false
    unless @params.repeats_file == ''
      store_repeat_regions
    end
  end

  # reads repeat masker output file and stores masked regions to ignore variants in thos regions
  def store_repeat_regions
    File.foreach(@params.repeats_file) do |line|
      line.strip!
      next if line =~ /^SW/ or line =~ /^score/ or line == ''
      info = line.split("\s")
      pileups_obj = @pileups[info[4]]
      index = pileups_obj.masked_regions.length
      pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
      pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
    end
  end

  # Reads and store pileup data for each of input bulk and parents pileup files
  # And sets pileups_analyzed to true that pileups files are processed
  def analyse_pileups
    if @params.input_format == 'bam'
      @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
    end
    %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
      infile = @params[input]
      if infile != ''
        logger.info "processing #{input} file"
        if @params.input_format == 'pileup'
          extract_pileup(infile, input)
        elsif @params.input_format == 'vcf'
          extract_vcfs(infile, input)
        else
          extract_bam_pileup(infile, input)
        end
      end
    end

    @pileups_analyzed = true
  end

  # Input vcf file is read and positions are selected that pass the thresholds
  # @param vcffile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_vcfs(vcffile, sym)
    # read vcf file and process each variant
    File.foreach(vcffile) do |line|
      next if line =~ /^#/
      v = Bio::DB::Vcf.new(line)
      unless v.alt == '.'
        pileup_string = Vcf.to_pileup(v)
        pileup = Pileup.new(pileup_string)
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input pileup file is read and positions are selected that pass the thresholds
  # @param pileupfile [String] path to the pileup file to read
  # @param sym [Symbol] Symbol of the pileup file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_pileup(pileupfile, sym)
    # read mpileup file and process each variant
    File.foreach(pileupfile) do |line|
      pileup = Pileup.new(line)
      if pileup.is_var
        store_pileup_info(pileup, sym)
      end
    end
  end

  # Input bamfile is read and selected positions pileups are stored
  # @param bamfile [String] path to the bam file to read
  # @param sym [Symbol] Symbol of the bam file used to write selected variants
  # pileup information to respective ContigPileups object
  def extract_bam_pileup(bamfile, sym)
    bq = Options.base_quality
    mq = Options.mapping_quality
    bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
    bamobject.index unless bamobject.indexed?

    # or calculate from bamfile
    set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
    # check if user has set max depth or set to zero to ignore
    max_d = Options.maxdepth
    logger.info "max depth used for #{sym} file\t#{max_d}"

    @vcf_hash.each_key do | id |
      positions = @vcf_hash[id][:het].keys
      positions << @vcf_hash[id][:hom].keys
      positions.flatten!
      next if positions.empty?
      positions.each do | pos |
        command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
        stdout = capture_command(command)
        if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
          logger.info "pileup data empty for\t#{id}\t#{pos}"
        else
          pileup = Pileup.new(stdout)
          store_pileup_info(pileup, sym)
        end
      end
    end
  end

  # Bam object is read and each contig mean and std deviation of depth calculated
  # @param bamobject [Bio::DB::Sam]
  # Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis
  def set_max_depth(bamobject, bamfile)
    logger.info "processing #{bamfile} file for depth"
    all_depths = []
    bq = Options.base_quality
    mq = Options.mapping_quality
    @assembly.each_key do | id |
      contig_obj = @assembly[id]
      len = contig_obj.length
      command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
      data = capture_command(command)
      if data == ''
        logger.info "depth data empty for\t#{id}"
        next
      end
      depths = []
      data.split("\n").each do |line|
        info = line.split("\t")
        depths << info[2].to_i
      end
      variance = 0
      mean_depth = depths.reduce(0, :+) / len.to_f
      depths.each do |value|
        variance += (value.to_f - mean_depth)**2
      end
      all_depths << mean_depth
      contig_obj.sd_depth = Math.sqrt(variance)
      contig_obj.mean_depth = mean_depth
    end
    # setting max depth as 3 times the average depth
    mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
    Options.maxdepth = Options.max_d_multiple * mean_coverage
  end

  def capture_command(command)
    stdout, stderr, status = Open3.capture3(command)
    unless status.success?
      logger.error "resulted in exit code #{status.exitstatus} using #{command}"
      logger.error "stderr output is: #{stderr}"
      raise CheripicError
    end
    stdout.chomp
  end

  # stores pileup information provided to respective contig_pileup object using sym input
  # @param pileup [Pileup] Pileup objects
  # @param sym [Symbol] Symbol of the input file used to write selected variants
  # pileup information stored to respective ContigPileups object
  def store_pileup_info(pileup, sym)
    # discarding variants with higher than max depth only for mut_bulk
    if sym == :mut_bulk
      unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
        logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
        return nil
      end
    end
    contig_obj = @pileups[pileup.ref_name]
    contig_obj.send(sym).store(pileup.pos, pileup)
    nil
  end

  # Once pileup files are analysed and variants are extracted from each bulk;
  # bulks are compared to identify and isolate variants for downstream analysis.
  # If polyploidy set to trye and mut_parent and bg_parent bulks are provided
  # hemisnps in parents are extracted for bulk frequency ratio analysis
  def compare_pileups
    unless @pileups_analyzed
      self.analyse_pileups
    end
    @assembly.each_key do | id |
      contig = @assembly[id]
      # extract parental hemi snps for polyploids before bulks are compared
      if Options.polyploidy
        if @params.mut_parent != '' or @params.bg_parent != ''
          @pileups[id].hemisnps_in_parent
        end
      end
      contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
    end
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for homozygosity enrichment score (hme_score)
  def hmes_frags
    # calculate every time method gets called
    @hmes_frags = select_contigs(:hme_score)
  end

  # From Assembly contig objects, contigs are selected based on user selected options
  # for bulk frequency ratio (bfr_score)
  def bfr_frags
    unless defined?(@bfr_frags)
      @bfr_frags = select_contigs(:bfr_score)
    end
    @bfr_frags
  end

  # Applies selection procedure on assembly contigs based on the ratio_type provided.
  # If use_all_contigs is set to false then contigs without any variant are discarded for :hme_score
  # while contigs without any hemisnps are discarded for :bfr_score
  # If include_low_hmes is set to false then contigs are further filtered based on a cut off value of the score
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  def select_contigs(ratio_type)
    selected_contigs ={}
    use_all_contigs = Options.use_all_contigs
    @assembly.each_key do | frag |
      if use_all_contigs
        selected_contigs[frag] = @assembly[frag]
      else
        if ratio_type == :hme_score
          # selecting fragments which have a variant
          if @assembly[frag].hm_num + @assembly[frag].ht_num > 2 * Options.hmes_adjust
            selected_contigs[frag] = @assembly[frag]
          end
        else # ratio_type == :bfr_score
          # in polyploidy scenario selecting fragments with at least one bfr position
          if @assembly[frag].hemi_num > 0
            selected_contigs[frag] = @assembly[frag]
          end
        end
      end
    end
    selected_contigs = filter_contigs(selected_contigs, ratio_type)
    if use_all_contigs
      logger.info "No filtering was applied to fragments\n"
    else
      logger.info "Selected #{selected_contigs.length} out of #{@assembly.length} fragments with #{ratio_type} score\n"
    end
    selected_contigs
  end

  # Filters out contigs below a cutoff for selected ratio_type
  # a cutoff value is calculated based on ratio_type provided
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def filter_contigs(selected_contigs, ratio_type)
    cutoff = get_cutoff(selected_contigs, ratio_type)
    selected_contigs.each_key do | frag |
      if selected_contigs[frag].send(ratio_type) < cutoff
        selected_contigs.delete(frag)
      end
    end
    selected_contigs
  end

  # Cut off value calculation used to filter out low scored contigs.
  #
  # @param ratio_type [Symbol] ratio_type is either :hme_score or :bfr_score
  # @param selected_contigs [Hash] a hash of contigs with selected ratio_type, a subset of assembly hash
  def get_cutoff(selected_contigs, ratio_type)
    include_low_hmes = Options.include_low_hmes
    # set minimum cut off hme_score or bfr_score to pick fragments with variants
    # calculate min hme score for back or out crossed data or bfr_score for polypoidy data
    # if no filtering applied set cutoff to 1.1
    if include_low_hmes
      cutoff = 0.0
    else
      if ratio_type == :hme_score
        adjust = Options.hmes_adjust
        if Options.cross_type == 'back'
          cutoff = (1.0/adjust) + 1.0
        else # outcross
          cutoff = (2.0/adjust) + 1.0
        end
      else # ratio_type is bfr_score
        cutoff = bfr_cutoff(selected_contigs)
      end
    end
    cutoff
  end

  # Cut off value calculation for bfr contigs.
  # ratio value at index 0.1% length of an array or at index zero of an array that contains decreasing order of bfr ratios
  # @param selected_contigs [Hash] a hash of contigs with selected bfr score, a subset of assembly hash
  def bfr_cutoff(selected_contigs, prop=0.1)
    ratios = []
    selected_contigs.each_key do | frag |
      ratios << selected_contigs[frag].bfr_score
    end
    ratios.sort!.reverse!
    index = (ratios.length * prop)/100
    # set a minmum index to get at least one contig
    if index < 1
      index = 1
    end
    ratios[index - 1]
  end

  # Method is to discard homozygous variant positions for which background bulk
  # pileup shows a fraction value higher than 0.35 for variant allele/non-reference allele
  # a recessive variant is expected to have 1/3rd frequency in background bulk
  def verify_bg_bulk_pileup
    unless defined?(@hmes_frags)
      self.hmes_frags
    end
    @hmes_frags.each_key do | frag |
      positions = @assembly[frag].hm_pos.keys
      contig_pileup_obj = @pileups[frag]
      positions.each do | pos |
        if contig_pileup_obj.mut_bulk.key?(pos)
          mut_pileup = contig_pileup_obj.mut_bulk[pos]
          if mut_pileup.is_var
            if contig_pileup_obj.bg_bulk.key?(pos)
              bg_pileup = contig_pileup_obj.bg_bulk[pos]
              if bg_pileup.non_ref_ratio > 0.35
                @assembly[frag].hm_pos.delete(pos)
              end
            end
          else
            # this should not happen, may be catch as as an error
            @assembly[frag].hm_pos.delete(pos)
          end
        else
          # this should not happen, may be catch as as an error
          @assembly[frag].hm_pos.delete(pos)
        end
      end
    end
    # recalculate hmes_frags once pileups are verified
    self.hmes_frags
  end

end

Instance Method Details

#analyse_pileupsObject

Reads and store pileup data for each of input bulk and parents pileup files And sets pileups_analyzed to true that pileups files are processed



74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# File 'lib/cheripic/variants.rb', line 74

def analyse_pileups
  if @params.input_format == 'bam'
    @vcf_hash = Vcf.filtering(@params.mut_bulk_vcf, @params.bg_bulk_vcf)
  end
  %i{mut_bulk bg_bulk mut_parent bg_parent}.each do | input |
    infile = @params[input]
    if infile != ''
      logger.info "processing #{input} file"
      if @params.input_format == 'pileup'
        extract_pileup(infile, input)
      elsif @params.input_format == 'vcf'
        extract_vcfs(infile, input)
      else
        extract_bam_pileup(infile, input)
      end
    end
  end

  @pileups_analyzed = true
end

#bfr_cutoff(selected_contigs, prop = 0.1) ⇒ Object

Cut off value calculation for bfr contigs. ratio value at index 0.1% length of an array or at index zero of an array that contains decreasing order of bfr ratios

Parameters:

  • selected_contigs (Hash)

    a hash of contigs with selected bfr score, a subset of assembly hash



336
337
338
339
340
341
342
343
344
345
346
347
348
# File 'lib/cheripic/variants.rb', line 336

def bfr_cutoff(selected_contigs, prop=0.1)
  ratios = []
  selected_contigs.each_key do | frag |
    ratios << selected_contigs[frag].bfr_score
  end
  ratios.sort!.reverse!
  index = (ratios.length * prop)/100
  # set a minmum index to get at least one contig
  if index < 1
    index = 1
  end
  ratios[index - 1]
end

#capture_command(command) ⇒ Object



196
197
198
199
200
201
202
203
204
# File 'lib/cheripic/variants.rb', line 196

def capture_command(command)
  stdout, stderr, status = Open3.capture3(command)
  unless status.success?
    logger.error "resulted in exit code #{status.exitstatus} using #{command}"
    logger.error "stderr output is: #{stderr}"
    raise CheripicError
  end
  stdout.chomp
end

#compare_pileupsObject

Once pileup files are analysed and variants are extracted from each bulk; bulks are compared to identify and isolate variants for downstream analysis. If polyploidy set to trye and mut_parent and bg_parent bulks are provided hemisnps in parents are extracted for bulk frequency ratio analysis



227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
# File 'lib/cheripic/variants.rb', line 227

def compare_pileups
  unless @pileups_analyzed
    self.analyse_pileups
  end
  @assembly.each_key do | id |
    contig = @assembly[id]
    # extract parental hemi snps for polyploids before bulks are compared
    if Options.polyploidy
      if @params.mut_parent != '' or @params.bg_parent != ''
        @pileups[id].hemisnps_in_parent
      end
    end
    contig.hm_pos, contig.ht_pos, contig.hemi_pos = @pileups[id].bulks_compared
  end
end

#extract_bam_pileup(bamfile, sym) ⇒ Object

Input bamfile is read and selected positions pileups are stored pileup information to respective ContigPileups object

Parameters:

  • bamfile (String)

    path to the bam file to read

  • sym (Symbol)

    Symbol of the bam file used to write selected variants



130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/cheripic/variants.rb', line 130

def extract_bam_pileup(bamfile, sym)
  bq = Options.base_quality
  mq = Options.mapping_quality
  bamobject = Bio::DB::Sam.new(:bam=>bamfile, :fasta=>@params.assembly)
  bamobject.index unless bamobject.indexed?

  # or calculate from bamfile
  set_max_depth(bamobject, bamfile) if Options.max_d_multiple > 0 and sym == :mut_bulk
  # check if user has set max depth or set to zero to ignore
  max_d = Options.maxdepth
  logger.info "max depth used for #{sym} file\t#{max_d}"

  @vcf_hash.each_key do | id |
    positions = @vcf_hash[id][:het].keys
    positions << @vcf_hash[id][:hom].keys
    positions.flatten!
    next if positions.empty?
    positions.each do | pos |
      command = "#{bamobject.samtools} mpileup -r #{id}:#{pos}-#{pos} -Q #{bq} -q #{mq} -B -f #{@params.assembly} #{bamfile}"
      stdout = capture_command(command)
      if stdout == '' or stdout.split("\t")[3].to_i == 0 or stdout =~ /^\t0/
        logger.info "pileup data empty for\t#{id}\t#{pos}"
      else
        pileup = Pileup.new(stdout)
        store_pileup_info(pileup, sym)
      end
    end
  end
end

#extract_pileup(pileupfile, sym) ⇒ Object

Input pileup file is read and positions are selected that pass the thresholds pileup information to respective ContigPileups object

Parameters:

  • pileupfile (String)

    path to the pileup file to read

  • sym (Symbol)

    Symbol of the pileup file used to write selected variants



116
117
118
119
120
121
122
123
124
# File 'lib/cheripic/variants.rb', line 116

def extract_pileup(pileupfile, sym)
  # read mpileup file and process each variant
  File.foreach(pileupfile) do |line|
    pileup = Pileup.new(line)
    if pileup.is_var
      store_pileup_info(pileup, sym)
    end
  end
end

#extract_vcfs(vcffile, sym) ⇒ Object

Input vcf file is read and positions are selected that pass the thresholds pileup information to respective ContigPileups object

Parameters:

  • vcffile (String)

    path to the pileup file to read

  • sym (Symbol)

    Symbol of the pileup file used to write selected variants



99
100
101
102
103
104
105
106
107
108
109
110
# File 'lib/cheripic/variants.rb', line 99

def extract_vcfs(vcffile, sym)
  # read vcf file and process each variant
  File.foreach(vcffile) do |line|
    next if line =~ /^#/
    v = Bio::DB::Vcf.new(line)
    unless v.alt == '.'
      pileup_string = Vcf.to_pileup(v)
      pileup = Pileup.new(pileup_string)
      store_pileup_info(pileup, sym)
    end
  end
end

#filter_contigs(selected_contigs, ratio_type) ⇒ Object

Filters out contigs below a cutoff for selected ratio_type a cutoff value is calculated based on ratio_type provided

Parameters:

  • ratio_type (Symbol)

    ratio_type is either :hme_score or :bfr_score

  • selected_contigs (Hash)

    a hash of contigs with selected ratio_type, a subset of assembly hash



297
298
299
300
301
302
303
304
305
# File 'lib/cheripic/variants.rb', line 297

def filter_contigs(selected_contigs, ratio_type)
  cutoff = get_cutoff(selected_contigs, ratio_type)
  selected_contigs.each_key do | frag |
    if selected_contigs[frag].send(ratio_type) < cutoff
      selected_contigs.delete(frag)
    end
  end
  selected_contigs
end

#get_cutoff(selected_contigs, ratio_type) ⇒ Object

Cut off value calculation used to filter out low scored contigs.

Parameters:

  • ratio_type (Symbol)

    ratio_type is either :hme_score or :bfr_score

  • selected_contigs (Hash)

    a hash of contigs with selected ratio_type, a subset of assembly hash



311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
# File 'lib/cheripic/variants.rb', line 311

def get_cutoff(selected_contigs, ratio_type)
  include_low_hmes = Options.include_low_hmes
  # set minimum cut off hme_score or bfr_score to pick fragments with variants
  # calculate min hme score for back or out crossed data or bfr_score for polypoidy data
  # if no filtering applied set cutoff to 1.1
  if include_low_hmes
    cutoff = 0.0
  else
    if ratio_type == :hme_score
      adjust = Options.hmes_adjust
      if Options.cross_type == 'back'
        cutoff = (1.0/adjust) + 1.0
      else # outcross
        cutoff = (2.0/adjust) + 1.0
      end
    else # ratio_type is bfr_score
      cutoff = bfr_cutoff(selected_contigs)
    end
  end
  cutoff
end

#select_contigs(ratio_type) ⇒ Object

Applies selection procedure on assembly contigs based on the ratio_type provided. If use_all_contigs is set to false then contigs without any variant are discarded for :hme_score while contigs without any hemisnps are discarded for :bfr_score If include_low_hmes is set to false then contigs are further filtered based on a cut off value of the score

Parameters:

  • ratio_type (Symbol)

    ratio_type is either :hme_score or :bfr_score



264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
# File 'lib/cheripic/variants.rb', line 264

def select_contigs(ratio_type)
  selected_contigs ={}
  use_all_contigs = Options.use_all_contigs
  @assembly.each_key do | frag |
    if use_all_contigs
      selected_contigs[frag] = @assembly[frag]
    else
      if ratio_type == :hme_score
        # selecting fragments which have a variant
        if @assembly[frag].hm_num + @assembly[frag].ht_num > 2 * Options.hmes_adjust
          selected_contigs[frag] = @assembly[frag]
        end
      else # ratio_type == :bfr_score
        # in polyploidy scenario selecting fragments with at least one bfr position
        if @assembly[frag].hemi_num > 0
          selected_contigs[frag] = @assembly[frag]
        end
      end
    end
  end
  selected_contigs = filter_contigs(selected_contigs, ratio_type)
  if use_all_contigs
    logger.info "No filtering was applied to fragments\n"
  else
    logger.info "Selected #{selected_contigs.length} out of #{@assembly.length} fragments with #{ratio_type} score\n"
  end
  selected_contigs
end

#set_max_depth(bamobject, bamfile) ⇒ Object

Bam object is read and each contig mean and std deviation of depth calculated Open3 capture returns string output, so careful not to give whole genome or big contigs for depth analysis

Parameters:

  • bamobject (Bio::DB::Sam)


163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# File 'lib/cheripic/variants.rb', line 163

def set_max_depth(bamobject, bamfile)
  logger.info "processing #{bamfile} file for depth"
  all_depths = []
  bq = Options.base_quality
  mq = Options.mapping_quality
  @assembly.each_key do | id |
    contig_obj = @assembly[id]
    len = contig_obj.length
    command = "#{bamobject.samtools} depth -r #{id} -Q #{bq} -q #{mq} #{bamfile}"
    data = capture_command(command)
    if data == ''
      logger.info "depth data empty for\t#{id}"
      next
    end
    depths = []
    data.split("\n").each do |line|
      info = line.split("\t")
      depths << info[2].to_i
    end
    variance = 0
    mean_depth = depths.reduce(0, :+) / len.to_f
    depths.each do |value|
      variance += (value.to_f - mean_depth)**2
    end
    all_depths << mean_depth
    contig_obj.sd_depth = Math.sqrt(variance)
    contig_obj.mean_depth = mean_depth
  end
  # setting max depth as 3 times the average depth
  mean_coverage = all_depths.reduce(0, :+) / @assembly.length.to_f
  Options.maxdepth = Options.max_d_multiple * mean_coverage
end

#store_pileup_info(pileup, sym) ⇒ Object

stores pileup information provided to respective contig_pileup object using sym input pileup information stored to respective ContigPileups object

Parameters:

  • pileup (Pileup)

    Pileup objects

  • sym (Symbol)

    Symbol of the input file used to write selected variants



210
211
212
213
214
215
216
217
218
219
220
221
# File 'lib/cheripic/variants.rb', line 210

def store_pileup_info(pileup, sym)
  # discarding variants with higher than max depth only for mut_bulk
  if sym == :mut_bulk
    unless Options.maxdepth == 0 or pileup.coverage <= Options.maxdepth
      logger.info "pileup coverage is higher than max\t#{pileup.ref_name}\t#{pileup.pos}\t#{pileup.coverage}"
      return nil
    end
  end
  contig_obj = @pileups[pileup.ref_name]
  contig_obj.send(sym).store(pileup.pos, pileup)
  nil
end

#store_repeat_regionsObject

reads repeat masker output file and stores masked regions to ignore variants in thos regions



60
61
62
63
64
65
66
67
68
69
70
# File 'lib/cheripic/variants.rb', line 60

def store_repeat_regions
  File.foreach(@params.repeats_file) do |line|
    line.strip!
    next if line =~ /^SW/ or line =~ /^score/ or line == ''
    info = line.split("\s")
    pileups_obj = @pileups[info[4]]
    index = pileups_obj.masked_regions.length
    pileups_obj.masked_regions[index + 1][:begin] = info[5].to_i
    pileups_obj.masked_regions[index + 1][:end] = info[6].to_i
  end
end

#verify_bg_bulk_pileupObject

Method is to discard homozygous variant positions for which background bulk pileup shows a fraction value higher than 0.35 for variant allele/non-reference allele a recessive variant is expected to have 1/3rd frequency in background bulk



353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
# File 'lib/cheripic/variants.rb', line 353

def verify_bg_bulk_pileup
  unless defined?(@hmes_frags)
    self.hmes_frags
  end
  @hmes_frags.each_key do | frag |
    positions = @assembly[frag].hm_pos.keys
    contig_pileup_obj = @pileups[frag]
    positions.each do | pos |
      if contig_pileup_obj.mut_bulk.key?(pos)
        mut_pileup = contig_pileup_obj.mut_bulk[pos]
        if mut_pileup.is_var
          if contig_pileup_obj.bg_bulk.key?(pos)
            bg_pileup = contig_pileup_obj.bg_bulk[pos]
            if bg_pileup.non_ref_ratio > 0.35
              @assembly[frag].hm_pos.delete(pos)
            end
          end
        else
          # this should not happen, may be catch as as an error
          @assembly[frag].hm_pos.delete(pos)
        end
      else
        # this should not happen, may be catch as as an error
        @assembly[frag].hm_pos.delete(pos)
      end
    end
  end
  # recalculate hmes_frags once pileups are verified
  self.hmes_frags
end