Class: Bio::Ucsc::File::Twobit

Inherits:
Object
  • Object
show all
Defined in:
lib/bio-ucsc/file/twobit.rb

Direct Known Subclasses

Reference

Defined Under Namespace

Classes: TwoBitHeader, TwoBitRecord

Constant Summary collapse

BINCODE =
{0b00 => "T", 0b01 => "C", 0b10 => "A", 0b11 => "G"}

Instance Attribute Summary collapse

Class Method Summary collapse

Instance Method Summary collapse

Constructor Details

#initialize(twobit_queue, filename, header, offsets) ⇒ Twobit

Returns a new instance of Twobit.



21
22
23
24
25
26
27
# File 'lib/bio-ucsc/file/twobit.rb', line 21

def initialize(twobit_queue, filename, header, offsets)
  @tbq      = twobit_queue
  @filename = filename
  @header   = header
  @offsets  = offsets
  @records  = Hash.new
end

Instance Attribute Details

#filenameObject (readonly)

Returns the value of attribute filename.



29
30
31
# File 'lib/bio-ucsc/file/twobit.rb', line 29

def filename
  @filename
end

#headerObject (readonly)

Returns the value of attribute header.



29
30
31
# File 'lib/bio-ucsc/file/twobit.rb', line 29

def header
  @header
end

#offsetsObject (readonly)

Returns the value of attribute offsets.



29
30
31
# File 'lib/bio-ucsc/file/twobit.rb', line 29

def offsets
  @offsets
end

Class Method Details

.load(filename) ⇒ Object



35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
# File 'lib/bio-ucsc/file/twobit.rb', line 35

def self.load(filename)
  two_bit = nil
  ::File.open(filename, 'rb') {|f|two_bit = f.read}
  tbq = Bio::Ucsc::File::ByteQueue.new(two_bit)
  
  twobit_header = TwoBitHeader.new
  twobit_header.signature      = tbq.next(4).unpack('L').first
  twobit_header.version        = tbq.next(4).unpack('L').first
  twobit_header.sequence_count = tbq.next(4).unpack('L').first
  twobit_header.reserved       = tbq.next(4).unpack('L').first
  
  offsets = Hash.new
  twobit_header.sequence_count.times do
    name_length = tbq.next(1).unpack('C').first
    offsets[tbq.next(name_length).unpack('a*').first] =
      tbq.next(4).unpack('L').first
  end
  new(tbq, filename, twobit_header, offsets)
end

.open(filename) ⇒ Object



55
56
57
58
59
60
61
# File 'lib/bio-ucsc/file/twobit.rb', line 55

def self.open(filename)
  if block_given?
    yield self.load(filename)
  else
    return self.load(filename)
  end
end

Instance Method Details

#find_by_interval(interval) ⇒ Object



114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
# File 'lib/bio-ucsc/file/twobit.rb', line 114

def find_by_interval(interval)
  # does not support an interval involving multiple N-blocks

  interval = Bio::Ucsc::Gi.wrap(interval)
  seq = self.find_by_interval_raw(interval)
  @records[interval.chrom].n_block_intervals.map do |nb|
    if interval.overlapped?(nb)
      case interval.compare(nb)
      when :equal,:contains
        seq = 'N' * interval.overlap(nb)
      when :contained_by
        left_len  = nb.chr_start - interval.chr_start           
        right_len = interval.chr_end - nb.chr_end
        case
        when left_len > 0 && right_len > 0
          seq =
            seq[0 .. (left_len-1)] +
            'N' * (seq.length - left_len - right_len) +
            seq[-right_len ..  -1]
        when left_len == 0 && right_len > 0
          seq =
            'N' * (seq.length - left_len - right_len) +
            seq[-right_len ..  -1]
        when left_len > 0 && right_len == 0
          seq =
            seq[0 .. (left_len-1)] +
            'N' * (seq.length - left_len - right_len)
        else
          raise "This should not happen! (Bio::Ucsc::File::Twobit#find_by_interval)"
        end
      when :left_overlapped
        left_len = nb.chr_end - interval.chr_start + 1
        seq[0, left_len] = 'N' * left_len
      when :right_overlapped
        right_len = interval.chr_end - nb.chr_start + 1
        seq[-right_len, right_len] = 'N' * right_len
      when :right_adjacent, :right_off
        # expecting that N-blocks are sorted
        # return Bio::Sequence::NA.new(seq) 
        seq
      end
    end
  end
  #Bio::Sequence::NA.new(seq)
  seq
end

#find_by_interval_raw(interval) ⇒ Object



161
162
163
164
165
166
167
168
169
170
171
172
173
174
# File 'lib/bio-ucsc/file/twobit.rb', line 161

def find_by_interval_raw(interval)
  interval = Bio::Ucsc::Gi.wrap(interval)
  byte_count, byte_mod = interval.zero_start.divmod 4
  chrom_top = self.records(interval.chrom).packed_dna_offset
  div_start, mod_start = interval.zero_start.divmod 4
  div_end, mod_end     = interval.zero_end.divmod 4
  div_len, mod_len     = interval.length.divmod 4
  
  byte_length = div_end - div_start + 1
  @tbq.index = chrom_top + div_start
  bytes = @tbq.next(byte_length).unpack('C*')
  seq = bytes_to_nucleotides(bytes)
  seq[mod_start..(-1-(4-mod_end))]
end

#inspectObject



31
32
33
# File 'lib/bio-ucsc/file/twobit.rb', line 31

def inspect
  "#<#{self.class}:0x#{self.__id__.to_s(16)} @filename=#{@filename} ...>"
end

#records(chrom) ⇒ Object



63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
# File 'lib/bio-ucsc/file/twobit.rb', line 63

def records(chrom)
  return @records[chrom] if @records[chrom]
  
  @tbq.index = @offsets[chrom]
  @records[chrom] = TwoBitRecord.new
  @records[chrom].dna_size = @tbq.next(4).unpack('L').first

  n_block_count = @tbq.next(4).unpack('L').first
  n_block_starts = Array.new
  n_block_count.times do
  n_block_starts << @tbq.next(4).unpack('L').first
  end
  n_block_sizes = Array.new
  n_block_count.times do
    n_block_sizes << @tbq.next(4).unpack('L').first
  end
  @records[chrom].n_block_intervals = Array.new
  n_block_count.times do |idx|
    @records[chrom].n_block_intervals << 
      Bio::GenomicInterval.zero_based(chrom,
                                      n_block_starts[idx],
                                      n_block_starts[idx]+n_block_sizes[idx])
  end

  mask_block_count = @tbq.next(4).unpack('L').first
  mask_block_starts = Array.new
  mask_block_count.times do
    mask_block_starts << @tbq.next(4).unpack('L').first
  end
  mask_block_sizes = Array.new
  mask_block_count.times do
    mask_block_sizes << @tbq.next(4).unpack('L').first
  end
  @records[chrom].mask_block_intervals = Array.new
  mask_block_count.times do |idx|
    @records[chrom].mask_block_intervals << 
      Bio::GenomicInterval.zero_based(chrom,
                                      mask_block_starts[idx],
                                      mask_block_starts[idx]+mask_block_sizes[idx])
  end

  @records[chrom].reserved = @tbq.next(4).unpack('L').first
  @records[chrom].packed_dna_offset = @tbq.index

  @records[chrom]
end

#subseq(interval) ⇒ Object



110
111
112
# File 'lib/bio-ucsc/file/twobit.rb', line 110

def subseq(interval)
  find_by_interval(interval)
end