Class: FastaReader
- Inherits:
-
Object
- Object
- FastaReader
- Defined in:
- lib/bio-table/parsers/fastareader.rb
Instance Method Summary collapse
- #close ⇒ Object
- #digest_tag(tag) ⇒ Object
-
#each ⇒ Object
returns a FastaRecord for every item (invokes parse_each).
- #first ⇒ Object
-
#get(id) ⇒ Object
Return a record by its
id
, nil when not found. - #get_by_index(idx) ⇒ Object
- #get_rec(fpos) ⇒ Object
-
#initialize(fn, regex = nil) ⇒ FastaReader
constructor
Initalize the reader of FASTA file fn.
-
#parse_each ⇒ Object
Parse the FASTA file and yield id, descr, sequence.
-
#size ⇒ Object
Returns the size of the dataset - as read.
Constructor Details
#initialize(fn, regex = nil) ⇒ FastaReader
Initalize the reader of FASTA file fn. Options can be :regex and :index (true/false)
19 20 21 22 23 24 25 26 27 |
# File 'lib/bio-table/parsers/fastareader.rb', line 19 def initialize fn, regex = nil @logger = Bio::Log::LoggerPlus['bio-table'] @f = File.open(fn) @fread_once = false @regex = regex @regex = '^(\S+)' if @regex == nil @regex = '('+regex+')' if regex !~ /\(/ @logger.info "Parsing FASTA with ID regex '"+@regex+"'" end |
Instance Method Details
#close ⇒ Object
126 127 128 |
# File 'lib/bio-table/parsers/fastareader.rb', line 126 def close @f.close end |
#digest_tag(tag) ⇒ Object
106 107 108 109 110 111 112 113 114 115 116 117 118 |
# File 'lib/bio-table/parsers/fastareader.rb', line 106 def digest_tag tag if tag =~ /^>/ descr = $'.strip matches = /#{@regex}/.match(descr).captures if matches.size > 0 # p matches return matches.join("\t"), descr end p descr # do not remove these p @regex end raise "Can not digest '#{tag}' using '"+@regex+"'" end |
#each ⇒ Object
returns a FastaRecord for every item (invokes parse_each)
64 65 66 |
# File 'lib/bio-table/parsers/fastareader.rb', line 64 def each parse_each { | id, descr, seq | yield FastaRecord.new(id, descr, seq) } end |
#first ⇒ Object
68 69 70 71 72 |
# File 'lib/bio-table/parsers/fastareader.rb', line 68 def first parse_each { | id, descr, seq | return FastaRecord.new(id, descr, seq) } end |
#get(id) ⇒ Object
Return a record by its id
, nil when not found
75 76 77 78 79 80 81 82 |
# File 'lib/bio-table/parsers/fastareader.rb', line 75 def get id indexed? if fpos = indexer_get(id) get_rec(fpos) else nil end end |
#get_by_index(idx) ⇒ Object
97 98 99 100 101 102 103 104 |
# File 'lib/bio-table/parsers/fastareader.rb', line 97 def get_by_index idx indexed? if fpos = indexer_get_by_index(idx)[1] ret = get_rec(fpos) return ret end nil end |
#get_rec(fpos) ⇒ Object
84 85 86 87 88 89 90 91 92 93 94 95 |
# File 'lib/bio-table/parsers/fastareader.rb', line 84 def get_rec fpos @f.seek fpos tag = @f.gets seq = "" begin line = @f.gets break if line =~ /^>/ seq += line.strip end while !@f.eof id, descr = digest_tag(tag) FastaRecord.new(id,descr,seq) end |
#parse_each ⇒ Object
Parse the FASTA file and yield id, descr, sequence. When the indexer is on it will index the records the first time. Note that, with indexing, when you don’t complete parsing there will be an error the second time. This is a # trade-off, otherwise one would always have to index the file and read it twice.
34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 |
# File 'lib/bio-table/parsers/fastareader.rb', line 34 def parse_each @f.seek 0 # force file rewind @rec_fpos = 0 @rec_line = @f.gets fpos = 0 @count = 0 begin # digest id from record description id, descr = digest_tag(@rec_line) id_fpos = @rec_fpos # parse the sequence seq = "" begin fpos = @f.tell line = @f.gets break if line =~ /^>/ seq += line.strip end while !@f.eof # new record @count += 1 @rec_fpos = fpos @rec_line = line # p [@rec_line, id, id_fpos] # indexer_set(id, id_fpos) if @indexer and not @fread_once yield id, descr, seq end while !@f.eof @fread_once = true end |
#size ⇒ Object
Returns the size of the dataset - as read. After the final record the size represents the number of items in the FASTA file
122 123 124 |
# File 'lib/bio-table/parsers/fastareader.rb', line 122 def size @count end |