Class: Cassiopee::Crawler
- Inherits:
-
Object
- Object
- Cassiopee::Crawler
- Defined in:
- lib/cassiopee.rb
Overview
Base class to index and search through a string
Constant Summary collapse
- METHOD_DIRECT =
0
- METHOD_SUFFIX =
1
- FILE_SUFFIX_EXT =
".sfx"
- FILE_SUFFIX_POS =
".sfp"
- SUFFIXLEN =
'suffix_length'
Instance Attribute Summary collapse
-
#ambiguous ⇒ Object
Ambiguity map (Hash).
-
#comments ⇒ Object
Array of comment characters to skip lines in input sequence file.
-
#file_suffix ⇒ Object
Suffix files name/path.
-
#maxthread ⇒ Object
Max number fo threads to use (not yet used).
-
#method ⇒ Object
Method for search FORCE or SUFFIX * SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused) * FORCE checks matches while crossing the suffixes.
-
#use_store ⇒ Object
Use persistent suffix file ?.
-
#useAmbiguity ⇒ Object
Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile.
-
#useCache ⇒ Object
Manage basic cache to store previous match.
Instance Method Summary collapse
-
#clear ⇒ Object
Clear suffixes in memory If using use_store, clear the store too.
-
#extractSuffix(start, len) ⇒ Object
Extract un suffix from suffix file based on md5 match.
-
#filter(posArray) ⇒ Object
Filter the array of positions with defined position filter.
-
#filter_position(min, max) ⇒ Object
Must be called after index creation or load.
- #filterCost ⇒ Object
- #filterLength ⇒ Object
-
#indexFile(f) ⇒ Object
Index an input file Clear existing indexes.
-
#indexString(s) ⇒ Object
Index an input string Clear existing indexes.
-
#initialize ⇒ Crawler
constructor
A new instance of Crawler.
-
#loadAmbiguityFile(f) ⇒ Object
Load ambiguity rules from a file File format should be: * A=B,C D=E,F …
-
#loadIndex ⇒ Object
Load sequence from a previous index command.
-
#next ⇒ Object
Iterates over matches.
- #searchApproximate(s, edit) ⇒ Object
-
#searchExact(s) ⇒ Object
Search exact match.
-
#setLogLevel(level) ⇒ Object
Set Logger level.
- #to_pos ⇒ Object
- #to_s ⇒ Object
Constructor Details
#initialize ⇒ Crawler
299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 |
# File 'lib/cassiopee.rb', line 299 def initialize @useAmbiguity = false @ambiguous = nil @useCache = false @file_suffix = "crawler" @method = 0 @prev_min_position = 0 @prev_max_position = 0 @suffix = nil @suffixmd5 = nil @position = 0 @suffixes = Hash.new @matches = Array.new @curmatch = 0 @use_store = false @sequence = nil @comments = Array["#"] @cache = Cassiopee::CrawlerCache.new end |
Instance Attribute Details
#ambiguous ⇒ Object
Ambiguity map (Hash)
264 265 266 |
# File 'lib/cassiopee.rb', line 264 def ambiguous @ambiguous end |
#comments ⇒ Object
Array of comment characters to skip lines in input sequence file
258 259 260 |
# File 'lib/cassiopee.rb', line 258 def comments @comments end |
#file_suffix ⇒ Object
Suffix files name/path
252 253 254 |
# File 'lib/cassiopee.rb', line 252 def file_suffix @file_suffix end |
#maxthread ⇒ Object
Max number fo threads to use (not yet used)
254 255 256 |
# File 'lib/cassiopee.rb', line 254 def maxthread @maxthread end |
#method ⇒ Object
Method for search FORCE or SUFFIX
-
SUFFIX loads all suffixes and search through them afterwards, interesting for multiple searches (suffixes are reused)
-
FORCE checks matches while crossing the suffixes. Does not keep parsed data for later search FORCE method does not yet support optimal filters
270 271 272 |
# File 'lib/cassiopee.rb', line 270 def method @method end |
#use_store ⇒ Object
Use persistent suffix file ?
256 257 258 |
# File 'lib/cassiopee.rb', line 256 def use_store @use_store end |
#useAmbiguity ⇒ Object
Use alphabet ambiguity (dna/rna) in search, automatically set with loadAmbiguityFile
250 251 252 |
# File 'lib/cassiopee.rb', line 250 def useAmbiguity @useAmbiguity end |
#useCache ⇒ Object
Manage basic cache to store previous match
261 262 263 |
# File 'lib/cassiopee.rb', line 261 def useCache @useCache end |
Instance Method Details
#clear ⇒ Object
Clear suffixes in memory If using use_store, clear the store too
340 341 342 343 344 345 346 347 348 |
# File 'lib/cassiopee.rb', line 340 def clear @suffixes = Hash.new @matches.clear @pattern = nil @prev_max_position = 0 @prev_min_position = 0 @cache.clearCache() File.delete(@file_suffix+FILE_SUFFIX_POS) unless !File.exists?(@file_suffix+FILE_SUFFIX_POS) end |
#extractSuffix(start, len) ⇒ Object
Extract un suffix from suffix file based on md5 match
586 587 588 589 590 591 592 593 594 595 596 597 598 |
# File 'lib/cassiopee.rb', line 586 def extractSuffix(start,len) sequence = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") file.pos = start sequence = file.read(len) file.close rescue => err puts "Exception: #{err}" return nil end return sequence end |
#filter(posArray) ⇒ Object
Filter the array of positions with defined position filter
563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 |
# File 'lib/cassiopee.rb', line 563 def filter(posArray) $log.debug("filter the position with " << @min_position.to_s << " and " << @max_position.to_s) if(@min_position==0 && @max_position==0) return posArray end filteredArray = Array.new i = 0 posArray.each do |pos| if(i==0) # First elt of array is match length filteredArray << pos end if(i>0 && pos>=@min_position && pos<=@max_position) filteredArray << pos end i +=1 end return filteredArray end |
#filter_position(min, max) ⇒ Object
Must be called after index creation or load
435 436 437 438 439 440 441 442 443 |
# File 'lib/cassiopee.rb', line 435 def filter_position(min,max) if(!use_store) clear() end @prev_min_position = @min_position @prev_max_position = @max_position @min_position = min @max_position = max end |
#filterCost ⇒ Object
333 334 335 |
# File 'lib/cassiopee.rb', line 333 def filterCost filterOptimal(1) end |
#filterLength ⇒ Object
329 330 331 |
# File 'lib/cassiopee.rb', line 329 def filterLength filterOptimal(0) end |
#indexFile(f) ⇒ Object
Index an input file Clear existing indexes
359 360 361 362 363 364 365 366 367 368 |
# File 'lib/cassiopee.rb', line 359 def indexFile(f) # Parse file, map letters to reduced alphabet # Later on, use binary map instead of ascii map # Take all suffix, order by length, link to position map on other file # Store md5 for easier compare? + 20 bytes per suffix @sequence = readSequence(f) clear() @min_position = 0 @max_position = 0 end |
#indexString(s) ⇒ Object
Index an input string Clear existing indexes
373 374 375 376 377 378 379 380 381 |
# File 'lib/cassiopee.rb', line 373 def indexString(s) @sequence = s File.open(@file_suffix+FILE_SUFFIX_EXT, 'w') do |data| data.puts(@sequence) end clear() @min_position = 0 @max_position = 0 end |
#loadAmbiguityFile(f) ⇒ Object
Load ambiguity rules from a file File format should be:
-
A=B,C D=E,F …
390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 |
# File 'lib/cassiopee.rb', line 390 def loadAmbiguityFile(f) if(!File.exists?(f)) $log.error("File "<< f << "does not exists") exit(1) end @ambiguous = Hash.new file = File.new(f, "r") while (line = file.gets) definition = line.downcase.chomp ambdef = definition.split('=') ambequal = ambdef[1].split(',') @ambiguous[ambdef[0]] = ambequal end @useAmbiguity = true $log.debug("loaded ambiguity rules: " << @ambiguous.inspect()) file.close end |
#loadIndex ⇒ Object
Load sequence from a previous index command
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 |
# File 'lib/cassiopee.rb', line 411 def loadIndex seq = '' begin file = File.new(@file_suffix+FILE_SUFFIX_EXT, "r") while (line = file.gets) input = line.downcase.chomp seq << input end file.close rescue => err $log.error("Exception: #{err}") exit() end @sequence = seq clear() @min_position = 0 @max_position = 0 end |
#next ⇒ Object
Iterates over matches
602 603 604 605 606 607 608 609 610 |
# File 'lib/cassiopee.rb', line 602 def next if(@curmatch<@matches.length) @curmatch = @curmatch + 1 return @matches[@curmatch-1] else @curmatch = 0 return nil end end |
#searchApproximate(s, edit) ⇒ Object
494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 |
# File 'lib/cassiopee.rb', line 494 def searchApproximate(s,edit) if(edit==0 && !@useAmbiguity) return searchExact(s) end allowederrors = edit if(edit>=0) useHamming = true minmatchsize = s.length maxmatchsize = s.length updateCache(1,edit) @matches = @cache.loadCache() else useHamming = false edit = edit * (-1) minmatchsize = s.length - edit maxmatchsize = s.length + edit updateCache(2,edit) @matches = @cache.loadCache() end if(@matches.length>0) return @matches end s = s.downcase #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,minmatchsize,maxmatchsize,allowederrors,s) return cache?(@matches) unless(method == METHOD_SUFFIX) @suffixes.each do |md5val,posArray| if(md5val == SUFFIXLEN) next end if (md5val == @pattern) filteredPosArray = filter(posArray) match = Array[md5val, 0, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match else if(posArray[0]>= minmatchsize && posArray[0] <= maxmatchsize) # Get string seq = extractSuffix(posArray[1],posArray[0]) errors = isApproximateEqual?(seq,s,useHamming,edit) if(errors>=0) filteredPosArray = filter(posArray) match = Array[md5val, errors, filteredPosArray] $log.debug "Match: " << match.inspect @matches << match end end end end return cache?(@matches) end |
#searchExact(s) ⇒ Object
Search exact match
447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 |
# File 'lib/cassiopee.rb', line 447 def searchExact(s) if(@useAmbiguity) return searchApproximate(s,0) end s = s.downcase updateCache(0,0) @matches = @cache.loadCache() if(@matches.length>0) return cache?(@matches) end #@matches.clear @pattern = Digest::MD5.hexdigest(s) parseSuffixes(@sequence,s.length,s.length,0,s) return @matches unless(method == METHOD_SUFFIX) # Search required length, compare (compare md5?) # MD5 = 128 bits, easier to compare for large strings matchsize = @pattern.length @suffixes.each do |md5val,posArray| if (isMatchEqual?(md5val)) match = Array[md5val, 0, posArray] $log.debug "Match: " << match.inspect @matches << match end end return cache?(@matches) end |
#setLogLevel(level) ⇒ Object
Set Logger level
352 353 354 |
# File 'lib/cassiopee.rb', line 352 def setLogLevel(level) $log.level = level end |
#to_pos ⇒ Object
612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 |
# File 'lib/cassiopee.rb', line 612 def to_pos positions = Hash.new @matches.each do |match| # match = Array[md5val, errors, posArray] i=0 len = 0 match[2].each do |pos| if(i==0) len = pos else if(positions.has_key?(pos)) posmatch = positions[pos] posmatch << Array[len,match[1]] else posmatch = Array.new posmatch << Array[len,match[1]] positions[pos] = posmatch end end i += 1 end end return positions.sort end |
#to_s ⇒ Object
640 641 642 |
# File 'lib/cassiopee.rb', line 640 def to_s puts '{ matches: "' << @matches.length << '" }' end |