Class: Rebuild

Inherits:
Object
  • Object
show all
Defined in:
lib/gene_assembler/rebuild.rb

Instance Method Summary collapse

Constructor Details

#initialize(dataset, dataset_uni_hsp, path) ⇒ Rebuild

La clase ha de recibr objetos dataset



7
8
9
10
11
12
# File 'lib/gene_assembler/rebuild.rb', line 7

def initialize(dataset,dataset_uni_hsp,path) #La clase ha de recibr objetos dataset
	@dataset=dataset
	@dataset_uni_hsp=dataset_uni_hsp
	@path=path
	@db_seqs=fasta_hash(path[:exonerate_db])
end

Instance Method Details

#add_uni_hsp(model, cluster) ⇒ Object

Compara contigs uni-hsp con contig modelo para determinar pseudogenes



248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
# File 'lib/gene_assembler/rebuild.rb', line 248

def add_uni_hsp(model,cluster)#Compara contigs uni-hsp con contig modelo para determinar pseudogenes
  contigs_uni_hsp=''
  is_contig=0
  pseudogenes=[]
  @clusters_uni_hsp.each do |contigs|
    if contigs.first.first_hit.name==cluster.first.first_hit.name
      contigs_uni_hsp=contigs
      is_contig=1
      break
    end
  end
  
  if is_contig==1 #Si se ha encontrado contigs uni-hsp se realiza la comparacion
    if model.class.to_s!='Array'
      model=[model]
    end
    model.each do |item|
      contigs_uni_hsp.each do |contig|
        start,exons=item.compare(contig)
        if exons>1 && !pseudogenes.include?(contig)
          pseudogenes << contig
        end
      end
    end
  end
	return pseudogenes
end

#array_contigs_to_contig(array_contigs) ⇒ Object



852
853
854
855
856
857
858
859
# File 'lib/gene_assembler/rebuild.rb', line 852

def array_contigs_to_contig(array_contigs)
  contig=Contig.new(array_contigs.first.name)
  array_contigs.each do |cn|
      contig.transfer_contig_hits(cn)
  end
  contig.length=array_contigs.last.length
  return contig
end

#build_gene_array(contigs, reference = nil) ⇒ Object

GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones



276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
# File 'lib/gene_assembler/rebuild.rb', line 276

def build_gene_array(contigs,reference=nil) #GEnera un array que representa la posicion relativa de todos los contigs entre si a nivel de los exones y de intrones
	gene_array=[]
	gene_array_introns=[]
	last_contig=''
	if !reference.nil?
		last_contig=reference
	end
	contigs.each do |contig|
			array_contig=[]
			array_contig_introns=[]
			n_exon=contig.first_hit.hsp_count #Contamos cantidad de hsps en el contig
			#Determinar posiciones vacias
			if !gene_array.empty?||reference 
				first_exon,ex=contig.compare(last_contig) #Comparamos el contig actual con el que se ha estudiado en la iteracion anterior
				if reference && first_exon==-1 # Abortar alineamiento cuando un contig no coincide con la referencia
				  if $verbose
				    puts "\n#{contig.name} alignment step OUT OF RANGE"
				  end
				  gene_array=[]
				  gene_array_introns=[]
				  break
				end
				if first_exon==-1
					gene_array.last.count.times do #Posiciones vacias cuando NO hay overlapping
						array_contig << 0 # Marca ausencia de exon para esa posicion
						array_contig_introns << 0 # Marca ausencia de intron para esa posicion
					end
				else
					if reference # ASignamiento de la posicion del contig respecto a la referencia
						void_positions=first_exon
					else
						void_positions=first_exon+gene_array.last.count(0)
					end
					void_positions.times do #Posiciones vacias cuando HAY overlapping
						array_contig << 0
						array_contig_introns << 0
					end
				end
			end
			#Agregar exones e intrones del contig
			exones=contig.exones_s
			introns=contig.intrones_q
			array_contig << exones # Marca presencia de exon para esa posicion
			array_contig_introns << introns # Marca presencia de exon para esa posicion
			gene_array << array_contig.flatten!
			gene_array_introns << array_contig_introns.flatten!
			if reference.nil?
				last_contig=contig
			end
	end
	return gene_array, gene_array_introns
end

#contig_compact(contigs) ⇒ Object

Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig



430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
# File 'lib/gene_assembler/rebuild.rb', line 430

def contig_compact(contigs) # Toma un conjunto de contigs, busca los q son correlativos, los fusiona, pasa por el exonerate y devuelve un array con los nuevos contig
	cn_def=[]
	cn_backup=contigs.dup
	#Determinar contigs a fusionar
	cn_to_merge=[]
	s_end=nil
	last_position_ref=nil
	position_overlap=nil
	last_contig=nil
	fusion=[]
	contigs.length.times do
		fusion << FALSE
	end
	#Marcaje de contigs correlativos no solapantes
	contigs.each_with_index do |contig,i|
		if i>0			
      diference=contig.first_hit.first_hsp.s_beg-s_end
			if diference==0 || diference==1
				fusion[i]=TRUE
			end
		end
		s_end=contig.first_hit.last_hsp.s_end
	end
	
	if fusion.include?(TRUE)
		
 		#Construccion array contigs a fusionar y guardado de los solapantes
 		fusion_contigs=[]
 		count=0	# Marca la posicion de las fusiones	
 		fusion.each_with_index do |cont,i|
 			if cont
 				if !fusion_contigs.include?(contigs[i-1])
 					fusion_contigs << contigs[i-1]
 				end
 				if !fusion_contigs.include?(contigs[i])
 					fusion_contigs << contigs[i]
 				end
 			else
 				if !fusion_contigs.empty?#Marcar fusiones
 					cn_to_merge << fusion_contigs
 					fusion_contigs=[]
 					cn_def << count
 					count+=1
 				end
 				if !fusion[i+1]||fusion[i+1].nil?#Guardar contigs que no participan en las fusiones
 					cn_def << contigs[i]
 				end
 			end
 			if i+1==fusion.length && !fusion_contigs.empty? #Control fin de bucle
 				cn_to_merge << fusion_contigs
 				cn_def << count
 				count+=1
 			end
 		end
 		
 		#Generar fasta de los contig fusionados
 		contigs_merge=contigs_seq_merge(cn_to_merge)
 		if !contigs_merge.empty?
   		temp=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.fasta'),'w')
   		contigs_merge.each_with_index do |seq,i|
   			temp.puts ">Fusion_#{i}\n#{seq}"
   		end
   		temp.close
 
   		temp_db=File.open(File.join(@path[:local],contigs.first.first_hit.name+'.db'),'w')
   		temp_db.puts ">#{contigs.first.first_hit.name}\n#{@db_seqs[contigs.first.first_hit.name]}"
   		temp_db.close
   		
     end
 		
 		#Exonerating
 		cmd="exonerate -q #{File.join(@path[:local],contigs.first.first_hit.name+'.db')} -t #{File.join(@path[:local],contigs.first.first_hit.name+'.fasta')} -Q protein -T dna -m protein2genome --percent 1 --showalignment 0 --useaatla 1 --showvulgar > #{File.join(@path[:local],contigs.first.first_hit.name+'.ex')}" #LINUX command line
 		system(cmd)
 		
 		#Parsing exonerate
 		local = ParserExonerate.new('contig','nucleotide_match', File.join(@path[:local],"#{contigs.first.first_hit.name}.ex"))
 		store_local_ex = local.dataset
 	  #store_local_ex.each_contig {|ite| puts ite.name+' '+ite.first_hit.name; ite.indices} 
 		store_local_ex.score_correction(30)
 		#puts "#{store_local_ex.contig_count}\t#{contigs_merge.length}"
 		if store_local_ex.contig_count==contigs_merge.length
 			#Recuperar atributos en contigs y cargar array con contigs def
 			store_local_ex.each_contig_with_index{|contig,i|
 				contig.seq=contigs_merge[i]
 				contig.length=contigs_merge[i].length
 				contig.first_hit.s_length=contigs.first.first_hit.s_length
 				cn_def.each_with_index do |contig_def,j| # Busqueda de la posicion de la fusion y asignacion en el array de contigs definitivos
 					if contig_def==i
 						cn_def[j]=contig
 					end
 				end
 			}
 		else
 			cn_def=cn_backup
 		end
	else
	  cn_def=contigs
	end

	return cn_def

end

#contigs_seq_merge(contigs) ⇒ Object

def



533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
# File 'lib/gene_assembler/rebuild.rb', line 533

def contigs_seq_merge(contigs) #Devuelve un array con las secuencias fusionadas a partir del array contigs donde se le proporciona los arrays a fusionar
	cn=[]
	seq=''
	contigs.each do |contigs_to_merge|
		contigs_to_merge.each do |contig|
			if seq.empty?
				seq=contig.seq
			else
				seq=seq+'n'*10+contig.seq
			end
		end
		cn << seq
		seq=''
	end
	return cn
end

#correct_model(model, length_model, gff_dataset, sequences_hash) ⇒ Object



954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
# File 'lib/gene_assembler/rebuild.rb', line 954

def correct_model(model, length_model, gff_dataset, sequences_hash)
  correct_add_Ns=0
  if model.class.to_s=='Array'
    model=array_contigs_to_contig(model)
    model.name=model.name.gsub('_0','')
    model.length=length_model
  end
  
  correct_add_Ns=gff_dataset.correct_left_side_contigs(model)
  model.modified_coordenates(correct_add_Ns)
  model.length+=correct_add_Ns
  gff_dataset.align_contigs(model)

	 #Corregir secuencia para que alinee con las features generadas
	 if correct_add_Ns>0
    sequences_hash[model.name.gsub('_model','')]='n'*correct_add_Ns+sequences_hash[model.name.gsub('_model','')]
	 end
  	
 return model
end

#eval_model(local_model, length) ⇒ Object

modifica model asi q se le ha de pasar una copia



171
172
173
174
175
176
177
178
179
180
181
# File 'lib/gene_assembler/rebuild.rb', line 171

def eval_model(local_model, length)#modifica model asi q se le ha de pasar una copia
  recover=0
  overlap=0
  if local_model.class.to_s=='Array'
    local_model=array_contigs_to_contig(local_model)
    local_model.length=length
  end
  recover=recover_test(local_model,FALSE)
  overlap=overlap_test(local_model,FALSE)    
  return recover, overlap
end

#format_model(model) ⇒ Object



944
945
946
947
948
949
950
951
952
# File 'lib/gene_assembler/rebuild.rb', line 944

def format_model(model)
  if model.n_hits?>1
    model.each_hit_with_index{|hit,i|
      hit.name=hit.name+"_gene_#{i}"
    }
  else
    model.first_hit.name=model.first_hit.name+'_gene'
  end
end

#gene_array_and_compact(rebuild, cluster, reference) ⇒ Object



221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
# File 'lib/gene_assembler/rebuild.rb', line 221

def gene_array_and_compact(rebuild,cluster,reference)
  # Contruir array de exones (a partir del cluster) con los hsps de forma que los solapantes se alineen en las mismas columnas
  #---------------------------------------------------------------------------------------------------------------------------
  if rebuild
    gene_array,gene_array_introns=build_gene_array(cluster,reference) #Con referencia
    if $verbose
      gene_exons=gene_stadistics(gene_array)
      gene_stadistics_report(gene_exons,'EXONS')
      gene_introns=gene_stadistics(gene_array_introns)
      gene_stadistics_report(gene_introns,'INTRONS')
    end
  end

  # Seleccion de contigs para modelado de gen
  #-----------------------------------------------------
  if $verbose #Info cluster before compact array contigs
    gene_array_report(gene_array,cluster,rebuild)
  end
  if rebuild
    gene_compact(gene_array,cluster) #Se descartan los contigs redundantes y quedan aquellos que cubren todo el gen para formar el modelo
  end
  if $verbose && rebuild #Info cluster after compact array contigs
    gene_array_report(gene_array,cluster,rebuild)
  end
  return cluster, gene_array
end

#gene_array_report(gene_array, contigs, act_array) ⇒ Object

Muestra el array de la funncion build_gene_array y una representacion de las secuencias



356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
# File 'lib/gene_assembler/rebuild.rb', line 356

def gene_array_report(gene_array,contigs,act_array) #Muestra el array de la funncion build_gene_array y una representacion de las secuencias
	if act_array 
		puts "\nGENE ARRAY"
		gene_array.each_with_index do |fila,c|
			print "#{contigs[c].name.center(24)}\t "
			print "#{contigs[c].completed}\t"
			fila.each do |item|
				print "#{item}\t"
			end
			puts "\n"
		end
	end
	
	puts "\nMAP"
	contigs.each do |contig|
		print "#{contig.name.center(25)}"
		print contig.draw
	end
end

#gene_compact(gene_array, contigs) ⇒ Object

Generacion modelo del gen quitando todas las secuencias redundantes posibles



376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
# File 'lib/gene_assembler/rebuild.rb', line 376

def gene_compact(gene_array, contigs) # Generacion modelo del gen quitando todas las secuencias redundantes posibles
	gene_array.each_with_index do |contig,c1|
		if !contig
			next
		end
		c1_len=contig.length
		n_exons=contig.count{|x| x>0}
		gene_array.each_with_index do |contig2,c2|
			if !contig2 ||c1==c2 #Saltamos contigs a nil o autocomparacion
				next
			end
			c2_len=contig2.length
			
			# IGUAL
			if c1_len==c2_len
				if contig2.count{|x| x>0}==n_exons
					if contigs[c1].first_hit.first_hsp.score>=contigs[c2].first_hit.first_hsp.score
						gene_array[c2]=nil
						contigs[c2]=nil
					else 
						gene_array[c1]=nil
						contigs[c1]=nil
						break
					end
				elsif contig2.count{|x| x>0}>n_exons 
					gene_array[c1]=nil
					contigs[c1]=nil
					break
				else
					gene_array[c2]=nil
					contigs[c2]=nil
				end

			# MAYOR QUE
			elsif c1_len>c2_len
				if contig.count(0)<=contig2.count(0)
					gene_array[c2]=nil
					contigs[c2]=nil
				end

			# MENOR QUE
			elsif c1_len<c2_len
				if contig.count(0)==contig2.count(0)
					gene_array[c1]=nil
					contigs[c1]=nil
					break
				end
			end
		end #end contig2
	end #end contig
	gene_array.compact!
	contigs.compact!
end

#gene_error(e, gene_name, file_error, cluster, model) ⇒ Object

e is a ruby exception object



861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
# File 'lib/gene_assembler/rebuild.rb', line 861

def gene_error(e, gene_name, file_error, cluster, model) #e is a ruby exception object
   puts gene_name+' ERROR'
   file_error.puts "\n"+gene_name+"\n.............................."
   file_error.puts e.message
   e.backtrace.each do |line|
     file_error.puts line
   end
   file_error.puts ',,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,'
   cluster.each do |contig|
     file_error.puts contig.name
     #puts contig.name
     #contig.indices
   end
   file_error.puts '----------------------------------------------------------------------------------------'
end

#gene_model_cut(contigs, reference = nil) ⇒ Object

Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig



550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
# File 'lib/gene_assembler/rebuild.rb', line 550

def gene_model_cut(contigs, reference=nil) #Genera un modelo por corte y empalme de contigs, genera un gff y devuelve un array con objetos contig
	q_beg=[]
	q_end=[]
	s_beg=[]
	s_end=[]
	seq=[]
	last_contig=nil
	last_score=0
	length_model=0
	multiple_lengths=[]
	add_length=TRUE
	add_last=0
	last_position_ref=nil
	last_position=nil
	lengthy=[]
	out_of_range=FALSE
	contigs.each do |contig|
		score = contig.first_hit.first_hsp.score/contig.length*contig.exon_acumulative
		n_exones = contig.first_hit.hsp_count
		
		# FIRST CONTIG
		#-------------------------------------------------------
		if last_contig.nil?
			q_end_seq=nil #SEQ
			contig.first_hit.each_hsp_with_index{|hsp,i|
				q_beg << hsp.q_beg
				q_end << hsp.q_end
				s_beg << hsp.s_beg
				s_end << hsp.s_end
				#SEQ.................................
				if i==0
					seq << contig.seq[0..contig.first_hit.first_hsp.q_end-1] 
				elsif i+1==n_exones
					seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]
				else
					seq << contig.seq[q_end_seq..hsp.q_end-1]
				end
				q_end_seq=hsp.q_end
				# ...................................
			}
			length_model+=contig.length
			if !reference.nil? #Posicionamiento del primer contig en la referencia
				last_position_ref,ex=contig.compare(reference)
				if last_position==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count #Abortar modelado en caso de qun contig no alinee con la referencia o la sobrepase
				  puts contig.name+' OUT OF RANGE'
				  out_of_range=TRUE
				  break
				end
			end
			
		# OTHER CONTIG 
		#--------------------------------------------------------	
		else
			position_overlap,ex=contig.compare(last_contig)

			#Correccion posicion del contig en base a una referencia
			if !reference.nil?
				last_position_ref,position_overlap=position_reference_guided(contig,last_contig,last_position_ref,reference)
         if last_position_ref==-1 || last_position_ref+contig.first_hit.hsp_count-1 > reference.first_hit.hsp_count
           out_of_range=TRUE
           puts contig.name+' OUT OF RANGE'
           break
         end
			end

			# NOT OVERLAP
			#..........................
			if position_overlap==-1 || contig.first_hit.hsp_count==1
				if contig.first_hit.first_hsp.s_beg-last_contig.first_hit.last_hsp.s_end>1 # Marcar discontinuidad en caso de que el contig no sea correlativo al anterior
					q_beg << 0
					q_end << 0
					s_beg << 0
					s_end << 0
					multiple_lengths << length_model
					length_model=contig.length
					last=length_model
					add_length=FALSE
					seq.last << 'n'*10 #SEQ Indicacion de GAP
				else
					last=length_model #Guardamos longitud anterior para poder desplazar las coordenadas del contig correctamente
					length_model+=contig.length
				end
 
				q_end_seq=nil #SEQ
				contig.first_hit.hsps.each_with_index do |hsp,i|
					add_no=last
					if !add_length
						add_no=0
					end
					q_beg << hsp.q_beg+add_no # Se acumula a las coordenadas la longitud del modelo
					q_end << hsp.q_end+add_no
					s_beg << hsp.s_beg
					s_end << hsp.s_end
					#SEQ.................................
					if i==0
						cn=contig.seq[0..contig.first_hit.first_hsp.q_end-1]
						cs="#{cn[0..1].swapcase!}#{cn[2..-1]}"
						seq << cs					
					elsif i+1==n_exones
						seq << contig.seq[contig.first_hit.hsps[i-1].q_end..contig.length-1]						 
					else
						seq << contig.seq[q_end_seq..hsp.q_end-1]
					end
					q_end_seq=hsp.q_end
					# ...................................		
				end
				
			# OVERLAP
			#..........................
			else
				if last_position==-1
					add_last=length_model-last_contig.length
				end
				overlap=last_contig.first_hit.hsp_count-position_overlap
				if last_contig.first_hit.hsp_count ==1
				  overlap=1
				end
         #puts "#{overlap} = #{last_contig.first_hit.hsp_count} - #{position_overlap}"
 				add=0
				dif=0
				if last_score>=score
					add=last_contig.first_hit.last_hsp.q_end-contig.first_hit.first_hsp.q_end
					dif=add
					if overlap>1 #eliminamos ultimo exon de 'last contig' para reemplazar por el segundo de 'contig' q es mas fiable por ser interno
						add=last_contig.first_hit.hsp_at(last_contig.first_hit.hsp_count-overlap).q_end-contig.first_hit.first_hsp.q_end #Como se dropea el ultimo exon se alinea por el penultimo
						#puts "hsp:#{contig.first_hit.hsp_count}\toverlap:#{overlap}"
						dif=contig.first_hit.hsp_at(overlap-1).q_end	
						q_beg=q_beg.reverse.drop(1).reverse
						q_end=q_end.reverse.drop(1).reverse
						s_beg=s_beg.reverse.drop(1).reverse
						s_end=s_end.reverse.drop(1).reverse
						seq=seq.reverse.drop(1).reverse #SEQ
					end
					if overlap==1
						overlap=2
					end
					(contig.first_hit.hsp_count-(overlap-1)).times do |n| #Añadimos el resto de exones del contig al modelo
						q_beg << contig.first_hit.hsp_at(n+overlap-1).q_beg+add+add_last
						q_end << contig.first_hit.hsp_at(n+overlap-1).q_end+add+add_last
						s_beg << contig.first_hit.hsp_at(n+overlap-1).s_beg
						s_end << contig.first_hit.hsp_at(n+overlap-1).s_end
						#SEQ.......................................
						position_hsp=n+overlap-2
						if position_hsp <0
							position_hsp= 0
						end
						position_next_hsp=n+overlap-1
						if position_next_hsp < 0
							position_next_hsp =0
						end

						if n==0	
							cn=contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
							cs=cn[0..1].swapcase!+cn[2..-1] 
							seq << cs
						elsif position_next_hsp==contig.first_hit.hsp_count-1
							seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.length-1]
						else
							seq << contig.seq[contig.first_hit.hsp_at(position_hsp).q_end..contig.first_hit.hsp_at(position_next_hsp).q_end-1]
						end
						#............................................
 					end
				else
					hsp_position=last_contig.first_hit.hsp_count-2
					if hsp_position<0 #para los casos de los contigs q solo poseen un hsp
						hsp_position=0
					end
					add=last_contig.first_hit.hsp_at(hsp_position).q_end
					dif=last_contig.length-add
					drop=1
					correction=0
					if overlap>1
						drop=overlap-1
						correction=1
						add=last_contig.first_hit.hsp_at(position_overlap).q_end-contig.first_hit.first_hsp.q_end
						dif=length_model-(add+add_last)
					end				
					# Eliminamos exones malos de 'last_contig' (mantenemos el primero del overlap)					
					q_beg=q_beg.reverse.drop(drop).reverse
					q_end=q_end.reverse.drop(drop).reverse
					s_beg=s_beg.reverse.drop(drop).reverse
					s_end=s_end.reverse.drop(drop).reverse
					seq=seq.reverse.drop(drop).reverse #SEQ

					# Añadimos los exones de 'contig' (excepto el primero)
					(contig.first_hit.hsp_count-correction).times do |n| #Añadimos el resto de exones del contig al modelo
						q_beg << contig.first_hit.hsp_at(n+correction).q_beg+add+add_last
						q_end << contig.first_hit.hsp_at(n+correction).q_end+add+add_last
						s_beg << contig.first_hit.hsp_at(n+correction).s_beg
						s_end << contig.first_hit.hsp_at(n+correction).s_end
						#SEQ.............................................................
						if n+1==(contig.first_hit.hsp_count-correction)
							n_correction=n+correction-1
							if n_correction < 0
								n_correction=0
							end
							seq << contig.seq[contig.first_hit.hsp_at(n_correction).q_end..contig.length-1]
						elsif n==0
							if n+correction==0
								cn=contig.seq[0..contig.first_hit.hsp_at(n+correction).q_end-1] # Si n+corr empieza en el primer exon del contig
							else
								cn=contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
							end
							cs=cn[0..1].swapcase!+cn[2..-1] 
							seq << cs
						else
							seq << contig.seq[contig.first_hit.hsp_at(n+correction-1).q_end..contig.first_hit.hsp_at(n+correction).q_end-1]
						end
						#................................................................ 
					end
				end
				length_model+=(contig.length-dif)
				add_length=TRUE
				add_last+=add
			end
		end
		last_position=position_overlap
		last_contig=contig
		last_score=score
		lengthy << length_model
	end
	if !multiple_lengths.empty?
		multiple_lengths << length_model
		length_model=multiple_lengths
	end
	
	model=nil
	if !out_of_range #Generar modelo si todos los contigs han alineado con la referencia
     model=void_contig(contigs.first.first_hit.name+'_model',length_model,contigs.first.first_hit.s_length,q_beg,q_end,s_beg,s_end,'contig','gene','exon')
     #Merge contigs under sequence reference
     model_length=nil
     if model.class.to_s=='Array'
       add=0
       model.each_with_index do |contig,i|
         contig.modified_coordenates(add)
         add+=contig.length
         if i<model.length-1
           add+=10
         end
       end
       model_length=add+model.last.length
     end
   
     model_n_exones=seq.length
   	final_seq=seq.join
 else #No generar modelo si al menos un contig no alinea contra la referencia
      model_length=nil
      final_seq=nil
   end

	return model, model_length, final_seq
end

#gene_stadistics(gene_array) ⇒ Object

Calcula el nº exones diferentes que hay por cada posicion del gene_array



329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
# File 'lib/gene_assembler/rebuild.rb', line 329

def gene_stadistics(gene_array) #Calcula el nº exones diferentes que hay por cada posicion del gene_array
	exons=[]
	length=length2D(gene_array)
	length.times do |column|
		exon=[]
		gene_array.each_with_index.each do |item,row|
			if !exon.include?(gene_array[row][column]) && gene_array[row][column]!=0
				exon << gene_array[row][column]
			end
		end
		exons << exon
	end
	exons_stadistic=[]
	exons.each do |ex|
		exons_stadistic << ex.compact.length
	end
	return exons_stadistic	
end

#gene_stadistics_report(exons_stadistic, tag) ⇒ Object

Muestra estadisticas de intrones o exones



348
349
350
351
352
353
354
# File 'lib/gene_assembler/rebuild.rb', line 348

def gene_stadistics_report(exons_stadistic,tag) #Muestra estadisticas de intrones o exones
	print "\n#{tag}\t"
	exons_stadistic.each do |item|
		print "#{item}\t"
	end
	print "\n"
end

#iterative_modeling_gene_w_reference(cluster, references_hash, options, sequences_hash) ⇒ Object

end main method



93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
# File 'lib/gene_assembler/rebuild.rb', line 93

def iterative_modeling_gene_w_reference(cluster,references_hash,options,sequences_hash)
  # Model atributes
  model=nil
  length=0
  seq=nil
  cluster_length=0
  length_cluster=0
  prot_reference=cluster.first.first_hit.name
  array_references=references_hash[prot_reference]

  # Model parameters
  recover=0
  overlap=0

  #Modelo de gen en ciego
  if $verbose
    puts "\n",'||||||||||  BLIND MODELING  ||||||||||'
  end
  model, length, seq, length_cluster= modeling_gene(cluster.dup,nil,options)

  recover, overlap=eval_model(model.dup, length)

  if $verbose
    puts "\nRecover: #{recover} Overlap: #{overlap}"
  end
  
  guided=FALSE
  #Modelo de gen guiado
  if !array_references.nil?
    array_references.each do |ref|
      if $verbose
        puts "\n",'||||||||||  GUIDED MODELING  ||||||||||'
      end

      guided_model, guided_length, guided_seq, guided_length_cluster = modeling_gene(cluster.dup,ref,options)
      if guided_model.nil? # Si algun modelo sale mal se ignora
        next
      end
      guided_recover, guided_overlap= eval_model(guided_model.dup, guided_length)
      if $verbose
        puts "\nRecover: #{guided_recover} Overlap: #{guided_overlap}"
      end
      
      #Arbol de decisiones
      if guided_overlap <= 15 #Si el overlap es menor del 15 %
        if guided_overlap >= overlap-overlap*0.05 && guided_overlap <= overlap+overlap*0.05 # A mismo overlap
          if guided_recover > recover
            guided=TRUE
          end        
        else # A distinto overlap
          recover_dif=guided_recover-recover
          if recover_dif < 0  # Si el guided_model tiene menos recuperacion q el anterior
            if recover_dif.abs >= overlap-overlap*0.05 && recover_dif.abs <= overlap+overlap*0.05 #Si la reduccion de la recuperacion se debe a la desaparicion del overlap
              guided=TRUE
            end
          elsif recover_dif> guided_overlap+guided_overlap*0.05 # Comprobar que la diferencia de recover no se debe a u aumento del overlap en la misma magnitud
            guided=TRUE      
          end
        end
      elsif guided_overlap < overlap # Quedarnos siempre con los overlap mas bajos aun en situacion de overlap alto
        guided=TRUE
      end
      
      if guided
        model=guided_model
        length=guided_length
        seq=guided_seq
        length_cluster=guided_length_cluster
        recover=guided_recover
        overlap=guided_overlap
      end
    end
  end
  
  sequences_hash[prot_reference]=seq    
  return model, length, length_cluster
end

#modeling_gene(cluster, reference, rebuild) ⇒ Object

Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo



183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
# File 'lib/gene_assembler/rebuild.rb', line 183

def modeling_gene(cluster, reference, rebuild) #Funcion que devuelve un objeto contig con el modelo de gen, los contigs q se han seleccionado y genera un gff del modelo
	model=nil
	model_length=nil
	seq=nil
	length_cluster=0
	# Reduccion iterativa de los contig para seleccionar los que van a formar parte del modelo de gen, elimina fragmentos menores que se puedan tomar como nuevos exones
	#--------------------------------------------------------------------------------------------------
	gene_array_length_before=nil
	continue=TRUE
	gene_array=[]
		
	while continue
	      cluster,gene_array=gene_array_and_compact(rebuild,cluster,reference)
	      gene_array_length_after=length2D(gene_array)
	      if gene_array_length_after == gene_array_length_before
	        continue=FALSE
	      end
	      gene_array_length_before=gene_array_length_after
	end
   		length_cluster=cluster.length

	# Modelado del gen
	#----------------------------------------------------
	if rebuild && !cluster.empty? && !gene_array.empty?
		if cluster.length >1
			cluster_comp=contig_compact(cluster) #Fusiona contigs contiguos y devuelve el array correspondiente
		else
			cluster_comp=cluster
		end
		if !cluster_comp.nil?
			model, model_length, seq=gene_model_cut(cluster_comp, reference)
		else
			puts cluster.first.first_hit.name+"\tGENE MODEL ABORTED"
		end
	end
	return model, model_length, seq, length_cluster
end

#overlap_test(model, output = TRUE) ⇒ Object



877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
# File 'lib/gene_assembler/rebuild.rb', line 877

def overlap_test(model,output=TRUE)
   perc_overlap=0
   overlap=model.overlap
   total=0
   if !overlap.empty?
     if output
       print 'WARNING: overlap/s '
     end
     overlap.each do |length_overlap|
       if output
         print (length_overlap*-3).to_s+', '
       end
       total+=length_overlap
     end
     perc_overlap=(total*-100.0/model.first_hit.s_length).round(2)
     if output
       puts 'nt. % Total overlap '+perc_overlap.to_s
     end
   end
   return perc_overlap    
end

#position_reference_guided(contig, last_contig, last_position_ref, reference) ⇒ Object

Si no existe overlap devuelve -1



840
841
842
843
844
845
846
847
848
849
850
# File 'lib/gene_assembler/rebuild.rb', line 840

def position_reference_guided(contig,last_contig,last_position_ref,reference)# Si no existe overlap devuelve -1
	position_ref,ex=contig.compare(reference)
	if !last_position_ref.nil?
		if position_ref<=last_position_ref+(last_contig.first_hit.hsp_count-1) #Overlap
			position_overlap=(last_position_ref-position_ref).abs
		else #No overlap
			position_overlap=-1
		end
	end
	return position_ref,position_overlap
end

#rebuild(options) ⇒ Object

MAIN METHOD



17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
# File 'lib/gene_assembler/rebuild.rb', line 17

def rebuild(options) #Genera contigs modelo,gff y busca pseudogenes
	gff_dataset_model=Dataset.new(:mix) #Object for save info of GeneAssembler's output
	gff_dataset=Dataset.new(:mix) #Object for save info of GeneAssembler's output
	file_error=File.open(@path[:error],'w')
	file_web=web_header(options[:web],@path[:html])
	sequences_hash={}
	gene_name=nil
	model=nil
	statistics={:genes => 0, :total_recovered => 0, :total_overlap => 0, :total_fragmentation => 0}
	puts "\nMODELING GENE",'*******************************************'
	@dataset.each_cluster{|cluster|
		begin
			if !cluster.nil?
				gene_name=cluster.first.first_hit.name
			end
			cluster_complete=cluster.dup
			model, length_model, length_cluster = iterative_modeling_gene_w_reference(cluster,@dataset.references_hash,options[:rebuild],sequences_hash) #Realiza la reconstruccion del gen (alineado,descarte y montaje del gen)

  			# GeneAssembler output (gff for Gbrowse)
  			#--------------------------------------------------------
  			if !model.nil?
  				#Format Contigs children of model				
  				gff_dataset.clr_contigs
  				gff_dataset.transfer_contigs(cluster_complete)
  				gff_dataset.transfer_n_contigs_def_hit_type(@dataset_uni_hsp,cluster,'pseudogene',50) #Transferir pseudogenes al report
  				
				# Convertir arrays a contig y ajustar alineamiento añadiendo Ns
  				model=correct_model(model, length_model, gff_dataset, sequences_hash)

  				# Comprobaciones en el modelo
  				exones=model.exones_s.length # N exones
  				puts 'Exones: '+ exones.to_s
  				recovered=recover_test(model)
  				overlap=overlap_test(model)
  				fragmentation=((length_cluster-1.00)/exones).round(2)
  				puts 'Fragmentation: ' + fragmentation.to_s
 				
  				# HTML index
  				if !file_web.nil?
  					gene_link=html_link(model.first_hit.name, @path[:gbrowse_link]+model.first_hit.name)
  					html_row(file_web, [gene_link, cluster.first.first_hit.s_length, exones, recovered, overlap, fragmentation])
				end
 			
  				#Format Model for Gbrowse
  				gff_dataset_model.clr_contigs
  				format_model(model) #Añade la particula _gene al modelo
  				gff_dataset_model.transfer_contigs(model)
 				
  				#Write
  				write_gbrowse_gff(gff_dataset_model, gff_dataset, @path[:gff], model.name)
 				
  				#General statistics
  				statistics[:genes]+=1
  				statistics[:total_recovered]+=recovered
  				statistics[:total_overlap]+=overlap
  				statistics[:total_fragmentation]+=fragmentation
  			end
		rescue Exception => e
			gene_error(e, gene_name, file_error, cluster_complete, model)
		end
		puts '* * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * * *'		
	}
	file_error.close
	web_body(file_web)

	puts	"\nFINAL STATISTICS\n",
		'Recovered genes: '+ statistics[:genes].to_s,
		'Mean recover: ' + (statistics[:total_recovered]/statistics[:genes]).to_s,
		'Mean overlap: ' + (statistics[:total_overlap]/statistics[:genes]).to_s,
		'Mean fragmentation: ' + (statistics[:total_fragmentation]/statistics[:genes]).to_s
	write_model_fasta(sequences_hash,@path[:fasta])
end

#recover_test(model, output = TRUE) ⇒ Object



899
900
901
902
903
904
905
906
907
908
909
# File 'lib/gene_assembler/rebuild.rb', line 899

def recover_test(model,output=TRUE)
	recovered=0
	model.exones_s.each do |exon|
		recovered+=exon
	end		
   recovered=(recovered*100.0/model.first_hit.s_length).round(2)
   if output
	  puts "Recovered\t"+model.first_hit.name+"\t#{recovered}"
	end
	return recovered
end

#void_contig(contig_name, contig_length, s_length, q_beg, q_end, s_beg, s_end, contig_type, hit_type, hsp_type, single = FALSE) ⇒ Object

Genera un objeto contig con los datos proporcionados



803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
# File 'lib/gene_assembler/rebuild.rb', line 803

def void_contig(contig_name,contig_length,s_length,q_beg,q_end,s_beg,s_end,contig_type,hit_type,hsp_type,single=FALSE) #Genera un objeto contig con los datos proporcionados
	contigs=[]
	is_contig=1
	contig=nil
	n=0
	q_beg.each_with_index do |item,ind|
		if item>0 ||single
			if contig==nil
				if contig_length.class.to_s=='Array'
					length=contig_length[n]
					name="#{contig_name}_#{n}"
				else
					length=contig_length
					name=contig_name
				end
				contig=Contig.new(name)
				contig.length=length
				contig.type=contig_type
				hit_v=contig.add_hit(contig_name,s_length,1,:prot)
				hit_v.type=hit_type
			end
			hsp_v=contig.first_hit.add_hsp(q_beg[ind], q_end[ind], s_beg[ind], s_end[ind], 0, 0, 0, 0)
			hsp_v.type=hsp_type
		end
		if item==0 && contig!=nil && !single||q_beg.length-1==ind
			if single ||!q_beg.include?(0)
				contigs=contig
			else
				contigs << contig
			end
			n+=1
			contig=nil
		end
	end
	return contigs
end

#web_body(file_web) ⇒ Object



921
922
923
924
925
926
927
# File 'lib/gene_assembler/rebuild.rb', line 921

def web_body(file_web)
  if !file_web.nil?
    html_table_footer(file_web)
    html_footer(file_web)
    file_web.close
  end
end

#web_header(web, path) ⇒ Object



911
912
913
914
915
916
917
918
919
# File 'lib/gene_assembler/rebuild.rb', line 911

def web_header(web, path)
  file_web=nil
  if web
    file_web=File.open(path,'w')
    html_header(file_web,'Gene index')
    html_table_header(file_web,1,['Gene model name', 'Protein length', 'Num exon', '% recovered protein', '% overlapping sequence', 'Fragmentation'])
  end
  return file_web
end

#write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name) ⇒ Object



937
938
939
940
941
942
# File 'lib/gene_assembler/rebuild.rb', line 937

def write_gbrowse_gff(gff_dataset_model, gff_dataset, path, name)
  gff_model=ReportGff.new(gff_dataset_model,path,'s')
  gff_model.create('a')
  gff=ReportGff.new(gff_dataset,path,'s')
  gff.create('a',name)
end

#write_model_fasta(sequences_hash, path) ⇒ Object



929
930
931
932
933
934
935
# File 'lib/gene_assembler/rebuild.rb', line 929

def write_model_fasta(sequences_hash, path)
  model_file=File.open(path,'w')
  sequences_hash.each do |model|
    model_file.puts '>'+model[0]+"_model\n"+model[1]
  end
  model_file.close
end