From b0ef93bb713c134b66be14fdbe0baed692e8fa61 Mon Sep 17 00:00:00 2001
From: Graham McVicker <mcvicker@sanger.ac.uk>
Date: Fri, 20 Dec 2002 16:47:45 +0000
Subject: [PATCH] removal of no longer needed EMBL loading

---
 .../embl_annotation/extract_curated.pl        | 399 ------------------
 1 file changed, 399 deletions(-)
 delete mode 100755 misc-scripts/embl_annotation/extract_curated.pl

diff --git a/misc-scripts/embl_annotation/extract_curated.pl b/misc-scripts/embl_annotation/extract_curated.pl
deleted file mode 100755
index 99860c25f7..0000000000
--- a/misc-scripts/embl_annotation/extract_curated.pl
+++ /dev/null
@@ -1,399 +0,0 @@
-#!/usr/local/bin/perl
-
-# script to extract annotation from EMBL files and insert it into an
-# ensembl database
-
-use strict;
-
-use Getopt::Long;
-
-use Bio::EnsEMBL::EMBLLOAD::Obj;
-use Bio::EnsEMBL::DBLoader;
-use Bio::SeqIO;
-use Bio::EnsEMBL::DBSQL::DBEntryAdaptor;
-
-# embl genes will be written to this database:
-my $ohost='localhost';
-my $ouser='root';
-my $odbname='embl_test';
-my $pass='';
-
-# list of clones in ensembl for which an embl file should be
-# checked is read from here
-my $ihost='localhost';
-my $iuser='root';
-my $idbname='';
-
-# test file
-my $emblfile='AL109928.embl';
-my $write;
-my $verbose;
-my $list;
-my $delete;
-my $clone_id;
-my $processed_clone_list='extract_curated.processed.lis';
-my $log='extract_curated.log.err';
-my $max_clone;
-my $nodb;
-my $help;
-&GetOptions(
-
-	    'ohost:s'=>\$ohost,
-	    'ouser:s'=>\$ouser,
-	    'odbname:s'=>\$odbname,
-
-	    'ihost:s'=>\$ihost,
-	    'iuser:s'=>\$iuser,
-	    'idbname:s'=>\$idbname,
-
-	    'emblfile:s'=>\$emblfile,
-
-	    'w'=>\$write,
-	    'v'=>\$verbose,
-	    'l'=>\$list,
-	    'd'=>\$delete,
-	    'c:s'=>\$clone_id,
-	    'log:s'=>\$log,
-	    'processed:s'=>\$processed_clone_list,
-	    'max:n'=>\$max_clone,
-	    'nodb'=>\$nodb,
-	    'help|h'=>\$help,
-	     );
-
-if($help){
-    print << "ENDOFTEXT";
-extract_curated.pl
-
-  -v               verbose
-  -w               write gene structures to output database
-
-  -d               delete (used with mode 2 to remove entries from database - bugs)
-  -c clone_id      clone_id override (to replace id read from embl file by id that will be
-                   used to name genes and write to database)
-
-  -log file        log file for stack trace of fatal errors (mode 1) [$log]
-  -processed file  file of all previously processed clones that have not been written to
-                   database - read before and written after processing (mode 1)
-  -max number      number of clones to process (mode 1)
-  -nodb            allow script to run without presence of database (testing only)
-
-Two sources of data to process:
-  -l               select list mode (mode 1)
-
-1. process all entries in 'clone' table of input database that are not
-   present in corresponding table of output database or in 'processed' log file.
-  -ihost hostname  hostname of input database [$ihost]
-  -iuser username  username of input database [$iuser]
-  -idbname dbname  name of input database [$idbname]
-
-2. read a single file:
-  -emblfile file   specify file for processing [$emblfile]
-
-
-  -ohost hostname  hostname of input database [$ohost]
-  -ouser username  username of input database [$ouser]
-  -odbname dbname  name of input database [$odbname]
-
-  -h               help
-ENDOFTEXT
-    exit;
-}
-
-# use -d -c clone to force a delete
-
-my $nunfin=0;
-my $nnoembl=0;
-my $nwrongsv=0;
-my $nclonefail=0;
-my $nclonenogene=0;
-my $nclonegene=0;
-my $ngeneok=0;
-
-my %processed;
-my $logging;
-
-my $gene_map_file=">txt/gene_stable_id.txt";
-my $transcript_map_file=">txt/transcript_stable_id.txt";
-my $exon_map_file=">txt/exon_stable_id.txt";
-my $translation_map_file=">txt/translation_stable_id.txt";
-
-system("mkdir txt");
-open(GENE_MAP,$gene_map_file) || die "cant open $gene_map_file";
-open(TRANSCRIPT_MAP,$transcript_map_file) || die "cant open $transcript_map_file";
-open(EXON_MAP,$exon_map_file) || die "cant open $exon_map_file";
-open(TRANSLATION_MAP,$translation_map_file) || die "cant open $translation_map_file";
-
-# connect to db for writing
-my $db;
-my $adx;
-unless($nodb){
-    $db = Bio::EnsEMBL::DBLoader->new(
-       "Bio::EnsEMBL::DBSQL::DBAdaptor/host=$ohost;user=$ouser;dbname=$odbname;pass=$pass");
-    # adaptor for xrefs
-    $adx = Bio::EnsEMBL::DBSQL::DBEntryAdaptor->new($db);
-}
-
-my $dbi;
-if($list){
-
-    # open log
-    if($write){
-	open(LOG,">$log") || die "cannot open $log";
-	$logging=1;
-    }
-
-    # load current list of processed clones
-    if(open(IN,$processed_clone_list)){
-	my $n=0;
-	while(<IN>){
-	    if(/^(\S+)\s+(\S+)/){
-		$processed{$1}=$2;
-		$n++;
-	    }
-	}
-	close(IN);
-	print "$n processed clones read from $processed_clone_list\n";
-    }else{
-	print "WARN: no $processed_clone_list file found\n";
-    }
-
-    $dbi=Bio::EnsEMBL::DBLoader->new(
-    "Bio::EnsEMBL::DBSQL::DBAdaptor/host=$ihost;user=$iuser;dbname=$idbname");
-
-    my $nclone=0;
-    foreach my $clone_id ($dbi->get_all_Clone_id){
-
-	# has clone already been processed?
-	if($processed{$clone_id}){
-	    next;
-	}
-
-	$nclone++;
-	last if($max_clone && $nclone>$max_clone);
-
-	# is clone in one contig (finished)?
-	my @contig;
-	my $clone;
-	eval {
-	$clone=$dbi->get_Clone($clone_id);
-    };
-	eval{
-	    (@contig)=$clone->get_all_Contigs;
-	};
-	if($@){
-	    print "Contig error in Database for $clone_id:\n$@\n\n";
-	    next;
-	}elsif(scalar(@contig)>1){
-	    print "$clone_id is in ".scalar(@contig)." contigs - skip\n";
-	    $processed{$clone_id}='unfinished';
-	    $nunfin++;
-	    next;
-	}
-
-	my $version=$clone->embl_version;
-
-	# is clone in output database?
-#	eval{
-#	    my $clone=$db->get_Clone($clone_id);
-#	};
-	#if($@){
-	    print "[$nclone] $clone_id not yet parsed ".
-		"[$nunfin; $nnoembl; $nwrongsv; $nclonefail; ".
-		    "$nclonenogene; $nclonegene; $ngeneok]\n";
-	    my $seqio;
-	    eval{
-		$seqio = Bio::SeqIO->new(-format => 'EMBL', -file => 
-		    "/usr/local/pubseq/bin/efetch emnew:$clone_id |");
-	    };
-	    if($@){
-		print "$clone_id could not be fetched from EMBL - warn\n";
-		$processed{$clone_id}='embl_failed';
-		$nnoembl++;
-	    }
-	    &_process_file($db,$seqio,$verbose,$clone_id,$version);
-	#}
-	
-	#	else{
-	#	    print "$clone_id already parsed - skip\n";
-	# 	}
-
-    }
-
-    # write current list of checked, empty clones
-    if($write){
-	rename($processed_clone_list,"$processed_clone_list.old");
-	if(open(OUT,">$processed_clone_list")){
-	    foreach my $clone_id (keys %processed){
-		print OUT "$clone_id ".$processed{$clone_id}."\n";
-	    }
-	    close(OUT);
-	}else{
-	    print "ERROR: failed writing to $processed_clone_list file\n";
-	}
-	close(LOG);
-    }
-
-}else{
-
-    # manual loading from EMBL file - for testing only
-
-    if(!$clone_id && $emblfile=~/^(\w+)/){
-	$clone_id=$1;
-    }
-
-    # delete this clone
-    if($delete){
-
-	# remove all genes
-	foreach my $gene_id ($db->gene_Obj->get_all_Gene_id()){
-	    $db->gene_Obj->delete($gene_id);
-	}
-	
-	my $clone=$db->get_Clone($clone_id);
-
-	# remove clone
-	$clone->delete_by_dbID;
-	exit 0;
-    }
-
-    # link to file to be read
-    my $seqio = Bio::SeqIO->new(-format => 'EMBL',-file => $emblfile);
-
-    # version passed as 0 so it is not checked
-    &_process_file($db,$seqio,$verbose,$clone_id,0);
-
-}
-
-# loop over entries in file
-sub _process_file{
-    my($db,$seqio,$verbose,$clone_id,$version)=@_;
-
-    for(;;){
-
-	my $seq;
-	eval{
-	    $seq = $seqio->next_seq;
-	};
-	if($@){
-	    print "Problem accessing EMBL file for $clone_id\n$@\n\n";
-	    next;
-	}
-	if(!$seq){
-	    last;
-	}
-
-	# check SV
-	my $sv=$seq->seq_version;
-	if($sv=~/^(\d+)$/){
-	    $sv=$1;
-	    if($version && $1!=$version){
-		print "Sequence versions are different: Ensembl:$version; EMBL:$1\n";
-		$processed{$clone_id}="embl_version:ENSEMBL:$version:EMBL:$1";
-		$nwrongsv++;
-		next;
-	    }else{
-		print "Sequence versions are same: $version\n";
-	    }
-	}else{
-	    print "SV could not be parsed |$sv|\n";
-	    next;
-	}
-
-	# get sequence from embl entry
-	my $obj = Bio::EnsEMBL::EMBLLOAD::Obj->new(-seq => $seq);
-
-	# HACK HACK HACK
-	# there is only one clone to get, so don't need an id, but
-	# need to set the id afterwards since it is wrong (embl_id not accession)
-	my $clone = $obj->get_Clone();
-	$clone->id($clone_id);
-	
-	# only for finished sequence, so only one contig
-	my $contig;
-	($contig) = $clone->get_all_Contigs();
-
-	my @genes;
-	eval{
-	    @genes=($contig->get_all_Genes);
-	};
-	if($@){
-	    print "Problems processing clone $clone_id\n\n$@\n\n";
-	    if($logging){
-		print LOG "$clone_id\n$@\n\n";
-	    }else{
-		print "$@\n\n";
-	    }
-	    $nclonefail++;
-	    next;
-	}
-
-	# loop over entries to get genes and insert them into database
-	my $ngene=scalar(@genes);
-	if($ngene){
-	    $nclonegene++;
-
-	    # only write clone entry into database if there are genes
-	    if($write){
-		eval{
-		    $db->write_Clone($clone);
-		};
-		if($@){
-		    print "Failed to write clone $clone_id\n";
-		}
-	    }
-
-	}else{
-	    # processing ok, but no genes
-	    $processed{$clone_id}='no_genes';
-	    $nclonenogene++;
-	}
-	$ngeneok+=$ngene;
-	print "parsed $ngene ok [$ngeneok]\n";
-
-	foreach my $gene ( @genes ) {
-	    
-	    # writeout information about gene
-	    if($verbose){
-		my $gene_id=$gene->stable_id;
-		
-		foreach my $transcript ($gene->each_Transcript){
-		    my $transcript_id=$transcript->stable_id;
-		   
-		}
-	    }
-	    
-	    # write gene to database
-	    if($write){
-		eval{
-		    $db->write_Gene($gene);
-
-		    print STDERR "DB ID >........................................".$gene->dbID,"\t",$gene->stable_id,"\n";
-
-		    print GENE_MAP $gene->dbID,"\t",$gene->stable_id,"\n";
-		    foreach  my $transcript ($gene->each_Transcript){
-			print TRANSCRIPT_MAP $transcript->dbID,"\t",$transcript->stable_id,"\n";
-                        print TRANSLATION_MAP $transcript->translation->dbID,"\t",$transcript->translation->stable_id,"\n";
-
-			foreach my $exon ($transcript->get_all_Exons){
-			    print EXON_MAP $exon->dbID,"\t",$exon->stable_id,"\n";
-			}
-
-			foreach my $dbentry ($transcript->each_DBLink){
-			    # attach adapter
-			    $dbentry->adaptor($adx);
-			    print STDERR "Storing ",$transcript->stable_id," ",$dbentry->primary_id,"\n";
-			    $adx->store($dbentry,$transcript->stable_id,'Transcript');
-			}
-		    }
-		};
-		if($@){
-		    print "Failed to write gene ".$gene->stable_id."\n\n".$@."\n";
-		}
-	    }
-	}
-    }
-}
-
-
-
-
-- 
GitLab