Skip to content
Snippets Groups Projects
Commit ff2bc08a authored by Ian Longden's avatar Ian Longden
Browse files

get data directly biomart

parent f1016409
No related branches found
No related tags found
No related merge requests found
package XrefParser::IKMCParser;
use strict;
use LWP::UserAgent;
use base qw( XrefParser::BaseParser );
# This parser will read Direct Xrefs from a simple tab-delimited file.
# The columns of the file should be the following:
#
# 1) Accession ID
# 2) label
# 3) source type
# 4) stable_id
#
sub new {
my $proto = shift;
......@@ -22,79 +15,181 @@ sub new {
return $self;
}
sub run {
my $self = shift;
sub run_script {
my $self = shift if (defined(caller(1)));
my $source_id = shift;
my $species_id = shift;
my $files_ref = shift;
my $rel_file = shift;
my $verbose = shift;
my $filename = @{$files_ref}[0];
my $file_io = $self->get_filehandle($filename);
if ( !defined($file_io) ) {
return 1;
}
my $file = shift;
my $source_id = shift;
my $species_id = shift;
my $verbose = shift;
my $parsed_count = 0;
my ($type, $my_args) = split(/:/,$file);
printf( STDERR "source = %d\t species = %d, file is %s\n",
$source_id, $species_id, $filename );
my %type2id;
my %type2id;
foreach my $t ("ES cells available", "Vector available", "No products available yet", "Mice available"){
foreach my $t ("No products available yet", "Vector available", "ES cells available", "Mice available"){
my $ikmc = "IKMC_".$t;
$ikmc =~ s/ /_/g;
$type2id{$t} = XrefParser::BaseParser->get_source_id_for_source_name($ikmc);
print $ikmc."\t".$type2id{$t}."\n";
# print $ikmc."\t".$type2id{$t}."\n";
if(!defined( $type2id{$t})){
die "Could not get source id for $ikmc\n";
}
}
my $xml = (<<XXML);
<?xml version="1.0" encoding="UTF-8"?>
<!DOCTYPE Query>
<Query virtualSchemaName = "default" formatter = "TSV" header = "0" uniqueRows = "1" count = "" datasetConfigVersion = "0.6" >
<Dataset name = "dcc" interface = "default" >
<Attribute name = "mgi_accession_id" />
<Attribute name = "marker_symbol" />
<Attribute name = "vector_available" />
<Attribute name = "escell_available" />
<Attribute name = "mouse_available" />
<Attribute name = "ensembl_gene_id" />
</Dataset>
</Query>
XXML
# print $xml."\nYO\n";
my %symbols;
my %ensembl_ids;
my %status;
my $path="http://www.i-dcc.org/biomart/martservice?";
my $request = HTTP::Request->new("POST",$path,HTTP::Headers->new(),'query='.$xml."\n");
my $ua = LWP::UserAgent->new;
my $response;
# print "getting data from url\n";
my $line_count=0;
my $old_data="";
my $chunks = 0;
my $before;
$ua->request($request,
sub{
my($data, $response) = @_;
if ($response->is_success) {
chomp $data;
if($data =~ /^MGI:/ and $chunks){
$old_data .= "\n";
}
my $data_line= $old_data.$data;
my @lines = split(/\n/,$data_line);
if(length($lines[-1]) == 0){
pop @lines;
}
$old_data = "";
my $count=0;
$chunks++;
my $max= scalar(@lines);
foreach my $entry (@lines){
$count++;
my @fields = split(/\t/,$entry);
next if (!length($entry));
if($count == $max){ # possible incomplete line
$old_data = $entry;
next;
}
elsif($count > $max){
die "What the celery is going on here";
}
else{
$line_count++;
my $mgi_id = $fields[0];
if(!($mgi_id =~ /MGI:/)){
print "PROB1:$data_line\n";
print "PROB2:".join(', ',@fields)."\n";
}
$symbols{$mgi_id}=$fields[1];
$ensembl_ids{$mgi_id}=$fields[5];
$status{$mgi_id} = 1 if ($status{$mgi_id} eq '');
if ($status{$mgi_id} < 4 && $fields[4] == 1){
$status{$mgi_id} = 4;
}
elsif ($status{$mgi_id} < 3 && $fields[3] == 1){
$status{$mgi_id} = 3;
}
elsif ($status{$mgi_id} < 2 && $fields[2] == 1){
$status{$mgi_id} = 2;# print "$data";
}
}
}
}
else {
warn ("Problems with the web server: ".$response->status_line);
return 1;
}
},1000);
# print "Number of chunks is $chunks\n";
if($old_data){
my @fields = split(/\t/,$old_data);
$line_count++;
# chop $line[5];
my $mgi_id = $fields[0];
if(!($mgi_id =~ /MGI:/)){
print "PROB3:$old_data\n";
print "PROB4:".join(', ',@fields)."\n";
}
$symbols{$mgi_id}=$fields[1];
$ensembl_ids{$mgi_id}=$fields[5];
$status{$mgi_id} = 1 if ($status{$mgi_id} eq '');
if ($status{$mgi_id} < 4 && $fields[4] == 1){
$status{$mgi_id} = 4;
}
elsif ($status{$mgi_id} < 3 && $fields[3] == 1){
$status{$mgi_id} = 3;
}
elsif ($status{$mgi_id} < 2 && $fields[2] == 1){
$status{$mgi_id} = 2;# print "$data";
}
}
# print "obtained $line_count lines\n";
my $parsed_count = 0;
my $direct_count = 0;
foreach my $acc (keys %symbols){
my $source_id;
$source_id = $type2id{'No products available yet'} if $status{$acc} == 1;
$source_id = $type2id{'Vector available'} if $status{$acc} == 2;
$source_id = $type2id{'ES cells available'} if $status{$acc} == 3;
$source_id = $type2id{'Mice available'} if $status{$acc} == 4;
my $label = $symbols{$acc} || $acc;
my $ensembl_id = $ensembl_ids{$acc};
# print OUT "$acc\t$symbols{$acc}\t$description\t$ensembl_ids{$acc}\n";
my $type = 'gene';
while ( defined( my $line = $file_io->getline() ) ) {
chomp $line;
my ( $accession, $label, $source_type, $ensembl_id)
= split( /\t/, $line );
if ( !defined($accession)) {
printf( "Line %d contains has less than one column.\n",
1 + $parsed_count );
print("The parsing failed\n");
return 1;
}
my $type = 'gene';
$label ||= $accession;
my $source_id = $type2id{$source_type};
++$parsed_count;
my $xref_id =
XrefParser::BaseParser->get_xref( $accession, $source_id, $species_id );
if ( !defined($xref_id) || $xref_id eq '' ) {
$xref_id =
XrefParser::BaseParser->add_xref(
$accession, undef, $label,
'', $source_id, $species_id, "DIRECT"
);
}
XrefParser::BaseParser->add_direct_xref( $xref_id, $ensembl_id,
$type, $accession );
} ## end while ( defined( my $line...
printf( "%d direct xrefs succesfully parsed\n", $parsed_count );
$file_io->close();
print "Done\n";
return 0;
++$parsed_count;
my $xref_id =
XrefParser::BaseParser->get_xref( $acc, $source_id, $species_id );
if ( !defined($xref_id) || $xref_id eq '' ) {
$xref_id =
XrefParser::BaseParser->add_xref(
$acc, undef, $label,
'', $source_id, $species_id, "DIRECT"
);
}
next if(!defined($ensembl_ids{$acc}));
$direct_count++;
XrefParser::BaseParser->add_direct_xref( $xref_id, $ensembl_id,
$type, $acc );
}
printf( "%d xrefs succesfully parsed and %d direct xrefs added\n", $parsed_count, $direct_count );
return 0;
} ## end sub run
1;
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment