Newer
Older
Glenn Proctor
committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
package XrefParser::WilsonAffyParser;
use strict;
use XrefParser::BaseParser;
use vars qw(@ISA);
@ISA = qw(XrefParser::BaseParser);
my $xref_sth ;
my $dep_sth;
my $syn_sth;
sub run {
my ($self, $file, $source_id, $species_id) = @_;
my @xrefs = $self->create_xrefs($source_id, $species_id, $file);
# upload
XrefParser::BaseParser->upload_xref_object_graphs(@xrefs);
}
sub create_xrefs {
my ($self, $source_id, $species_id, $file) = @_;
my ($count, $noseq, $direct) = (0,0,0);
$| = 1; # don't buffer
my @xrefs;
open(FILE,"<".$file) || die "Could not open $file\n";
<FILE>; # skip first line
while (<FILE>) {
#last if ($count > 200);
my $xref;
my @fields = split /\t/;
# first field (probe_set) is accession
my $acc = $fields[0];
chomp($acc);
$acc =~ s/\"//g;
# get linked accession (may be RefSeq or EMBL or ensembl)
my $target = $fields[2];
chomp($target);
$target =~ s/\"//g;
# Create direct xrefs for mappings to Ensembl transcripts
if ($target =~ /ENSGALT/) {
# remove version if present
($target) = $target =~ /([^.]*)\.([^.]*)/;
# add xref - not we're assuming it doesn't already exist;
# may need to check like in CCDS parser
my $xref_id = $self->add_xref($acc, 0, $acc, "", $source_id, $species_id);
$self->add_direct_xref($xref_id, $target, "transcript", "");
$direct++;
} else {
# fetch sequence for others (EMBL ESTs and RefSeqs - pfetch will handle these)
system ("pfetch -q $target > seq.txt");
open(SEQ, "<seq.txt");
my $seq = <SEQ>;
chomp($seq);
close(SEQ);
if ($seq && $seq !~ /no match/) {
$xref->{ACCESSION} = $acc;
$xref->{SEQUENCE} = $seq;
$xref->{LABEL} = $acc;
$xref->{SOURCE_ID} = $source_id;
$xref->{SPECIES_ID} = $species_id;
$xref->{SEQUENCE_TYPE} = 'dna';
$xref->{STATUS} = 'experimental';
# Add description noting where the mapping came from
$xref->{DESCRIPTION} = $target . " used as mapping target";
#print $xref->{ACCESSION} . " " . $target . " " . $? . "\n";
$count++;
print "$count " if ($count % 100 == 0);
push @xrefs, $xref;
} else {
print "Couldn't get sequence for $target\n";
$noseq++;
}
}
}
close(FILE);
print "\n\nParsed $count primary xrefs.\n";
print "Couldn't get sequence for $noseq primary_xrefs\n" if ($noseq);
print "Added $direct direct xrefs.\n";
return \@xrefs;
}
sub new {
my $self = {};
bless $self, "XrefParser::WilsonAffyParser";
return $self;
}
1;