Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
db252504
Commit
db252504
authored
13 years ago
by
Andy Yates
Browse files
Options
Downloads
Patches
Plain Diff
Removed as Bio::EnsEMBL::DBSQL::Obj no longer exists
parent
61f934a1
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
misc-scripts/golden_path/nt_crossmatch.pl
+0
-316
0 additions, 316 deletions
misc-scripts/golden_path/nt_crossmatch.pl
with
0 additions
and
316 deletions
misc-scripts/golden_path/nt_crossmatch.pl
deleted
100755 → 0
+
0
−
316
View file @
61f934a1
#!/usr/local/ensembl/bin/perl
#This script reads a file which has nt contig and clone coordinates
#then reads a fasta file of all nt contigs
#then runs crossmatches between NT contigs and each raw contig mapped to be within that NT
use
strict
;
use
Bio::EnsEMBL::DBSQL::
Obj
;
use
Bio::
SeqIO
;
use
Bio::EnsEMBL::Pipeline::Runnable::
CrossMatch
;
$|
=
1
;
#This is the NT-clone agp file
my
$ctg_coords
=
shift
(
@ARGV
);
#This is the NT fasta file
my
$ntfile
=
shift
(
@ARGV
);
$ntfile
=~
/(\S+)\.fa/
;
my
$nt
=
$
1
;
my
$gp
=
"
/work2/elia/oct07/data/
$nt
.agp
";
my
$seqin
=
Bio::
SeqIO
->
new
(
-
file
=>
"
/work2/elia/oct07/data/
$ntfile
"
,
'
-format
'
=>
'
Fasta
');
print
STDERR
"
Reading NT fasta file...
\n
";
my
$ntseq
=
$seqin
->
next_seq
();
my
%nt_clone
;
open
(
CTG
,"
<
$ctg_coords
");
print
STDERR
"
Reading ctg coordinate file...
\n
";
while
(
<
CTG
>
)
{
my
@tabs
=
split
(
/\t/
);
push
(
@
{
$nt_clone
{
$tabs
[
0
]}},
$tabs
[
6
]);
}
close
(
CTG
);
#Hash holding the seq object for each NT contig
my
%nt_hsp
;
my
%nt_fasta
;
$ntseq
->
id
=~
/ref\|(\S+)\|(\S+)/
;
my
$id
=
$
1
;
my
$hsp
=
$
2
;
$nt_hsp
{
$id
}
=
$hsp
;
$ntseq
->
display_id
(
$id
);
my
$db
=
Bio::EnsEMBL::DBSQL::
Obj
->
new
(
-
dbname
=>
'
arne_oct07_tim
',
-
host
=>
'
ecs1b
',
-
user
=>
'
lijnzaad
');
print
STDERR
"
Starting crossmatches...
\n
";
open
(
GP
,"
>
$gp
");
print
STDERR
"
Going to crossmatch
$nt
:
\n
";
my
@matches
;
foreach
my
$cl
(
@
{
$nt_clone
{
$nt
}})
{
print
STDERR
"
Getting
$cl
from db
\n
";
$cl
=~
/(\S+)\.(\d+)/
;
my
$clone
=
$db
->
get_Clone
(
$
1
);
if
((
$clone
->
embl_version
!=
$
2
)
&&
(
$clone
->
embl_version
!=
0
))
{
print
STDERR
"
BUG:
$cl
has version $2 in the flatfile and
"
.
$clone
->
embl_version
.
"
in the database
\n
";
next
;
}
print
STDERR
"
Going to crossmatch
$cl
with
$nt
...
\n
";
my
$c
=
0
;
my
@fp
;
my
%offset
;
print
STDERR
"
Getting here...
\n
";
foreach
my
$contig
(
$clone
->
get_all_Contigs
)
{
$c
++
;
$offset
{
$contig
->
id
}
=
$contig
->
embl_offset
;
my
$seq
=
$contig
->
primary_seq
->
seq
();
$seq
=
uc
(
$seq
);
$seq
=~
s/[^ATCG]/N/g
;
my
$raw_seq
=
Bio::
PrimarySeq
->
new
(
-
display_id
=>
$contig
->
primary_seq
->
id
,
-
seq
=>
$seq
);
my
$nts
=
$ntseq
->
seq
;
$nts
=~
s/[^ATCG]/N/g
;
$nts
=
uc
(
$nts
);
$ntseq
->
seq
(
$nts
);
print
STDERR
"
Crossmatching
"
.
$contig
->
id
.
"
with
$nt
\n
";
my
$score
=
200
;
my
$cross
=
Bio::EnsEMBL::Pipeline::Runnable::
CrossMatch
->
new
(
-
nocopy
=>
1
,
-
seq1
=>
$raw_seq
,
-
seq2
=>
$ntseq
,
-
score
=>
$score
);
print
STDERR
"
Using score:
$score
\n
";
$cross
->
run
();
print
STDERR
"
Finished crossmatch
\n
";
push
(
@fp
,
$cross
->
output
);
}
if
(
$c
>
1
)
{
print
STDERR
"
BUG:
$cl
has
$c
contigs even though it is finished
\n
";
}
my
$size
=
scalar
(
@fp
);
if
(
$size
==
0
)
{
print
STDERR
"
BUG:
$cl
-
$nt
crossmatch did not return any feature pairs, need to rerun with lowers threshold
";
}
elsif
(
$size
==
1
)
{
#Easy case, one feature pair between the two
print
STDERR
"
One feature pair for
$nt
-
$cl
\n
";
my
$f
=
$fp
[
0
];
my
$cv
=
$clone
->
id
.
"
.
"
.
$clone
->
embl_version
;
my
$match
;
$match
->
{
clone
}
=
$cv
;
$match
->
{
NT
}
=
$nt
;
$match
->
{
ntstart
}
=
$f
->
hstart
;
$match
->
{
ntend
}
=
$f
->
hend
;
my
$cl_start
=
$f
->
start
+
$offset
{
$f
->
seqname
}
-
1
;
#Adjust for offset!
$match
->
{
clstart
}
=
$cl_start
;
my
$cl_end
=
$f
->
end
+
$offset
{
$f
->
seqname
}
-
1
;
#Adjust for offset!
$match
->
{
clend
}
=
$cl_end
;
if
(
$f
->
hstrand
==
1
)
{
#print STDERR "...forward...\n";
$match
->
{
strand
}
=
'
+1
';
}
else
{
#print STDERR "...reverse...\n";
$match
->
{
strand
}
=
'
-1
';
}
push
(
@matches
,
$match
);
}
else
{
print
STDERR
"
More than one feature pair (
$size
) for
$cl
-
$nt
, getting longest
\n
";
my
$cv
=
$clone
->
id
.
"
.
"
.
$clone
->
embl_version
;
#Take longest crossmatch!
my
$length
=
0
;
#This loop finds longest match
foreach
my
$f
(
@fp
)
{
my
$match
;
$match
->
{
clone
}
=
$cv
;
$match
->
{
NT
}
=
$nt
;
print
STDERR
"
Checking feature on nt
"
.
$f
->
hstart
.
"
"
.
$f
->
hend
.
"
and CLONE
"
.
$f
->
start
.
"
"
.
$f
->
end
.
"
\n
";
#if (($f->hend-$f->hstart)> $length) {
$match
->
{
ntstart
}
=
$f
->
hstart
;
$match
->
{
ntend
}
=
$f
->
hend
;
my
$cl_start
=
$f
->
start
+
$offset
{
$f
->
seqname
}
-
1
;
$match
->
{
clstart
}
=
$cl_start
;
my
$cl_end
=
$f
->
end
+
$offset
{
$f
->
seqname
}
-
1
;
$match
->
{
clend
}
=
$cl_end
;
if
(
$f
->
hstrand
==
1
)
{
$match
->
{
strand
}
=
'
+1
';
}
else
{
$match
->
{
strand
}
=
'
-1
';
}
#$length=$f->hend-$f->hstart;
#}
push
(
@matches
,
$match
);
}
}
}
my
$size
=
scalar
(
@matches
);
print
STDERR
"
Got
$size
matches for this NT contig
\n
";
my
@validated
;
my
%internal
;
if
(
$size
>
1
)
{
#Start by sorting matches by start in nt
my
@sorted
=
sort
{
$a
->
{
ntstart
}
<=>
$b
->
{
ntstart
}}
@matches
;
#This loop takes care of removin internal matches (i.e. matches within matches
#We shift out one by one the matches from the array, and compare
#them to all other subsequenet matches (ordered by ntstart)
while
(
my
$match
=
shift
@sorted
)
{
my
$key1
=
$match
->
{
clone
}
.
"
-
"
.
$match
->
{
clstart
};
if
(
exists
$internal
{
$match
->
{
$key1
}})
{
#print STDERR "Skipping match with clone ".$match->{clone}.", because already detected as internal ".$match->{clstart}."-".$match->{clend}."\n";
next
;
}
#print STDERR "Checking ".$match->{NT}."\t".$match->{ntstart}."\t".$match->{ntend}."\t".$match->{clone}."\t".$match->{clstart}."\t".$match->{clend}."\t".$match->{strand}."\n";
#Check if subsequent matches are contained in this match
foreach
my
$other
(
@sorted
)
{
my
$key2
=
$other
->
{
clone
}
.
"
-
"
.
$other
->
{
clstart
};
#Check if contaied and not set to internal already
if
((
$other
->
{
ntend
}
<
$match
->
{
ntend
})
&&
(
!
exists
$internal
{
$key2
}))
{
#Store internal matches in %internal
$internal
{
$key2
}
=
1
;
#print STDERR "Skipping ".$other->{NT}."\t".$other->{ntstart}."\t".$other->{ntend}."\t".$other->{clone}."\t".$other->{clstart}."\t".$other->{clend}."\t".$other->{strand}."\n";
}
}
#All matches that are not internal, go into validated arrau
if
(
!
exists
$internal
{
$key1
})
{
push
@validated
,
$match
;
}
}
}
else
{
push
(
@validated
,
@matches
);
}
#Now we have a set of non-redundant matches sorted by nt_start
#Now we have to go through them to assign golden path switch points
#We write the determined golden path to a file, and try to detect gaps if any
print
STDERR
"
\n
Final set:
\n
";
foreach
my
$match
(
@validated
)
{
print
STDERR
$match
->
{
NT
}
.
"
\t
"
.
$match
->
{
ntstart
}
.
"
\t
"
.
$match
->
{
ntend
}
.
"
\t
"
.
$match
->
{
clone
}
.
"
\t
"
.
$match
->
{
clstart
}
.
"
\t
"
.
$match
->
{
clend
}
.
"
\t
"
.
$match
->
{
strand
}
.
"
\n
";
}
my
$size
=
scalar
(
@validated
);
my
$hsp
=
$nt_hsp
{
$nt
};
print
STDERR
"
The golden path will have
$size
elements for NT contig
$nt
:
\n
";
my
(
$ntstart
,
$ntend
,
$cl
,
$clstart
,
$clend
,
$strand
);
if
(
$size
==
1
)
{
#Write golden path directly from match results
my
$match
=
$validated
[
0
];
$ntstart
=
$match
->
{
ntstart
};
$ntend
=
$match
->
{
ntend
};
$cl
=
$match
->
{
clone
};
$clstart
=
$match
->
{
clstart
};
$clend
=
$match
->
{
clend
};
$strand
=
$match
->
{
strand
};
print
STDERR
"
$nt
\t
$hsp
\t
$ntstart
\t
$ntend
\t
$strand
\t
F
\t
$cl
\t
$clstart
\t
$clend
\n
";
print
GP
"
$nt
\t
$hsp
\t
$ntstart
\t
$ntend
\t
$strand
\t
F
\t
$cl
\t
$clstart
\t
$clend
\n
";
}
else
{
#Loop through this final validated set to deduce ("walk trough") golden path
for
(
my
$i
=
0
;
$i
<
$size
;
$i
++
)
{
#print STDERR "looking at $i size...\n";
#Variables to be written to golden path file
my
$match
=
$validated
[
$i
];
my
$next
=
$validated
[
$i
+
1
];
#Three different cases, first, internal, and last
#First match (starts from start, does not look at previous
if
(
$i
==
0
)
{
$ntstart
=
$match
->
{
ntstart
};
$ntend
=
$next
->
{
ntstart
};
$cl
=
$match
->
{
clone
};
if
(
$match
->
{
strand
}
==
'
+1
')
{
$clstart
=
$match
->
{
clstart
};
$clend
=
$match
->
{
clend
}
-
(
$match
->
{
ntend
}
-
$next
->
{
ntstart
});
}
else
{
$clstart
=
$match
->
{
clstart
}
+
(
$match
->
{
ntend
}
-
$next
->
{
ntstart
});
$clend
=
$match
->
{
clend
};
}
$strand
=
$match
->
{
strand
};
if
(
$ntstart
!=
1
)
{
print
STDERR
"
BUG: ntstart of first match for
$nt
-
$cl
is not 1!
\n
";
}
}
#Last
elsif
(
$i
==
(
$size
-
1
))
{
$ntstart
=
$match
->
{
ntstart
}
+
1
;
$ntend
=
$match
->
{
ntend
};
$cl
=
$match
->
{
clone
};
if
(
$match
->
{
strand
}
==
'
+1
')
{
$clstart
=
$match
->
{
clstart
}
+
1
;
$clend
=
$match
->
{
clend
};
}
else
{
$clstart
=
$match
->
{
clstart
};
$clend
=
$match
->
{
clend
}
-
1
;
}
$strand
=
$match
->
{
strand
};
#Check if the last match extends until the end
#(bug fix for ambiguity characters in the end)
if
(
$ntseq
->
length
>
$ntend
)
{
#print STDERR "Detected missing tail\n";
my
$diff
=
(
$ntseq
->
length
-
$match
->
{
ntend
});
print
STDERR
"
Missing
$diff
bases from the end in
$nt
-
$cl
match, adding them
\n
";
#Check if extension on clone feasible?
$ntend
+=
$diff
;
$clend
+=
$diff
;
}
}
#All others (internal)
else
{
$ntstart
=
$match
->
{
ntstart
}
+
1
;
$ntend
=
$next
->
{
ntstart
};
$cl
=
$match
->
{
clone
};
if
(
$match
->
{
strand
}
==
'
+1
')
{
$clstart
=
$match
->
{
clstart
}
+
1
;
$clend
=
$match
->
{
clend
}
-
(
$match
->
{
ntend
}
-
$next
->
{
ntstart
});
}
else
{
$clstart
=
$match
->
{
clstart
}
+
(
$match
->
{
ntend
}
-
$next
->
{
ntstart
});
$clend
=
$match
->
{
clend
}
-
1
;
}
$strand
=
$match
->
{
strand
};
}
print
STDERR
"
$nt
\t
$hsp
\t
$ntstart
\t
$ntend
\t
$strand
\t
F
\t
$cl
\t
$clstart
\t
$clend
\n
";
print
GP
"
$nt
\t
$hsp
\t
$ntstart
\t
$ntend
\t
$strand
\t
F
\t
$cl
\t
$clstart
\t
$clend
\n
";
}
}
close
(
GP
);
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment