Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
1dfcadaa
Commit
1dfcadaa
authored
20 years ago
by
Arne Stabenau
Browse files
Options
Downloads
Patches
Plain Diff
trf condensing script
parent
2e9b9ca7
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
misc-scripts/repeats/shrink_trfs.pl
+190
-0
190 additions, 0 deletions
misc-scripts/repeats/shrink_trfs.pl
with
190 additions
and
0 deletions
misc-scripts/repeats/shrink_trfs.pl
0 → 100644
+
190
−
0
View file @
1dfcadaa
# this script tries to build repeat consensus sequences
# for TRF repeats that are unique and minimal.This should reduce the amount
#of TRF consensus objects.
# further it will replace all repeat consensi longer than 8 characters with
# just one for the length.
use
strict
;
use
DBI
;
use
Getopt::
Long
;
my
(
$host
,
$user
,
$pass
,
$port
,
$dbname
);
GetOptions
(
"
host=s
",
\
$host
,
"
user=s
",
\
$user
,
"
pass=s
",
\
$pass
,
"
port=i
",
\
$port
,
"
dbname=s
",
\
$dbname
);
if
(
!
$host
||
!
$dbname
)
{
usage
();
}
my
$dsn
=
"
DBI:mysql:host=
$host
;dbname=
$dbname
";
if
(
$port
)
{
$dsn
.=
"
;port=
$port
";
}
# retrive all consensi of length 1..8
# build the normal consensus ( rotate, revcomp and build alphabetical minimum )
my
$db
=
DBI
->
connect
(
$dsn
,
$user
,
$pass
);
# check if the trfs have the uncompressed format
my
$res
=
$db
->
selectall_arrayref
(
"
select count(*) from repeat_consensus where repeat_consensus rlike
\"\\\\
(
\"
"
);
if
(
$res
->
[
0
]
->
[
0
]
>
0
)
{
print
STDERR
"
Database might alread be converted, contains
\"
(
\"
in repeat_consensus.
\n
";
exit
;
}
my
$filename
=
"
tmp_old_new_rcid.txt
";
open
(
FH
,
"
>
$filename
"
)
or
die
(
"
Couldnt write
$filename
"
);
for
my
$i
(
1
..
8
)
{
my
$remap_count
=
0
;
my
$all_cons_count
=
0
;
my
$sth
=
$db
->
prepare
(
"
select repeat_consensus_id, repeat_consensus
"
.
"
from repeat_consensus where repeat_class =
\"
trf
\"
"
.
"
AND length(repeat_consensus) =
$i
"
);
$sth
->
execute
();
my
(
%rc_2_norm
,
%rc_2_seq
,
%norm_2_rc
);
while
(
my
$arr
=
$sth
->
fetchrow_arrayref
()
)
{
my
(
$rc_id
,
$rc_seq
)
=
@$arr
;
$all_cons_count
++
;
my
$norm_seq
=
norm_seq
(
$rc_seq
);
$rc_2_norm
{
$rc_id
}
=
$norm_seq
;
$rc_2_seq
{
$rc_id
}
=
$rc_seq
;
if
(
$norm_seq
eq
$rc_seq
)
{
$norm_2_rc
{
$norm_seq
}
=
$rc_id
;
}
}
# iterate through the rc_ids and write old tab new lines out
for
my
$rc_id
(
keys
%rc_2_norm
)
{
my
$norm
=
$rc_2_norm
{
$rc_id
};
if
(
exists
$norm_2_rc
{
$norm
}
)
{
my
$new_rc
=
$norm_2_rc
{
$norm
};
if
(
$rc_id
!=
$new_rc
)
{
print
FH
"
$rc_id
\t
$new_rc
\n
";
}
}
else
{
$norm_2_rc
{
$norm
}
=
$rc_id
;
}
}
}
my
$sth
=
$db
->
prepare
(
"
SELECT repeat_consensus_id, length( repeat_consensus )
"
.
"
FROM repeat_consensus WHERE repeat_class =
\"
trf
\"
"
.
"
AND length(repeat_consensus) > 8
"
);
$sth
->
execute
();
my
%len_2_rc
;
my
$length_removal_count
=
0
;
while
(
my
$arr
=
$sth
->
fetchrow_arrayref
()
)
{
my
(
$rc_id
,
$length
)
=
@$arr
;
if
(
exists
$len_2_rc
{
$length
}
)
{
# map this rc to the first one of that length
print
FH
"
$rc_id
\t
"
.
$len_2_rc
{
$length
}
.
"
\n
";
}
else
{
$len_2_rc
{
$length
}
=
$rc_id
;
}
}
print
STDERR
"
File written
";
close
(
FH
);
$db
->
do
(
"
create table tmp_old_new_rcid ( old_id int not null, new_id int not null, key old_idx( old_id))
"
);
$db
->
do
(
"
load data local infile '
$filename
' into table tmp_old_new_rcid
"
);
print
STDERR
"
and uploaded.
\n
";
$db
->
do
(
"
update repeat_feature rf, tmp_old_new_rcid tonr
"
.
"
set rf.repeat_consensus_id = tonr.new_id
"
.
"
where rf.repeat_consensus_id = tonr.old_id
"
);
print
STDERR
"
Repeat_features updated.
\n
";
$db
->
do
(
"
delete repeat_consensus
"
.
"
from repeat_consensus rc, tmp_old_new_rcid tonr
"
.
"
where rc.repeat_consensus_id = tonr.old_id
"
);
$db
->
do
(
"
update repeat_consensus
"
.
"
set repeat_consensus = concat( length( repeat_consensus ),
\"
(N)
\"
)
"
.
"
where repeat_class =
\"
trf
\"
"
.
"
and length( repeat_consensus ) > 8
"
);
$db
->
do
(
"
drop table tmp_old_new_rcid
"
);
unlink
(
$filename
);
exit
;
# find a representation of the input sequence that is canonical
# rotate and revcomp all possibilities and take alphabetical lowest
sub
norm_seq
{
my
$seq
=
shift
;
my
$test_seq
=
$seq
;
my
@all_seq
=
();
push
(
@all_seq
,
$seq
);
for
(
my
$i
=
1
;
$i
<
2
*length
(
$seq
);
$i
++
)
{
if
(
$i
!=
length
(
$seq
))
{
$test_seq
=
rotate
(
$test_seq
);
}
else
{
$test_seq
=
rev_comp
(
$test_seq
);
}
my
$new_seq
=
$test_seq
;
push
(
@all_seq
,
$new_seq
);
}
# print "old: ",join( " ", @all_seq),"\n";
@all_seq
=
sort
{
$a
cmp
$b
}
@all_seq
;
# print "new: ",join( " ", @all_seq),"\n";
return
$all_seq
[
0
];
}
sub
rotate
{
my
$string
=
shift
;
return
substr
(
$string
,
1
)
.
substr
(
$string
,
0
,
1
);
}
sub
rev_comp
{
my
$string
=
shift
;
$string
=
reverse
(
$string
);
$string
=~
tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/
;
return
$string
;
}
sub
usage
{
print
STDERR
<<EOC
Usage: perl shrink_trfs.pl -user .. -port .. -pass .. -host .. -dbname ..
Tries to minimize the number of repeat_consensi by only having one
trf consensus for each length of repeat_consensus longer than 8.
Shorter repeat consensi are reduced to the alphabetical minimum
of all rotate / revcomp equivalent repeats.
EOC
;
exit
;
}
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment