Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
be71ab17
Commit
be71ab17
authored
14 years ago
by
Andreas Kusalananda Kähäri
Browse files
Options
Downloads
Patches
Plain Diff
Forgot patching this with Dan's patch...
parent
f4be3985
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
misc-scripts/id_mapping/dump_cache.pl
+106
-31
106 additions, 31 deletions
misc-scripts/id_mapping/dump_cache.pl
with
106 additions
and
31 deletions
misc-scripts/id_mapping/dump_cache.pl
+
106
−
31
View file @
be71ab17
...
...
@@ -137,18 +137,26 @@ sub build_cache_auto {
my
$cache_impl
=
'
Bio::EnsEMBL::IdMapping::Cache
';
inject
(
$cache_impl
);
my
$cache
=
$cache_impl
->
new
(
-
LOGGER
=>
$logger
,
-
CONF
=>
$conf
,
);
# EG - populate cache for each species in turn
$logger
->
debug
("
\n
Checking number of toplevel seq_regions...
\n
");
my
$max
=
0
;
my
$max
=
0
;
my
@species_ids
=
@
{
get_species_ids
("
target
")
};
foreach
my
$dbtype
(
qw(source target)
)
{
my
$num
=
scalar
(
@
{
$cache
->
slice_names
(
$dbtype
)
});
$max
=
$num
if
(
$num
>
$max
);
$logger
->
debug
("
$dbtype
:
$num
.
\n
",
1
);
# populate the cache for each species in turn
for
my
$species
(
@species_ids
)
{
$conf
->
param
(
'
species_id
',
$$species
[
1
]
);
$conf
->
param
(
'
species_name
',
$$species
[
0
]
);
my
$cache
=
$cache_impl
->
new
(
-
LOGGER
=>
$logger
,
-
CONF
=>
$conf
,
);
my
$num
=
scalar
(
@
{
$cache
->
slice_names
(
$dbtype
,
@$species
)
}
);
$max
=
$num
if
(
$num
>
$max
);
$logger
->
debug
(
"
$dbtype
:
$num
.
\n
",
1
);
}
}
my
$threshold
=
$conf
->
param
('
build_cache_auto_threshold
')
||
100
;
...
...
@@ -177,19 +185,22 @@ sub build_cache_by_seq_region {
system
("
mkdir -p
$logpath
")
==
0
or
$logger
->
error
("
Can't create lsf log dir
$logpath
: $!
\n
");
# EG get the list of species IDs for sources and targets
my
@source_species_ids
=
@
{
get_species_ids
("
source
")
};
my
@species_ids
=
@
{
get_species_ids
("
target
")
};
# load the cache implementation
my
$cache_impl
=
'
Bio::EnsEMBL::IdMapping::Cache
';
inject
(
$cache_impl
);
my
$cache
=
$cache_impl
->
new
(
-
LOGGER
=>
$logger
,
-
CONF
=>
$conf
,
);
# EG store the base directory onto which the species ID will be added
my
$basedir
=
$conf
->
param
('
basedir
');
# submit jobs to lsf
foreach
my
$dbtype
(
qw(source target)
)
{
$logger
->
info
("
\n
"
.
ucfirst
(
$dbtype
)
.
"
db...
\n
",
0
,
'
stamped
');
# EG iterate over individual species for source and target
# determine which slices need to be done
my
$filename
=
"
$dbtype
.dump_cache.slices.txt
";
...
...
@@ -197,21 +208,47 @@ sub build_cache_by_seq_region {
throw
("
Unable to open
$logpath
/
$filename
for writing: $!
");
my
$num_jobs
=
0
;
foreach
my
$slice_name
(
@
{
$cache
->
slice_names
(
$dbtype
)
})
{
my
$type
=
"
$dbtype
.
$slice_name
";
unless
(
$cache
->
cache_file_exists
(
$type
))
{
print
$fh
"
$slice_name
\n
";
$num_jobs
++
;
for
my
$species
(
@species_ids
)
{
# EG set config based on species ID in turn
$conf
->
param
(
'
basedir
',
path_append
(
$basedir
,
$$species
[
1
]
)
);
$conf
->
param
(
'
species_id
',
$$species
[
1
]
);
$conf
->
param
(
'
species_name
',
$$species
[
0
]
);
# EG load cache for current species ID
my
$cache
=
$cache_impl
->
new
(
-
LOGGER
=>
$logger
,
-
CONF
=>
$conf
,
);
foreach
my
$slice_name
(
@
{
$cache
->
slice_names
(
$dbtype
,
@$species
)
}
)
{
my
$type
=
"
$dbtype
.
$slice_name
";
my
$src_species_id
;
for
my
$src_id
(
@source_species_ids
)
{
if
(
$$species
[
1
]
==
$$src_id
[
1
]
)
{
$src_species_id
=
$$src_id
[
1
];
last
;
}
}
$logger
->
info
(
"
\n
"
.
ucfirst
(
$dbtype
)
.
"
db...
\n
",
0
,
'
stamped
'
);
foreach
my
$slice_name
(
@
{
$cache
->
slice_names
(
$dbtype
)
}
)
{
my
$type
=
"
$dbtype
.
$slice_name
";
unless
(
$cache
->
cache_file_exists
(
$type
)
)
{
print
$fh
"
$slice_name
\n
";
print
$fh
"
$slice_name
,$
$species
[0],$
$species
[1],
"
.
$src_species_id
.
"
\n
";
$num_jobs
++
;
}
}
}
}
unless
(
$num_jobs
)
{
$logger
->
info
("
All cache files for
$dbtype
exist.
\n
");
next
;
}
}
## end for my $species (@species_ids)
close
(
$fh
);
unless
(
$num_jobs
)
{
$logger
->
info
("
All cache files for
$dbtype
exist.
\n
");
next
;
}
# EG reset original basedir
$conf
->
param
(
'
basedir
',
$basedir
);
# build lsf command
my
$lsf_name
=
'
dump_by_seq_region_
'
.
time
;
...
...
@@ -226,10 +263,17 @@ sub build_cache_by_seq_region {
cache_impl
=>
$cache_impl
,
);
my
$cmd
=
qq{./dump_by_seq_region.pl $options --index \$LSB_JOBINDEX}
;
# EG invoke perl with correct path rather than relying on shebang
my
$cmd
=
qq{perl -I ./modules }
.
qq{./misc-scripts/id_mapping/dump_by_seq_region.pl }
.
qq{$options --index \$LSB_JOBINDEX}
;
my
$pipe
=
qq{|bsub -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
'
|bsub
'
.
$conf
->
param
('
lsf_opt_run
')
.
qq{ -J '$lsf_name\[1-$num_jobs\]\%$concurrent' }
.
qq{-o $logpath/dump_by_seq_region.$dbtype.\%I.out }
.
qq{-e $logpath/dump_by_seq_region.$dbtype.\%I.err }
.
$conf
->
param
('
lsf_opt_dump_cache
');
...
...
@@ -251,8 +295,10 @@ sub build_cache_by_seq_region {
# submit dependent job to monitor finishing of jobs
$logger
->
info
("
Waiting for jobs to finish...
\n
",
0
,
'
stamped
');
my
$dependent_job
=
qq{bsub -K -w "ended($lsf_name)" -q small }
.
qq{-o $logpath/dump_cache.$dbtype.depend.out /bin/true}
;
my
$dependent_job
=
qq{bsub -K -w "ended($lsf_name)" }
.
$conf
->
param
('
lsf_opt_run_small
')
.
qq{ -o $logpath/dump_cache.$dbtype.depend.out /bin/true}
;
system
(
$dependent_job
)
==
0
or
$logger
->
error
("
Error submitting dependent job: $!
\n
");
...
...
@@ -306,3 +352,32 @@ sub build_cache_all {
}
# EG new method for getting species IDs
sub
get_species_ids
{
my
(
$prefix
)
=
@_
;
my
@speciesIds
;
my
$dsn
=
"
DBI:mysql:database=
"
.
$conf
->
param
("
${prefix}
dbname
")
.
"
;host=
"
.
$conf
->
param
("
${prefix}
host
")
.
"
;port=
"
.
$conf
->
param
("
${prefix}
port
");
my
$ensemblCoreDbh
=
DBI
->
connect
(
$dsn
,
$conf
->
param
("
${prefix}
user
"),
$conf
->
param
("
${prefix}
pass
")
)
||
die
"
Cannot connect to server:
$DBI
::errstr
\n
";
my
$query
=
"
SELECT DISTINCT meta_value, species_id
FROM meta WHERE meta_key = 'species.production_name'
";
my
$psmt
=
$ensemblCoreDbh
->
prepare
(
$query
);
$psmt
->
execute
();
while
(
my
(
@results
)
=
$psmt
->
fetchrow
()
)
{
push
@speciesIds
,
[
$results
[
0
],
$results
[
1
]
];
}
return
\
@speciesIds
;
}
## end sub get_species_ids
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment