Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
E
ensembl
Manage
Activity
Members
Labels
Plan
Issues
0
Issue boards
Milestones
Iterations
Wiki
Requirements
Jira
Code
Merge requests
1
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Locked files
Build
Pipelines
Jobs
Pipeline schedules
Test cases
Artifacts
Deploy
Releases
Package Registry
Container Registry
Operate
Environments
Terraform modules
Monitor
Incidents
Service Desk
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Code review analytics
Issue analytics
Insights
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Terms and privacy
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
ensembl-gh-mirror
ensembl
Commits
134468c2
Commit
134468c2
authored
13 years ago
by
Andy Yates
Browse files
Options
Downloads
Patches
Plain Diff
Alternative script which does processing based upon taxonomy databases
parent
ef905951
No related branches found
Branches containing commit
No related tags found
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
misc-scripts/production_database/scripts/import_species_names_from_taxonomy.pl
+350
-0
350 additions, 0 deletions
...on_database/scripts/import_species_names_from_taxonomy.pl
with
350 additions
and
0 deletions
misc-scripts/production_database/scripts/import_species_names_from_taxonomy.pl
0 → 100644
+
350
−
0
View file @
134468c2
#!/usr/bin/env perl
use
strict
;
use
warnings
;
use
Getopt::
Long
qw( :config no_ignore_case )
;
use
Pod::
Usage
;
use
Bio::EnsEMBL::DBSQL::
DBAdaptor
;
use
Bio::EnsEMBL::DBSQL::
DBConnection
;
sub
run
{
my
(
$class
)
=
@_
;
my
$self
=
bless
({},
$class
);
$self
->
args
();
$self
->
defaults
();
my
$taxons
=
$self
->
_get_taxons
();
foreach
my
$taxon
(
@
{
$taxons
})
{
$self
->
process_taxon
(
$taxon
);
}
return
;
}
sub
args
{
my
(
$self
)
=
@_
;
my
$opts
=
{};
my
@cmd_opts
=
qw/
mhost|mh=s
mport|mP=i
muser|mu=s
mpass|mp=s
mdatabase|md=s
host|h=s
port|P=i
user|u=s
pass|p=s
database|d=s
taxon=i@
verbose|v!
help
man
/
;
GetOptions
(
$opts
,
@cmd_opts
)
or
pod2usage
(
-
verbose
=>
1
,
-
exitval
=>
1
);
pod2usage
(
-
verbose
=>
1
,
-
exitval
=>
0
)
if
$opts
->
{
help
};
pod2usage
(
-
verbose
=>
2
,
-
exitval
=>
0
)
if
$opts
->
{
man
};
$self
->
{
opts
}
=
$opts
;
return
;
}
sub
defaults
{
my
(
$self
)
=
@_
;
my
$o
=
$self
->
{
opts
};
# Master database location:
$o
->
{
mhost
}
=
'
ens-staging1
'
unless
$o
->
{
mhost
};
$o
->
{
mport
}
=
3306
unless
$o
->
{
mport
};
# User database location (default values):
$o
->
{
port
}
=
3306
unless
$o
->
{
port
};
foreach
my
$required
(
qw/host user muser database/
)
{
my
$msg
=
"
Required parameter --
${required}
was not given
";
pod2usage
(
-
msg
=>
$msg
,
-
verbose
=>
1
,
-
exitval
=>
1
)
if
!
$o
->
{
$required
};
}
return
;
}
sub
_get_taxons
{
my
(
$self
)
=
@_
;
my
$taxons
;
if
(
defined
$self
->
{
opts
}
->
{
taxon
}
&&
@
{
$self
->
{
opts
}
->
{
taxon
}})
{
$taxons
=
$self
->
{
opts
}
->
{
taxon
};
}
else
{
$taxons
=
$self
->
_production_dbc
()
->
sql_helper
()
->
execute_simple
(
-
SQL
=>
'
select taxon from species where taxon is not null
');
}
$self
->
v
('
Found %d taxon(s) to process
',
scalar
(
@
{
$taxons
}));
return
$taxons
;
}
sub
process_taxon
{
my
(
$self
,
$taxon
)
=
@_
;
$self
->
v
('
Processing %s
',
$taxon
);
$self
->
v
('
Getting results from taxonomy DB
');
my
$taxonomy
=
$self
->
_get_taxonomy
(
$taxon
);
$self
->
v
('
Getting results from production DB
');
my
$production
=
$self
->
_get_production
(
$taxon
);
if
(
!
$production
->
{
species_id
})
{
$self
->
v
("
ERROR: %d taxon ID does not exist in the production database
",
$taxon
);
return
;
}
$self
->
v
('
Updating attributes
');
$self
->
_update_attributes
(
$taxonomy
,
$production
);
$self
->
v
('
Updating species aliases
');
$self
->
_update_species_aliases
(
$taxonomy
,
$production
);
$self
->
v
('
Done
');
return
;
}
sub
_get_taxonomy
{
my
(
$self
,
$taxon
)
=
@_
;
my
$dbc
=
$self
->
_taxonomy_dbc
();
my
%aliases
;
my
$query_hash
=
$dbc
->
sql_helper
()
->
execute_into_hash
(
-
SQL
=>
'
select name_class, name from ncbi_taxa_name where taxon_id =?
',
-
PARAMS
=>
[
$taxon
],
-
CALLBACK
=>
sub
{
my
(
$row
,
$value
)
=
@_
;
my
(
$class
,
$name
)
=
@
{
$row
};
return
{}
if
$class
eq
'
authority
';
return
{}
if
$class
eq
'
merged_taxon_id
';
$aliases
{
lc
(
$name
)}
=
1
;
if
(
defined
$value
)
{
push
(
@
{
$value
},
$name
);
}
return
[
$name
];
}
);
my
%return_hash
;
$return_hash
{
active_aliases
}
=
[
sort
keys
%aliases
];
$return_hash
{
inactive_aliases
}
=
[]
;
$return_hash
{
scientific_name
}
=
$query_hash
->
{'
scientific name
'}
->
[
0
]
if
$query_hash
->
{'
scientific name
'};
$return_hash
{
common_name
}
=
$query_hash
->
{'
genbank common name
'}
->
[
0
]
if
$query_hash
->
{'
genbank common name
'};
$return_hash
{
web_name
}
=
$query_hash
->
{'
common name
'}
->
[
0
]
if
$query_hash
->
{'
common name
'};
return
\
%return_hash
;
}
sub
_get_production
{
my
(
$self
,
$taxon_id
)
=
@_
;
my
$dbc
=
$self
->
_production_dbc
();
my
$hash
;
$dbc
->
sql_helper
()
->
execute_no_return
(
-
SQL
=>
'
select species_id, common_name, web_name, scientific_name, taxon from species where taxon =?
',
-
PARAMS
=>
[
$taxon_id
],
-
CALLBACK
=>
sub
{
my
(
$r
)
=
@_
;
my
$i
=
0
;
$hash
=
{
species_id
=>
$r
->
[
$i
++
],
common_name
=>
$r
->
[
$i
++
],
web_name
=>
$r
->
[
$i
++
],
scientific_name
=>
$r
->
[
$i
++
],
taxon
=>
$r
->
[
$i
++
]
};
}
);
my
$sql
=
'
select alias from species_alias where species_id =? and is_current = ?
';
$hash
->
{
active_aliases
}
=
$dbc
->
sql_helper
()
->
execute_simple
(
-
SQL
=>
$sql
,
-
PARAMS
=>
[
$hash
->
{
species_id
},
1
]);
$hash
->
{
inactive_aliases
}
=
$dbc
->
sql_helper
()
->
execute_simple
(
-
SQL
=>
$sql
,
-
PARAMS
=>
[
$hash
->
{
species_id
},
1
]);
return
$hash
;
}
sub
_decipher_aliases
{
my
(
$self
,
$source
,
$production
)
=
@_
;
my
%source_active_hash
=
map
{
$_
=>
1
}
@
{
$source
->
{
active_aliases
}};
my
%prod_active_hash
=
map
{
$_
=>
1
}
@
{
$production
->
{
active_aliases
}};
my
%prod_inactive_hash
=
map
{
$_
=>
1
}
@
{
$production
->
{
inactive_aliases
}};
my
@new_aliases
;
my
@reactivated_aliases
;
my
@retired_aliases
;
foreach
my
$source
(
keys
%source_active_hash
)
{
if
(
!
$prod_active_hash
{
$source
})
{
if
(
!
$prod_inactive_hash
{
$source
})
{
push
(
@new_aliases
,
$source
);
}
else
{
push
(
@reactivated_aliases
,
$source
);
}
}
}
foreach
my
$target
(
keys
%prod_active_hash
)
{
if
(
!
$source_active_hash
{
$target
})
{
push
(
@retired_aliases
,
$target
);
}
}
return
{
new
=>
\
@new_aliases
,
retired
=>
\
@retired_aliases
,
reactivated
=>
\
@reactivated_aliases
};
}
sub
_decipher_attributes
{
my
(
$self
,
$source
,
$target
)
=
@_
;
my
%final
;
my
@keys
=
qw/common_name web_name scientific_name/
;
foreach
my
$key
(
@keys
)
{
if
(
$target
->
{
$key
})
{
$final
{
$key
}
=
$target
->
{
$key
};
}
else
{
$final
{
$key
}
=
$source
->
{
$key
};
}
}
return
\
%final
;
}
sub
_update_attributes
{
my
(
$self
,
$source
,
$production
)
=
@_
;
my
$dbc
=
$self
->
_production_dbc
();
my
$h
=
$dbc
->
sql_helper
();
my
$attributes
=
$self
->
_decipher_attributes
(
$source
,
$production
);
my
@keys
=
keys
%
{
$attributes
};
my
$set_string
=
join
(
q{,}
,
map
{
"
$_
=?
"
}
@keys
);
my
@params
=
map
{
$attributes
->
{
$_
}
}
@keys
;
push
(
@params
,
$production
->
{
species_id
});
my
$sql
=
sprintf
('
update species set %s where species_id =?
',
$set_string
);
$h
->
execute_update
(
-
SQL
=>
$sql
,
-
PARAMS
=>
\
@params
);
return
;
}
sub
_update_species_aliases
{
my
(
$self
,
$source
,
$production
)
=
@_
;
my
$dbc
=
$self
->
_production_dbc
();
my
$h
=
$dbc
->
sql_helper
();
my
$aliases
=
$self
->
_decipher_aliases
(
$source
,
$production
);
my
$species_id
=
$production
->
{
species_id
};
foreach
my
$alias
(
@
{
$aliases
->
{
new
}})
{
my
$sql
=
'
insert into species_alias (species_id, alias, is_current, created_by, created_at) values (?,?,?,?,now())
';
$h
->
execute_update
(
-
SQL
=>
$sql
,
-
PARAMS
=>
[
$species_id
,
$alias
,
1
,
undef
]);
}
foreach
my
$alias
(
@
{
$aliases
->
{
reactivated
}})
{
my
$sql
=
'
update species_alias set is_current =?, modified_by =?, modified_at =now() where species_id =? and alias =?
';
$h
->
execute_update
(
-
SQL
=>
$sql
,
-
PARAMS
=>
[
1
,
undef
,
$species_id
,
$alias
]);
}
foreach
my
$alias
(
@
{
$aliases
->
{
retired
}})
{
my
$sql
=
'
update species_alias set is_current =?, modified_by =?, modified_at =now() where species_id =? and alias =?
';
$h
->
execute_update
(
-
SQL
=>
$sql
,
-
PARAMS
=>
[
0
,
undef
,
$species_id
,
$alias
]);
}
return
;
}
sub
_taxonomy_dbc
{
my
(
$self
)
=
@_
;
my
$o
=
$self
->
{
opts
};
my
%args
=
(
-
HOST
=>
$o
->
{
host
},
-
PORT
=>
$o
->
{
port
},
-
USER
=>
$o
->
{
user
},
-
DBNAME
=>
$o
->
{
database
}
);
$args
{
-
PASS
}
=
$o
->
{
pass
}
if
$o
->
{
pass
};
return
Bio::EnsEMBL::DBSQL::
DBConnection
->
new
(
%args
);
}
sub
_production_dbc
{
my
(
$self
)
=
@_
;
my
$o
=
$self
->
{
opts
};
my
%args
=
(
-
HOST
=>
$o
->
{
mhost
},
-
PORT
=>
$o
->
{
mport
},
-
DBNAME
=>
$o
->
{
mdatabase
},
-
USER
=>
$o
->
{
muser
}
);
$args
{
-
PASS
}
=
$o
->
{
mpass
}
if
$o
->
{
mpass
};
return
Bio::EnsEMBL::DBSQL::
DBConnection
->
new
(
%args
);
}
sub
v
{
my
(
$self
,
$msg
,
@args
)
=
@_
;
return
unless
$self
->
{
opts
}
->
{
verbose
};
my
$s_msg
=
sprintf
(
$msg
,
@args
);
my
(
$sec
,
$min
,
$hour
,
$mday
,
$mon
,
$year
,
$wday
,
$yday
,
$isdst
)
=
localtime
(
time
());
print
sprintf
("
[%02d-%02d-%04d %02d:%02d:%02d] %s
\n
",
$mday
,
$mon
,
$year
+
1900
,
$hour
,
$min
,
$sec
,
$s_msg
);
return
;
}
__PACKAGE__
->
run
();
__END__
=pod
=head1 NAME
import_species_names_from_taxonomy.pl
=head1 SYNOPSIS
./import_species_names_from_taxonomy.pl -h host [-P port] \\
-u user [-p password]
-d database \\
[-mh host] [-mP port] \\
[-mu user] [-mp password] [-md database] \\
[-taxon id]
[-v]
=head1 DESCRIPTION
TODO
=head1 OPTIONS
=over 8
=item B<-h|--host>
Host of the taxonomy database
=item B<-P|--port>
Port of the taxonomy database
=item B<-u|--user>
Username for the taxonomy database
=item B<-p|--pass>
Password for the taxonomy database
=item B<-mh|--mhost>
Host for the production database
=item B<-mP|--mport>
Port for the production database
=item B<-mu|--muser>
User for the production database
=item B<-mp|--mpass>
Pass for the production database
=item B<--taxon>
Allows for the update of a single species by specifying its taxonomy id. This
is useful for new species updates.
=item B<--verbose>
Make the script chatty
=item B<--help>
Help message
=item B<--man>
Man page
=back
=cut
\ No newline at end of file
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment