description_tests.py 12.5 KB
Newer Older
Blake Sweeney's avatar
Blake Sweeney committed
1 2 3 4 5 6 7 8 9 10 11 12 13
"""
Copyright [2009-2017] EMBL-European Bioinformatics Institute
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
     http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
"""

Blake Sweeney's avatar
Blake Sweeney committed
14 15
import unittest

Blake Sweeney's avatar
Blake Sweeney committed
16 17
from django.test import SimpleTestCase

Blake Sweeney's avatar
Blake Sweeney committed
18 19 20 21

from django_performance_testing.queries import QueryBatchLimit
from django_performance_testing.timing import TimeLimit

Blake Sweeney's avatar
Blake Sweeney committed
22 23
from portal.models import Rna

24 25
from portal.utils.descriptions import rule_method as rm

26 27 28
__doc__ = """
To run these tests you can simply do:

Blake Sweeney's avatar
Blake Sweeney committed
29
$ python manage.py test portal.tests.description_tests
30 31
"""

Blake Sweeney's avatar
Blake Sweeney committed
32

Blake Sweeney's avatar
Blake Sweeney committed
33
class GenericDescriptionTest(SimpleTestCase):
Blake Sweeney's avatar
Blake Sweeney committed
34
    allow_database_queries = True
Blake Sweeney's avatar
Blake Sweeney committed
35

Blake Sweeney's avatar
Blake Sweeney committed
36
    def assertDescriptionIs(self, description, upi, taxid=None):
Blake Sweeney's avatar
Blake Sweeney committed
37
        seq = Rna.objects.get(upi=upi)
38
        with QueryBatchLimit(write=0, read=10):
39
            with TimeLimit(total=50):
Blake Sweeney's avatar
Blake Sweeney committed
40 41
                computed = seq.get_description(taxid=taxid, recompute=True)
        self.assertEquals(description, computed)
Blake Sweeney's avatar
Blake Sweeney committed
42

Blake Sweeney's avatar
Blake Sweeney committed
43
    def assertDescriptionContains(self, short, upi, taxid=None):
Blake Sweeney's avatar
Blake Sweeney committed
44
        seq = Rna.objects.get(upi=upi)
45 46
        with QueryBatchLimit(write=0, read=10):
            with TimeLimit(total=5):
Blake Sweeney's avatar
Blake Sweeney committed
47
                description = seq.get_description(taxid=taxid, recompute=True)
Blake Sweeney's avatar
Blake Sweeney committed
48
        self.assertIn(short, description)
Blake Sweeney's avatar
Blake Sweeney committed
49 50 51 52 53 54 55 56 57


class SimpleDescriptionTests(GenericDescriptionTest):

    def test_handles_bantam(self):
        """
        Descriptions should select a name with 'dme-bantam' in it for the
        precursor and both products.
        """
Blake Sweeney's avatar
Blake Sweeney committed
58 59 60
        self.assertDescriptionContains('dme-bantam', 'URS00002F21DA', taxid=7227)
        self.assertDescriptionContains('dme-bantam-3p', 'URS00004E9E38', taxid=7227)
        self.assertDescriptionContains('dme-bantam-5p', 'URS000055786A', taxid=7227)
Blake Sweeney's avatar
Blake Sweeney committed
61 62 63 64 65 66 67

    def test_handles_ribozymes(self):
        """
        We should pick a good, not repetitive name for ribozymes.
        """
        pass

Blake Sweeney's avatar
Blake Sweeney committed
68 69 70 71 72 73
    def test_can_handle_trnas(self):
        self.assertDescriptionIs(
            'Escherichia coli tRNA-Pro (CGG)',
            'URS00001DEEBE',
            taxid=562)

74 75 76 77 78 79
    def test_does_not_truncate_rna_types_incorrectly(self):
        self.assertDescriptionIs(
            'Danio rerio tRNA',
            'URS0000661037',
            taxid=7955)

Blake Sweeney's avatar
Blake Sweeney committed
80 81 82 83

class HumanDescriptionTests(GenericDescriptionTest):
    def test_likes_hgnc_for_human(self):
        self.assertDescriptionIs(
84
            'Homo sapiens DiGeorge syndrome critical region gene 9 (DGCR9)',
Blake Sweeney's avatar
Blake Sweeney committed
85 86 87
            'URS0000759BEC',
            taxid=9606)

Blake Sweeney's avatar
Blake Sweeney committed
88
        self.assertDescriptionIs(
89
            'Homo sapiens STARD4 antisense RNA 1 (STARD4-AS1)',
Blake Sweeney's avatar
Blake Sweeney committed
90 91 92 93 94 95
            'URS00003CE153',
            taxid=9606)

        # NOTE: This is a bit questionable, there are other names which may
        # possibly be better
        self.assertDescriptionIs(
96
            'Homo sapiens small Cajal body-specific RNA 10 (SCARNA10)',
Blake Sweeney's avatar
Blake Sweeney committed
97 98 99
            'URS0000569A4A',
            taxid=9606)

100 101 102 103 104 105
    def test_uses_ensembl_names(self):
        self.assertDescriptionIs(
            'Homo sapiens long intergenic non-protein coding RNA 1729 (LINC01729)',
            'URS00003BECAC',
            taxid=9606)

Blake Sweeney's avatar
Blake Sweeney committed
106 107
    def test_prefers_mirbase_for_mirna_precursor(self):
        self.assertDescriptionIs(
108
            'Homo sapiens (human) microRNA precursor (hsa-mir-3648-1, hsa-mir-3648-2)',
Blake Sweeney's avatar
Blake Sweeney committed
109 110 111 112
            'URS000075A546',
            taxid=9606)

        self.assertDescriptionIs(
113 114
            'Homo sapiens (human) microRNA precursor (hsa-mir-1302-2, hsa-mir-1302 9 to 11)',
            # 'Homo sapiens (human) microRNA precursor (hsa-mir-1302-2, hsa-mir-1302-9, hsa-mir-1302-10, hsa-mir-1302-11)',
Blake Sweeney's avatar
Blake Sweeney committed
115 116 117
            'URS000075CC93',
            taxid=9606)

118 119
    def test_includes_gene_name_for_hgnc(self):
        self.assertDescriptionIs(
120
            'Homo sapiens HOX transcript antisense RNA (HOTAIR)',
121 122 123 124
            'URS000075C808',
            taxid=9606
        )

125
    def test_will_indicate_several_genes_for_hgnc(self):
126
        self.assertDescriptionIs(
Blake Sweeney's avatar
Blake Sweeney committed
127 128
            # 'Homo sapiens RNA, 5S ribosomal (RNA5S1-8, RNA5S 10-17)',
            'Homo sapiens RNA, 5S ribosomal 1 (RNA5S1-8, RNA5S10-17)',
129 130 131 132
            'URS00000F9D45',
            taxid=9606
        )

Blake Sweeney's avatar
Blake Sweeney committed
133 134 135 136 137 138
        self.assertDescriptionIs(
            'Homo sapiens RNA, 45S pre-ribosomal 4 (RNA45S4, RNA45SN1)',
            'URS0000ABD87F',
            taxid=9606
        )

Blake Sweeney's avatar
Blake Sweeney committed
139 140 141 142 143 144
    def test_it_will_use_hgnc_over_gencode(self):
        self.assertDescriptionIs(
            'Homo sapiens HELLP associated long non-coding RNA (HELLPAR)',
            'URS000019E0CD',
            taxid=9606)

Blake Sweeney's avatar
Blake Sweeney committed
145 146 147 148 149 150 151
    @pytest.mark.skip()
    def test_uses_snopy_descriptions(self):
        self.assertDescriptionIs(
            'Homo sapiens (human) small nucleolar RNA SNORD118L8',
            'URS00006CE02F',
            taxid=9606)

Blake Sweeney's avatar
Blake Sweeney committed
152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179

class ArabidopisDescriptionTests(GenericDescriptionTest):
    def test_likes_lncrnadb_over_ena(self):
        self.assertDescriptionIs(
            'Arabidopsis thaliana (thale cress) Long non-coding antisense RNA COOLAIR',
            'URS000018EB2E',
            taxid=3702)

    def test_uses_ena_if_only_source(self):
        self.assertDescriptionIs(
            'Arabidopsis thaliana (thale cress) tRNA-Met(CAT)',
            'URS00005F4CAF',
            taxid=3702)

    def test_it_uses_most_common_ena_name(self):
        self.assertDescriptionIs(
            'Arabidopsis thaliana (thale cress) partial tRNA-Leu',
            'URS000048B30C',
            taxid=3702)

    def test_uses_rfam_if_only_source(self):
        self.assertDescriptionIs(
            'Arabidopsis thaliana tRNA',
            'URS00006A2469',
            taxid=3702)

    def test_it_uses_tair_over_refseq(self):
        self.assertDescriptionIs(
Blake Sweeney's avatar
Blake Sweeney committed
180
            'Arabidopsis thaliana (thale cress) TAS3/TASIR-ARF (TRANS-ACTING SIRNA3); other RNA',
Blake Sweeney's avatar
Blake Sweeney committed
181 182 183 184 185
            'URS00003AC4AA',
            taxid=3702)


class MouseDescriptionTests(GenericDescriptionTest):
Blake Sweeney's avatar
Blake Sweeney committed
186 187

    @unittest.expectedFailure
Blake Sweeney's avatar
Blake Sweeney committed
188
    def test_likes_refseq_over_noncode(self):
Blake Sweeney's avatar
Blake Sweeney committed
189 190 191 192
        """This is failing currently, because Rfam and refseq disagree about
        the rna_type. We end up using Rfam's because that is generally right,
        however, in this case we end up with worse name because of it. Though
        examining the data for the RefSeq annotation I am inclined to believe
Blake Sweeney's avatar
Blake Sweeney committed
193
        Rfam over RefSeq in this case.
Blake Sweeney's avatar
Blake Sweeney committed
194
        """
Blake Sweeney's avatar
Blake Sweeney committed
195
        self.assertDescriptionIs(
196
            'Mus musculus small Cajal body-specific RNA 1 (Scarna13), guide RNA',
Blake Sweeney's avatar
Blake Sweeney committed
197 198 199 200 201
            'URS00006550DA',
            taxid=10090)

    def test_likes_refseq_over_ena(self):
        self.assertDescriptionIs(
202
            'Mus musculus small nucleolar RNA, C/D box 17 (Snord17), small nucleolar RNA',
Blake Sweeney's avatar
Blake Sweeney committed
203 204 205 206
            'URS000051DCEC',
            taxid=10090)

        self.assertDescriptionIs(
207
            'Mus musculus small nucleolar RNA, H/ACA box 3 (Snora3), small nucleolar RNA',
Blake Sweeney's avatar
Blake Sweeney committed
208 209 210
            'URS000060B496',
            taxid=10090)

Blake Sweeney's avatar
Blake Sweeney committed
211 212 213 214
    def test_will_not_build_using_rfam_if_disagree_rna_type(self):
        # This is a good candidate for using Rfam hits as a source of RNA
        # types. If we did so then this would be a snoRNA and have a different
        # name.
Blake Sweeney's avatar
Blake Sweeney committed
215
        self.assertDescriptionIs(
Blake Sweeney's avatar
Blake Sweeney committed
216
            'Mus musculus predicted gene 12238 (Gm12238)',
Blake Sweeney's avatar
Blake Sweeney committed
217 218 219
            'URS00004E52D3',
            taxid=10090)

220 221
    def test_it_will_strip_trailing_terms_from_description(self):
        self.assertDescriptionIs(
Blake Sweeney's avatar
Blake Sweeney committed
222
            'Mus musculus predicted gene 11532 (Gm11532), long non-coding RNA',
223
            'URS00008E3A1B',
Blake Sweeney's avatar
Blake Sweeney committed
224
            taxid=10090)
225

Blake Sweeney's avatar
Blake Sweeney committed
226 227
    def test_it_likes_rfam_over_noncode(self):
        self.assertDescriptionIs(
228
            'Mus musculus small Cajal body-specific RNA 2 (Scarna2)',
Blake Sweeney's avatar
Blake Sweeney committed
229
            'URS00006B3271',
Blake Sweeney's avatar
Blake Sweeney committed
230 231
            taxid=10090)

Blake Sweeney's avatar
Blake Sweeney committed
232 233 234 235
        # self.assertDescriptionIs(
        #     'Mus musculus Small Cajal body-specific RNA 6',
        #     'URS0000653D5F',
        #     taxid=10090)
Blake Sweeney's avatar
Blake Sweeney committed
236

237 238 239 240 241 242
    def test_does_not_add_duplicate_gene_names(self):
        self.assertDescriptionIs(
            'Mus musculus predicted gene 29254 (Gm29254)',
            'URS0000A86584',
            taxid=10090)

Blake Sweeney's avatar
Blake Sweeney committed
243 244 245 246

class CattleDescriptionTests(GenericDescriptionTest):
    def test_likes_name_with_precursor(self):
        self.assertDescriptionIs(
247
            'Bos taurus (cattle) microRNA 431 precursor',
Blake Sweeney's avatar
Blake Sweeney committed
248 249 250
            'URS00007150F8',
            taxid=9913)

Blake Sweeney's avatar
Blake Sweeney committed
251 252 253
    @unittest.skip("No examples yet")
    def test_will_use_gtRNAdb(self):
        pass
Blake Sweeney's avatar
Blake Sweeney committed
254 255

    def test_use_pdb_over_rfam(self):
Blake Sweeney's avatar
Blake Sweeney committed
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273
        self.assertDescriptionIs(
            'transfer RNA-Trp from Bos taurus (PDB 2AKE, chain B)' ,
            'URS0000669B12',
            taxid=9913)

    def test_will_use_mirbase_for_precusors(self):
        self.assertDescriptionIs(
            'Bos taurus (cattle) microRNA bta-mir-497 precursor',
            'URS00006D80BC',
            taxid=9913)

        self.assertDescriptionIs(
            'Bos taurus (cattle) microRNA bta-mir-10a precursor',
            'URS000075CF25',
            taxid=9913)

    def test_likes_refseq_over_ena(self):
        self.assertDescriptionIs(
274
            'Bos taurus telomerase RNA component (TERC), telomerase RNA',
Blake Sweeney's avatar
Blake Sweeney committed
275 276 277 278 279 280 281
            'URS00003EBD9A',
            taxid=9913)


class WormTests(GenericDescriptionTest):
    def test_likes_wormbase(self):
        self.assertDescriptionIs(
Blake Sweeney's avatar
Blake Sweeney committed
282
            'Caenorhabditis elegans long non-coding RNA linc-125',
Blake Sweeney's avatar
Blake Sweeney committed
283 284 285
            'URS00005511ED',
            taxid=6239)

Blake Sweeney's avatar
Blake Sweeney committed
286
    def test_likes_uses_wormbase_over_good_silva(self):
Blake Sweeney's avatar
Blake Sweeney committed
287
        self.assertDescriptionIs(
Blake Sweeney's avatar
Blake Sweeney committed
288
            'Caenorhabditis elegans 26s rRNA',
Blake Sweeney's avatar
Blake Sweeney committed
289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
            'URS00004FB44B',
            taxid=6239)

    # NOTE: this may be questionable as there is a fairly specific looking name
    # from wormbase, but the one from mirbase looks better to me. Need to see
    # what people in the worm/miRNA field like.
    def test_it_likes_mirbase_over_wormbase(self):
        self.assertDescriptionIs(
            'Caenorhabditis elegans microRNA cel-miR-229-5p',
            'URS0000466DE6',
            taxid=6239)

    def test_likes_wormbase_over_rfam(self):
        self.assertDescriptionIs(
            'Caenorhabditis elegans tRNA-Undet',
            'URS00006DC8B9',
            taxid=6239)

        self.assertDescriptionIs(
            'Caenorhabditis elegans tRNA-His',
            'URS000069D7FA',
            taxid=6239)
Blake Sweeney's avatar
Blake Sweeney committed
311 312


Blake Sweeney's avatar
Blake Sweeney committed
313
class LargeDataTests(GenericDescriptionTest):
314
    @unittest.skip("No examples yet")
Blake Sweeney's avatar
Blake Sweeney committed
315
    def test_can_compute_when_many_cross_ref(self):
Blake Sweeney's avatar
Blake Sweeney committed
316 317 318
        """This is a stress test to see if this still performs quickly given a
        sequence with many ~10k xrefs
        """
319
        self.assertDescriptionIs('tRNA from 3413 species', 'URS0000181AEC')
320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360


def test_can_strip_trailing_dots():
    name = 'Mus musculus predicted gene 11532 (Gm11532), long non-coding RNA.'
    ans = 'Mus musculus predicted gene 11532 (Gm11532), long non-coding RNA'
    assert rm.remove_extra_description_terms(name) == ans


def test_can_strip_extra_lnc_rna():
    name = 'Mus musculus predicted gene 11532 (Gm11532), long non-coding RNA'
    ans = 'Mus musculus predicted gene 11532 (Gm11532)'
    assert rm.trim_trailing_rna_type('lncRNA', name) == ans


def test_can_strip_extra_lnc_rna_when_antisense():
    name = 'Mus musculus predicted gene 11532 (Gm11532), long non-coding RNA'
    ans = 'Mus musculus predicted gene 11532 (Gm11532)'
    assert rm.trim_trailing_rna_type('antisense', name) == ans


def test_will_not_strip_if_rna_type_differs():
    name = 'Mus musculus predicted gene 11532 (Gm11532), long non-coding RNA'
    assert rm.trim_trailing_rna_type('SRP_RNA', name) == name


def test_will_strip_non_coding_term_with_whitespace():
    name = 'Mus musculus myocardial infarction associated transcript (non- protein coding)'
    ans = 'Mus musculus myocardial infarction associated transcript'
    assert rm.remove_extra_description_terms(name) == ans


def test_will_strip_non_coding_term():
    name = 'Mus musculus myocardial infarction associated transcript (non-protein coding)'
    ans = 'Mus musculus myocardial infarction associated transcript'
    assert rm.remove_extra_description_terms(name) == ans


def test_will_correct_tmrna():
    name = 'Homo sapiens (human) transfer-messenger mRNA Esche_coli_K12'
    ans = 'Homo sapiens (human) transfer-messenger RNA Esche_coli_K12'
    assert rm.remove_extra_description_terms(name) == ans