Unverified Commit 6eb971c5 authored by Andrey Azov's avatar Andrey Azov Committed by GitHub
Browse files

Download sequences in plain text and transform them to FASTA in web worker (#450)

parent bccc4446
......@@ -8122,6 +8122,12 @@
"delayed-stream": "~1.0.0"
}
},
"comlink": {
"version": "4.3.0",
"resolved": "https://registry.npmjs.org/comlink/-/comlink-4.3.0.tgz",
"integrity": "sha512-mu4KKKNuW8TvkfpW/H88HBPeILubBS6T94BdD1VWBXNXfiyqVtwUCVNO1GeNOBTsIswzsMjWlycYr+77F5b84g==",
"dev": true
},
"comma-separated-tokens": {
"version": "1.0.8",
"resolved": "https://registry.npmjs.org/comma-separated-tokens/-/comma-separated-tokens-1.0.8.tgz",
......@@ -28511,6 +28517,37 @@
"errno": "~0.1.7"
}
},
"worker-plugin": {
"version": "5.0.0",
"resolved": "https://registry.npmjs.org/worker-plugin/-/worker-plugin-5.0.0.tgz",
"integrity": "sha512-AXMUstURCxDD6yGam2r4E34aJg6kW85IiaeX72hi+I1cxyaMUtrvVY6sbfpGKAj5e7f68Acl62BjQF5aOOx2IQ==",
"dev": true,
"requires": {
"loader-utils": "^1.1.0"
},
"dependencies": {
"json5": {
"version": "1.0.1",
"resolved": "https://registry.npmjs.org/json5/-/json5-1.0.1.tgz",
"integrity": "sha512-aKS4WQjPenRxiQsC93MNfjx+nbF4PAdYzmd/1JIj8HYzqfbu86beTuNgXDzPknWk0n0uARlyewZo4s++ES36Ow==",
"dev": true,
"requires": {
"minimist": "^1.2.0"
}
},
"loader-utils": {
"version": "1.4.0",
"resolved": "https://registry.npmjs.org/loader-utils/-/loader-utils-1.4.0.tgz",
"integrity": "sha512-qH0WSMBtn/oHuwjy/NucEgbx5dbxxnxup9s4PVXJUDHZBQY+s0NWA9rJf53RBnQZxfch7euUui7hpoAPvALZdA==",
"dev": true,
"requires": {
"big.js": "^5.2.2",
"emojis-list": "^3.0.0",
"json5": "^1.0.1"
}
}
}
},
"worker-rpc": {
"version": "0.1.1",
"resolved": "https://registry.npmjs.org/worker-rpc/-/worker-rpc-0.1.1.tgz",
......@@ -117,6 +117,7 @@
"babel-loader": "8.1.0",
"babel-plugin-react-remove-properties": "0.3.0",
"brotli-webpack-plugin": "1.1.0",
"comlink": "4.3.0",
"compression-webpack-plugin": "5.0.2",
"connect-history-api-fallback": "1.6.0",
"copy-webpack-plugin": "6.1.0",
......@@ -170,7 +171,8 @@
"webpack-cli": "3.3.12",
"webpack-dev-server": "3.11.0",
"webpack-merge": "5.1.4",
"workbox-webpack-plugin": "5.1.4"
"workbox-webpack-plugin": "5.1.4",
"worker-plugin": "5.0.0"
},
"browserslist": [
"> 1% and last 2 versions",
......
......@@ -14,6 +14,8 @@
* limitations under the License.
*/
import { wrap } from 'comlink';
import downloadAsFile from 'src/shared/helpers/downloadAsFile';
import {
ProteinOptions,
......@@ -21,10 +23,15 @@ import {
proteinOptionsOrder
} from 'src/shared/components/instant-download/instant-download-protein/InstantDownloadProtein';
import {
fetchTranscriptChecksums,
TranscriptChecksums
fetchTranscriptSequenceMetadata,
TranscriptSequenceMetadata
} from './fetchSequenceChecksums';
import {
WorkerApi,
SingleSequenceFetchParams
} from 'src/shared/workers/sequenceFetcher.worker';
type FetchPayload = {
genomeId: string;
transcriptId: string;
......@@ -33,47 +40,60 @@ type FetchPayload = {
export const fetchForProtein = async (payload: FetchPayload) => {
const { genomeId, transcriptId, options } = payload;
const productGeneratingContext = await fetchTranscriptChecksums({
const transcriptSequenceData = await fetchTranscriptSequenceMetadata({
genomeId,
transcriptId
});
const urls = buildUrlsForProtein(productGeneratingContext, options);
const sequencePromises = urls.map((url) =>
fetch(url).then((response) => response.text())
);
const sequenceDownloadParams = prepareDownloadParameters({
transcriptSequenceData,
options
});
const worker = new Worker('src/shared/workers/sequenceFetcher.worker', {
type: 'module'
});
const sequences = await Promise.all(sequencePromises);
const combinedFasta = sequences.join('\n\n');
const service = wrap<WorkerApi>(worker);
downloadAsFile(combinedFasta, `${transcriptId}.fasta`, {
const sequences = await service.downloadSequences(sequenceDownloadParams);
worker.terminate();
downloadAsFile(sequences, `${transcriptId}.fasta`, {
type: 'text/x-fasta'
});
};
const buildUrlsForProtein = (
productGeneratingContext: TranscriptChecksums,
options: ProteinOptions
) => {
return options
? proteinOptionsOrder
.filter((option) => options[option])
.map((option) => buildFetchUrl(productGeneratingContext, option))
: [];
type PrepareDownloadParametersParams = {
transcriptSequenceData: TranscriptSequenceMetadata;
options: ProteinOptions;
};
const buildFetchUrl = (
productGeneratingContext: TranscriptChecksums,
sequenceType: ProteinOption
) => {
const sequenceTypeToContextType: Record<ProteinOption, string> = {
proteinSequence: 'product',
cds: 'cds'
};
const contextType = sequenceTypeToContextType[
sequenceType
] as keyof TranscriptChecksums;
const checksum = productGeneratingContext[contextType]?.sequence_checksum;
const prepareDownloadParameters = (params: PrepareDownloadParametersParams) => {
const { transcriptSequenceData } = params;
return proteinOptionsOrder
.filter((option) => params.options[option])
.map((option) => labelTypeToSequenceType[option]) // 'protein', 'cds'
.map((option) => {
const dataForSingleSequence = transcriptSequenceData[option];
if (!dataForSingleSequence) {
// shouldn't happen; but to keep typescript happy
return null;
}
return {
label: dataForSingleSequence.label,
url: `/api/refget/sequence/${dataForSingleSequence.checksum}?accept=text/plain`
};
})
.filter(Boolean) as SingleSequenceFetchParams[];
};
return `/api/refget/sequence/${checksum}?accept=text/x-fasta`;
// map of field names received from component to field names returned when fetching checksums
const labelTypeToSequenceType: Record<
ProteinOption,
keyof TranscriptSequenceMetadata
> = {
proteinSequence: 'protein',
cds: 'cds'
};
......@@ -14,6 +14,8 @@
* limitations under the License.
*/
import { wrap } from 'comlink';
import downloadAsFile from 'src/shared/helpers/downloadAsFile';
import {
......@@ -22,10 +24,15 @@ import {
transcriptOptionsOrder
} from 'src/shared/components/instant-download/instant-download-transcript/InstantDownloadTranscript';
import {
fetchTranscriptChecksums,
TranscriptChecksums
fetchTranscriptSequenceMetadata,
TranscriptSequenceMetadata
} from './fetchSequenceChecksums';
import {
WorkerApi,
SingleSequenceFetchParams
} from 'src/shared/workers/sequenceFetcher.worker';
type Options = {
transcript: Partial<TranscriptOptions>;
gene: {
......@@ -47,64 +54,77 @@ export const fetchForTranscript = async (payload: FetchPayload) => {
transcriptId,
options: { transcript: transcriptOptions, gene: geneOptions }
} = payload;
const checksums = await fetchTranscriptChecksums({
const transcriptSequenceData = await fetchTranscriptSequenceMetadata({
genomeId,
transcriptId
});
const urls = buildUrlsForTranscript({ geneId, checksums }, transcriptOptions);
const sequenceDownloadParams = prepareDownloadParameters({
transcriptId,
transcriptSequenceData,
options: transcriptOptions
});
if (geneOptions.genomicSequence) {
urls.push(buildFetchUrl({ geneId }, 'genomicSequence'));
sequenceDownloadParams.push(getGenomicSequenceData(geneId));
}
const sequencePromises = urls.map((url) =>
fetch(url).then((response) => response.text())
);
const sequences = await Promise.all(sequencePromises);
const combinedFasta = sequences.join('\n\n');
const worker = new Worker('src/shared/workers/sequenceFetcher.worker', {
type: 'module'
});
const service = wrap<WorkerApi>(worker);
const sequences = await service.downloadSequences(sequenceDownloadParams);
downloadAsFile(combinedFasta, `${transcriptId}.fasta`, {
worker.terminate();
downloadAsFile(sequences, `${transcriptId}.fasta`, {
type: 'text/x-fasta'
});
};
const buildUrlsForTranscript = (
data: {
geneId: string;
checksums: TranscriptChecksums;
},
options: Partial<TranscriptOptions>
) => {
return options
? transcriptOptionsOrder
.filter((option) => options[option])
.map((option) => buildFetchUrl(data, option))
: [];
type PrepareDownloadParametersParams = {
transcriptId: string;
transcriptSequenceData: TranscriptSequenceMetadata;
options: Partial<TranscriptOptions>;
};
const buildFetchUrl = (
data: {
geneId: string;
checksums?: TranscriptChecksums;
},
sequenceType: TranscriptOption
) => {
const sequenceTypeToContextType: Record<TranscriptOption, string> = {
genomicSequence: 'genomic',
proteinSequence: 'product',
cdna: 'cdna',
cds: 'cds'
};
// map of field names received from component to field names returned when fetching checksums
const labelTypeToSequenceType: Record<
TranscriptOption,
keyof TranscriptSequenceMetadata | 'genomic'
> = {
genomicSequence: 'genomic',
proteinSequence: 'protein',
cdna: 'cdna',
cds: 'cds'
};
if (sequenceType === 'genomicSequence') {
return `https://rest.ensembl.org/sequence/id/${data.geneId}?content-type=text/x-fasta&type=${sequenceTypeToContextType.genomicSequence}`;
} else {
const contextType = sequenceTypeToContextType[
sequenceType
] as keyof TranscriptChecksums;
const checksum =
data.checksums && data.checksums[contextType]?.sequence_checksum;
const prepareDownloadParameters = (params: PrepareDownloadParametersParams) => {
return transcriptOptionsOrder
.filter((option) => params.options[option])
.map((option) => labelTypeToSequenceType[option]) // 'genomic', 'protein', 'cdna', 'cds'
.map((option) => {
if (option === 'genomic') {
return getGenomicSequenceData(params.transcriptId);
} else {
const dataForSingleSequence = params.transcriptSequenceData[option];
if (!dataForSingleSequence) {
// shouldn't happen; but to keep typescript happy
return null;
}
return {
label: dataForSingleSequence.label,
url: `/api/refget/sequence/${dataForSingleSequence.checksum}?accept=text/plain`
};
}
})
.filter(Boolean) as SingleSequenceFetchParams[];
};
return `/api/refget/sequence/${checksum}?accept=text/x-fasta`;
}
const getGenomicSequenceData = (id: string) => {
return {
label: `${id} genomic`,
url: `https://rest.ensembl.org/sequence/id/${id}?content-type=text/plain&type=genomic`
};
};
......@@ -18,21 +18,35 @@ import { gql } from '@apollo/client';
import { client } from 'src/gql-client';
export type TranscriptChecksums = {
cdna: {
sequence_checksum: string;
export type TranscriptSequenceMetadata = {
cdna?: {
checksum: string;
label: string;
};
cds: {
sequence_checksum: string;
cds?: {
checksum: string;
label: string;
};
product: {
sequence_checksum: string;
protein?: {
checksum: string;
label: string;
};
};
type GeneFragment = {
type TranscriptQueryResult = {
transcript: {
product_generating_contexts: TranscriptChecksums[];
product_generating_contexts: Array<{
cdna: {
sequence_checksum: string;
};
cds: {
sequence_checksum: string;
};
product: {
stable_id: string;
sequence_checksum: string;
};
}>;
};
};
......@@ -47,6 +61,7 @@ const transcriptChecksumsQuery = gql`
sequence_checksum
}
product {
stable_id
sequence_checksum
}
}
......@@ -59,10 +74,37 @@ type Variables = {
transcriptId: string;
};
export const fetchTranscriptChecksums = (variables: Variables) =>
client
.query<GeneFragment>({
export const fetchTranscriptSequenceMetadata = (
variables: Variables
): Promise<TranscriptSequenceMetadata> => {
const { transcriptId } = variables;
return client
.query<TranscriptQueryResult>({
query: transcriptChecksumsQuery,
variables
})
.then(({ data }) => data.transcript.product_generating_contexts[0]);
.then(({ data }) => {
// TODO: expect to fetch genomic sequence here as well when checksum becomes available
const productGeneratingContext =
data.transcript.product_generating_contexts[0];
if (!productGeneratingContext) {
return {};
}
return {
cdna: {
checksum: productGeneratingContext.cdna.sequence_checksum,
label: `${transcriptId} cdna`
},
cds: {
checksum: productGeneratingContext.cds.sequence_checksum,
label: `${transcriptId} cds`
},
protein: {
checksum: productGeneratingContext.product.sequence_checksum,
label: `${productGeneratingContext.product.stable_id} pep`
}
};
});
};
import { toFasta, LINE_LENGTH } from './fastaFormatter';
import random from 'lodash/random';
const generateSequence = (length: number) => {
const alphabet = 'AGCT';
let sequence = '';
for (let i = 0; i <= length; i++) {
const characterIndex = Math.floor(Math.random() * alphabet.length);
const character = alphabet[characterIndex];
sequence += character;
}
return sequence;
};
describe('fasta formatter', () => {
it('formats raw sequence in the fasta format', () => {
const sequenceLength = random(1, 600);
const sequenceLabel = 'label for the sequence';
const rawSequence = generateSequence(sequenceLength);
const fastaFormattedSequence = toFasta(sequenceLabel, rawSequence);
const [firstLine, ...sequenceLines] = fastaFormattedSequence.split('\n');
expect(firstLine).toBe(`>${sequenceLabel}`);
expect(sequenceLines.every(line => line.length <= LINE_LENGTH));
expect(sequenceLines.join('')).toBe(rawSequence);
});
});
export const LINE_LENGTH = 60; // line length in Ensembl refget implementations
export const toFasta = (sequenceLabel: string, sequence: string) => {
const formattedSequence = [];
formattedSequence.push(`>${sequenceLabel}`);
let row = '';
for (let i = 0; i < sequence.length; i++) {
row += sequence[i];
const isAtEndOfLine = (i + 1) % LINE_LENGTH === 0;
if (i === sequence.length - 1 || isAtEndOfLine) {
formattedSequence.push(row);
row = '';
}
}
return formattedSequence.join('\n');
};
/**
* See the NOTICE file distributed with this work for additional information
* regarding copyright ownership.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import { expose } from 'comlink';
import { toFasta } from 'src/shared/helpers/formatters/fastaFormatter';
export type SingleSequenceFetchParams = {
label: string;
url: string;
};
export type SequenceFetcherParams = Array<SingleSequenceFetchParams>;
const downloadSequences = async (params: SequenceFetcherParams) => {
const sequencePromises = params.map(({ label, url }) => {
return fetch(url)
.then((response) => response.text())
.then((sequence) => toFasta(label, sequence));
});
const sequences = await Promise.all(sequencePromises);
// start new sequence on a new line; no empty lines allowed in FASTA files
return sequences.join('\n');
};
const workerApi = {
downloadSequences
};
export type WorkerApi = typeof workerApi;
expose(workerApi);
const path = require('path');
const postcssPresetEnv = require('postcss-preset-env');
const HtmlPlugin = require('html-webpack-plugin');
const MiniCssExtractPlugin = require('mini-css-extract-plugin');
const ForkTsCheckerPlugin = require('fork-ts-checker-webpack-plugin');
const WorkerPlugin = require("worker-plugin");
const { getPaths } = require('../paths');
const { isDevelopment } = require('./environment-detector');
......@@ -97,7 +97,9 @@ module.exports = (env) => {
filename: isDev ? 'index.html' : '../index.html',
template: paths.htmlTemplatePath,
publicPath: '/'
})
}),
new WorkerPlugin()
],
// configuration that allows us to not to use file extensions and shorten import paths (using aliases)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment