Skip to content
Snippets Groups Projects
Unverified Commit 6c62aa54 authored by Benjamin Wingfield's avatar Benjamin Wingfield Committed by GitHub
Browse files

Use FID and IID when working with sample IDs (#17)

* check IIDs are unique

* add duplicated samples exception test

* fix reference prefix in duplicate test

* fix FID handling

* fix testing for duplicates

* update --stu_filt_iid argument description

* add FID column to pcs file output

* update fam mask generation

* mixed up the generator arguments o_o

* major version bump
parent bc4aeeb4
No related branches found
Tags v1.0.0
No related merge requests found
[tool.poetry]
name = "fraposa-pgsc"
version = "0.1.1"
version = "1.0.0"
description = "Tools to perform ancestry projection to a reference dataset within the calculator pipeline (pgsc_calc)"
homepage = "https://github.com/PGScatalog/fraposa_pgsc"
authors = ["smlmbrt <sam.a.lambert@gmail.com>"]
......
......@@ -22,6 +22,8 @@ import logging
from sklearn.utils.extmath import randomized_svd
from typing import Union
from .sampleid import SampleID
def create_logger(out_filepref='fraposa'):
log = logging.getLogger()
log.handlers = [] # Avoid duplicated logs in interactive modes
......@@ -136,13 +138,17 @@ def read_bed(bed_filepref, dtype=np.int8, filt_iid=None):
p = len(bim)
n = len(fam)
if type(filt_iid) is list:
matched_ids = set(filt_iid).intersection(fam['iid'])
if filt_iid:
fam_ids = set(SampleID(x, y) for x, y in zip(fam['fid'], fam['iid'], strict=True))
matched_ids = filt_iid.intersection(fam_ids)
if len(matched_ids) == 0:
logging.error('ERROR: 0 / {} ids in filter list match the study dataset'.format(len(filt_iid)))
sys.exit(1)
raise ValueError(f"ERROR: 0 / {len(filt_iid)} ids in filter list match the study dataset")
elif len(fam_ids) != len(fam):
raise ValueError("Samples with duplicated FID + IID detected, please remove and retry")
bed = np.zeros(shape=(p, len(matched_ids)), dtype=dtype)
fam_mask = fam.iid.isin(matched_ids)
# in will call SampleID's __hash__ method which uses (fid, iid)
fam_mask = pd.Series((x in matched_ids for x in fam_ids), dtype=bool)
i_extract = np.where(fam_mask == True)
for (i, (snp, genotypes)) in enumerate(pyp):
bed[i,:] = genotypes[i_extract]
......@@ -338,9 +344,16 @@ def pca_stu(W, X_mean, X_std, method,
def _write_pcs(df_pcs, df_fam, colnames, filepref, output_fmt, stage='REFERENCE'):
pcs_ref = pd.DataFrame(data=df_pcs, index=df_fam['iid'], columns=colnames)
pcs_ref.index.name = 'IID'
pcs_ref.to_csv(filepref + '.pcs', sep='\t', header=True, index=True, float_format=output_fmt)
pcs_ref = pd.DataFrame(data=df_pcs, index=df_fam[["fid", "iid"]], columns=colnames)
pcs_ref.index = pd.MultiIndex.from_tuples(pcs_ref.index, names=['FID', 'IID'])
pcs_ref = pcs_ref.reset_index() # index to normal columns
# FID is always a string
if all(pcs_ref["FID"] == "0"):
# column is present but missing data
pcs_ref["FID"] = pcs_ref["IID"]
pcs_ref.to_csv(filepref + '.pcs', sep='\t', header=True, index=False, float_format=output_fmt)
logging.info('{} PC scores saved to {}.pcs'.format(stage, filepref))
......
#! /usr/bin/env python
import csv
import fraposa_pgsc.fraposa as fp
from fraposa_pgsc.sampleid import SampleID
import argparse
......@@ -8,7 +10,7 @@ def main():
parser = argparse.ArgumentParser()
parser.add_argument('ref_filepref', help='Prefix of the binary PLINK file for the reference samples.')
parser.add_argument('--stu_filepref', help='Prefix of the binary PLINK file for the study samples.')
parser.add_argument('--stu_filt_iid', help='File with list of IIDs to extract from the study file')
parser.add_argument('--stu_filt_iid', help='File with list of FIDs and IIDs to extract from the study file (bim format)')
parser.add_argument('--method', help='The method for PCA prediction. oadp: most accurate. adp: accurate but slow. sp: fast but inaccurate. Default is odap.')
parser.add_argument('--dim_ref', help='Number of PCs you need.')
parser.add_argument('--dim_stu', help='Number of PCs predicted for the study samples before doing the Procrustes transformation. Only needed for the oadp and adp methods. Default is 2*dim_ref.')
......@@ -29,13 +31,20 @@ def main():
dim_rand = None
dim_spikes = None
dim_spikes_max = None
stu_filt_iid = None
if args.stu_filepref:
stu_filepref = args.stu_filepref
out_filepref = stu_filepref
if args.stu_filt_iid:
stu_filt_iid = open(args.stu_filt_iid, 'r').read().strip().split('\n')
try:
with open(args.stu_filt_iid) as f:
reader = csv.reader(f, delimiter="\t")
stu_filt_iid = set(SampleID(x[0], x[1]) for x in list(reader))
except TypeError:
stu_filt_iid = None
except IndexError:
raise ValueError("Can't parse --stu_filt_iid file (it should be a plink fam file)")
if args.out:
out_filepref = args.out
if args.method:
......
class SampleID:
""" A sample ID from a plink fam file, including FID and IID """
def __init__(self, fid, iid):
self._fid = fid
self._iid = iid
def __repr__(self):
return f"{self.__class__.__name__}(fid={repr(self.fid)}, iid={repr(self.iid)})"
@property
def fid(self):
if self._fid == "0": # 0 means missing :)
return self._iid
else:
return self._fid
@property
def iid(self):
return self._iid
def __hash__(self):
return hash((self.fid, self.iid))
def __eq__(self, other):
if not isinstance(other, SampleID):
return NotImplemented
return self.fid == other.fid and self.iid == other.iid
File added
This diff is collapsed.
samp001 samp001 0 0 0 -9
samp002 samp001 0 0 0 -9
samp003 samp001 0 0 0 -9
samp004 samp001 0 0 0 -9
samp005 samp001 0 0 0 -9
samp006 samp001 0 0 0 -9
samp007 samp001 0 0 0 -9
samp008 samp001 0 0 0 -9
samp009 samp001 0 0 0 -9
samp010 samp010 0 0 0 -9
samp011 samp011 0 0 0 -9
samp012 samp012 0 0 0 -9
samp013 samp013 0 0 0 -9
samp014 samp014 0 0 0 -9
samp015 samp015 0 0 0 -9
samp016 samp016 0 0 0 -9
samp017 samp017 0 0 0 -9
samp018 samp018 0 0 0 -9
samp019 samp019 0 0 0 -9
samp020 samp020 0 0 0 -9
samp021 samp021 0 0 0 -9
samp022 samp022 0 0 0 -9
samp023 samp023 0 0 0 -9
samp024 samp024 0 0 0 -9
samp025 samp025 0 0 0 -9
samp026 samp026 0 0 0 -9
samp027 samp027 0 0 0 -9
samp028 samp028 0 0 0 -9
samp029 samp029 0 0 0 -9
samp030 samp030 0 0 0 -9
samp031 samp031 0 0 0 -9
samp032 samp032 0 0 0 -9
samp033 samp033 0 0 0 -9
samp034 samp034 0 0 0 -9
samp035 samp035 0 0 0 -9
samp036 samp036 0 0 0 -9
samp037 samp037 0 0 0 -9
samp038 samp038 0 0 0 -9
samp039 samp039 0 0 0 -9
samp040 samp040 0 0 0 -9
samp041 samp041 0 0 0 -9
samp042 samp042 0 0 0 -9
samp043 samp043 0 0 0 -9
samp044 samp044 0 0 0 -9
samp045 samp045 0 0 0 -9
samp046 samp046 0 0 0 -9
samp047 samp047 0 0 0 -9
samp048 samp048 0 0 0 -9
samp049 samp049 0 0 0 -9
samp050 samp050 0 0 0 -9
samp051 samp051 0 0 0 -9
samp052 samp052 0 0 0 -9
samp053 samp053 0 0 0 -9
samp054 samp054 0 0 0 -9
samp055 samp055 0 0 0 -9
samp056 samp056 0 0 0 -9
samp057 samp057 0 0 0 -9
samp058 samp058 0 0 0 -9
samp059 samp059 0 0 0 -9
samp060 samp060 0 0 0 -9
samp061 samp061 0 0 0 -9
samp062 samp062 0 0 0 -9
samp063 samp063 0 0 0 -9
samp064 samp064 0 0 0 -9
samp065 samp065 0 0 0 -9
samp066 samp066 0 0 0 -9
samp067 samp067 0 0 0 -9
samp068 samp068 0 0 0 -9
samp069 samp069 0 0 0 -9
samp070 samp070 0 0 0 -9
samp071 samp071 0 0 0 -9
samp072 samp072 0 0 0 -9
samp073 samp073 0 0 0 -9
samp074 samp074 0 0 0 -9
samp075 samp075 0 0 0 -9
samp076 samp076 0 0 0 -9
samp077 samp077 0 0 0 -9
samp078 samp078 0 0 0 -9
samp079 samp079 0 0 0 -9
samp080 samp080 0 0 0 -9
samp081 samp081 0 0 0 -9
samp082 samp082 0 0 0 -9
samp083 samp083 0 0 0 -9
samp084 samp084 0 0 0 -9
samp085 samp085 0 0 0 -9
samp086 samp086 0 0 0 -9
samp087 samp087 0 0 0 -9
samp088 samp088 0 0 0 -9
samp089 samp089 0 0 0 -9
samp090 samp090 0 0 0 -9
samp091 samp091 0 0 0 -9
samp092 samp092 0 0 0 -9
samp093 samp093 0 0 0 -9
samp094 samp094 0 0 0 -9
samp095 samp095 0 0 0 -9
samp096 samp096 0 0 0 -9
samp097 samp097 0 0 0 -9
samp098 samp098 0 0 0 -9
samp099 samp099 0 0 0 -9
samp100 samp100 0 0 0 -9
samp101 samp101 0 0 0 -9
samp102 samp102 0 0 0 -9
samp103 samp103 0 0 0 -9
samp104 samp104 0 0 0 -9
samp105 samp105 0 0 0 -9
samp106 samp106 0 0 0 -9
samp107 samp107 0 0 0 -9
samp108 samp108 0 0 0 -9
samp109 samp109 0 0 0 -9
samp110 samp110 0 0 0 -9
samp111 samp111 0 0 0 -9
samp112 samp112 0 0 0 -9
samp113 samp113 0 0 0 -9
samp114 samp114 0 0 0 -9
samp115 samp115 0 0 0 -9
samp116 samp116 0 0 0 -9
samp117 samp117 0 0 0 -9
samp118 samp118 0 0 0 -9
samp119 samp119 0 0 0 -9
samp120 samp120 0 0 0 -9
samp121 samp121 0 0 0 -9
samp122 samp122 0 0 0 -9
samp123 samp123 0 0 0 -9
samp124 samp124 0 0 0 -9
samp125 samp125 0 0 0 -9
samp126 samp126 0 0 0 -9
samp127 samp127 0 0 0 -9
samp128 samp128 0 0 0 -9
samp129 samp129 0 0 0 -9
samp130 samp130 0 0 0 -9
samp131 samp131 0 0 0 -9
samp132 samp132 0 0 0 -9
samp133 samp133 0 0 0 -9
samp134 samp134 0 0 0 -9
samp135 samp135 0 0 0 -9
samp136 samp136 0 0 0 -9
samp137 samp137 0 0 0 -9
samp138 samp138 0 0 0 -9
samp139 samp139 0 0 0 -9
samp140 samp140 0 0 0 -9
samp141 samp141 0 0 0 -9
samp142 samp142 0 0 0 -9
samp143 samp143 0 0 0 -9
samp144 samp144 0 0 0 -9
samp145 samp145 0 0 0 -9
samp146 samp146 0 0 0 -9
samp147 samp147 0 0 0 -9
samp148 samp148 0 0 0 -9
samp149 samp149 0 0 0 -9
samp150 samp150 0 0 0 -9
samp151 samp151 0 0 0 -9
samp152 samp152 0 0 0 -9
samp153 samp153 0 0 0 -9
samp154 samp154 0 0 0 -9
samp155 samp155 0 0 0 -9
samp156 samp156 0 0 0 -9
samp157 samp157 0 0 0 -9
samp158 samp158 0 0 0 -9
samp159 samp159 0 0 0 -9
samp160 samp160 0 0 0 -9
samp161 samp161 0 0 0 -9
samp162 samp162 0 0 0 -9
samp163 samp163 0 0 0 -9
samp164 samp164 0 0 0 -9
samp165 samp165 0 0 0 -9
samp166 samp166 0 0 0 -9
samp167 samp167 0 0 0 -9
samp168 samp168 0 0 0 -9
samp169 samp169 0 0 0 -9
samp170 samp170 0 0 0 -9
samp171 samp171 0 0 0 -9
samp172 samp172 0 0 0 -9
samp173 samp173 0 0 0 -9
samp174 samp174 0 0 0 -9
samp175 samp175 0 0 0 -9
samp176 samp176 0 0 0 -9
samp177 samp177 0 0 0 -9
samp178 samp178 0 0 0 -9
samp179 samp179 0 0 0 -9
samp180 samp180 0 0 0 -9
samp181 samp181 0 0 0 -9
samp182 samp182 0 0 0 -9
samp183 samp183 0 0 0 -9
samp184 samp184 0 0 0 -9
samp185 samp185 0 0 0 -9
samp186 samp186 0 0 0 -9
samp187 samp187 0 0 0 -9
samp188 samp188 0 0 0 -9
samp189 samp189 0 0 0 -9
samp190 samp190 0 0 0 -9
samp191 samp191 0 0 0 -9
samp192 samp192 0 0 0 -9
samp193 samp193 0 0 0 -9
samp194 samp194 0 0 0 -9
samp195 samp195 0 0 0 -9
samp196 samp196 0 0 0 -9
samp197 samp197 0 0 0 -9
samp198 samp198 0 0 0 -9
samp199 samp199 0 0 0 -9
samp200 samp200 0 0 0 -9
samp201 samp201 0 0 0 -9
samp202 samp202 0 0 0 -9
samp203 samp203 0 0 0 -9
samp204 samp204 0 0 0 -9
samp205 samp205 0 0 0 -9
samp206 samp206 0 0 0 -9
samp207 samp207 0 0 0 -9
samp208 samp208 0 0 0 -9
samp209 samp209 0 0 0 -9
samp210 samp210 0 0 0 -9
samp211 samp211 0 0 0 -9
samp212 samp212 0 0 0 -9
samp213 samp213 0 0 0 -9
samp214 samp214 0 0 0 -9
samp215 samp215 0 0 0 -9
samp216 samp216 0 0 0 -9
samp217 samp217 0 0 0 -9
samp218 samp218 0 0 0 -9
samp219 samp219 0 0 0 -9
samp220 samp220 0 0 0 -9
samp221 samp221 0 0 0 -9
samp222 samp222 0 0 0 -9
samp223 samp223 0 0 0 -9
samp224 samp224 0 0 0 -9
samp225 samp225 0 0 0 -9
samp226 samp226 0 0 0 -9
samp227 samp227 0 0 0 -9
samp228 samp228 0 0 0 -9
samp229 samp229 0 0 0 -9
samp230 samp230 0 0 0 -9
samp231 samp231 0 0 0 -9
samp232 samp232 0 0 0 -9
samp233 samp233 0 0 0 -9
samp234 samp234 0 0 0 -9
samp235 samp235 0 0 0 -9
samp236 samp236 0 0 0 -9
samp237 samp237 0 0 0 -9
samp238 samp238 0 0 0 -9
samp239 samp239 0 0 0 -9
samp240 samp240 0 0 0 -9
samp241 samp241 0 0 0 -9
samp242 samp242 0 0 0 -9
samp243 samp243 0 0 0 -9
samp244 samp244 0 0 0 -9
samp245 samp245 0 0 0 -9
samp246 samp246 0 0 0 -9
samp247 samp247 0 0 0 -9
samp248 samp248 0 0 0 -9
samp249 samp249 0 0 0 -9
samp250 samp250 0 0 0 -9
samp251 samp251 0 0 0 -9
samp252 samp252 0 0 0 -9
samp253 samp253 0 0 0 -9
samp254 samp254 0 0 0 -9
samp255 samp255 0 0 0 -9
samp256 samp256 0 0 0 -9
samp257 samp257 0 0 0 -9
samp258 samp258 0 0 0 -9
samp259 samp259 0 0 0 -9
samp260 samp260 0 0 0 -9
samp261 samp261 0 0 0 -9
samp262 samp262 0 0 0 -9
samp263 samp263 0 0 0 -9
samp264 samp264 0 0 0 -9
samp265 samp265 0 0 0 -9
samp266 samp266 0 0 0 -9
samp267 samp267 0 0 0 -9
samp268 samp268 0 0 0 -9
samp269 samp269 0 0 0 -9
samp270 samp270 0 0 0 -9
samp271 samp271 0 0 0 -9
samp272 samp272 0 0 0 -9
samp273 samp273 0 0 0 -9
samp274 samp274 0 0 0 -9
samp275 samp275 0 0 0 -9
samp276 samp276 0 0 0 -9
samp277 samp277 0 0 0 -9
samp278 samp278 0 0 0 -9
samp279 samp279 0 0 0 -9
samp280 samp280 0 0 0 -9
samp281 samp281 0 0 0 -9
samp282 samp282 0 0 0 -9
samp283 samp283 0 0 0 -9
samp284 samp284 0 0 0 -9
samp285 samp285 0 0 0 -9
samp286 samp286 0 0 0 -9
samp287 samp287 0 0 0 -9
samp288 samp288 0 0 0 -9
samp289 samp289 0 0 0 -9
samp290 samp290 0 0 0 -9
samp291 samp291 0 0 0 -9
samp292 samp292 0 0 0 -9
samp293 samp293 0 0 0 -9
samp294 samp294 0 0 0 -9
samp295 samp295 0 0 0 -9
samp296 samp296 0 0 0 -9
samp297 samp297 0 0 0 -9
samp298 samp298 0 0 0 -9
samp299 samp299 0 0 0 -9
samp300 samp300 0 0 0 -9
samp301 samp301 0 0 0 -9
samp302 samp302 0 0 0 -9
samp303 samp303 0 0 0 -9
samp304 samp304 0 0 0 -9
samp305 samp305 0 0 0 -9
samp306 samp306 0 0 0 -9
samp307 samp307 0 0 0 -9
samp308 samp308 0 0 0 -9
samp309 samp309 0 0 0 -9
samp310 samp310 0 0 0 -9
samp311 samp311 0 0 0 -9
samp312 samp312 0 0 0 -9
samp313 samp313 0 0 0 -9
samp314 samp314 0 0 0 -9
samp315 samp315 0 0 0 -9
samp316 samp316 0 0 0 -9
samp317 samp317 0 0 0 -9
samp318 samp318 0 0 0 -9
samp319 samp319 0 0 0 -9
samp320 samp320 0 0 0 -9
samp321 samp321 0 0 0 -9
samp322 samp322 0 0 0 -9
samp323 samp323 0 0 0 -9
samp324 samp324 0 0 0 -9
samp325 samp325 0 0 0 -9
samp326 samp326 0 0 0 -9
samp327 samp327 0 0 0 -9
samp328 samp328 0 0 0 -9
samp329 samp329 0 0 0 -9
samp330 samp330 0 0 0 -9
samp331 samp331 0 0 0 -9
samp332 samp332 0 0 0 -9
samp333 samp333 0 0 0 -9
samp334 samp334 0 0 0 -9
samp335 samp335 0 0 0 -9
samp336 samp336 0 0 0 -9
samp337 samp337 0 0 0 -9
samp338 samp338 0 0 0 -9
samp339 samp339 0 0 0 -9
samp340 samp340 0 0 0 -9
samp341 samp341 0 0 0 -9
samp342 samp342 0 0 0 -9
samp343 samp343 0 0 0 -9
samp344 samp344 0 0 0 -9
samp345 samp345 0 0 0 -9
samp346 samp346 0 0 0 -9
samp347 samp347 0 0 0 -9
samp348 samp348 0 0 0 -9
samp349 samp349 0 0 0 -9
samp350 samp350 0 0 0 -9
samp351 samp351 0 0 0 -9
samp352 samp352 0 0 0 -9
samp353 samp353 0 0 0 -9
samp354 samp354 0 0 0 -9
samp355 samp355 0 0 0 -9
samp356 samp356 0 0 0 -9
samp357 samp357 0 0 0 -9
samp358 samp358 0 0 0 -9
samp359 samp359 0 0 0 -9
samp360 samp360 0 0 0 -9
samp361 samp361 0 0 0 -9
samp362 samp362 0 0 0 -9
samp363 samp363 0 0 0 -9
samp364 samp364 0 0 0 -9
samp365 samp365 0 0 0 -9
samp366 samp366 0 0 0 -9
samp367 samp367 0 0 0 -9
samp368 samp368 0 0 0 -9
samp369 samp369 0 0 0 -9
samp370 samp370 0 0 0 -9
samp371 samp371 0 0 0 -9
samp372 samp372 0 0 0 -9
samp373 samp373 0 0 0 -9
samp374 samp374 0 0 0 -9
samp375 samp375 0 0 0 -9
samp376 samp376 0 0 0 -9
samp377 samp377 0 0 0 -9
samp378 samp378 0 0 0 -9
samp379 samp379 0 0 0 -9
samp380 samp380 0 0 0 -9
samp381 samp381 0 0 0 -9
samp382 samp382 0 0 0 -9
samp383 samp383 0 0 0 -9
samp384 samp384 0 0 0 -9
samp385 samp385 0 0 0 -9
samp386 samp386 0 0 0 -9
samp387 samp387 0 0 0 -9
samp388 samp388 0 0 0 -9
samp389 samp389 0 0 0 -9
samp390 samp390 0 0 0 -9
samp391 samp391 0 0 0 -9
samp392 samp392 0 0 0 -9
samp393 samp393 0 0 0 -9
samp394 samp394 0 0 0 -9
samp395 samp395 0 0 0 -9
samp396 samp396 0 0 0 -9
samp397 samp397 0 0 0 -9
samp398 samp398 0 0 0 -9
samp399 samp399 0 0 0 -9
samp400 samp400 0 0 0 -9
samp401 samp401 0 0 0 -9
samp402 samp402 0 0 0 -9
samp403 samp403 0 0 0 -9
samp404 samp404 0 0 0 -9
samp405 samp405 0 0 0 -9
samp406 samp406 0 0 0 -9
samp407 samp407 0 0 0 -9
samp408 samp408 0 0 0 -9
samp409 samp409 0 0 0 -9
samp410 samp410 0 0 0 -9
samp411 samp411 0 0 0 -9
samp412 samp412 0 0 0 -9
samp413 samp413 0 0 0 -9
samp414 samp414 0 0 0 -9
samp415 samp415 0 0 0 -9
samp416 samp416 0 0 0 -9
samp417 samp417 0 0 0 -9
samp418 samp418 0 0 0 -9
samp419 samp419 0 0 0 -9
samp420 samp420 0 0 0 -9
samp421 samp421 0 0 0 -9
samp422 samp422 0 0 0 -9
samp423 samp423 0 0 0 -9
samp424 samp424 0 0 0 -9
samp425 samp425 0 0 0 -9
samp426 samp426 0 0 0 -9
samp427 samp427 0 0 0 -9
samp428 samp428 0 0 0 -9
samp429 samp429 0 0 0 -9
samp430 samp430 0 0 0 -9
samp431 samp431 0 0 0 -9
samp432 samp432 0 0 0 -9
samp433 samp433 0 0 0 -9
samp434 samp434 0 0 0 -9
samp435 samp435 0 0 0 -9
samp436 samp436 0 0 0 -9
samp437 samp437 0 0 0 -9
samp438 samp438 0 0 0 -9
samp439 samp439 0 0 0 -9
samp440 samp440 0 0 0 -9
samp441 samp441 0 0 0 -9
samp442 samp442 0 0 0 -9
samp443 samp443 0 0 0 -9
samp444 samp444 0 0 0 -9
samp445 samp445 0 0 0 -9
samp446 samp446 0 0 0 -9
samp447 samp447 0 0 0 -9
samp448 samp448 0 0 0 -9
samp449 samp449 0 0 0 -9
samp450 samp450 0 0 0 -9
samp451 samp451 0 0 0 -9
samp452 samp452 0 0 0 -9
samp453 samp453 0 0 0 -9
samp454 samp454 0 0 0 -9
samp455 samp455 0 0 0 -9
samp456 samp456 0 0 0 -9
samp457 samp457 0 0 0 -9
samp458 samp458 0 0 0 -9
samp459 samp459 0 0 0 -9
samp460 samp460 0 0 0 -9
samp461 samp461 0 0 0 -9
samp462 samp462 0 0 0 -9
samp463 samp463 0 0 0 -9
samp464 samp464 0 0 0 -9
samp465 samp465 0 0 0 -9
samp466 samp466 0 0 0 -9
samp467 samp467 0 0 0 -9
samp468 samp468 0 0 0 -9
samp469 samp469 0 0 0 -9
samp470 samp470 0 0 0 -9
samp471 samp471 0 0 0 -9
samp472 samp472 0 0 0 -9
samp473 samp473 0 0 0 -9
samp474 samp474 0 0 0 -9
samp475 samp475 0 0 0 -9
samp476 samp476 0 0 0 -9
samp477 samp477 0 0 0 -9
samp478 samp478 0 0 0 -9
samp479 samp479 0 0 0 -9
samp480 samp480 0 0 0 -9
samp481 samp481 0 0 0 -9
samp482 samp482 0 0 0 -9
samp483 samp483 0 0 0 -9
samp484 samp484 0 0 0 -9
samp485 samp485 0 0 0 -9
samp486 samp486 0 0 0 -9
samp487 samp487 0 0 0 -9
samp488 samp488 0 0 0 -9
samp489 samp489 0 0 0 -9
samp490 samp490 0 0 0 -9
samp491 samp491 0 0 0 -9
samp492 samp492 0 0 0 -9
samp493 samp493 0 0 0 -9
samp494 samp494 0 0 0 -9
samp495 samp495 0 0 0 -9
samp496 samp496 0 0 0 -9
samp497 samp497 0 0 0 -9
samp498 samp498 0 0 0 -9
samp499 samp499 0 0 0 -9
samp500 samp500 0 0 0 -9
import csv
import os
from distutils.dir_util import copy_tree
import shutil
from unittest.mock import patch
import pandas as pd
......@@ -11,7 +12,7 @@ from fraposa_pgsc.fraposa_runner import main
@pytest.fixture(scope="session")
def ref_data(tmp_path_factory):
fn = tmp_path_factory.mktemp("data")
copy_tree("tests/data/", str(fn))
shutil.copytree("tests/data/", str(fn), dirs_exist_ok=True)
return fn.resolve()
......@@ -20,11 +21,55 @@ def filt_id(ref_data):
df = pd.read_table(os.path.join(ref_data, "example_comm.fam"), header=None)
subset: list[str] = df[0].to_list()[:100]
with open(os.path.join(ref_data, "filt.txt"), "w") as filt:
[filt.write(x + "\n") for x in subset]
# fake bim format: FID \t IID \t ...
[filt.write(x + "\t" + x + "\n") for x in subset]
return os.path.join(ref_data, "filt.txt")
@pytest.fixture
def duplicate_fid(ref_data, tmp_path_factory):
fn = tmp_path_factory.mktemp("baddata")
shutil.copytree("tests/data", str(fn), dirs_exist_ok=True)
with open(fn / "dup_test.fam", "rt") as f:
lines = list(csv.reader(f, delimiter="\t"))
for x in lines:
# reset FIDs so they share duplicated IDs
x[0] = "samp001"
with open(fn / "dup_test.fam", "wt") as f:
csv.writer(f, delimiter="\t").writerows(lines)
return fn.resolve()
def test_bad_duplicate_fraposa(duplicate_fid, filt_id):
""" Duplicate IIDs will fail if they're duplicated in an FID too """
args = ['fraposa', "--stu_filepref", "dup_test", "thousand_comm", "--stu_filt_iid", filt_id]
with pytest.raises(ValueError) as excinfo:
with patch('sys.argv', args):
cwd = os.getcwd()
os.chdir(duplicate_fid)
main()
os.chdir(cwd)
assert "duplicated FID + IID" in str(excinfo.value)
def test_duplicate_fraposa(ref_data, filt_id):
""" Duplicate IIDs will pass if FIDs are distinct """
args = ['fraposa', "--stu_filepref", "dup_test", "thousand_comm", "--stu_filt_iid", filt_id]
with patch('sys.argv', args):
cwd = os.getcwd()
os.chdir(ref_data)
main()
os.chdir(cwd)
assert _fraposa_finished(ref_data, stu_prefix="dup_test"), "FRAPOSA did not finish in log"
assert _output_exists(ref_data, stu_prefix="dup_test"), "Missing output files"
@pytest.mark.parametrize("args", [
['fraposa', "--stu_filepref", "example_comm", "thousand_comm"],
['fraposa', "--stu_filepref", "example_comm", "thousand_comm", "--stu_filt_iid", "filt.txt"]
......@@ -40,8 +85,8 @@ def test_fraposa(ref_data, filt_id, args):
assert _output_exists(ref_data), "Missing output files"
def _fraposa_finished(ref_data):
fn = ref_data / "example_comm.log"
def _fraposa_finished(ref_data, stu_prefix="example_comm"):
fn = ref_data / f"{stu_prefix}.log"
with open(fn, 'r') as f:
if 'FRAPOSA finished' in f.read():
return True
......@@ -49,17 +94,17 @@ def _fraposa_finished(ref_data):
return False
def _expected_outputs(ref_data):
def _expected_outputs(ref_data, stu_prefix="example_comm"):
outputs = ["thousand_comm_U.dat",
"thousand_comm_V.dat",
"thousand_comm_mnsd.dat",
"thousand_comm_s.dat",
"thousand_comm_vars.dat",
"thousand_comm.pcs",
"example_comm.pcs"]
f"{stu_prefix}.pcs"]
return [ref_data / x for x in outputs]
def _output_exists(ref_data):
fns = _expected_outputs(ref_data)
def _output_exists(ref_data, stu_prefix="example_comm"):
fns = _expected_outputs(ref_data, stu_prefix)
return all([x.exists() for x in fns])
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment