From b5c1017cd94e8a85eb0060257bc0efcbb6fceb73 Mon Sep 17 00:00:00 2001 From: Chris Miller Date: Fri, 12 Jun 2026 10:09:38 -0500 Subject: [PATCH] adding gzip support for annotation files --- tests/test_data/indels.bam_readcount.gz | Bin 0 -> 190 bytes tests/test_data/info.tsv.gz | Bin 0 -> 52 bytes tests/test_data/snvs.bam_readcount.gz | Bin 0 -> 220 bytes tests/test_vcf_info_annotator.py | 14 ++++++++++++++ tests/test_vcf_readcount_annotator.py | 12 ++++++++++++ vatools/transform_split_values.py | 3 ++- vatools/utils.py | 11 +++++++++++ vatools/vcf_info_annotator.py | 3 ++- vatools/vcf_readcount_annotator.py | 3 ++- vatools/vep_annotation_reporter.py | 5 +++-- 10 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 tests/test_data/indels.bam_readcount.gz create mode 100644 tests/test_data/info.tsv.gz create mode 100644 tests/test_data/snvs.bam_readcount.gz create mode 100644 vatools/utils.py diff --git a/tests/test_data/indels.bam_readcount.gz b/tests/test_data/indels.bam_readcount.gz new file mode 100644 index 0000000000000000000000000000000000000000..2ccce8b8decf0f273c3efe33f1f037e2c22988c5 GIT binary patch literal 190 zcmV;v073sBiwFqpHY#cW18Ht#Wo&aUVqtAxa%Ev;V{dhCbO4=@I|>6a3`A??DgmR9 zKWo7j!Nqo|eE+jVvXg8!Nwpw^W~AYPGDIpVrXtXwWLOmy10LF6>tMj`T&yXx=Aos$ zS0q|(;>9H?VMx_Y46-Q2=+Zod?Hsi$J)!&oop(H$b75YnhDH_z3$ZOknDt@~ literal 0 HcmV?d00001 diff --git a/tests/test_data/snvs.bam_readcount.gz b/tests/test_data/snvs.bam_readcount.gz new file mode 100644 index 0000000000000000000000000000000000000000..c3f86df9f1de626de38afe9992523cf2a218fad2 GIT binary patch literal 220 zcmV<203-h&iwFqpHY#cW19NV6b1q_GZC`R_VPs=(b#8P3osv5W12GIlYv(XRf7^l- zVT*LBa{seLT0_D@DmTI!d60Qy=0?uQIK3TtZ1=!Gz)SfnU4-@s4|4DxN{xh4j=iYnYd$!k>?Z!!YxT<$4 z%GmcYZ`q147h=_rxu9}F%{XRZ)CHLsWR#k*BxO~Ym89A}WeY!BvF}8J>sO4mQ@nz$ WmBzVpz&L~DUDO|r@*MHe0ssJ-<7Dyx literal 0 HcmV?d00001 diff --git a/tests/test_vcf_info_annotator.py b/tests/test_vcf_info_annotator.py index 7a0ee0d..8990d7a 100644 --- a/tests/test_vcf_info_annotator.py +++ b/tests/test_vcf_info_annotator.py @@ -68,6 +68,20 @@ def test_simple_caseq(self): self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf'))) temp_path.cleanup() + def test_gzipped_values_file(self): + temp_path = tempfile.TemporaryDirectory() + command = [ + os.path.join(self.test_data_dir, 'input.vcf'), + os.path.join(self.test_data_dir, 'info.tsv.gz'), + 'TEST', + '-d', "test", + '-f', 'Integer', + '-o', os.path.join(temp_path.name, 'info_annotation.vcf') + ] + vcf_info_annotator.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf'))) + temp_path.cleanup() + def test_simple_string(self): temp_path = tempfile.TemporaryDirectory() print(temp_path) diff --git a/tests/test_vcf_readcount_annotator.py b/tests/test_vcf_readcount_annotator.py index cfa4bd3..0b58111 100644 --- a/tests/test_vcf_readcount_annotator.py +++ b/tests/test_vcf_readcount_annotator.py @@ -52,6 +52,18 @@ def test_single_sample_vcf_without_readcounts_annotations_dna_mode(self): self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf'))) temp_path.cleanup() + def test_gzipped_bam_readcount_file(self): + temp_path = tempfile.TemporaryDirectory() + os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf')) + command = [ + os.path.join(temp_path.name, 'input.vcf'), + os.path.join(self.test_data_dir, 'snvs.bam_readcount.gz'), + 'DNA', + ] + vcf_readcount_annotator.main(command) + self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf'))) + temp_path.cleanup() + def test_single_sample_vcf_without_readcounts_annotations_rna_mode(self): temp_path = tempfile.TemporaryDirectory() os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf')) diff --git a/vatools/transform_split_values.py b/vatools/transform_split_values.py index 55b86b8..0153df2 100644 --- a/vatools/transform_split_values.py +++ b/vatools/transform_split_values.py @@ -10,6 +10,7 @@ import binascii from statistics import mean, median, stdev import logging +from vatools.utils import open_maybe_gz def define_parser(): parser = argparse.ArgumentParser( @@ -176,7 +177,7 @@ def main(args_input = sys.argv[1:]): output_file = "{}.tsv".format(head) if args.input_tsv: - with open(args.input_tsv, 'r') as input_filehandle: + with open_maybe_gz(args.input_tsv) as input_filehandle: tsv_reader = create_tsv_reader(input_filehandle) output_filehandle = open(output_file, 'w') writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + field_names(args, sample_name), delimiter = "\t") diff --git a/vatools/utils.py b/vatools/utils.py new file mode 100644 index 0000000..5de3d53 --- /dev/null +++ b/vatools/utils.py @@ -0,0 +1,11 @@ +import gzip + +# handle opening files that may or may not be gzipped +# check the magic bytes at the beginning of the file to determine gzip status +# which is more reliable than looking at the file extension +def open_maybe_gz(path, mode='r'): + with open(path, 'rb') as f: + magic = f.read(2) + if magic == b'\x1f\x8b': + return gzip.open(path, mode + 't' if 'b' not in mode else mode) + return open(path, mode) diff --git a/vatools/vcf_info_annotator.py b/vatools/vcf_info_annotator.py index 0e9cba7..00efe13 100644 --- a/vatools/vcf_info_annotator.py +++ b/vatools/vcf_info_annotator.py @@ -3,6 +3,7 @@ import vcfpy import csv from collections import OrderedDict +from vatools.utils import open_maybe_gz def to_array(dictionary): array = [] @@ -12,7 +13,7 @@ def to_array(dictionary): def parse_tsv_file(args): values={} - with open(args.values_file,'r') as tsvin: + with open_maybe_gz(args.values_file) as tsvin: tsvin = csv.reader(tsvin, delimiter='\t') for row in tsvin: if any(x.strip() for x in row): #skip blank lines diff --git a/vatools/vcf_readcount_annotator.py b/vatools/vcf_readcount_annotator.py index d7f0473..0ff90a4 100644 --- a/vatools/vcf_readcount_annotator.py +++ b/vatools/vcf_readcount_annotator.py @@ -7,6 +7,7 @@ import csv from collections import OrderedDict import logging +from vatools.utils import open_maybe_gz def define_parser(): parser = argparse.ArgumentParser( @@ -58,7 +59,7 @@ def parse_brct_field(brcts): def parse_bam_readcount_file(args): coverage = {} - with open(args.bam_readcount_file, 'r') as reader: + with open_maybe_gz(args.bam_readcount_file) as reader: coverage_tsv_reader = csv.reader(reader, delimiter='\t') for row in coverage_tsv_reader: chromosome = row[0] diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py index 54a5614..88f86b6 100644 --- a/vatools/vep_annotation_reporter.py +++ b/vatools/vep_annotation_reporter.py @@ -9,6 +9,7 @@ import csv import binascii import logging +from vatools.utils import open_maybe_gz def define_parser(): parser = argparse.ArgumentParser( @@ -61,7 +62,7 @@ def create_tsv_reader(input_filehandle): def parse_preferred_transcripts_tsv(preferred_transcripts_tsv): if preferred_transcripts_tsv is None: return None - with open(preferred_transcripts_tsv, 'r') as fh: + with open_maybe_gz(preferred_transcripts_tsv) as fh: tsv_reader = csv.DictReader(fh, delimiter="\t") if 'transcript_id' not in tsv_reader.fieldnames: raise Exception("ERROR preferred transcripts TSV {} doesn't contain required column 'transcript_id'.".format(preferred_transcripts_tsv)) @@ -244,7 +245,7 @@ def main(args_input = sys.argv[1:]): output_file = "{}.tsv".format(head) if args.input_tsv: - with open(args.input_tsv, 'r') as input_filehandle: + with open_maybe_gz(args.input_tsv) as input_filehandle: tsv_reader = create_tsv_reader(input_filehandle) output_filehandle = open(output_file, 'w') writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + args.vep_fields, delimiter = "\t")