From b5c1017cd94e8a85eb0060257bc0efcbb6fceb73 Mon Sep 17 00:00:00 2001
From: Chris Miller <chrisamiller@gmail.com>
Date: Fri, 12 Jun 2026 10:09:38 -0500
Subject: [PATCH] adding gzip support for annotation files

---
 tests/test_data/indels.bam_readcount.gz | Bin 0 -> 190 bytes
 tests/test_data/info.tsv.gz             | Bin 0 -> 52 bytes
 tests/test_data/snvs.bam_readcount.gz   | Bin 0 -> 220 bytes
 tests/test_vcf_info_annotator.py        |  14 ++++++++++++++
 tests/test_vcf_readcount_annotator.py   |  12 ++++++++++++
 vatools/transform_split_values.py       |   3 ++-
 vatools/utils.py                        |  11 +++++++++++
 vatools/vcf_info_annotator.py           |   3 ++-
 vatools/vcf_readcount_annotator.py      |   3 ++-
 vatools/vep_annotation_reporter.py      |   5 +++--
 10 files changed, 46 insertions(+), 5 deletions(-)
 create mode 100644 tests/test_data/indels.bam_readcount.gz
 create mode 100644 tests/test_data/info.tsv.gz
 create mode 100644 tests/test_data/snvs.bam_readcount.gz
 create mode 100644 vatools/utils.py
diff --git a/tests/test_data/indels.bam_readcount.gz b/tests/test_data/indels.bam_readcount.gz
new file mode 100644
index 0000000000000000000000000000000000000000..2ccce8b8decf0f273c3efe33f1f037e2c22988c5
GIT binary patch
literal 190
zcmV;v073sBiwFqpHY#cW18Ht#Wo&aUVqtAxa%Ev;V{dhCbO4=@I|>6a3`A??DgmR9
zKWo7j!Nqo|eE+jVvXg8!Nwpw^W~AYPGDIpVrXtXwWLOmy10LF6>tMj`T&yXx=Aos$
zS0q|(;>9H?VMx_Y46-Q2=+Zod?Hsi$J)!&oop(H$b75YnhD<ci>H_z3$ZOknDt<xB
sP)Vb_pdvd|#L~TM;-5HYFMEY&&@=v_X_<;8)G_1b0UdNdo#z1n0Ppiu?f?J)

literal 0
HcmV?d00001

diff --git a/tests/test_data/info.tsv.gz b/tests/test_data/info.tsv.gz
new file mode 100644
index 0000000000000000000000000000000000000000..2fb2b6c7157c950b7c4effcd2326d73bf98691e2
GIT binary patch
literal 52
zcmb2|=HPf^rj^COoSB!FuUArB#$asnz{JGhfyo0SqemuN42%s8jm@8#JYr(l?3^AW
I&cMI`0K`fVsQ>@~

literal 0
HcmV?d00001

diff --git a/tests/test_data/snvs.bam_readcount.gz b/tests/test_data/snvs.bam_readcount.gz
new file mode 100644
index 0000000000000000000000000000000000000000..c3f86df9f1de626de38afe9992523cf2a218fad2
GIT binary patch
literal 220
zcmV<203-h&iwFqpHY#cW19NV6b1q_GZC`R_VPs=(b#8P3osv5W12GIlYv(XRf7^l-
zVT*LBa{seLT0_D@DmTI!d60Qy=0?uQIK3TtZ1=!Gz)SfnU4-@s4|4DxN<e*0EGQ3N
z1u2|hNNNa%K}O4~vmHTsw^y&{@HBg9UA1La1}!a>{xh4j=iYnYd$!k>?Z!!YxT<$4
z%GmcYZ`q147h=_rxu9}F%{XRZ)CHLsWR#k*BxO~Ym89A}WeY!BvF}8J>sO4mQ@nz$
WmBzVpz&L~DUDO|r@*MHe0ssJ-<7Dyx

literal 0
HcmV?d00001

diff --git a/tests/test_vcf_info_annotator.py b/tests/test_vcf_info_annotator.py
index 7a0ee0d..8990d7a 100644
--- a/tests/test_vcf_info_annotator.py
+++ b/tests/test_vcf_info_annotator.py
@@ -68,6 +68,20 @@ def test_simple_caseq(self):
         self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf')))
         temp_path.cleanup()
 
+    def test_gzipped_values_file(self):
+        temp_path = tempfile.TemporaryDirectory()
+        command = [
+            os.path.join(self.test_data_dir, 'input.vcf'),
+            os.path.join(self.test_data_dir, 'info.tsv.gz'),
+            'TEST',
+            '-d', "test",
+            '-f', 'Integer',
+            '-o', os.path.join(temp_path.name, 'info_annotation.vcf')
+        ]
+        vcf_info_annotator.main(command)
+        self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf')))
+        temp_path.cleanup()
+
     def test_simple_string(self):
         temp_path = tempfile.TemporaryDirectory()
         print(temp_path)
diff --git a/tests/test_vcf_readcount_annotator.py b/tests/test_vcf_readcount_annotator.py
index cfa4bd3..0b58111 100644
--- a/tests/test_vcf_readcount_annotator.py
+++ b/tests/test_vcf_readcount_annotator.py
@@ -52,6 +52,18 @@ def test_single_sample_vcf_without_readcounts_annotations_dna_mode(self):
         self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf')))
         temp_path.cleanup()
 
+    def test_gzipped_bam_readcount_file(self):
+        temp_path = tempfile.TemporaryDirectory()
+        os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf'))
+        command = [
+            os.path.join(temp_path.name, 'input.vcf'),
+            os.path.join(self.test_data_dir, 'snvs.bam_readcount.gz'),
+            'DNA',
+        ]
+        vcf_readcount_annotator.main(command)
+        self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf')))
+        temp_path.cleanup()
+
     def test_single_sample_vcf_without_readcounts_annotations_rna_mode(self):
         temp_path = tempfile.TemporaryDirectory()
         os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf'))
diff --git a/vatools/transform_split_values.py b/vatools/transform_split_values.py
index 55b86b8..0153df2 100644
--- a/vatools/transform_split_values.py
+++ b/vatools/transform_split_values.py
@@ -10,6 +10,7 @@
 import binascii
 from statistics import mean, median, stdev
 import logging
+from vatools.utils import open_maybe_gz
 
 def define_parser():
     parser = argparse.ArgumentParser(
@@ -176,7 +177,7 @@ def main(args_input = sys.argv[1:]):
         output_file = "{}.tsv".format(head)
 
     if args.input_tsv:
-        with open(args.input_tsv, 'r') as input_filehandle:
+        with open_maybe_gz(args.input_tsv) as input_filehandle:
             tsv_reader = create_tsv_reader(input_filehandle)
             output_filehandle = open(output_file, 'w')
             writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + field_names(args, sample_name), delimiter = "\t")
diff --git a/vatools/utils.py b/vatools/utils.py
new file mode 100644
index 0000000..5de3d53
--- /dev/null
+++ b/vatools/utils.py
@@ -0,0 +1,11 @@
+import gzip
+
+# handle opening files that may or may not be gzipped
+# check the magic bytes at the beginning of the file to determine gzip status
+# which is more reliable than looking at the file extension
+def open_maybe_gz(path, mode='r'):
+    with open(path, 'rb') as f:
+        magic = f.read(2)
+    if magic == b'\x1f\x8b':
+        return gzip.open(path, mode + 't' if 'b' not in mode else mode)
+    return open(path, mode)
diff --git a/vatools/vcf_info_annotator.py b/vatools/vcf_info_annotator.py
index 0e9cba7..00efe13 100644
--- a/vatools/vcf_info_annotator.py
+++ b/vatools/vcf_info_annotator.py
@@ -3,6 +3,7 @@
 import vcfpy
 import csv
 from collections import OrderedDict
+from vatools.utils import open_maybe_gz
 
 def to_array(dictionary):
     array = []
@@ -12,7 +13,7 @@ def to_array(dictionary):
 
 def parse_tsv_file(args):
     values={}
-    with open(args.values_file,'r') as tsvin:
+    with open_maybe_gz(args.values_file) as tsvin:
         tsvin = csv.reader(tsvin, delimiter='\t')
         for row in tsvin:
             if any(x.strip() for x in row): #skip blank lines
diff --git a/vatools/vcf_readcount_annotator.py b/vatools/vcf_readcount_annotator.py
index d7f0473..0ff90a4 100644
--- a/vatools/vcf_readcount_annotator.py
+++ b/vatools/vcf_readcount_annotator.py
@@ -7,6 +7,7 @@
 import csv
 from collections import OrderedDict
 import logging
+from vatools.utils import open_maybe_gz
 
 def define_parser():
     parser = argparse.ArgumentParser(
@@ -58,7 +59,7 @@ def parse_brct_field(brcts):
 
 def parse_bam_readcount_file(args):
     coverage = {}
-    with open(args.bam_readcount_file, 'r') as reader:
+    with open_maybe_gz(args.bam_readcount_file) as reader:
         coverage_tsv_reader = csv.reader(reader, delimiter='\t')
         for row in coverage_tsv_reader:
             chromosome     = row[0]
diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py
index 54a5614..88f86b6 100644
--- a/vatools/vep_annotation_reporter.py
+++ b/vatools/vep_annotation_reporter.py
@@ -9,6 +9,7 @@
 import csv
 import binascii
 import logging
+from vatools.utils import open_maybe_gz
 
 def define_parser():
     parser = argparse.ArgumentParser(
@@ -61,7 +62,7 @@ def create_tsv_reader(input_filehandle):
 def parse_preferred_transcripts_tsv(preferred_transcripts_tsv):
     if preferred_transcripts_tsv is None:
         return None
-    with open(preferred_transcripts_tsv, 'r') as fh:
+    with open_maybe_gz(preferred_transcripts_tsv) as fh:
         tsv_reader = csv.DictReader(fh, delimiter="\t")
         if 'transcript_id' not in tsv_reader.fieldnames:
             raise Exception("ERROR preferred transcripts TSV {} doesn't contain required column 'transcript_id'.".format(preferred_transcripts_tsv))
@@ -244,7 +245,7 @@ def main(args_input = sys.argv[1:]):
         output_file = "{}.tsv".format(head)
 
     if args.input_tsv:
-        with open(args.input_tsv, 'r') as input_filehandle:
+        with open_maybe_gz(args.input_tsv) as input_filehandle:
             tsv_reader = create_tsv_reader(input_filehandle)
             output_filehandle = open(output_file, 'w')
             writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + args.vep_fields, delimiter = "\t")