griffithlab · chrisamiller · Jun 12, 2026
diff --git a/tests/test_data/indels.bam_readcount.gz b/tests/test_data/indels.bam_readcount.gz
diff --git a/tests/test_data/info.tsv.gz b/tests/test_data/info.tsv.gz
diff --git a/tests/test_data/snvs.bam_readcount.gz b/tests/test_data/snvs.bam_readcount.gz
diff --git a/tests/test_vcf_info_annotator.py b/tests/test_vcf_info_annotator.py
@@ -68,6 +68,20 @@ def test_simple_caseq(self):
         self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf')))
         temp_path.cleanup()
 
+    def test_gzipped_values_file(self):
+        temp_path = tempfile.TemporaryDirectory()
+        command = [
+            os.path.join(self.test_data_dir, 'input.vcf'),
+            os.path.join(self.test_data_dir, 'info.tsv.gz'),
+            'TEST',
+            '-d', "test",
+            '-f', 'Integer',
+            '-o', os.path.join(temp_path.name, 'info_annotation.vcf')
+        ]
+        vcf_info_annotator.main(command)
+        self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf')))
+        temp_path.cleanup()
+
     def test_simple_string(self):
         temp_path = tempfile.TemporaryDirectory()
         print(temp_path)

diff --git a/tests/test_vcf_readcount_annotator.py b/tests/test_vcf_readcount_annotator.py
@@ -52,6 +52,18 @@ def test_single_sample_vcf_without_readcounts_annotations_dna_mode(self):
         self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf')))
         temp_path.cleanup()
 
+    def test_gzipped_bam_readcount_file(self):
+        temp_path = tempfile.TemporaryDirectory()
+        os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf'))
+        command = [
+            os.path.join(temp_path.name, 'input.vcf'),
+            os.path.join(self.test_data_dir, 'snvs.bam_readcount.gz'),
+            'DNA',
+        ]
+        vcf_readcount_annotator.main(command)
+        self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf')))
+        temp_path.cleanup()
+
     def test_single_sample_vcf_without_readcounts_annotations_rna_mode(self):
         temp_path = tempfile.TemporaryDirectory()
         os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf'))

diff --git a/vatools/transform_split_values.py b/vatools/transform_split_values.py
@@ -10,6 +10,7 @@
 import binascii
 from statistics import mean, median, stdev
 import logging
+from vatools.utils import open_maybe_gz
 
 def define_parser():
     parser = argparse.ArgumentParser(
@@ -176,7 +177,7 @@ def main(args_input = sys.argv[1:]):
         output_file = "{}.tsv".format(head)
 
     if args.input_tsv:
-        with open(args.input_tsv, 'r') as input_filehandle:
+        with open_maybe_gz(args.input_tsv) as input_filehandle:
             tsv_reader = create_tsv_reader(input_filehandle)
             output_filehandle = open(output_file, 'w')
             writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + field_names(args, sample_name), delimiter = "\t")

diff --git a/vatools/utils.py b/vatools/utils.py
@@ -0,0 +1,11 @@
+import gzip
+
+# handle opening files that may or may not be gzipped
+# check the magic bytes at the beginning of the file to determine gzip status
+# which is more reliable than looking at the file extension
+def open_maybe_gz(path, mode='r'):
+    with open(path, 'rb') as f:
+        magic = f.read(2)
+    if magic == b'\x1f\x8b':
+        return gzip.open(path, mode + 't' if 'b' not in mode else mode)
+    return open(path, mode)
diff --git a/vatools/vcf_info_annotator.py b/vatools/vcf_info_annotator.py
@@ -3,6 +3,7 @@
 import vcfpy
 import csv
 from collections import OrderedDict
+from vatools.utils import open_maybe_gz
 
 def to_array(dictionary):
     array = []
@@ -12,7 +13,7 @@ def to_array(dictionary):
 
 def parse_tsv_file(args):
     values={}
-    with open(args.values_file,'r') as tsvin:
+    with open_maybe_gz(args.values_file) as tsvin:
         tsvin = csv.reader(tsvin, delimiter='\t')
         for row in tsvin:
             if any(x.strip() for x in row): #skip blank lines

diff --git a/vatools/vcf_readcount_annotator.py b/vatools/vcf_readcount_annotator.py
@@ -7,6 +7,7 @@
 import csv
 from collections import OrderedDict
 import logging
+from vatools.utils import open_maybe_gz
 
 def define_parser():
     parser = argparse.ArgumentParser(
@@ -58,7 +59,7 @@ def parse_brct_field(brcts):
 
 def parse_bam_readcount_file(args):
     coverage = {}
-    with open(args.bam_readcount_file, 'r') as reader:
+    with open_maybe_gz(args.bam_readcount_file) as reader:
         coverage_tsv_reader = csv.reader(reader, delimiter='\t')
         for row in coverage_tsv_reader:
             chromosome     = row[0]

diff --git a/vatools/vep_annotation_reporter.py b/vatools/vep_annotation_reporter.py
@@ -9,6 +9,7 @@
 import csv
 import binascii
 import logging
+from vatools.utils import open_maybe_gz
 
 def define_parser():
     parser = argparse.ArgumentParser(
@@ -61,7 +62,7 @@ def create_tsv_reader(input_filehandle):
 def parse_preferred_transcripts_tsv(preferred_transcripts_tsv):
     if preferred_transcripts_tsv is None:
         return None
-    with open(preferred_transcripts_tsv, 'r') as fh:
+    with open_maybe_gz(preferred_transcripts_tsv) as fh:
         tsv_reader = csv.DictReader(fh, delimiter="\t")
         if 'transcript_id' not in tsv_reader.fieldnames:
             raise Exception("ERROR preferred transcripts TSV {} doesn't contain required column 'transcript_id'.".format(preferred_transcripts_tsv))
@@ -244,7 +245,7 @@ def main(args_input = sys.argv[1:]):
         output_file = "{}.tsv".format(head)
 
     if args.input_tsv:
-        with open(args.input_tsv, 'r') as input_filehandle:
+        with open_maybe_gz(args.input_tsv) as input_filehandle:
             tsv_reader = create_tsv_reader(input_filehandle)
             output_filehandle = open(output_file, 'w')
             writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + args.vep_fields, delimiter = "\t")