Skip to content
Open

Gzip #86

Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added tests/test_data/indels.bam_readcount.gz
Binary file not shown.
Binary file added tests/test_data/info.tsv.gz
Binary file not shown.
Binary file added tests/test_data/snvs.bam_readcount.gz
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_vcf_info_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,20 @@ def test_simple_caseq(self):
self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf')))
temp_path.cleanup()

def test_gzipped_values_file(self):
temp_path = tempfile.TemporaryDirectory()
command = [
os.path.join(self.test_data_dir, 'input.vcf'),
os.path.join(self.test_data_dir, 'info.tsv.gz'),
'TEST',
'-d', "test",
'-f', 'Integer',
'-o', os.path.join(temp_path.name, 'info_annotation.vcf')
]
vcf_info_annotator.main(command)
self.assertTrue(cmp(os.path.join(self.test_data_dir, 'info_annotation.vcf'), os.path.join(temp_path.name, 'info_annotation.vcf')))
temp_path.cleanup()

def test_simple_string(self):
temp_path = tempfile.TemporaryDirectory()
print(temp_path)
Expand Down
12 changes: 12 additions & 0 deletions tests/test_vcf_readcount_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,18 @@ def test_single_sample_vcf_without_readcounts_annotations_dna_mode(self):
self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf')))
temp_path.cleanup()

def test_gzipped_bam_readcount_file(self):
temp_path = tempfile.TemporaryDirectory()
os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf'))
command = [
os.path.join(temp_path.name, 'input.vcf'),
os.path.join(self.test_data_dir, 'snvs.bam_readcount.gz'),
'DNA',
]
vcf_readcount_annotator.main(command)
self.assertTrue(cmp(os.path.join(self.test_data_dir, 'single_sample.dna.readcount.vcf'), os.path.join(temp_path.name, 'input.readcount.vcf')))
temp_path.cleanup()

def test_single_sample_vcf_without_readcounts_annotations_rna_mode(self):
temp_path = tempfile.TemporaryDirectory()
os.symlink(os.path.join(self.test_data_dir, 'input.vcf'), os.path.join(temp_path.name, 'input.vcf'))
Expand Down
3 changes: 2 additions & 1 deletion vatools/transform_split_values.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
import binascii
from statistics import mean, median, stdev
import logging
from vatools.utils import open_maybe_gz

def define_parser():
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -176,7 +177,7 @@ def main(args_input = sys.argv[1:]):
output_file = "{}.tsv".format(head)

if args.input_tsv:
with open(args.input_tsv, 'r') as input_filehandle:
with open_maybe_gz(args.input_tsv) as input_filehandle:
tsv_reader = create_tsv_reader(input_filehandle)
output_filehandle = open(output_file, 'w')
writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + field_names(args, sample_name), delimiter = "\t")
Expand Down
11 changes: 11 additions & 0 deletions vatools/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import gzip

# handle opening files that may or may not be gzipped
# check the magic bytes at the beginning of the file to determine gzip status
# which is more reliable than looking at the file extension
def open_maybe_gz(path, mode='r'):
with open(path, 'rb') as f:
magic = f.read(2)
if magic == b'\x1f\x8b':
return gzip.open(path, mode + 't' if 'b' not in mode else mode)
return open(path, mode)
3 changes: 2 additions & 1 deletion vatools/vcf_info_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import vcfpy
import csv
from collections import OrderedDict
from vatools.utils import open_maybe_gz

def to_array(dictionary):
array = []
Expand All @@ -12,7 +13,7 @@ def to_array(dictionary):

def parse_tsv_file(args):
values={}
with open(args.values_file,'r') as tsvin:
with open_maybe_gz(args.values_file) as tsvin:
tsvin = csv.reader(tsvin, delimiter='\t')
for row in tsvin:
if any(x.strip() for x in row): #skip blank lines
Expand Down
3 changes: 2 additions & 1 deletion vatools/vcf_readcount_annotator.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
import csv
from collections import OrderedDict
import logging
from vatools.utils import open_maybe_gz

def define_parser():
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -58,7 +59,7 @@ def parse_brct_field(brcts):

def parse_bam_readcount_file(args):
coverage = {}
with open(args.bam_readcount_file, 'r') as reader:
with open_maybe_gz(args.bam_readcount_file) as reader:
coverage_tsv_reader = csv.reader(reader, delimiter='\t')
for row in coverage_tsv_reader:
chromosome = row[0]
Expand Down
5 changes: 3 additions & 2 deletions vatools/vep_annotation_reporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import csv
import binascii
import logging
from vatools.utils import open_maybe_gz

def define_parser():
parser = argparse.ArgumentParser(
Expand Down Expand Up @@ -61,7 +62,7 @@ def create_tsv_reader(input_filehandle):
def parse_preferred_transcripts_tsv(preferred_transcripts_tsv):
if preferred_transcripts_tsv is None:
return None
with open(preferred_transcripts_tsv, 'r') as fh:
with open_maybe_gz(preferred_transcripts_tsv) as fh:
tsv_reader = csv.DictReader(fh, delimiter="\t")
if 'transcript_id' not in tsv_reader.fieldnames:
raise Exception("ERROR preferred transcripts TSV {} doesn't contain required column 'transcript_id'.".format(preferred_transcripts_tsv))
Expand Down Expand Up @@ -244,7 +245,7 @@ def main(args_input = sys.argv[1:]):
output_file = "{}.tsv".format(head)

if args.input_tsv:
with open(args.input_tsv, 'r') as input_filehandle:
with open_maybe_gz(args.input_tsv) as input_filehandle:
tsv_reader = create_tsv_reader(input_filehandle)
output_filehandle = open(output_file, 'w')
writer = csv.DictWriter(output_filehandle, fieldnames = tsv_reader.fieldnames + args.vep_fields, delimiter = "\t")
Expand Down
Loading