From 0a9e0afa3ea48c4aef7274efcc4e0bba0ce45717 Mon Sep 17 00:00:00 2001 From: Michael Rightmire Date: Thu, 13 Aug 2020 15:39:26 +0200 Subject: [PATCH 1/4] Changes to parser.py and model.py to allow for nonstandard VCF file headers. --- vcf/model.py | 55 +++++++++++----- vcf/parser.py | 149 +++++++++++++++++++++++++++++-------------- vcf/sample_filter.py | 6 +- vcf/utils.py | 6 +- 4 files changed, 145 insertions(+), 71 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 34a4d17a..19d7a2d2 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -1,4 +1,5 @@ from abc import ABCMeta, abstractmethod +import ast import collections import sys import re @@ -172,18 +173,41 @@ class _Record(object): Neither the upstream nor downstream flanking bases are included in the region. """ - def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, - sample_indexes, samples=None): - self.CHROM = CHROM - #: the one-based coordinate of the first nucleotide in ``REF`` - self.POS = POS - self.ID = ID - self.REF = REF - self.ALT = ALT - self.QUAL = QUAL - self.FILTER = FILTER - self.INFO = INFO - self.FORMAT = FORMAT + def __init__(self, ROWDICT, + #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT, + sample_indexes, samples=None): + + for varname, value in ROWDICT.items(): + # "." pretty always means no value (NOne) + if value == ".": + value = None + + else: + try: + # Convert to either int or float + # WIll error if value is not a string or a number + value = ast.literal_eval(value) + except Exception as e: + # Any error means value is a string, so just leave it + # Silence this error, or print it for debugging + pass + #print("ERROR occurred converting value '{}' with AST!({})".format(value, e)) + + setattr(self, varname, value) #equivalent to: self.varname= 'something' + + #======================================================================= + # self.CHROM = CHROM + # #: the one-based coordinate of the first nucleotide in ``REF`` + # self.POS = POS + # self.ID = ID + # self.REF = REF + # self.ALT = ALT + # self.QUAL = QUAL + # self.FILTER = FILTER + # self.INFO = INFO + # self.FORMAT = FORMAT + #======================================================================= + #: zero-based, half-open start coordinate of ``REF`` self.start = self.POS - 1 #: zero-based, half-open end coordinate of ``REF`` @@ -362,7 +386,7 @@ def heterozygosity(self): If there are i alleles with frequency p_i, H=1-sum_i(p_i^2) """ allele_freqs = [1-sum(self.aaf)] + self.aaf - return 1 - sum(map(lambda x: x**2, allele_freqs)) + return 1 - sum([x**2 for x in allele_freqs]) def get_hom_refs(self): """ The list of hom ref genotypes""" @@ -558,9 +582,8 @@ def is_filtered(self): return True -class _AltRecord(object): +class _AltRecord(object, metaclass=ABCMeta): '''An alternative allele record: either replacement string, SV placeholder, or breakend''' - __metaclass__ = ABCMeta def __init__(self, type, **kwargs): super(_AltRecord, self).__init__(**kwargs) @@ -596,7 +619,7 @@ def __len__(self): return len(self.sequence) def __eq__(self, other): - if isinstance(other, basestring): + if isinstance(other, str): return self.sequence == other elif not isinstance(other, self.__class__): return False diff --git a/vcf/parser.py b/vcf/parser.py index c3c3d082..09a3a579 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -1,3 +1,4 @@ +import ast import codecs import collections import csv @@ -22,8 +23,8 @@ except ImportError: cparse = None -from model import _Call, _Record, make_calldata_tuple -from model import _Substitution, _Breakend, _SingleBreakend, _SV +from .model import _Call, _Record, make_calldata_tuple +from .model import _Substitution, _Breakend, _SingleBreakend, _SV # Metadata parsers/constants @@ -315,6 +316,7 @@ def _parse_metainfo(self): parser = _vcf_metadata_parser() line = next(self.reader) + while line.startswith('##'): self._header_lines.append(line) @@ -350,7 +352,8 @@ def _parse_metainfo(self): line = next(self.reader) fields = self._row_pattern.split(line[1:]) - self._column_headers = fields[:9] +# self._column_headers = fields[:9] + self._column_headers = fields self.samples = fields[9:] self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) @@ -468,7 +471,7 @@ def _parse_samples(self, samples, samp_fmt, site): nfields = len(samp_fmt._fields) - for name, sample in itertools.izip(self.samples, samples): + for name, sample in zip(self.samples, samples): # parse the data for this sample sampdat = [None] * nfields @@ -548,47 +551,95 @@ def _parse_alt(self, str): else: return _Substitution(str) - def next(self): + def __next__(self): '''Return the next record in the file.''' line = next(self.reader) - row = self._row_pattern.split(line.rstrip()) - chrom = row[0] + row = self._row_pattern.split(line.rstrip()) + rowdict = collections.OrderedDict((zip(self._column_headers, row))) + if self._prepend_chr: - chrom = 'chr' + chrom - pos = int(row[1]) - - if row[2] != '.': - ID = row[2] - else: - ID = None - - ref = row[3] - alt = self._map(self._parse_alt, row[4].split(',')) - - try: - qual = int(row[5]) - except ValueError: - try: - qual = float(row[5]) - except ValueError: - qual = None - - filt = self._parse_filter(row[6]) - info = self._parse_info(row[7]) + rowdict["CHROM"] = 'chr' + rowdict["CHROM"] +#=============================================================================== +# chrom = rowdict["CHROM"] +# if self._prepend_chr: +# chrom = 'chr' + chrom +# # chrom = row[0] +#=============================================================================== + +#=============================================================================== +# pos = int(rowdict["POS"]) +# # pos = int(row[1]) +#=============================================================================== + + rowdict["ID"] = rowdict["ID"] if rowdict["ID"] != '.' else None + #======================================================================= + # if row[2] != '.': + # ID = row[2] + # else: + # ID = None + #======================================================================= + + #======================================================================= + # ref = rowdict["REF"] + # alt = self._map(self._parse_alt, rowdict["ALT"].split(',')) + #======================================================================= + rowdict["ALT"] = self._map(self._parse_alt, rowdict["ALT"].split(',')) + + try: + rowdict["QUAL"] = ast.literal_eval(rowdict["QUAL"]) + except: # Any error means qual = None + rowdict["QUAL"] = None + #======================================================================= + # try: + # qual = int(row[5]) + # except ValueError: + # try: + # qual = float(row[5]) + # except ValueError: + # qual = None + #======================================================================= + + rowdict["FILTER"] = self._parse_filter(rowdict["FILTER"]) + rowdict["INFO"] = self._parse_info(rowdict["INFO"]) + #======================================================================= + # filt = self._parse_filter(rowdict["FILTER"]) + # info = self._parse_info(rowdict["INFO"]) + #======================================================================= try: - fmt = row[8] - except IndexError: - fmt = None - else: - if fmt == '.': - fmt = None - - record = _Record(chrom, pos, ID, ref, alt, qual, filt, - info, fmt, self._sample_indexes) - - if fmt is not None: - samples = self._parse_samples(row[9:], fmt, record) + if rowdict["FORMAT"] == '.': + rowdict["FORMAT"] = None + except KeyError: + rowdict["FORMAT"] = None + #======================================================================= + # try: + # fmt = row[8] + # except IndexError: + # fmt = None + # else: + # if fmt == '.': + # fmt = None + #======================================================================= + + #======================================================================= + # record = _Record(chrom, pos, ID, ref, alt, qual, filt, + # info, fmt, self._sample_indexes) + #======================================================================= + + record = _Record(rowdict, self._sample_indexes) + + # Genotype fields can be followed by other data columns, + # so we have to detect that + if rowdict["FORMAT"] is not None: + items = list(rowdict.items()) + #=================================================================== + # We will assume that the next string that follows the FORMAT + # declaration is the sample data. HOWEVER, a FORMAT can have + # multiple sample lines (I.e. TUMOR and CONTROL.) This will only set + # the first sample data. The class "_Record" needs to be + # modified in order to handle multiple sets of sample data + #=================================================================== + samples = self._parse_samples(items[9][1], rowdict["FORMAT"], record) record.samples = samples return record @@ -641,7 +692,7 @@ class Writer(object): """VCF Writer. On Windows Python 2, open stream with 'wb'.""" # Reverse keys and values in header field count dictionary - counts = dict((v,k) for k,v in field_counts.iteritems()) + counts = dict((v,k) for k,v in field_counts.items()) def __init__(self, stream, template, lineterminator="\n"): self.writer = csv.writer(stream, delimiter="\t", @@ -654,30 +705,30 @@ def __init__(self, stream, template, lineterminator="\n"): # get a maximum key). self.info_order = collections.defaultdict( lambda: len(template.infos), - dict(zip(template.infos.iterkeys(), itertools.count()))) + dict(list(zip(iter(template.infos.keys()), itertools.count())))) two = '##{key}=\n' four = '##{key}=\n' _num = self._fix_field_count - for (key, vals) in template.metadata.iteritems(): + for (key, vals) in template.metadata.items(): if key in SINGULAR_METADATA: vals = [vals] for val in vals: if isinstance(val, dict): values = ','.join('{0}={1}'.format(key, value) - for key, value in val.items()) + for key, value in list(val.items())) stream.write('##{0}=<{1}>\n'.format(key, values)) else: stream.write('##{0}={1}\n'.format(key, val)) - for line in template.infos.itervalues(): + for line in template.infos.values(): stream.write(four.format(key="INFO", *line, num=_num(line.num))) - for line in template.formats.itervalues(): + for line in template.formats.values(): stream.write(four.format(key="FORMAT", *line, num=_num(line.num))) - for line in template.filters.itervalues(): + for line in template.filters.values(): stream.write(two.format(key="FILTER", *line)) - for line in template.alts.itervalues(): + for line in template.alts.values(): stream.write(two.format(key="ALT", *line)) - for line in template.contigs.itervalues(): + for line in template.contigs.values(): if line.length: stream.write('##contig=\n'.format(*line)) else: diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py index b156b453..0f015430 100644 --- a/vcf/sample_filter.py +++ b/vcf/sample_filter.py @@ -7,7 +7,7 @@ import warnings -from parser import Reader, Writer +from .parser import Reader, Writer class SampleFilter(object): @@ -81,13 +81,13 @@ def filt2idx(item): # is int, check if it's an idx if item < len(self.samples): return item - filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s))) + filters = set([x for x in map(filt2idx, filt_s) if x is not None]) if len(filters) < len(filt_s): # TODO print the filters that were ignored warnings.warn("Invalid filters, ignoring", RuntimeWarning) if self.invert: - filters = set(xrange(len(self.samples))).difference(filters) + filters = set(range(len(self.samples))).difference(filters) # `sample_filter` setter updates `samples` self.parser.sample_filter = filters diff --git a/vcf/utils.py b/vcf/utils.py index 2881dc24..4b51c199 100644 --- a/vcf/utils.py +++ b/vcf/utils.py @@ -37,19 +37,19 @@ def walk_together(*readers, **kwargs): next_idx_to_k = dict( (i, get_key(r)) for i, r in enumerate(nexts) if r is not None) keys_with_prev_contig = [ - k for k in next_idx_to_k.values() if k[0] == min_k[0]] + k for k in list(next_idx_to_k.values()) if k[0] == min_k[0]] if any(keys_with_prev_contig): min_k = min(keys_with_prev_contig) # finish previous contig else: min_k = min(next_idx_to_k.values()) # move on to next contig - min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k]) + min_k_idxs = set([i for i, k in list(next_idx_to_k.items()) if k == min_k]) yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))] for i in min_k_idxs: try: - nexts[i] = readers[i].next() + nexts[i] = next(readers[i]) except StopIteration: nexts[i] = None From 600a429888f72a4ffdb8c68cf3d582d2a05d219b Mon Sep 17 00:00:00 2001 From: root Date: Thu, 13 Aug 2020 17:08:37 +0200 Subject: [PATCH 2/4] Adjusted model.py _record.str() to respond with all columns, not just CHROM, POS, REF; and ALT.) --- vcf/model.py | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/vcf/model.py b/vcf/model.py index 19d7a2d2..d9098eb8 100644 --- a/vcf/model.py +++ b/vcf/model.py @@ -195,19 +195,6 @@ def __init__(self, ROWDICT, setattr(self, varname, value) #equivalent to: self.varname= 'something' - #======================================================================= - # self.CHROM = CHROM - # #: the one-based coordinate of the first nucleotide in ``REF`` - # self.POS = POS - # self.ID = ID - # self.REF = REF - # self.ALT = ALT - # self.QUAL = QUAL - # self.FILTER = FILTER - # self.INFO = INFO - # self.FORMAT = FORMAT - #======================================================================= - #: zero-based, half-open start coordinate of ``REF`` self.start = self.POS - 1 #: zero-based, half-open end coordinate of ``REF`` @@ -294,8 +281,20 @@ def __iter__(self): return iter(self.samples) def __str__(self): - return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__ - +# return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__ + return ''.join(["Record(", + "CHROM=%(CHROM)s, ", + "POS=%(POS)s, ", + "ID=%(ID)s, ", + "REF=%(REF)s, ", + "ALT=%(ALT)s), " + "QUAL=%(QUAL)s), " + "FILTER=%(FILTER)s), " + "INFO=%(INFO)s), " + "FORMAT=%(FORMAT)s)" + ]) % self.__dict__ + +# CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT def add_format(self, fmt): self.FORMAT = self.FORMAT + ':' + fmt From 7a1c849647af4be1b29b2fa99faf86435dd9c3e8 Mon Sep 17 00:00:00 2001 From: Michael Rightmire Date: Fri, 14 Aug 2020 15:24:00 +0200 Subject: [PATCH 3/4] Moved the enumeration of self.samples and self._samples_indexes to after the first row of data is read. This way, non-standard columns of data can be inced or excluded as appropriate. --- vcf/parser.py | 34 +++++++++++++++++++++++++++------- 1 file changed, 27 insertions(+), 7 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 09a3a579..7d1797d4 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -353,9 +353,13 @@ def _parse_metainfo(self): fields = self._row_pattern.split(line[1:]) # self._column_headers = fields[:9] - self._column_headers = fields - self.samples = fields[9:] - self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) + self._column_headers = fields + self._non_standard_column_headers = ( + set(self._column_headers) - + set(["CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", + "INFO", "FORMAT"])) +# self.samples = fields[9:] #removed in favor of running in __next__ +# self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) def _map(self, func, iterable, bad=['.', '']): '''``map``, but make bad values None.''' @@ -625,20 +629,36 @@ def __next__(self): # record = _Record(chrom, pos, ID, ref, alt, qual, filt, # info, fmt, self._sample_indexes) #======================================================================= - - record = _Record(rowdict, self._sample_indexes) # Genotype fields can be followed by other data columns, # so we have to detect that if rowdict["FORMAT"] is not None: items = list(rowdict.items()) - #=================================================================== + # We run this code here, because we have to wait for the first line + # of actual data. + # It will use the number of colons found in the FORMAT field, to + # create a regex pattern. It will match that regex pattern against + # the data in the remaining non-standard colums and, if they appear + # to actually be sample data, adds them to self.samples + # self.samples remains set to None in init. + if self.samples is None: + self.samples = [] + self._num_format_colons = rowdict["FORMAT"].count(":") + self._samples_pattern = "^" + for i in range(0,self._num_format_colons+1, 1): + self._samples_pattern += r"[0-9.,/-]{1,}:" + self._samples_pattern = self._samples_pattern.rstrip(":") + "$" + for header in self._column_headers: + if re.match(self._samples_pattern, str(rowdict[header])): + self.samples.append(header) + self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) + record = _Record(rowdict, self._sample_indexes) + # We will assume that the next string that follows the FORMAT # declaration is the sample data. HOWEVER, a FORMAT can have # multiple sample lines (I.e. TUMOR and CONTROL.) This will only set # the first sample data. The class "_Record" needs to be # modified in order to handle multiple sets of sample data - #=================================================================== samples = self._parse_samples(items[9][1], rowdict["FORMAT"], record) record.samples = samples From 91f3bd2987e3cc383e8a2d4cf5188f65fb8d8cac Mon Sep 17 00:00:00 2001 From: Michael Rightmire Date: Fri, 14 Aug 2020 17:30:43 +0200 Subject: [PATCH 4/4] Corrected little bump I caused in _parse_samples when I modified things to use rowdict. --- vcf/parser.py | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/vcf/parser.py b/vcf/parser.py index 7d1797d4..dbcaa9c8 100644 --- a/vcf/parser.py +++ b/vcf/parser.py @@ -454,6 +454,10 @@ def _parse_sample_format(self, samp_fmt): return samp_fmt def _parse_samples(self, samples, samp_fmt, site): + print("-----------------------------")#333 + print("samples =", samples) #3333 + print("samp_fmt =", samp_fmt)#3333 + print("site =", site) #333 '''Parse a sample entry according to the format specified in the FORMAT column. @@ -466,6 +470,7 @@ def _parse_samples(self, samples, samp_fmt, site): self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt) samp_fmt = self._format_cache[samp_fmt] + print("cparse=", cparse)#333 if cparse: return cparse.parse_samples( self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site) @@ -474,13 +479,16 @@ def _parse_samples(self, samples, samp_fmt, site): _map = self._map nfields = len(samp_fmt._fields) - + + print("self.samples", self.samples) #3333 + print("samples =", samples)#3333 for name, sample in zip(self.samples, samples): - + print("Zipped: ", name, sample) #333 # parse the data for this sample sampdat = [None] * nfields for i, vals in enumerate(sample.split(':')): + print("sample split = ", i, ":", vals)#333 # short circuit the most common if samp_fmt._fields[i] == 'GT': @@ -633,7 +641,6 @@ def __next__(self): # Genotype fields can be followed by other data columns, # so we have to detect that if rowdict["FORMAT"] is not None: - items = list(rowdict.items()) # We run this code here, because we have to wait for the first line # of actual data. # It will use the number of colons found in the FORMAT field, to @@ -654,12 +661,14 @@ def __next__(self): self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)]) record = _Record(rowdict, self._sample_indexes) - # We will assume that the next string that follows the FORMAT - # declaration is the sample data. HOWEVER, a FORMAT can have - # multiple sample lines (I.e. TUMOR and CONTROL.) This will only set - # the first sample data. The class "_Record" needs to be - # modified in order to handle multiple sets of sample data - samples = self._parse_samples(items[9][1], rowdict["FORMAT"], record) + # Generate an "items" list (of sample data) based on the columns + # identified to be containing sample data. This is passed to + # _parse_samples + items = [] + for col in self.samples: + items.append(rowdict[col]) + + samples = self._parse_samples(items, rowdict["FORMAT"], record) record.samples = samples return record