From 0a9e0afa3ea48c4aef7274efcc4e0bba0ce45717 Mon Sep 17 00:00:00 2001
From: Michael Rightmire <m.rightmire@dkfz-heidelberg.de>
Date: Thu, 13 Aug 2020 15:39:26 +0200
Subject: [PATCH 1/4] Changes to parser.py and model.py to allow for
 nonstandard VCF file headers.

---
 vcf/model.py         |  55 +++++++++++-----
 vcf/parser.py        | 149 +++++++++++++++++++++++++++++--------------
 vcf/sample_filter.py |   6 +-
 vcf/utils.py         |   6 +-
 4 files changed, 145 insertions(+), 71 deletions(-)

diff --git a/vcf/model.py b/vcf/model.py
index 34a4d17a..19d7a2d2 100644
--- a/vcf/model.py
+++ b/vcf/model.py
@@ -1,4 +1,5 @@
 from abc import ABCMeta, abstractmethod
+import ast
 import collections
 import sys
 import re
@@ -172,18 +173,41 @@ class _Record(object):
             Neither the upstream nor downstream flanking bases are
             included in the region.
     """
-    def __init__(self, CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
-            sample_indexes, samples=None):
-        self.CHROM = CHROM
-        #: the one-based coordinate of the first nucleotide in ``REF``
-        self.POS = POS
-        self.ID = ID
-        self.REF = REF
-        self.ALT = ALT
-        self.QUAL = QUAL
-        self.FILTER = FILTER
-        self.INFO = INFO
-        self.FORMAT = FORMAT
+    def __init__(self, ROWDICT,
+                 #CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT,
+                    sample_indexes, samples=None):
+
+        for varname, value in ROWDICT.items():
+            # "." pretty always means no value (NOne)
+            if value == ".": 
+                value = None
+                
+            else: 
+                try: 
+                    # Convert to either int or float
+                    # WIll error if value is not a string or a number
+                    value = ast.literal_eval(value)
+                except Exception as e: 
+                    # Any error means value is a string, so just leave it
+                    # Silence this error, or print it for debugging
+                    pass
+                    #print("ERROR occurred converting value '{}' with AST!({})".format(value, e))
+
+            setattr(self, varname, value) #equivalent to: self.varname= 'something'
+
+        #=======================================================================
+        # self.CHROM = CHROM
+        # #: the one-based coordinate of the first nucleotide in ``REF``
+        # self.POS = POS
+        # self.ID = ID
+        # self.REF = REF
+        # self.ALT = ALT
+        # self.QUAL = QUAL
+        # self.FILTER = FILTER
+        # self.INFO = INFO
+        # self.FORMAT = FORMAT
+        #=======================================================================
+        
         #: zero-based, half-open start coordinate of ``REF``
         self.start = self.POS - 1
         #: zero-based, half-open end coordinate of ``REF``
@@ -362,7 +386,7 @@ def heterozygosity(self):
         If there are i alleles with frequency p_i, H=1-sum_i(p_i^2)
         """
         allele_freqs = [1-sum(self.aaf)] + self.aaf
-        return 1 - sum(map(lambda x: x**2, allele_freqs))
+        return 1 - sum([x**2 for x in allele_freqs])
 
     def get_hom_refs(self):
         """ The list of hom ref genotypes"""
@@ -558,9 +582,8 @@ def is_filtered(self):
             return True
 
 
-class _AltRecord(object):
+class _AltRecord(object, metaclass=ABCMeta):
     '''An alternative allele record: either replacement string, SV placeholder, or breakend'''
-    __metaclass__ = ABCMeta
 
     def __init__(self, type, **kwargs):
         super(_AltRecord, self).__init__(**kwargs)
@@ -596,7 +619,7 @@ def __len__(self):
         return len(self.sequence)
 
     def __eq__(self, other):
-        if isinstance(other, basestring):
+        if isinstance(other, str):
             return self.sequence == other
         elif not isinstance(other, self.__class__):
             return False
diff --git a/vcf/parser.py b/vcf/parser.py
index c3c3d082..09a3a579 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -1,3 +1,4 @@
+import ast
 import codecs
 import collections
 import csv
@@ -22,8 +23,8 @@
 except ImportError:
     cparse = None
 
-from model import _Call, _Record, make_calldata_tuple
-from model import _Substitution, _Breakend, _SingleBreakend, _SV
+from .model import _Call, _Record, make_calldata_tuple
+from .model import _Substitution, _Breakend, _SingleBreakend, _SV
 
 
 # Metadata parsers/constants
@@ -315,6 +316,7 @@ def _parse_metainfo(self):
         parser = _vcf_metadata_parser()
 
         line = next(self.reader)
+        
         while line.startswith('##'):
             self._header_lines.append(line)
 
@@ -350,7 +352,8 @@ def _parse_metainfo(self):
             line = next(self.reader)
 
         fields = self._row_pattern.split(line[1:])
-        self._column_headers = fields[:9]
+#        self._column_headers = fields[:9]
+        self._column_headers = fields                    
         self.samples = fields[9:]
         self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
 
@@ -468,7 +471,7 @@ def _parse_samples(self, samples, samp_fmt, site):
 
         nfields = len(samp_fmt._fields)
 
-        for name, sample in itertools.izip(self.samples, samples):
+        for name, sample in zip(self.samples, samples):
 
             # parse the data for this sample
             sampdat = [None] * nfields
@@ -548,47 +551,95 @@ def _parse_alt(self, str):
         else:
             return _Substitution(str)
 
-    def next(self):
+    def __next__(self):
         '''Return the next record in the file.'''
         line = next(self.reader)
-        row = self._row_pattern.split(line.rstrip())
-        chrom = row[0]
+        row = self._row_pattern.split(line.rstrip())        
+        rowdict = collections.OrderedDict((zip(self._column_headers, row)))
+        
         if self._prepend_chr:
-            chrom = 'chr' + chrom
-        pos = int(row[1])
-
-        if row[2] != '.':
-            ID = row[2]
-        else:
-            ID = None
-
-        ref = row[3]
-        alt = self._map(self._parse_alt, row[4].split(','))
-
-        try:
-            qual = int(row[5])
-        except ValueError:
-            try:
-                qual = float(row[5])
-            except ValueError:
-                qual = None
-
-        filt = self._parse_filter(row[6])
-        info = self._parse_info(row[7])
+            rowdict["CHROM"] = 'chr' + rowdict["CHROM"]
+#===============================================================================
+#         chrom = rowdict["CHROM"]
+#         if self._prepend_chr:
+#             chrom = 'chr' + chrom
+# #        chrom = row[0]
+#===============================================================================
+
+#===============================================================================
+#         pos = int(rowdict["POS"])
+# #        pos = int(row[1])
+#===============================================================================
+
+        rowdict["ID"] = rowdict["ID"] if rowdict["ID"] != '.' else None
+        #=======================================================================
+        # if row[2] != '.':
+        #     ID = row[2]
+        # else:
+        #     ID = None
+        #=======================================================================
+
+        #=======================================================================
+        # ref = rowdict["REF"]
+        # alt = self._map(self._parse_alt, rowdict["ALT"].split(','))
+        #=======================================================================
+        rowdict["ALT"] = self._map(self._parse_alt, rowdict["ALT"].split(','))
+        
+        try: 
+            rowdict["QUAL"] = ast.literal_eval(rowdict["QUAL"])
+        except: # Any error means qual = None 
+              rowdict["QUAL"] = None
+        #=======================================================================
+        # try:
+        #     qual = int(row[5])
+        # except ValueError:
+        #     try:
+        #         qual = float(row[5])
+        #     except ValueError:
+        #         qual = None
+        #=======================================================================
+
+        rowdict["FILTER"] = self._parse_filter(rowdict["FILTER"])
+        rowdict["INFO"]   = self._parse_info(rowdict["INFO"])
+        #=======================================================================
+        # filt = self._parse_filter(rowdict["FILTER"])
+        # info = self._parse_info(rowdict["INFO"])
+        #=======================================================================
 
         try:
-            fmt = row[8]
-        except IndexError:
-            fmt = None
-        else:
-            if fmt == '.':
-                fmt = None
-
-        record = _Record(chrom, pos, ID, ref, alt, qual, filt,
-                info, fmt, self._sample_indexes)
-
-        if fmt is not None:
-            samples = self._parse_samples(row[9:], fmt, record)
+            if rowdict["FORMAT"] == '.': 
+                rowdict["FORMAT"] = None
+        except KeyError:
+            rowdict["FORMAT"] = None
+        #=======================================================================
+        # try:
+        #     fmt = row[8]
+        # except IndexError:
+        #     fmt = None
+        # else:
+        #     if fmt == '.':
+        #         fmt = None
+        #=======================================================================
+
+        #=======================================================================
+        # record = _Record(chrom, pos, ID, ref, alt, qual, filt,
+        #                  info, fmt, self._sample_indexes)
+        #=======================================================================
+
+        record = _Record(rowdict, self._sample_indexes)
+            
+        # Genotype fields can be followed by other data columns, 
+        # so we have to detect that
+        if rowdict["FORMAT"] is not None:
+            items = list(rowdict.items())
+            #===================================================================
+            # We will assume that the next string that follows the FORMAT 
+            # declaration is the sample data. HOWEVER, a FORMAT can have 
+            # multiple sample lines (I.e. TUMOR and CONTROL.) This will only set 
+            # the first sample data. The class "_Record" needs to be 
+            # modified in order to handle multiple sets of sample data
+            #===================================================================
+            samples = self._parse_samples(items[9][1], rowdict["FORMAT"], record)
             record.samples = samples
 
         return record
@@ -641,7 +692,7 @@ class Writer(object):
     """VCF Writer. On Windows Python 2, open stream with 'wb'."""
 
     # Reverse keys and values in header field count dictionary
-    counts = dict((v,k) for k,v in field_counts.iteritems())
+    counts = dict((v,k) for k,v in field_counts.items())
 
     def __init__(self, stream, template, lineterminator="\n"):
         self.writer = csv.writer(stream, delimiter="\t",
@@ -654,30 +705,30 @@ def __init__(self, stream, template, lineterminator="\n"):
         # get a maximum key).
         self.info_order = collections.defaultdict(
             lambda: len(template.infos),
-            dict(zip(template.infos.iterkeys(), itertools.count())))
+            dict(list(zip(iter(template.infos.keys()), itertools.count()))))
 
         two = '##{key}=<ID={0},Description="{1}">\n'
         four = '##{key}=<ID={0},Number={num},Type={2},Description="{3}">\n'
         _num = self._fix_field_count
-        for (key, vals) in template.metadata.iteritems():
+        for (key, vals) in template.metadata.items():
             if key in SINGULAR_METADATA:
                 vals = [vals]
             for val in vals:
                 if isinstance(val, dict):
                     values = ','.join('{0}={1}'.format(key, value)
-                                      for key, value in val.items())
+                                      for key, value in list(val.items()))
                     stream.write('##{0}=<{1}>\n'.format(key, values))
                 else:
                     stream.write('##{0}={1}\n'.format(key, val))
-        for line in template.infos.itervalues():
+        for line in template.infos.values():
             stream.write(four.format(key="INFO", *line, num=_num(line.num)))
-        for line in template.formats.itervalues():
+        for line in template.formats.values():
             stream.write(four.format(key="FORMAT", *line, num=_num(line.num)))
-        for line in template.filters.itervalues():
+        for line in template.filters.values():
             stream.write(two.format(key="FILTER", *line))
-        for line in template.alts.itervalues():
+        for line in template.alts.values():
             stream.write(two.format(key="ALT", *line))
-        for line in template.contigs.itervalues():
+        for line in template.contigs.values():
             if line.length:
                 stream.write('##contig=<ID={0},length={1}>\n'.format(*line))
             else:
diff --git a/vcf/sample_filter.py b/vcf/sample_filter.py
index b156b453..0f015430 100644
--- a/vcf/sample_filter.py
+++ b/vcf/sample_filter.py
@@ -7,7 +7,7 @@
 import warnings
 
 
-from parser import Reader, Writer
+from .parser import Reader, Writer
 
 
 class SampleFilter(object):
@@ -81,13 +81,13 @@ def filt2idx(item):
                 # is int, check if it's an idx
                 if item < len(self.samples):
                     return item
-        filters = set(filter(lambda x: x is not None, map(filt2idx, filt_s)))
+        filters = set([x for x in map(filt2idx, filt_s) if x is not None])
         if len(filters) < len(filt_s):
             # TODO print the filters that were ignored
             warnings.warn("Invalid filters, ignoring", RuntimeWarning)
 
         if self.invert:
-            filters = set(xrange(len(self.samples))).difference(filters)
+            filters = set(range(len(self.samples))).difference(filters)
 
         # `sample_filter` setter updates `samples`
         self.parser.sample_filter = filters
diff --git a/vcf/utils.py b/vcf/utils.py
index 2881dc24..4b51c199 100644
--- a/vcf/utils.py
+++ b/vcf/utils.py
@@ -37,19 +37,19 @@ def walk_together(*readers, **kwargs):
         next_idx_to_k = dict(
             (i, get_key(r)) for i, r in enumerate(nexts) if r is not None)
         keys_with_prev_contig = [
-            k for k in next_idx_to_k.values() if k[0] == min_k[0]]
+            k for k in list(next_idx_to_k.values()) if k[0] == min_k[0]]
 
         if any(keys_with_prev_contig):
             min_k = min(keys_with_prev_contig)   # finish previous contig
         else:
             min_k = min(next_idx_to_k.values())   # move on to next contig
 
-        min_k_idxs = set([i for i, k in next_idx_to_k.items() if k == min_k])
+        min_k_idxs = set([i for i, k in list(next_idx_to_k.items()) if k == min_k])
         yield [nexts[i] if i in min_k_idxs else None for i in range(len(nexts))]
 
         for i in min_k_idxs:
             try:
-                nexts[i] = readers[i].next()
+                nexts[i] = next(readers[i])
             except StopIteration:
                 nexts[i] = None
 

From 600a429888f72a4ffdb8c68cf3d582d2a05d219b Mon Sep 17 00:00:00 2001
From: root <m.rightmire@dkfz-heidelberg.de>
Date: Thu, 13 Aug 2020 17:08:37 +0200
Subject: [PATCH 2/4] Adjusted model.py _record.str() to respond with all
 columns, not just CHROM, POS, REF; and ALT.)

---
 vcf/model.py | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/vcf/model.py b/vcf/model.py
index 19d7a2d2..d9098eb8 100644
--- a/vcf/model.py
+++ b/vcf/model.py
@@ -195,19 +195,6 @@ def __init__(self, ROWDICT,
 
             setattr(self, varname, value) #equivalent to: self.varname= 'something'
 
-        #=======================================================================
-        # self.CHROM = CHROM
-        # #: the one-based coordinate of the first nucleotide in ``REF``
-        # self.POS = POS
-        # self.ID = ID
-        # self.REF = REF
-        # self.ALT = ALT
-        # self.QUAL = QUAL
-        # self.FILTER = FILTER
-        # self.INFO = INFO
-        # self.FORMAT = FORMAT
-        #=======================================================================
-        
         #: zero-based, half-open start coordinate of ``REF``
         self.start = self.POS - 1
         #: zero-based, half-open end coordinate of ``REF``
@@ -294,8 +281,20 @@ def __iter__(self):
         return iter(self.samples)
 
     def __str__(self):
-        return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
-
+#        return "Record(CHROM=%(CHROM)s, POS=%(POS)s, REF=%(REF)s, ALT=%(ALT)s)" % self.__dict__
+        return ''.join(["Record(",
+                        "CHROM=%(CHROM)s, ",
+                        "POS=%(POS)s, ",
+                        "ID=%(ID)s, ",
+                        "REF=%(REF)s, ",
+                        "ALT=%(ALT)s), "
+                        "QUAL=%(QUAL)s), "
+                        "FILTER=%(FILTER)s), "
+                        "INFO=%(INFO)s), "
+                        "FORMAT=%(FORMAT)s)"
+                        ]) % self.__dict__
+                        
+#    CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT
     def add_format(self, fmt):
         self.FORMAT = self.FORMAT + ':' + fmt
 

From 7a1c849647af4be1b29b2fa99faf86435dd9c3e8 Mon Sep 17 00:00:00 2001
From: Michael Rightmire <m.rightmire@dkfz-heidelberg.de>
Date: Fri, 14 Aug 2020 15:24:00 +0200
Subject: [PATCH 3/4] Moved the enumeration of self.samples and
 self._samples_indexes to after the first row of data is read. This way,
 non-standard columns of data can be inced or excluded as appropriate.

---
 vcf/parser.py | 34 +++++++++++++++++++++++++++-------
 1 file changed, 27 insertions(+), 7 deletions(-)

diff --git a/vcf/parser.py b/vcf/parser.py
index 09a3a579..7d1797d4 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -353,9 +353,13 @@ def _parse_metainfo(self):
 
         fields = self._row_pattern.split(line[1:])
 #        self._column_headers = fields[:9]
-        self._column_headers = fields                    
-        self.samples = fields[9:]
-        self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
+        self._column_headers = fields
+        self._non_standard_column_headers = (
+            set(self._column_headers) - 
+            set(["CHROM", "POS", "ID", "REF", "ALT","QUAL", "FILTER", 
+                 "INFO", "FORMAT"]))
+#         self.samples = fields[9:] #removed in favor of running in __next__
+#         self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
 
     def _map(self, func, iterable, bad=['.', '']):
         '''``map``, but make bad values None.'''
@@ -625,20 +629,36 @@ def __next__(self):
         # record = _Record(chrom, pos, ID, ref, alt, qual, filt,
         #                  info, fmt, self._sample_indexes)
         #=======================================================================
-
-        record = _Record(rowdict, self._sample_indexes)
             
         # Genotype fields can be followed by other data columns, 
         # so we have to detect that
         if rowdict["FORMAT"] is not None:
             items = list(rowdict.items())
-            #===================================================================
+            # We run this code here, because we have to wait for the first line 
+            # of actual data. 
+            # It will use the number of colons found in the FORMAT field, to 
+            # create a regex pattern. It will match that regex pattern against 
+            # the data in the remaining non-standard colums and, if they appear 
+            # to actually be sample data, adds them to self.samples
+            # self.samples remains set to None in init. 
+            if self.samples is None: 
+                self.samples = []
+                self._num_format_colons = rowdict["FORMAT"].count(":")
+                self._samples_pattern = "^"
+                for i in range(0,self._num_format_colons+1, 1):
+                    self._samples_pattern += r"[0-9.,/-]{1,}:"
+                self._samples_pattern = self._samples_pattern.rstrip(":") + "$"
+                for header in self._column_headers:
+                        if re.match(self._samples_pattern, str(rowdict[header])):
+                            self.samples.append(header) 
+                self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
+            record = _Record(rowdict, self._sample_indexes)
+                     
             # We will assume that the next string that follows the FORMAT 
             # declaration is the sample data. HOWEVER, a FORMAT can have 
             # multiple sample lines (I.e. TUMOR and CONTROL.) This will only set 
             # the first sample data. The class "_Record" needs to be 
             # modified in order to handle multiple sets of sample data
-            #===================================================================
             samples = self._parse_samples(items[9][1], rowdict["FORMAT"], record)
             record.samples = samples
 

From 91f3bd2987e3cc383e8a2d4cf5188f65fb8d8cac Mon Sep 17 00:00:00 2001
From: Michael Rightmire <m.rightmire@dkfz-heidelberg.de>
Date: Fri, 14 Aug 2020 17:30:43 +0200
Subject: [PATCH 4/4] Corrected little bump I caused in _parse_samples when I
 modified things to use rowdict.

---
 vcf/parser.py | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

diff --git a/vcf/parser.py b/vcf/parser.py
index 7d1797d4..dbcaa9c8 100644
--- a/vcf/parser.py
+++ b/vcf/parser.py
@@ -454,6 +454,10 @@ def _parse_sample_format(self, samp_fmt):
         return samp_fmt
 
     def _parse_samples(self, samples, samp_fmt, site):
+        print("-----------------------------")#333
+        print("samples =", samples) #3333
+        print("samp_fmt =", samp_fmt)#3333
+        print("site =", site) #333
         '''Parse a sample entry according to the format specified in the FORMAT
         column.
 
@@ -466,6 +470,7 @@ def _parse_samples(self, samples, samp_fmt, site):
             self._format_cache[samp_fmt] = self._parse_sample_format(samp_fmt)
         samp_fmt = self._format_cache[samp_fmt]
 
+        print("cparse=", cparse)#333
         if cparse:
             return cparse.parse_samples(
                 self.samples, samples, samp_fmt, samp_fmt._types, samp_fmt._nums, site)
@@ -474,13 +479,16 @@ def _parse_samples(self, samples, samp_fmt, site):
         _map = self._map
 
         nfields = len(samp_fmt._fields)
-
+        
+        print("self.samples", self.samples) #3333
+        print("samples =", samples)#3333
         for name, sample in zip(self.samples, samples):
-
+            print("Zipped: ", name, sample) #333
             # parse the data for this sample
             sampdat = [None] * nfields
 
             for i, vals in enumerate(sample.split(':')):
+                print("sample split = ", i, ":", vals)#333
 
                 # short circuit the most common
                 if samp_fmt._fields[i] == 'GT':
@@ -633,7 +641,6 @@ def __next__(self):
         # Genotype fields can be followed by other data columns, 
         # so we have to detect that
         if rowdict["FORMAT"] is not None:
-            items = list(rowdict.items())
             # We run this code here, because we have to wait for the first line 
             # of actual data. 
             # It will use the number of colons found in the FORMAT field, to 
@@ -654,12 +661,14 @@ def __next__(self):
                 self._sample_indexes = dict([(x,i) for (i,x) in enumerate(self.samples)])
             record = _Record(rowdict, self._sample_indexes)
                      
-            # We will assume that the next string that follows the FORMAT 
-            # declaration is the sample data. HOWEVER, a FORMAT can have 
-            # multiple sample lines (I.e. TUMOR and CONTROL.) This will only set 
-            # the first sample data. The class "_Record" needs to be 
-            # modified in order to handle multiple sets of sample data
-            samples = self._parse_samples(items[9][1], rowdict["FORMAT"], record)
+            # Generate an "items" list (of sample data) based on the columns
+            # identified to be containing sample data. This is passed to 
+            # _parse_samples
+            items = []
+            for col in self.samples:
+                items.append(rowdict[col])
+                
+            samples = self._parse_samples(items, rowdict["FORMAT"], record)
             record.samples = samples
 
         return record