ariba/mlst_profile.py - ariba (debian/2.14.4+ds-2.1)

Tree @debian/2.14.4+ds-2.1 (Download .tar.gz)

mlst_profile.py @debian/2.14.4+ds-2.1 — raw · history · blame

import sys
import csv
import os

class Error (Exception): pass

class MlstProfile:
    def __init__(self, infile, duplicate_warnings=True):
        self.infile = infile
        self.duplicate_warnings = duplicate_warnings
        self.columns_to_ignore = ['clonal_complex', 'CC', 'Lineage', 'mlst_clade', 'species']

        if not os.path.exists(self.infile):
            raise Error('Error! Input file "' + self.infile + '" not found.')
        self._load_input_file()


    def _load_input_file(self):
        self.profile_to_type = {}

        with open(self.infile) as f:
            reader = csv.DictReader(f, delimiter='\t')
            if reader.fieldnames[0] != 'ST':
                raise Error('Error. Expected first column of profile file "' + self.infile + '" to be "ST"')

            self.genes_list = [column_name for column_name in reader.fieldnames[1:] if column_name not in self.columns_to_ignore]

            for row in reader:
                type_tuple = tuple(int(row[x]) for x in self.genes_list)
                if type_tuple in self.profile_to_type:
                    previous_st = self.profile_to_type[type_tuple]
                    this_st = int(row['ST'])
                    if previous_st != this_st:
                        new_st = min(previous_st, this_st)
                        if self.duplicate_warnings:
                            print('WARNING: Same profile found twice in input file, but two different STs. Going to use the ST with the smaller number (', new_st, ')', sep='', file=sys.stderr)
                            print(' ... STs are', previous_st, this_st, 'and alleles are', ', '.join([x + ':' + row[x] for x in self.genes_list]), file=sys.stderr)
                        self.profile_to_type[type_tuple] = new_st
                else:
                    self.profile_to_type[type_tuple] = int(row['ST'])

        self.genes_set = set(self.genes_list)


    def has_gene(self, gene):
        return gene in self.genes_set


    def get_sequence_type(self, type_dict):
        key = tuple(type_dict.get(x, 'ND') for x in self.genes_list)
        if 'ND' in key:
            return 'ND'
        else:
            return self.profile_to_type.get(key, 'Novel')