Codebase list centrifuge / lintian-fixes/main taxonomy.h
lintian-fixes/main

Tree @lintian-fixes/main (Download .tar.gz)

taxonomy.h @lintian-fixes/mainraw · history · blame

/*
 * taxonomy.h
 *
 *  Created on: Feb 10, 2016
 *      Author: fbreitwieser
 */

#ifndef TAXONOMY_H_
#define TAXONOMY_H_

#include<map>
#include<utility>
#include<string>

enum {
    RANK_UNKNOWN = 0,
    RANK_STRAIN,
    RANK_SPECIES,
    RANK_GENUS,
    RANK_FAMILY,
    RANK_ORDER,
    RANK_CLASS,
    RANK_PHYLUM,
    RANK_KINGDOM,
    RANK_DOMAIN,
    RANK_FORMA,
    RANK_INFRA_CLASS,
    RANK_INFRA_ORDER,
    RANK_PARV_ORDER,
    RANK_SUB_CLASS,
    RANK_SUB_FAMILY,
    RANK_SUB_GENUS,
    RANK_SUB_KINGDOM,
    RANK_SUB_ORDER,
    RANK_SUB_PHYLUM,
    RANK_SUB_SPECIES,
    RANK_SUB_TRIBE,
    RANK_SUPER_CLASS,
    RANK_SUPER_FAMILY,
    RANK_SUPER_KINGDOM,
    RANK_SUPER_ORDER,
    RANK_SUPER_PHYLUM,
    RANK_TRIBE,
    RANK_VARIETAS,
    RANK_LIFE,
    RANK_MAX
};

extern uint8_t tax_rank_num[RANK_MAX];

struct TaxonomyNode {
    uint64_t parent_tid;
    uint8_t  rank;
    uint8_t  leaf;

    TaxonomyNode(uint64_t _parent_tid, uint8_t  _rank, uint8_t _leaf):
    	parent_tid(_parent_tid), rank(_rank), leaf(_leaf) {};

    TaxonomyNode(): parent_tid(0), rank(RANK_UNKNOWN), leaf(false) {};
};

struct TaxonomyPathTable {
    static const size_t nranks = 10;

    map<uint64_t, uint32_t> tid_to_pid;  // from taxonomic ID to path ID
    ELList<uint64_t> paths;

    static uint8_t rank_to_pathID(uint8_t rank) {
        switch(rank) {
            case RANK_STRAIN:
            case RANK_SUB_SPECIES:
                return 0;
            case RANK_SPECIES:
                return 1;
            case RANK_GENUS:
                return 2;
            case RANK_FAMILY:
                return 3;
            case RANK_ORDER:
                return 4;
            case RANK_CLASS:
                return 5;
            case RANK_PHYLUM:
                return 6;
            case RANK_KINGDOM:
                return 7;
            case RANK_SUPER_KINGDOM:
                return 8;
            case RANK_DOMAIN:
                return 9;
            default:
                return std::numeric_limits<uint8_t>::max();
        }
    }

    void buildPaths(const EList<pair<string, uint64_t> >& uid_to_tid,
                    const std::map<uint64_t, TaxonomyNode>& tree)
    {
        map<uint32_t, uint32_t> rank_map;
        rank_map[RANK_STRAIN]        = 0;
        rank_map[RANK_SUB_SPECIES]   = 0;
        rank_map[RANK_SPECIES]       = 1;
        rank_map[RANK_GENUS]         = 2;
        rank_map[RANK_FAMILY]        = 3;
        rank_map[RANK_ORDER]         = 4;
        rank_map[RANK_CLASS]         = 5;
        rank_map[RANK_PHYLUM]        = 6;
        rank_map[RANK_KINGDOM]       = 7;
        rank_map[RANK_SUPER_KINGDOM] = 8;
        rank_map[RANK_DOMAIN]        = 9;

        tid_to_pid.clear();
        paths.clear();
        for(size_t i = 0; i < uid_to_tid.size(); i++) {
            uint64_t tid = uid_to_tid[i].second;
            if(tid_to_pid.find(tid) != tid_to_pid.end())
                continue;
            if(tree.find(tid) == tree.end())
                continue;
            tid_to_pid[tid] = (uint32_t)paths.size();
            paths.expand();
            EList<uint64_t>& path = paths.back();
            path.resizeExact(nranks);
            path.fillZero();
            bool first = true;
            while(true) {
                std::map<uint64_t, TaxonomyNode>::const_iterator itr = tree.find(tid);
                if(itr == tree.end()) {
                    break;
                }
                const TaxonomyNode& node = itr->second;
                uint32_t rank = std::numeric_limits<uint32_t>::max();
                if(first && node.rank == RANK_UNKNOWN) {
                    rank = rank_map[RANK_STRAIN];
                } else if(rank_map.find(node.rank) != rank_map.end()) {
                    rank = rank_map[node.rank];
                }
                if(rank < path.size() && path[rank] == 0) {
                    path[rank] = tid;
                }

                first = false;
                if(node.parent_tid == tid) {
                    break;
                }
                tid = node.parent_tid;
            }
        }
    }

    void getPath(uint64_t tid, EList<uint64_t>& path) const {
        map<uint64_t, uint32_t>::const_iterator itr = tid_to_pid.find(tid);
        if(itr != tid_to_pid.end()) {
            uint32_t pid = itr->second;
            assert_lt(pid, paths.size());
            path = paths[pid];
        } else {
            path.clear();
        }
    }
};

typedef std::map<uint64_t, TaxonomyNode> TaxonomyTree;

inline static void initial_tax_rank_num() {
    uint8_t rank = 0;
    
    tax_rank_num[RANK_SUB_SPECIES] = rank;
    tax_rank_num[RANK_STRAIN] = rank++;
    
    tax_rank_num[RANK_SPECIES] = rank++;
    
    tax_rank_num[RANK_SUB_GENUS] = rank;
    tax_rank_num[RANK_GENUS] = rank++;
    
    tax_rank_num[RANK_SUB_FAMILY] = rank;
    tax_rank_num[RANK_FAMILY] = rank;
    tax_rank_num[RANK_SUPER_FAMILY] = rank++;
    
    tax_rank_num[RANK_SUB_ORDER] = rank;
    tax_rank_num[RANK_INFRA_ORDER] = rank;
    tax_rank_num[RANK_PARV_ORDER] = rank;
    tax_rank_num[RANK_ORDER] = rank;
    tax_rank_num[RANK_SUPER_ORDER] = rank++;
    
    tax_rank_num[RANK_INFRA_CLASS] = rank;
    tax_rank_num[RANK_SUB_CLASS] = rank;
    tax_rank_num[RANK_CLASS] = rank;
    tax_rank_num[RANK_SUPER_CLASS] = rank++;
    
    tax_rank_num[RANK_SUB_PHYLUM] = rank;
    tax_rank_num[RANK_PHYLUM] = rank;
    tax_rank_num[RANK_SUPER_PHYLUM] = rank++;
    
    tax_rank_num[RANK_SUB_KINGDOM] = rank;
    tax_rank_num[RANK_KINGDOM] = rank;
    tax_rank_num[RANK_SUPER_KINGDOM] = rank++;
    
    tax_rank_num[RANK_DOMAIN] = rank;
    tax_rank_num[RANK_FORMA] = rank;
    tax_rank_num[RANK_SUB_TRIBE] = rank;
    tax_rank_num[RANK_TRIBE] = rank;
    tax_rank_num[RANK_VARIETAS] = rank;
    tax_rank_num[RANK_UNKNOWN] = rank;
}

inline static const char* get_tax_rank_string(uint8_t rank) {
    switch(rank) {
        case RANK_STRAIN:        return "strain";
        case RANK_SPECIES:       return "species";
        case RANK_GENUS:         return "genus";
        case RANK_FAMILY:        return "family";
        case RANK_ORDER:         return "order";
        case RANK_CLASS:         return "class";
        case RANK_PHYLUM:        return "phylum";
        case RANK_KINGDOM:       return "kingdom";
        case RANK_FORMA:         return "forma";
        case RANK_INFRA_CLASS:   return "infraclass";
        case RANK_INFRA_ORDER:   return "infraorder";
        case RANK_PARV_ORDER:    return "parvorder";
        case RANK_SUB_CLASS:     return "subclass";
        case RANK_SUB_FAMILY:    return "subfamily";
        case RANK_SUB_GENUS:     return "subgenus";
        case RANK_SUB_KINGDOM:   return "subkingdom";
        case RANK_SUB_ORDER:     return "suborder";
        case RANK_SUB_PHYLUM:    return "subphylum";
        case RANK_SUB_SPECIES:   return "subspecies";
        case RANK_SUB_TRIBE:     return "subtribe";
        case RANK_SUPER_CLASS:   return "superclass";
        case RANK_SUPER_FAMILY:  return "superfamily";
        case RANK_SUPER_KINGDOM: return "superkingdom";
        case RANK_SUPER_ORDER:   return "superorder";
        case RANK_SUPER_PHYLUM:  return "superphylum";
        case RANK_TRIBE:         return "tribe";
        case RANK_VARIETAS:      return "varietas";
        case RANK_LIFE:          return "life";
        default:                 return "no rank";
    };
}

inline static uint8_t get_tax_rank_id(const char* rank) {
    if(strcmp(rank, "strain") == 0) {
        return RANK_STRAIN;
    } else if(strcmp(rank, "species") == 0) {
        return RANK_SPECIES;
    } else if(strcmp(rank, "genus") == 0) {
        return RANK_GENUS;
    } else if(strcmp(rank, "family") == 0) {
        return RANK_FAMILY;
    } else if(strcmp(rank, "order") == 0) {
        return RANK_ORDER;
    } else if(strcmp(rank, "class") == 0) {
        return RANK_CLASS;
    } else if(strcmp(rank, "phylum") == 0) {
        return RANK_PHYLUM;
    } else if(strcmp(rank, "kingdom") == 0) {
        return RANK_KINGDOM;
    } else if(strcmp(rank, "forma") == 0) {
        return RANK_FORMA;
    } else if(strcmp(rank, "infraclass") == 0) {
        return RANK_INFRA_CLASS;
    } else if(strcmp(rank, "infraorder") == 0) {
        return RANK_INFRA_ORDER;
    } else if(strcmp(rank, "parvorder") == 0) {
        return RANK_PARV_ORDER;
    } else if(strcmp(rank, "subclass") == 0) {
        return RANK_SUB_CLASS;
    } else if(strcmp(rank, "subfamily") == 0) {
        return RANK_SUB_FAMILY;
    } else if(strcmp(rank, "subgenus") == 0) {
        return RANK_SUB_GENUS;
    } else if(strcmp(rank, "subkingdom") == 0) {
        return RANK_SUB_KINGDOM;
    } else if(strcmp(rank, "suborder") == 0) {
        return RANK_SUB_ORDER;
    } else if(strcmp(rank, "subphylum") == 0) {
        return RANK_SUB_PHYLUM;
    } else if(strcmp(rank, "subspecies") == 0) {
        return RANK_SUB_SPECIES;
    } else if(strcmp(rank, "subtribe") == 0) {
        return RANK_SUB_TRIBE;
    } else if(strcmp(rank, "superclass") == 0) {
        return RANK_SUPER_CLASS;
    } else if(strcmp(rank, "superfamily") == 0) {
        return RANK_SUPER_FAMILY;
    } else if(strcmp(rank, "superkingdom") == 0) {
        return RANK_SUPER_KINGDOM;
    } else if(strcmp(rank, "superorder") == 0) {
        return RANK_SUPER_ORDER;
    } else if(strcmp(rank, "superphylum") == 0) {
        return RANK_SUPER_PHYLUM;
    } else if(strcmp(rank, "tribe") == 0) {
        return RANK_TRIBE;
    } else if(strcmp(rank, "varietas") == 0) {
        return RANK_VARIETAS;
    } else if(strcmp(rank, "life") == 0) {
        return RANK_LIFE;
    } else {
        return RANK_UNKNOWN;
    }
}

inline static uint64_t get_taxid_at_parent_rank(const TaxonomyTree& tree, uint64_t taxid, uint8_t at_rank) {
	while (true) {
		TaxonomyTree::const_iterator itr = tree.find(taxid);
		if(itr == tree.end()) {
			break;
		}
		const TaxonomyNode& node = itr->second;

		if (node.rank == at_rank) {
			return taxid;
		} else if (node.rank > at_rank || node.parent_tid == taxid) {
			return 0;
		}

		taxid = node.parent_tid;
	}
	return 0;
}

inline static TaxonomyTree read_taxonomy_tree(string taxonomy_fname) {
	TaxonomyTree tree;
	ifstream taxonomy_file(taxonomy_fname.c_str(), ios::in);
	if(taxonomy_file.is_open()) {
		char line[1024];
		while(!taxonomy_file.eof()) {
			line[0] = 0;
			taxonomy_file.getline(line, sizeof(line));
			if(line[0] == 0 || line[0] == '#') continue;
			istringstream cline(line);
			uint64_t tid, parent_tid;
			char dummy; string rank_string;
			cline >> tid >> dummy >> parent_tid >> dummy >> rank_string;
			if(tree.find(tid) != tree.end()) {
				cerr << "Warning: " << tid << " already has a parent!" << endl;
				continue;
			}

			tree[tid] = TaxonomyNode(parent_tid, get_tax_rank_id(rank_string.c_str()), false);
		}
		taxonomy_file.close();
	} else {
		cerr << "Error: " << taxonomy_fname << " doesn't exist!" << endl;
		throw 1;
	}
	return tree;
}


#endif /* TAXONOMY_H_ */