Codebase list kmc / 60095252-c551-4abb-89a7-ca6cd934eeb0/upstream kmc_tools / kff_info_reader.cpp
60095252-c551-4abb-89a7-ca6cd934eeb0/upstream

Tree @60095252-c551-4abb-89a7-ca6cd934eeb0/upstream (Download .tar.gz)

kff_info_reader.cpp @60095252-c551-4abb-89a7-ca6cd934eeb0/upstreamraw · history · blame

#include "kff_info_reader.h"
#include "defs.h"
#include <stdexcept>
#include <vector>
#include <array>
//#include <iostream>
#include <algorithm>
#include <cstring>

std::string CKFFInfoReader::ReadVarName()
{
	std::string res;
	while (true)
	{
		int c = fgetc(file);
		if (c == EOF)
			throw std::runtime_error("unexpected EOF");
		char _c = c;
		if (_c == 0)
			return res;
		res.push_back(_c);
	}
}

CKFFInfoReader::CKFFInfoReader(const std::string& path)
{
	file = my_fopen(path.c_str(), "rb");
	if (!file)
		throw std::runtime_error("Error: cannot open file " + path);

	// Check markers
	char marker[4];
	marker[3] = '\0';
	fread(marker, 1, 3, file);
	if (strncmp(marker, "KFF", 3) != 0)
		throw std::runtime_error("Error: missing KFF marker at the begining of file " + path);

	my_fseek(file, -3, SEEK_END);
	fread(marker, 1, 3, file);
	if (strncmp(marker, "KFF", 3) != 0)
		throw std::runtime_error("Error: missing KFF marker at the end of file " + path);

	my_fseek(file, -23, SEEK_END);
	char footer_size_str[12];
	fread(footer_size_str, 1, 12, file);
	bool footer_present = strcmp(footer_size_str, "footer_size") == 0;
	uint64_t first_index = std::numeric_limits<uint64_t>::max();
	std::array<uint8_t, 8> tmp;
	uint64_t nb_vars;
	char t;
	if (footer_present)
	{		
		uint64_t footer_size;
		fread(tmp.data(), 1, 8, file);
		LoadBigEndian(tmp.data(), footer_size);

		//std::cerr << "footer_size: " << footer_size << "\n";

		my_fseek(file, -((int64_t)footer_size + 3), SEEK_END);
		
		fread(tmp.data(), 1, 1, file);
		LoadBigEndian(tmp.data(), t);

		if (t != 'v')
			throw std::runtime_error("Error: footer should start as 'v' section, file " + path);

		fread(tmp.data(), 1, 8, file);
		LoadBigEndian(tmp.data(), nb_vars);

		//std::cerr << "footer nb_vars: " << nb_vars << "\n";

		for (uint64_t i = 0; i < nb_vars; ++i)
		{
			auto name = ReadVarName();
			uint64_t val;
			fread(tmp.data(), 1, 8, file);
			LoadBigEndian(tmp.data(), val);
			//std::cerr << name << ": " << val << "\n";

			kff_file_struct.footer[name] = val;

			if (name == "first_index")
				first_index = val;
		}
	}

	my_fseek(file, 3, SEEK_SET);
	uint8_t ver_minor, ver_major;
	fread(tmp.data(), 1, 1, file);
	LoadBigEndian(tmp.data(), ver_major);

	fread(tmp.data(), 1, 1, file);
	LoadBigEndian(tmp.data(), ver_minor);


	fread(tmp.data(), 1, 1, file);
	LoadBigEndian(tmp.data(), kff_file_struct.encoding);

	fread(tmp.data(), 1, 1, file);
	LoadBigEndian(tmp.data(), kff_file_struct.all_unique);
	if (kff_file_struct.all_unique == 0)
		throw std::runtime_error("Error: only unique k-mers in KFF file are supported, file " + path);

	uint8_t canonical;
	fread(tmp.data(), 1, 1, file);
	LoadBigEndian(tmp.data(), canonical);

	kff_file_struct.both_strands = canonical;

	uint32_t free_size;
	fread(tmp.data(), 1, sizeof(free_size), file);
	LoadBigEndian(tmp.data(), free_size);

	my_fseek(file, free_size, SEEK_CUR); //skip free block

	fread(tmp.data(), 1, 1, file);
	LoadBigEndian(tmp.data(), t);
	if (t == 'i')
	{
		uint64_t p = my_ftell(file);
		if (first_index != std::numeric_limits<uint64_t>::max() && first_index != p) //first index was defined in footer and is different the a real first index
			throw std::runtime_error("Error: footer defines 'first_index' but there is also an index as first section and the positions are inconsistent, file " + path);
		first_index = p;
	}

	if (first_index == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: no first_index in the footer and first section is not an index, file: " + path);


	while (first_index)
	{
		//std::cerr << "reading single index\n";

		my_fseek(file, first_index, SEEK_SET);

		fread(tmp.data(), 1, 1, file);
		LoadBigEndian(tmp.data(), t);
		if (t != 'i')
			throw std::runtime_error("Error: missing index");

		fread(tmp.data(), 1, 8, file);

		LoadBigEndian(tmp.data(), nb_vars);
		//std::cerr << "index nb_vars: " << nb_vars << "\n";
		//std::cerr << "Index: \n";

		int64_t this_index_end = my_ftell(file) + nb_vars * (sizeof(uint64_t) + 1) + sizeof(uint64_t);
		for (uint64_t i = 0; i < nb_vars; ++i)
		{
			fread(tmp.data(), 1, 1, file);
			LoadBigEndian(tmp.data(), t);
			int64_t rel_pos;

			fread(tmp.data(), 1, 8, file);
			LoadBigEndian(tmp.data(), rel_pos);

			index.emplace_back(t, this_index_end + rel_pos);
			//std::cerr << t << "\t" << rel_pos << "\n";
		}

		fread(tmp.data(), 1, 8, file);
		LoadBigEndian(tmp.data(), first_index);
	}

	std::sort(index.begin(), index.end(), [](const auto& lhs, const auto& rhs) {return lhs.section_pos < rhs.section_pos; });

	//std::cerr << "Final index:\n";
	for (const auto& e : index)
	{
		//std::cerr << e.first << "\t" << e.second << "\n";

		my_fseek(file, e.section_pos, SEEK_SET);
		fread(tmp.data(), 1, 1, file);
		LoadBigEndian(tmp.data(), t);
		if (t != e.section_type)
			throw std::runtime_error("Error: KFF index is inconsistent with file content");
	}

	for (auto e : index)
	{
		my_fseek(file, e.section_pos, SEEK_SET);
		fread(tmp.data(), 1, 1, file);
		LoadBigEndian(tmp.data(), t);
		if (t == 'i')
			continue; //skip index sections
		else if (t == 'v') //variable section
			ReadVariableSection();
		else if (t == 'r') //raw section
			ReadRawSection();
		else if (t == 'm') //minimizer section
			ReadMinimizerSection();
		else
			throw std::runtime_error(std::string("Error: unsupported section type (") + t + "), file " + path);
	}

	if (kff_file_struct.scopes.size())
	{
		if (kff_file_struct.scopes.back().data_sections.empty()) // remove prev variable section if empty
			kff_file_struct.scopes.pop_back();
	}
}

void CKFFInfoReader::ReadVariableSection()
{
	uint64_t nb_vars;	
	std::array<uint8_t, 8> tmp;
	fread(tmp.data(), 1, sizeof(uint64_t), file);
	LoadBigEndian(tmp.data(), nb_vars);
	CKFFVariables section;
	for (uint64_t i = 0; i < nb_vars; ++i)
	{
		auto var_name = ReadVarName();
		uint64_t val;
		fread(tmp.data(), 1, sizeof(uint64_t), file);
		LoadBigEndian(tmp.data(), val);
		if (var_name == "k")
			section.kmer_size = val;
		else if (var_name == "max")
			section.max_in_block = val;
		else if (var_name == "data_size")
			section.data_size = val;
		else if (var_name == "m")
			section.minimizer_size = val;
		else if (var_name == "ordered")
			section.ordered = val;
	}
	if (kff_file_struct.scopes.size())
	{
		if (kff_file_struct.scopes.back().data_sections.empty()) // remove prev variable section if empty
			kff_file_struct.scopes.pop_back();
	}
	kff_file_struct.scopes.push_back(section);
}

void CKFFInfoReader::ReadRawSection()
{
	if (kff_file_struct.scopes.empty())
		throw std::runtime_error("Error: raw section declared without variable section");
	auto& scope = kff_file_struct.scopes.back();

	if(scope.kmer_size == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `k` variable was not defined for raw section");

	if(scope.max_in_block == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `max` variable was not defined for raw section");

	if(scope.data_size == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `data_size` variable was not defined for raw section");

	uint64_t nb_blocks;
	std::array<uint8_t, 8> tmp;

	fread(tmp.data(), 1, 8, file);
	LoadBigEndian(tmp.data(), nb_blocks);

	CKFFDataSection data_section;
	data_section.type = KFFDataSectionType::RAW;
	data_section.nb_blocks = nb_blocks;
	data_section.data_start_pos = my_ftell(file);
	scope.data_sections.push_back(data_section);
}

void CKFFInfoReader::ReadMinimizerSection()
{
	if (kff_file_struct.scopes.empty())
		throw std::runtime_error("Error: minimizer section declared without variable section");
	auto& scope = kff_file_struct.scopes.back();

	if (scope.kmer_size == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `k` variable was not defined for minimizer section");

	if (scope.max_in_block == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `max` variable was not defined for minimizer section");

	if (scope.data_size == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `data_size` variable was not defined for minimizer section");

	if (scope.minimizer_size == std::numeric_limits<uint64_t>::max())
		throw std::runtime_error("Error: `m` variable was not defined for minimizer section");

	uint64_t nb_blocks;
	std::array<uint8_t, 8> tmp;

	uint64_t minimizer_bytes = (scope.minimizer_size + 3) / 4;

	CKFFDataSection data_section;
	data_section.minimizer.resize(minimizer_bytes);
	fread(data_section.minimizer.data(), 1, minimizer_bytes, file);

	fread(tmp.data(), 1, 8, file);
	LoadBigEndian(tmp.data(), nb_blocks);

	data_section.type = KFFDataSectionType::RAW;
	data_section.nb_blocks = nb_blocks;
	data_section.data_start_pos = my_ftell(file);
	scope.data_sections.push_back(data_section);
}

CKFFInfoReader::~CKFFInfoReader()
{
	fclose(file);
}