hyperloglogplus.h - centrifuge (debian/1.0.3-1)

Tree @debian/1.0.3-1 (Download .tar.gz)

hyperloglogplus.h @debian/1.0.3-1 — raw · history · blame

/*
 * hyperloglogplus.h
 *
 * Implementation of HyperLogLog++ algorithm described by Stefan Heule et al.
 *
 *  Created on: Apr 25, 2015
 *      Author: fbreitwieser
 */

#ifndef HYPERLOGLOGPLUS_H_
#define HYPERLOGLOGPLUS_H_

#include<set>
#include<vector>
#include<stdexcept>
#include<iostream>
#include<fstream>
#include<math.h>    //log
#include<algorithm> //vector.count
#include<bitset>

#include "hyperloglogbias.h"
#include "third_party/MurmurHash3.cpp"
#include "assert_helpers.h"

using namespace std;

//#define HLL_DEBUG
//#define NDEBUG
//#define NDEBUG2
#define arr_len(a) (a + sizeof a / sizeof a[0])

// experimentally determined threshold values for  p - 4
static const uint32_t threshold[] = {10, 20, 40, 80, 220, 400, 900, 1800, 3100,
							  6500, 11500, 20000, 50000, 120000, 350000};


///////////////////////

//
/**
 * gives the estimated cardinality for m bins, v of which are non-zero
 * @param m number of bins in the matrix
 * @param v number of non-zero bins
 * @return
 */
double linearCounting(uint32_t m, uint32_t v) {
	if (v > m) {
	    throw std::invalid_argument("number of v should not be greater than m");
	}
	double fm = double(m);
	return fm * log(fm/double(v));
}

/**
  * from Numerical Recipes, 3rd Edition, p 352
  * Returns hash of u as a 64-bit integer.
  *
*/
inline uint64_t ranhash (uint64_t u) {
  uint64_t v = u * 3935559000370003845 + 2691343689449507681;

  v ^= v >> 21; v ^= v << 37; v ^= v >>  4;

  v *= 4768777513237032717;

  v ^= v << 20; v ^= v >> 41; v ^= v <<  5;

  return v;
}

inline uint64_t murmurhash3_finalizer (uint64_t key)  {
	key += 1; // murmurhash returns a hash value of 0 for the key 0 - avoid that.
	key ^= key >> 33;
	key *= 0xff51afd7ed558ccd;
	key ^= key >> 33;
	key *= 0xc4ceb9fe1a85ec53;
	key ^= key >> 33;
	return key;
}

/**
 * Bias correction factors for specific m's
 * @param m
 * @return
 */
double alpha(uint32_t m)  {
	switch (m) {
	case 16: return 0.673;
	case 32: return 0.697;
	case 64: return 0.709;
	}

	// m >= 128
	return 0.7213 / (1 + 1.079/double(m));
}

/**
 * calculate the raw estimate as harmonic mean of the ranks in the register
 * @param array
 * @return
 */
double calculateEstimate(vector<uint8_t> array) {
	double inverseSum = 0.0;
	for (size_t i = 0; i < array.size(); ++i) {
		// TODO: pre-calculate the power calculation
		inverseSum += pow(2,-array[i]);
	}
	return alpha(array.size()) * double(array.size() * array.size()) * 1 / inverseSum;
}

uint32_t countZeros(vector<uint8_t> s) {
	return (uint32_t)count(s.begin(), s.end(), 0);
}

/**
 * Extract bits (from uint32_t or uint64_t) using LSB 0 numbering from hi to lo, including lo
 * @param bits
 * @param hi
 * @param lo
 * @return
 */
template<typename T>
T extractBits(T value, uint8_t hi, uint8_t lo, bool shift_left = false) {

    // create a bitmask:
    //            (T(1) << (hi - lo)                 a 1 at the position (hi - lo)
    //           ((T(1) << (hi - lo) - 1)              1's from position 0 to position (hi-lo-1)
    //          (((T(1) << (hi - lo)) - 1) << lo)      1's from position lo to position hi

	// The T(1) is required to not cause overflow on 32bit machines
	// TODO: consider creating a bitmask only once in the beginning
	T bitmask = (((T(1) << (hi - lo)) - 1) << lo);
    T result = value & bitmask;

    if (!shift_left) {
        // shift resulting bits to the right
        result = result >> lo;
    } else {
        // shift resulting bits to the left
        result = result << (sizeof(T)*8 - hi);
    }
    return result;	
}

template<typename T>
T extractBits(T bits, uint8_t hi) {
    // create a bitmask for first hi bits (LSB 0 numbering)
	T bitmask = T(-1) << (sizeof(T)*8 - hi);

	return (bits & bitmask);
}

// functions for counting the number of leading 0-bits (clz)
//           and counting the number of trailing 0-bits (ctz)
//#ifdef __GNUC__

// TODO: switch between builtin clz and 64_clz based on architecture
//#define clz(x) __builtin_clz(x)
#if 0
static int clz_manual(uint64_t x)
{
  // This uses a binary search (counting down) algorithm from Hacker's Delight.
   uint64_t y;
   int n = 64;
   y = x >>32;  if (y != 0) {n -= 32;  x = y;}
   y = x >>16;  if (y != 0) {n -= 16;  x = y;}
   y = x >> 8;  if (y != 0) {n -=  8;  x = y;}
   y = x >> 4;  if (y != 0) {n -=  4;  x = y;}
   y = x >> 2;  if (y != 0) {n -=  2;  x = y;}
   y = x >> 1;  if (y != 0) return n - 2;
   return n - x;
}
#endif

inline uint32_t clz(const uint32_t x) {
	return __builtin_clz(x);
}

inline uint32_t clz(const uint64_t x) {
    uint32_t u32 = (x >> 32);
    uint32_t result = u32 ? __builtin_clz(u32) : 32;
    if (result == 32) {
        u32 = x & 0xFFFFFFFFUL;
        result += (u32 ? __builtin_clz(u32) : 32);
    }
    return result;
}
//#else

uint32_t clz_log2(const uint64_t w) {
	return 63 - floor(log2(w));
}
//#endif


// TODO: the sparse list may be encoded with variable length encoding
//   see Heule et al., section 5.3.2
// Also, using sets might give a larger overhead as each insertion costs more
//  consider using vector and sort/unique when merging.
typedef set<uint32_t> SparseListType;
typedef uint64_t HashSize;

/**
 * HyperLogLogPlusMinus class
 * typename T corresponds to the hash size - usually either uint32_t or uint64_t (implemented for uint64_t)
 */

typedef uint64_t T_KEY;
template <typename T_KEY>
class HyperLogLogPlusMinus {

private:

	vector<uint8_t> M;  // registers (M) of size m
	uint8_t p;            // precision
	uint32_t m;           // number of registers
	bool sparse;          // sparse representation of the data?
	SparseListType sparseList; // TODO: use a compressed list instead

	// vectors containing data for bias correction
	vector<vector<double> > rawEstimateData; // TODO: make this static
	vector<vector<double> > biasData;

	// sparse versions of p and m
	static const uint8_t  pPrime = 25; // precision when using a sparse representation
	                                   // fixed to 25, because 25 + 6 bits for rank + 1 flag bit = 32
	static const uint32_t mPrime = 1 << (pPrime -1); // 2^pPrime


public:

	~HyperLogLogPlusMinus() {};

	/**
	 * Create new HyperLogLogPlusMinus counter
	 * @param precision
	 * @param sparse
	 */
	HyperLogLogPlusMinus(uint8_t precision=10, bool sparse=true):p(precision),sparse(sparse) {
		if (precision > 18 || precision < 4) {
	        throw std::invalid_argument("precision (number of register = 2^precision) must be between 4 and 18");
		}

		this->m = 1 << precision;

		if (sparse) {
			this->sparseList = SparseListType(); // TODO: if SparseListType is changed, initialize with appropriate size
		} else {
			this->M = vector<uint8_t>(m);
		}
	}

	/**
	 * Add a new item to the counter.
	 * @param item
	 */
	void add(T_KEY item) {
		add(item, sizeof(T_KEY));
	}

	/**
	 * Add a new item to the counter.
	 * @param item
	 * @param size  size of item
	 */
	void add(T_KEY item, size_t size) {

		// compute hash for item
		HashSize hash_value = murmurhash3_finalizer(item);

#ifdef HLL_DEBUG
		cerr << "Value: " << item << "; hash(value): " << hash_value << endl;
		cerr << bitset<64>(hash_value) << endl;
#endif

		if (sparse) {
			// sparse mode: put the encoded hash into sparse list
			uint32_t encoded_hash_value = encodeHashIn32Bit(hash_value);
			this->sparseList.insert(encoded_hash_value);

#ifdef HLL_DEBUG
			idx_n_rank ir = getIndexAndRankFromEncodedHash(encoded_hash_value);
			assert_eq(ir.idx,get_index(hash_value, p));
			assert_eq(ir.rank, get_rank(hash_value, p));
#endif

			// if the sparseList is too large, switch to normal (register) representation
			if (this->sparseList.size() > this->m) { // TODO: is the size of m correct?
				switchToNormalRepresentation();
			}
		} else {
			// normal mode
			// take first p bits as index  {x63,...,x64-p}
			uint32_t idx = get_index(hash_value, p);
			// shift those p values off, and count leading zeros of the remaining string {x63-p,...,x0}
			uint8_t rank = get_rank(hash_value, p);

			// update the register if current rank is bigger
			if (rank > this->M[idx]) {
				this->M[idx] = rank;
			}
		}
	}

	void add(vector<T_KEY> words) {
		for(size_t i = 0; i < words.size(); ++i) {
			this->add(words[i]);
		}
	}

	/**
	 * Reset to its initial state.
	 */
	void reset() {
		this->sparse = true;
		this->sparseList.clear();  // 
		this->M.clear();
	}

	/**
	 * Convert from sparse representation (using tmpSet and sparseList) to normal (using register)
	 */
	void switchToNormalRepresentation() {
#ifdef HLL_DEBUG
		cerr << "switching to normal representation" << endl;
		cerr << " est before: " << cardinality(true) << endl;
#endif
		this->sparse = false;
		this->M = vector<uint8_t>(this->m);
		if (sparseList.size() > 0) { //TDOD: do I need to check this, here?
			addToRegisters(this->sparseList);
			this->sparseList.clear();
		}
#ifdef HLL_DEBUG
		cerr << " est after: " << cardinality(true) << endl;
#endif
	}

	/**
	 * add sparseList to the registers of M
	 */
	void addToRegisters(const SparseListType &sparseList) {
		if (sparseList.size() == 0) {
			return;
		}
		for (SparseListType::const_iterator encoded_hash_value_ptr = sparseList.begin(); encoded_hash_value_ptr != sparseList.end(); ++encoded_hash_value_ptr) {

			idx_n_rank ir = getIndexAndRankFromEncodedHash(*encoded_hash_value_ptr);

			assert_lt(ir.idx,M.size());
			if (ir.rank > this->M[ir.idx]) {
				this->M[ir.idx] = ir.rank;
			}
		}
	}

	/**
	 * Merge another HyperLogLogPlusMinus into this. Converts to normal representation
	 * @param other
	 */
	void merge(const HyperLogLogPlusMinus* other) {
		if (this->p != other->p) {
			throw std::invalid_argument("precisions must be equal");
		}

		if (this->sparse && other->sparse) {
			if (this->sparseList.size()+other->sparseList.size() > this->m) {
				switchToNormalRepresentation();
				addToRegisters(other->sparseList);
			} else {
				this->sparseList.insert(other->sparseList.begin(),other->sparseList.end());
			}
		} else if (other->sparse) {
			// other is sparse, but this is not
			addToRegisters(other->sparseList);
		} else {
			if (this->sparse) {
				switchToNormalRepresentation();
			}

			// merge registers
			for (size_t i = 0; i < other->M.size(); ++i) {
				if (other->M[i] > this->M[i]) {
					this->M[i] = other->M[i];
				}
			}
		}
	}

	/**
	 *
	 * @return cardinality estimate
	 */
	uint64_t cardinality(bool verbose=true) {
		if (sparse) {
			// if we are still 'sparse', then use linear counting, which is more
			//  accurate for low cardinalities, and use increased precision pPrime
			return uint64_t(linearCounting(mPrime, mPrime-uint32_t(sparseList.size())));
		}

		// initialize bias correction data
		if (rawEstimateData.empty()) { initRawEstimateData(); }
		if (biasData.empty())        { initBiasData(); }

		// calculate raw estimate on registers
		//double est = alpha(m) * harmonicMean(M, m);
		double est = calculateEstimate(M);

		// correct for biases if estimate is smaller than 5m
		if (est <= double(m)*5.0) {
			est -= getEstimateBias(est);
		}

		uint32_t v = countZeros(M);
		if (v > 2) {
			// calculate linear counting (lc) estimate if there are more than 2 zeros in the matrix
			double lc_estimate = linearCounting(m, v);

			// check if the lc estimate is below the threshold
			if (lc_estimate <= double(threshold[p-4])) {
				if (lc_estimate < 0) { throw; }
				// return lc estimate of cardinality
				return lc_estimate;
			}
			return lc_estimate; // always use lc_estimate when available
		}

		// return bias-corrected hyperloglog estimate of cardinality
		return uint64_t(est);
	}

private:

    uint8_t rank(HashSize x, uint8_t b) {
        uint8_t v = 1;
        while (v <= b && !(x & 0x80000000)) {
            v++;
            x <<= 1;
        }
        return v;
    }

    template<typename T> inline uint32_t get_index(const T hash_value, const uint8_t p, const uint8_t size) const {
    	// take first p bits as index  {x63,...,x64-p}
    	assert_lt(p,size);
    	uint32_t idx = hash_value >> (size - p);
    	return idx;
    }

    inline uint32_t get_index(const uint64_t hash_value, const uint8_t p) const {
        return get_index(hash_value, p, 64);
    }

    inline uint32_t get_index(const uint32_t hash_value, const uint8_t p) const {
    	return get_index(hash_value, p, 32);
    }

    template<typename T> inline
	T get_trailing_ones(const uint8_t p) const {
    	return (T(1) << p ) - 1;
    }

    template<typename T> inline
    uint8_t get_rank(const T hash_value, const uint8_t p) const {
    	// shift p values off, and count leading zeros of the remaining string {x63-p,...,x0}
    	T_KEY rank_bits = (hash_value << p | get_trailing_ones<T>(p));
#ifdef HLL_DEBUG
    	cerr << "rank bits: " << bitset<32>(rank_bits) << endl;
#endif

    	uint8_t rank_val = (uint8_t) (clz(rank_bits)) + 1;
    	assert_leq(rank_val,64-p+1);
    	return rank_val;
    }

	void initRawEstimateData() {
	    rawEstimateData = vector<vector<double> >();

	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision4,arr_len(rawEstimateData_precision4)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision5,arr_len(rawEstimateData_precision5)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision6,arr_len(rawEstimateData_precision6)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision7,arr_len(rawEstimateData_precision7)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision8,arr_len(rawEstimateData_precision8)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision9,arr_len(rawEstimateData_precision9)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision10,arr_len(rawEstimateData_precision10)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision11,arr_len(rawEstimateData_precision11)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision12,arr_len(rawEstimateData_precision12)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision13,arr_len(rawEstimateData_precision13)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision14,arr_len(rawEstimateData_precision14)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision15,arr_len(rawEstimateData_precision15)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision16,arr_len(rawEstimateData_precision16)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision17,arr_len(rawEstimateData_precision17)));
	    rawEstimateData.push_back(vector<double>(rawEstimateData_precision18,arr_len(rawEstimateData_precision18)));

	}

	void initBiasData() {
		biasData = vector<vector<double> >();

		biasData.push_back(vector<double>(biasData_precision4,arr_len(biasData_precision4)));
		biasData.push_back(vector<double>(biasData_precision5,arr_len(biasData_precision5)));
		biasData.push_back(vector<double>(biasData_precision6,arr_len(biasData_precision6)));
		biasData.push_back(vector<double>(biasData_precision7,arr_len(biasData_precision7)));
		biasData.push_back(vector<double>(biasData_precision8,arr_len(biasData_precision8)));
		biasData.push_back(vector<double>(biasData_precision9,arr_len(biasData_precision9)));
		biasData.push_back(vector<double>(biasData_precision10,arr_len(biasData_precision10)));
		biasData.push_back(vector<double>(biasData_precision11,arr_len(biasData_precision11)));
		biasData.push_back(vector<double>(biasData_precision12,arr_len(biasData_precision12)));
		biasData.push_back(vector<double>(biasData_precision13,arr_len(biasData_precision13)));
		biasData.push_back(vector<double>(biasData_precision14,arr_len(biasData_precision14)));
		biasData.push_back(vector<double>(biasData_precision15,arr_len(biasData_precision15)));
		biasData.push_back(vector<double>(biasData_precision16,arr_len(biasData_precision16)));
		biasData.push_back(vector<double>(biasData_precision17,arr_len(biasData_precision17)));
		biasData.push_back(vector<double>(biasData_precision18,arr_len(biasData_precision18)));
	}

	/**
	 * Estimate the bias using empirically determined values.
	 * Uses weighted average of the two cells between which the estimate falls.
	 * TODO: Check if nearest neighbor average gives better values, as proposed in the paper
	 * @param est
	 * @return correction value for
	 */
	double getEstimateBias(double estimate) {
		vector<double> rawEstimateTable = rawEstimateData[p-4];
		vector<double> biasTable = biasData[p-4];
	
		// check if estimate is lower than first entry, or larger than last
		if (rawEstimateTable.front() >= estimate) { return rawEstimateTable.front() - biasTable.front(); }
		if (rawEstimateTable.back()  <= estimate) { return rawEstimateTable.back() - biasTable.back(); }
	
		// get iterator to first element that is not smaller than estimate
		vector<double>::const_iterator it = lower_bound(rawEstimateTable.begin(),rawEstimateTable.end(),estimate);
		size_t pos = it - rawEstimateTable.begin();

		double e1 = rawEstimateTable[pos-1];
		double e2 = rawEstimateTable[pos];
	
		double c = (estimate - e1) / (e2 - e1);

		return biasTable[pos-1]*(1-c) + biasTable[pos]*c;
	}
	

	/**
	 * Encode the 64-bit hash code x as an 32-bit integer, to be used in the sparse representation.
	 *
	 * Difference from the algorithm described in the paper:
	 * The index always is in the p most significant bits
	 *
	 * see section 5.3 in Heule et al.
	 * @param x the hash bits
	 * @return encoded hash value
	 */
	uint32_t encodeHashIn32Bit(uint64_t hash_value) {
		// extract first pPrime bits, and shift them onto a 32-bit integer
		uint32_t idx = (uint32_t)(extractBits(hash_value,pPrime) >> 32);

#ifdef HLL_DEBUG
		cerr << "value:  " << bitset<64>(hash_value) << endl;
        cerr << "index: " << std::bitset<32>(idx) << " ( bits from 64 to " << 64-pPrime << "; " << idx << ")" << endl;
#endif

		// are the bits {63-p, ..., 63-p'} all 0?
		if (extractBits(hash_value, 64-this->p, 64-pPrime) == 0) {
			// compute the additional rank (minimum rank is already p'-p)
			// the maximal size will be below 2^6=64. We thus combine the 25 bits of the index with 6 bits for the rank, and one bit as flag
			uint8_t additional_rank = get_rank(hash_value, pPrime); // this is rank - (p'-p), as we know that positions p'...p are 0
			return idx | uint32_t(additional_rank<<1) | 1;
		} else {
			// else, return the idx, only - it has enough length to calculate the rank (left-shifted, last bit = 0)
			assert_eq((idx & 1),0);
			return idx;
		}
	}


	/**
	 * struct holding the index and rank/rho of an entry
	 */
	struct idx_n_rank {
		uint32_t idx;
		uint8_t rank;
		idx_n_rank(uint32_t _idx, uint8_t _rank) : idx(_idx), rank(_rank) {}
	};

	//
	//
	/**
	 * Decode a hash from the sparse representation.
	 * Returns the index and number of leading zeros (nlz) with precision p stored in k
	 * @param k the hash bits
	 * @return index and rank in non-sparse format
	 */
	idx_n_rank getIndexAndRankFromEncodedHash(const uint32_t encoded_hash_value) const  {

		// difference to paper: Index can be recovered in the same way for pPrime and normally encoded hashes
		uint32_t idx = get_index(encoded_hash_value, p);
		uint8_t rank_val;

		// check if the last bit is 1
		if ( (encoded_hash_value & 1) == 1) {
			// if yes: the hash was stored with higher precision, bits p to pPrime were 0
			uint8_t additional_rank = pPrime - p;
			rank_val = additional_rank + extractBits(encoded_hash_value, 7, 1);
		} else {
			rank_val = get_rank(encoded_hash_value,p);

			// clz counts 64 bit only, it seems
			if (rank_val > 32)
				rank_val -= 32;
		}

		return(idx_n_rank(idx,rank_val));
	}

};




#endif /* HYPERLOGLOGPLUS_H_ */