Codebase list mozc / debian/1.1.690.102-1 storage / existence_filter.cc
debian/1.1.690.102-1

Tree @debian/1.1.690.102-1 (Download .tar.gz)

existence_filter.cc @debian/1.1.690.102-1raw · history · blame

// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <string.h>
#include <cmath>
#include "storage/existence_filter.h"

namespace mozc {

class ExistenceFilter::BlockBitmap {
 public:
  explicit BlockBitmap(uint32 length);
  BlockBitmap(uint32 length, bool is_mutable);
  virtual ~BlockBitmap();

  size_t size() const;
  void Clear();
  bool Get(uint32 index) const;
  void Set(uint32 index);

  // REQUIRES: "iter" is zero, or was set by a preceding call
  // to GetMutableFragment().
  //
  // This allows caller to peek into and write to the underlying bitmap
  // as a series of non-empty fragments in whole number of 4-byte words.
  // If the entire bitmap has been exhausted, return false.  Otherwise,
  // return true and point caller to the next non-empty fragment.
  //
  // Usage:
  //    char** ptr;
  //    size_t bytes;
  //    for (uint32 iter = 0; bm.GetMutableFragment(&iter, &ptr, &bytes); ) {
  //      Process(*ptr, bytes);
  //    }
  bool GetMutableFragment(uint32* itr, char*** ptr, size_t* size);

 private:
  static const int kBlockShift = 21;  // 2^21 bits == 256KB block
  static const int kBlockBits = 1 << kBlockShift;
  static const int kBlockMask = kBlockBits - 1;
  static const int kBlockBytes = kBlockBits >> 3;
  static const int kBlockWords = kBlockBits >> 5;

  // Array of blocks
  uint32 **block_;
  uint32 length_;
  uint32 num_blocks_;
  uint32 bytes_in_last_;
  bool is_mutable_;

  DISALLOW_COPY_AND_ASSIGN(BlockBitmap);
};

ExistenceFilter::BlockBitmap::BlockBitmap(uint32 length, bool is_mutable)
    : length_(length), is_mutable_(is_mutable) {
  CHECK_GT(length, 0);
  const uint32 bits_in_last_block = (length & kBlockMask);

  // Allocate the block array
  num_blocks_ = (length >> kBlockShift);
  if (bits_in_last_block) {
    // Need an extra block for the leftover bits
    num_blocks_++;
  }
  CHECK_GT(num_blocks_, 0);

  block_ = new uint32*[num_blocks_];
  CHECK(block_);

  // Allocate full blocks
  for (size_t i = 0; i < num_blocks_ - 1; ++i) {
    block_[i] = is_mutable_ ? new uint32[kBlockWords] : NULL;
  }

  // Allocate the last block
  if (bits_in_last_block) {
    bytes_in_last_ = sizeof(uint32) * ((bits_in_last_block + 31)/32);
  } else {
    bytes_in_last_ = kBlockBytes;
  }
  CHECK_EQ(bytes_in_last_ % sizeof(uint32), 0);

  block_[num_blocks_-1] =
      is_mutable_ ? new uint32[bytes_in_last_/sizeof(uint32)] : NULL;
}

ExistenceFilter::BlockBitmap::~BlockBitmap() {
  if (is_mutable_) {
    for (int i = 0; i < num_blocks_; ++i) {
      delete [] block_[i];
    }
  }
  delete [] block_;
}

void ExistenceFilter::BlockBitmap::Clear() {
  if (!is_mutable_) {
    return;
  }
  for (int i = 0; i < num_blocks_ - 1; ++i) {
    memset(block_[i], 0, kBlockBytes);
  }
  memset(block_[num_blocks_-1], 0, bytes_in_last_);
}

size_t ExistenceFilter::BlockBitmap::size() const {
  CHECK_GT(num_blocks_, 0);
  return static_cast<size_t>(num_blocks_-1) * kBlockBytes + bytes_in_last_;
}

ExistenceFilter::ExistenceFilter(uint32 m, uint32 n, int k)
    : vec_size_(m ? m : 1),
      is_power_of_two_((vec_size_ & (vec_size_ - 1)) == 0),
      expected_nelts_(n),
      num_hashes_(k) {
  CHECK_LT(num_hashes_, 8);
  rep_.reset(new BlockBitmap(m ? m : 1, true));
  rep_->Clear();
}

// this is private constructor
ExistenceFilter::ExistenceFilter(uint32 m, uint32 n, int k,
                                 bool is_mutable)
    : vec_size_(m ? m : 1),
      is_power_of_two_((vec_size_ & (vec_size_ - 1)) == 0),
      expected_nelts_(n),
      num_hashes_(k) {
  CHECK_LT(num_hashes_, 8);
  rep_.reset(new BlockBitmap(m ? m : 1, is_mutable));
  rep_->Clear();
}

// static
ExistenceFilter *
ExistenceFilter::CreateImmutableExietenceFilter(uint32 m,
                                                uint32 n,
                                                int k) {
  return new ExistenceFilter(m, n, k, false);
}

ExistenceFilter* ExistenceFilter::CreateOptimal(size_t size_in_bytes,
                                                uint32 estimated_insertions) {
  CHECK_LT(size_in_bytes, (1 << 29))
                             << "Requested size is too big";
  CHECK_GT(estimated_insertions, 0);
  const uint32 m = size_in_bytes * 8;
  const uint32 n = estimated_insertions;

  int optimal_k = static_cast<int>((static_cast<float>(m) / n * log(2.0))
                                   + 0.5);
  if (optimal_k < 1) {
    optimal_k = 1;
  }
  if (optimal_k > 7) {
    optimal_k = 7;
  }

  VLOG(1) << "optimal_k: " << optimal_k;

  ExistenceFilter *filter = new ExistenceFilter(m, n, optimal_k);
  CHECK(filter);
  return filter;
}

ExistenceFilter::~ExistenceFilter() {
}

void ExistenceFilter::Clear() {
  rep_->Clear();
}

inline bool ExistenceFilter::BlockBitmap::Get(uint32 index) const {
  const uint32 bindex = index >> kBlockShift;
  const uint32 windex = (index & kBlockMask) >> 5;
  const uint32 bitpos = index & 31;
  return (block_[bindex][windex] >> bitpos) & 1;
}

inline void ExistenceFilter::BlockBitmap::Set(uint32 index) {
  const uint32 bindex = index >> kBlockShift;
  const uint32 windex = (index & kBlockMask) >> 5;
  const uint32 bitpos = index & 31;
  block_[bindex][windex] |= (static_cast<uint32>(1) << bitpos);
}

bool ExistenceFilter::BlockBitmap::GetMutableFragment(uint32 *iter,
                                                      char ***ptr,
                                                      size_t *size) {
  const uint32 b = *iter;
  (*iter)++;
  if (b < num_blocks_) {
    *ptr = reinterpret_cast<char **>(&block_[b]);
    *size = (b == num_blocks_-1) ? bytes_in_last_ : kBlockBytes;
    return true;
  } else {
    LOG(WARNING) << "out of range iterator";
    return false;
  }
}

/* static */
uint64 ExistenceFilter::RotateLeft64(uint64 original, int num_bits) {
  int num_bot_bits = num_bits;
  uint64 mask_bits = (1 << num_bot_bits) - 1;
  uint64 old_bot_part = original & mask_bits;
  uint64 new_bot_part = original >> num_bot_bits;
  uint64 new_top_part = old_bot_part << (64 - num_bot_bits);
  uint64 rotated_original = new_bot_part | new_top_part;
  return rotated_original;
}

bool ExistenceFilter::Exists(uint64 hash) const {
  for (size_t i = 0; i < num_hashes_; ++i) {
    hash = RotateLeft64(hash, 8);
    uint32 index = hash % vec_size_;
    if (!rep_->Get(index)) {
      return false;
    }
  }
  return true;
}

void ExistenceFilter::Insert(uint64 hash) {
  for (size_t i = 0; i < num_hashes_; ++i) {
    hash = RotateLeft64(hash, 8);
    uint32 index = hash % vec_size_;
    rep_->Set(index);
  }
}

size_t ExistenceFilter::Size() const {
  return (BitsToWords(vec_size_) * sizeof(uint32));
}

size_t ExistenceFilter::MinFilterSizeInBytesForErrorRate(float error_rate,
                                                         size_t num_elements) {
  // (-num_hashes * num_elements) / log(1 - error_rate^(1/num_hashes))

  double min_bits = 0;
  for (size_t num_hashes = 1; num_hashes < 8; ++num_hashes) {
    double num_bits = (-1.0 * num_hashes * num_elements) /
        log(1.0 - pow(static_cast<double>(error_rate), (1.0 / num_hashes)));
    if (min_bits == 0 || num_bits < min_bits)
      min_bits = num_bits;
  }
  return static_cast<size_t>(ceil(min_bits / 8));
}

// allocate 'buf' and write filter to the buf.
// 'size' will hold the size of buf
void ExistenceFilter::Write(char **buf, size_t *size) {
  const int require_bytes = sizeof(Header) + Size();

  *buf = new char[require_bytes];
  CHECK(*buf);
  *size = require_bytes;

  char *buf_ptr = *buf;

  // write header
  memcpy(buf_ptr, &vec_size_, sizeof(vec_size_));
  buf_ptr += sizeof(vec_size_);
  memcpy(buf_ptr, &expected_nelts_, sizeof(expected_nelts_));
  buf_ptr += sizeof(expected_nelts_);
  memcpy(buf_ptr, &num_hashes_, sizeof(num_hashes_));
  buf_ptr += sizeof(num_hashes_);
  LOG(INFO) << "Write header : vec_size" << vec_size_ << " expected_nelts "
            << expected_nelts_ << " num_hashes " << num_hashes_;

  // write bitmap
  char **fragment_ptr = NULL;
  size_t bytes = 0;
  size_t write = 0;
  for (uint32 iter = 0;
       rep_->GetMutableFragment(&iter, &fragment_ptr, &bytes);) {
    memcpy(buf_ptr, *fragment_ptr, bytes);
    buf_ptr += bytes;
    write += bytes;
  }
  if (write != Size()) {
    LOG(ERROR) << "Write " << write << " bytes instead of " << Size();
  }
}

bool ExistenceFilter::ReadHeader(const char *buf, Header* header) {
  memcpy(&(header->m), buf, sizeof(header->m));
  buf += sizeof(header->m);
  memcpy(&(header->n), buf, sizeof(header->n));
  buf += sizeof(header->n);
  memcpy(&(header->k), buf, sizeof(header->k));
  buf += sizeof(header->k);
  if (header->k >= 8 || header->k <= 0) {
    LOG(ERROR) << "Bad number of hashes (header->k)";
    return false;
  }
  return true;
}

ExistenceFilter* ExistenceFilter::Read(const char *buf, size_t size) {
  Header header;
  const uint32 header_bytes =
      sizeof(header.m) + sizeof(header.n) + sizeof(header.k);
  if (size < header_bytes) {
    LOG(ERROR) << "Not enough bufsize: could not read header";
    return NULL;
  }

  if (!ReadHeader(buf, &header)) {
    LOG(ERROR) << "Invalid format: could not read header";
    return NULL;
  }
  buf += header_bytes;

  const uint32 filter_size = BitsToWords(header.m);
  const uint32 filter_bytes = filter_size * sizeof(uint32);
  VLOG(1) << "Reading bloom filter with size: " << filter_bytes << " bytes, "
          << "estimated insertions: " << header.n << " (k: " << header.k
          << ")";


  if (size < header_bytes + filter_bytes) {
    LOG(ERROR) << "Not enough bufsize: could not read filter";
    return NULL;
  }

  ExistenceFilter* filter =
      ExistenceFilter::CreateImmutableExietenceFilter(header.m,
                                                      header.n,
                                                      header.k);
  char **ptr = NULL;
  size_t n = 0;
  size_t read = 0;
  for (uint32 iter = 0; filter->rep_->GetMutableFragment(&iter, &ptr, &n);) {
    *ptr = const_cast<char *>(buf);   // TODO(taku): don't want to remove const
    buf += n;
    read += n;
  }
  if (read != filter_bytes) {
    LOG(ERROR) << "Read " << read << " bytes instead of " << filter_bytes;
    delete filter;
    return NULL;
  }
  return filter;
}
}  // namespace mozc