src/data_manager/serialized_dictionary.h - mozc (debian/2.23.2815.102+dfsg-7)

Tree @debian/2.23.2815.102+dfsg-7 (Download .tar.gz)

serialized_dictionary.h @debian/2.23.2815.102+dfsg-7 — raw · history · blame

// Copyright 2010-2018, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifndef MOZC_DATA_MANAGER_SERIALIZED_DICTIONARY_H_
#define MOZC_DATA_MANAGER_SERIALIZED_DICTIONARY_H_

#include <istream>
#include <iterator>
#include <map>
#include <string>
#include <utility>

#include "base/port.h"
#include "base/serialized_string_array.h"
#include "base/string_piece.h"

namespace mozc {

// This class serializes Mozc's dictionary data and load it at runtime without
// cost of deserialization.  Mozc's dictionary is analogous to
// std::multimap<Key, Value>, where Key is a string (e.g., reading for symbols,
// etc.), and Value is its associated data:
//
// Value {
//   string value;  // Surface form
//   string description;
//   string additional_description;
//   uint16 lid;
//   uint16 rid;
//   int16 cost;
// }
//
// * Prerequisite
// Little endian is assumed.
//
// * Creating serialized data
// The binary data consists of two data, token array and string array.  Use
// SerializedDictionary::Compile() to create the images.
//
// * Map access
// At runtime, we can access map contents just by loading two binary images,
// token array and string array, e.g., from files, onto memory blocks.  But
// these two memory blocks must be aligned at 4 byte boundary.  Accessors are
// designed to have similar interfaces to std::multimap<string, Value>, so
// values can be looked up by equal_range(), etc.
//
// * Binary format
//
// ** String array
// All the strings, such as keys and values, are serialized into one array using
// SerializedStringArray.  In the map structure (see below), every string is
// stored as an index to this array.
//
// ** Token array
// A key value pair of map entry is encoded as data block of fixed byte length:
//
// Token layout (24 bytes)
// +---------------------------------------+
// | Key index  (4 bytes)                  |
// + - - - - - - - - - - - - - - - - - - - +
// | Value index (4 bytes)                 |
// + - - - - - - - - - - - - - - - - - - - +
// | Description index  (4 bytes)          |
// + - - - - - - - - - - - - - - - - - - - +
// | Additional description index (4 bytes)|
// + - - - - - - - - - - - - - - - - - - - +
// | LID (2 bytes)                         |
// + - - - - - - - - - - - - - - - - - - - +
// | RID (2 bytes)                         |
// + - - - - - - - - - - - - - - - - - - - +
// | Cost (2 bytes)                        |
// + - - - - - - - - - - - - - - - - - - - +
// | Padding = 0x0000 (2 bytes)            |
// +---------------------------------------+
//
// The map structure is serialized as a sorted array of tokens where tokens are
// sorted first by key and then by cost, both in ascending order.  Thus, the
// array has 24 * #tokens bytes.  Note that each token is properly aligned at 4
// byte boundary by the insertion of padding.  String values of a token (key,
// value, description, additional_description) can be retrieved from the string
// array by index.
class SerializedDictionary {
 public:
  struct CompilerToken {
    string value;
    string description;
    string additional_description;
    uint16 lid;
    uint16 rid;
    int16 cost;
  };

  using TokenList = std::vector<std::unique_ptr<CompilerToken>>;

  static const size_t kTokenByteLength = 24;

  class iterator : public std::iterator<std::random_access_iterator_tag,
                                        StringPiece> {
   public:
    iterator() : token_ptr_(nullptr), string_array_(nullptr) {}
    iterator(const char *token_ptr, const SerializedStringArray *string_array)
        : token_ptr_(token_ptr), string_array_(string_array) {}
    iterator(const iterator &x) = default;

    uint32 key_index() { return *reinterpret_cast<const uint32 *>(token_ptr_); }
    uint32 key_index() const {
      return *reinterpret_cast<const uint32 *>(token_ptr_);
    }
    StringPiece key() { return (*string_array_)[key_index()]; }
    StringPiece key() const { return (*string_array_)[key_index()]; }

    uint32 value_index() {
      return *reinterpret_cast<const uint32 *>(token_ptr_ + 4);
    }
    uint32 value_index() const {
      return *reinterpret_cast<const uint32 *>(token_ptr_ + 4);
    }
    StringPiece value() { return (*string_array_)[value_index()]; }
    StringPiece value() const { return (*string_array_)[value_index()]; }

    uint32 description_index() {
      return *reinterpret_cast<const uint32 *>(token_ptr_ + 8);
    }
    uint32 description_index() const {
      return *reinterpret_cast<const uint32 *>(token_ptr_ + 8);
    }

    StringPiece description() { return (*string_array_)[description_index()]; }
    StringPiece description() const {
      return (*string_array_)[description_index()];
    }

    uint32 additional_description_index() {
      return *reinterpret_cast<const uint32 *>(token_ptr_ + 12);
    }
    uint32 additional_description_index() const {
      return *reinterpret_cast<const uint32 *>(token_ptr_ + 12);
    }
    StringPiece additional_description() {
      return (*string_array_)[additional_description_index()];
    }

    StringPiece additional_description() const {
      return (*string_array_)[additional_description_index()];
    }

    uint16 lid() { return *reinterpret_cast<const uint16 *>(token_ptr_ + 16); }
    uint16 lid() const {
      return *reinterpret_cast<const uint16 *>(token_ptr_ + 16);
    }

    uint16 rid() { return *reinterpret_cast<const uint16 *>(token_ptr_ + 18); }
    uint16 rid() const {
      return *reinterpret_cast<const uint16 *>(token_ptr_ + 18);
    }

    int16 cost() { return *reinterpret_cast<const uint16 *>(token_ptr_ + 20); }
    int16 cost() const {
      return *reinterpret_cast<const uint16 *>(token_ptr_ + 20);
    }

    StringPiece operator*() { return key(); }
    StringPiece operator*() const { return key(); }

    void swap(iterator &x) {
      using std::swap;
      swap(token_ptr_, x.token_ptr_);
      swap(string_array_, x.string_array_);
    }

    friend void swap(iterator &x, iterator &y) { x.swap(y); }

    iterator &operator++() {
      token_ptr_ += kTokenByteLength;
      return *this;
    }

    iterator operator++(int) {
      const char *tmp = token_ptr_;
      token_ptr_ += kTokenByteLength;
      return iterator(tmp, string_array_);
    }

    iterator &operator--() {
      token_ptr_ -= kTokenByteLength;
      return *this;
    }

    iterator operator--(int) {
      const char *tmp = token_ptr_;
      token_ptr_ -= kTokenByteLength;
      return iterator(tmp, string_array_);
    }

    iterator &operator+=(difference_type n) {
      token_ptr_ += n * kTokenByteLength;
      return *this;
    }

    iterator &operator-=(difference_type n) {
      token_ptr_ -= n * kTokenByteLength;
      return *this;
    }

    friend iterator operator+(iterator x, difference_type n) {
      return iterator(x.token_ptr_ + n * kTokenByteLength, x.string_array_);
    }

    friend iterator operator+(difference_type n, iterator x) {
      return iterator(x.token_ptr_ + n * kTokenByteLength, x.string_array_);
    }

    friend iterator operator-(iterator x, difference_type n) {
      return iterator(x.token_ptr_ - n * kTokenByteLength, x.string_array_);
    }

    friend difference_type operator-(iterator x, iterator y) {
      return (x.token_ptr_ - y.token_ptr_) / kTokenByteLength;
    }

    // The following comparison operators make sense only for iterators obtained
    // from the same dictionary.
    friend bool operator==(iterator x, iterator y) {
      DCHECK_EQ(x.string_array_, y.string_array_);
      return x.token_ptr_ == y.token_ptr_;
    }

    friend bool operator!=(iterator x, iterator y) {
      DCHECK_EQ(x.string_array_, y.string_array_);
      return x.token_ptr_ != y.token_ptr_;
    }

    friend bool operator<(iterator x, iterator y) {
      DCHECK_EQ(x.string_array_, y.string_array_);
      return x.token_ptr_ < y.token_ptr_;
    }

    friend bool operator<=(iterator x, iterator y) {
      DCHECK_EQ(x.string_array_, y.string_array_);
      return x.token_ptr_ <= y.token_ptr_;
    }

    friend bool operator>(iterator x, iterator y) {
      DCHECK_EQ(x.string_array_, y.string_array_);
      return x.token_ptr_ > y.token_ptr_;
    }

    friend bool operator>=(iterator x, iterator y) {
      DCHECK_EQ(x.string_array_, y.string_array_);
      return x.token_ptr_ >= y.token_ptr_;
    }

   private:
    const char *token_ptr_;
    const SerializedStringArray *string_array_;
  };

  using const_iterator = iterator;

  using IterRange = std::pair<const_iterator, const_iterator>;

  // Creates serialized data into buffers.  The first and second StringPieces of
  // returned value points to memory block for token array and string array,
  // respectively.  The input stream should supply TSV file of Mozc's dctionary
  // format; see, e.g., data/symbol/symbol.tsv.
  static std::pair<StringPiece, StringPiece> Compile(
      std::istream *input,
      std::unique_ptr<uint32[]> *output_token_array_buf,
      std::unique_ptr<uint32[]> *output_string_array_buf);
  static std::pair<StringPiece, StringPiece> Compile(
      const std::map<string, TokenList> &dic,
      std::unique_ptr<uint32[]> *output_token_array_buf,
      std::unique_ptr<uint32[]> *output_string_array_buf);

  // Creates serialized data and writes them to files.
  static void CompileToFiles(const string &input,
                             const string &output_token_array,
                             const string &output_string_array);
  static void CompileToFiles(const std::map<string, TokenList> &dic,
                             const string &output_token_array,
                             const string &output_string_array);

  // Validates the serialized data.
  static bool VerifyData(StringPiece token_array_data,
                         StringPiece string_array_data);

  // Both |token_array| and |string_array_data| must be aligned at 4-byte
  // boundary.
  SerializedDictionary(StringPiece token_array, StringPiece string_array_data);
  ~SerializedDictionary();

  std::size_t size() const {
    return token_array_.size() / kTokenByteLength;
  }

  iterator begin() { return iterator(token_array_.data(), &string_array_); }
  const_iterator begin() const {
    return iterator(token_array_.data(), &string_array_);
  }

  iterator end() {
    return iterator(token_array_.data() + token_array_.size(), &string_array_);
  }
  iterator end() const {
    return iterator(token_array_.data() + token_array_.size(), &string_array_);
  }

  // Returns the range of iterators whose keys match the given key.  The range
  // is sorted in ascending order of cost.
  IterRange equal_range(StringPiece key) const;

 private:
  StringPiece token_array_;
  SerializedStringArray string_array_;
};

}  // namespace mozc

#endif  // MOZC_DATA_MANAGER_SERIALIZED_DICTIONARY_H_