Codebase list mozc / debian/1.5.1090.102-3 converter / sparse_connector_builder.cc
debian/1.5.1090.102-3

Tree @debian/1.5.1090.102-3 (Download .tar.gz)

sparse_connector_builder.cc @debian/1.5.1090.102-3raw · history · blame

// Copyright 2010-2012, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "converter/sparse_connector_builder.h"

#include <vector>

#include "base/base.h"
#include "base/file_stream.h"
#include "base/mmap.h"
#include "converter/sparse_connector.h"
#include "storage/sparse_array_image.h"

DEFINE_bool(use_1byte_cost, false,
            "Cost is encoded int a byte, instead of 2 bytes.");
DEFINE_int32(cost_resolution, 64,
             "Cost value is calculated by the value in SparseArray * "
             "cost_resolution. So every cost value should be smaller than "
             "resolution*256.");

namespace mozc {
namespace converter {
namespace {

size_t LoadIDSize(const string &filename) {
  InputFileStream ifs(filename.c_str());
  CHECK(ifs);
  string line;
  vector<string> fields;
  int max_id = 0;
  int line_num = 0;
  while (getline(ifs, line)) {
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 2);
    const int id = atoi32(fields[0].c_str());
    max_id = max(id, max_id);
    ++line_num;
  }
  ++max_id;

  CHECK_EQ(line_num, max_id);

  return max_id;
}

size_t LoadSpecialPOSSize(const string &filename) {
  InputFileStream ifs(filename.c_str());
  CHECK(ifs);
  string line;
  int line_num = 0;
  while (getline(ifs, line)) {
    if (line.empty() || line[0] == '#') {
      continue;
    }
    ++line_num;
  }

  return line_num;
}
}  // namespace

SparseConnectorBuilder::SparseConnectorBuilder() {
}
SparseConnectorBuilder::~SparseConnectorBuilder() {
}

void SparseConnectorBuilder::SetIdFile(const string &id_file) {
  id_size_ = LoadIDSize(id_file);
}
void SparseConnectorBuilder::SetSpecialPosFile(
    const string &special_pos_file) {
  special_pos_size_ = LoadSpecialPOSSize(special_pos_file);
}
void SparseConnectorBuilder::ParseTextConnectionFile(
    const string &text_connection_file,
    vector<int16> *matrix) const {
  InputFileStream ifs(text_connection_file.c_str());
  CHECK(ifs);

  string line;
  vector<string> fields;
  getline(ifs, line);
  Util::SplitStringUsing(line, "\t ", &fields);
  CHECK_GE(fields.size(), 2);

  CHECK_EQ(id_size_, atoi32(fields[0].c_str()));
  CHECK_EQ(id_size_, atoi32(fields[1].c_str()));
  const uint16 matrix_size = MatrixSize();

  LOG(INFO) << "Making " << matrix_size << "x" << matrix_size << " matrix.";
  matrix->assign(matrix_size * matrix_size, 0);

  while (getline(ifs, line)) {
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 3);
    const int lid = atoi32(fields[0].c_str());
    const int rid = atoi32(fields[1].c_str());
    int cost = atoi32(fields[2].c_str());
    CHECK(lid < matrix_size && rid < matrix_size)
        << "index values are out of range: "
        << lid << ", " << rid << ", " << matrix_size;
    // BOS->EOS connection cost is always 0
    if (lid == 0 && rid == 0) {
      cost = 0;
    }
    (*matrix)[lid + matrix_size * rid] = static_cast<int16>(cost);
  }

  for (int lid = id_size_; lid < matrix_size; ++lid) {
    for (int rid = 1; rid < matrix_size; ++rid) {   // SKIP EOS (rid == 0)
      CHECK(lid < matrix_size && rid < matrix_size)
          << "index values are out of range: "
          << lid << ", " << rid << ", " << matrix_size;
      (*matrix)[lid + matrix_size * rid] = ConnectorInterface::kInvalidCost;
    }
  }

  for (int rid = id_size_; rid < matrix_size; ++rid) {
    for (int lid = 1; lid < matrix_size; ++lid) {   // SKIP BOS (lid == 0)
      CHECK(lid < matrix_size && rid < matrix_size)
          << "index values are out of range: "
          << lid << ", " << rid << ", " << matrix_size;
      (*matrix)[lid + matrix_size * rid] = ConnectorInterface::kInvalidCost;
    }
  }
}

void SparseConnectorBuilder::FillDefaultCost(const vector<int16> &matrix) {
  size_t matrix_size = MatrixSize();
  CHECK_EQ(matrix_size * matrix_size, matrix.size());

  // Initialized by filling 0
  default_cost_.assign(matrix_size, 0);
  for (int lid = 0; lid < matrix_size; ++lid) {
    for (int rid = 0; rid < matrix_size; ++rid) {
      const int16 c = matrix[lid + matrix_size * rid];
      if (ConnectorInterface::kInvalidCost != c) {
        default_cost_[lid] = max(default_cost_[lid], c);  // save default cost
      }
    }
  }
}

void SparseConnectorBuilder::Build(const string &text_connection_file) {
  BuildInternal(text_connection_file, NULL);
}

void SparseConnectorBuilder::BuildInternal(
    const string &text_connection_file,
    vector<int16> *output_matrix) {
  vector<int16> matrix;
  ParseTextConnectionFile(text_connection_file, &matrix);
  FillDefaultCost(matrix);

  LOG(INFO) << "compiling matrix with " << matrix.size();

  builder_.reset(new SparseArrayBuilder);
  if (FLAGS_use_1byte_cost) {
    builder_->SetUse1ByteValue(true);
  }

  size_t matrix_size = MatrixSize();
  for (int lid = 0; lid < matrix_size; ++lid) {
    for (int rid = 0; rid < matrix_size; ++rid) {
      int16 c = matrix[lid + matrix_size * rid];
      if (c != default_cost_[lid]) {
        if (FLAGS_use_1byte_cost) {
          if (c == ConnectorInterface::kInvalidCost) {
            c = SparseConnector::kInvalid1ByteCostValue;
          } else {
            c = c / FLAGS_cost_resolution;
            CHECK(c < 256 && c != SparseConnector::kInvalid1ByteCostValue);
          }
        }
        builder_->AddValue(SparseConnector::EncodeKey(lid, rid), c);
      }
    }
  }

  builder_->Build();
  if (output_matrix != NULL) {
    output_matrix->swap(matrix);
  }
}

void SparseConnectorBuilder::WriteTo(const string& output_file) const {
  CHECK(builder_.get() != NULL);

  OutputFileStream ofs(output_file.c_str(), ios::binary|ios::out);
  CHECK(ofs) << "permission denied: " << output_file;

  const uint16 magic = SparseConnector::kSparseConnectorMagic;
  ofs.write(reinterpret_cast<const char*>(&magic), sizeof(magic));
  uint16 resolution = 1;
  if (FLAGS_use_1byte_cost) {
    resolution = FLAGS_cost_resolution;
  }

  const uint16 matrix_size = MatrixSize();
  CHECK_EQ(matrix_size, default_cost_.size());

  ofs.write(reinterpret_cast<const char*>(&resolution), sizeof(resolution));
  ofs.write(reinterpret_cast<const char*>(&matrix_size), sizeof(matrix_size));
  ofs.write(reinterpret_cast<const char*>(&matrix_size), sizeof(matrix_size));
  ofs.write(reinterpret_cast<const char*>(&default_cost_[0]),
            sizeof(default_cost_[0]) * default_cost_.size());
  // 4byte alignment
  while (ofs.tellp() % 4) {
    ofs.put('\0');
  }
  ofs.write(builder_->GetImage(), builder_->GetSize());
  ofs.close();
}

void SparseConnectorBuilder::Compile(
    const string &text_connection_file,
    const string &id_file,
    const string &special_pos_file,
    const string &output_file) {
  vector<int16> matrix;
  SparseConnectorBuilder builder;
  builder.SetIdFile(id_file);
  builder.SetSpecialPosFile(special_pos_file);
  builder.BuildInternal(text_connection_file, &matrix);
  builder.WriteTo(output_file);

  // verify connector
  {
    Mmap<char> mmap;
    CHECK(mmap.Open(output_file.c_str(), "r"));

    scoped_ptr<SparseConnector> connector(
        new SparseConnector(mmap.begin(), mmap.GetFileSize()));
    CHECK(connector.get());

    const int resolution = connector->GetResolution();
    const size_t matrix_size = builder.MatrixSize();
    for (int rid = 0; rid < matrix_size; ++rid) {
      for (int lid = 0; lid < matrix_size; ++lid) {
        const int diff = abs(connector->GetTransitionCost(lid, rid) -
                             matrix[lid + matrix_size * rid]);
        CHECK_LT(diff, resolution);
      }
    }
  }
}
}  // namespace converter
}  // namespace mozc