Codebase list mozc / upstream/1.1.758.102 converter / sparse_connector_builder.cc
upstream/1.1.758.102

Tree @upstream/1.1.758.102 (Download .tar.gz)

sparse_connector_builder.cc @upstream/1.1.758.102raw · history · blame

// Copyright 2010-2011, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include <vector>
#include "base/base.h"
#include "base/file_stream.h"
#include "base/mmap.h"
#include "converter/sparse_connector.h"
#include "storage/sparse_array_image.h"

DEFINE_bool(use_1byte_cost, false,
            "Cost is encoded int a byte, instead of 2 bytes.");
DEFINE_int32(cost_resolution, 64,
             "Cost value is calculated by the value in SparseArray * "
             "cost_resolution. So every cost value should be smaller than "
             "resolution*256.");

namespace mozc {

namespace {

size_t LoadIDSize(const string &filename) {
  InputFileStream ifs(filename.c_str());
  CHECK(ifs);
  string line;
  vector<string> fields;
  int max_id = 0;
  int line_num = 0;
  while (getline(ifs, line)) {
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 2);
    const int id = atoi32(fields[0].c_str());
    max_id = max(id, max_id);
    ++line_num;
  }
  ++max_id;

  CHECK_EQ(line_num, max_id);

  return max_id;
}

size_t LoadSpecialPOSSize(const string &filename) {
  InputFileStream ifs(filename.c_str());
  CHECK(ifs);
  string line;
  int line_num = 0;
  while (getline(ifs, line)) {
    if (line.empty() || line[0] == '#') {
      continue;
    }
    ++line_num;
  }

  return line_num;
}
}  // namespace

void SparseConnectorBuilder::Compile(
    const string &text_connection_file,
    const string &id_file,
    const string &special_pos_file,
    const string &output_file) {
  const size_t id_size = LoadIDSize(id_file);
  const size_t special_pos_size = LoadSpecialPOSSize(special_pos_file);

  InputFileStream ifs(text_connection_file.c_str());
  CHECK(ifs);
  string line;
  vector<string> fields;
  getline(ifs, line);
  Util::SplitStringUsing(line, "\t ", &fields);
  CHECK_GE(fields.size(), 2);

  const uint16 original_lsize = atoi32(fields[0].c_str());
  const uint16 original_rsize = atoi32(fields[1].c_str());

  CHECK_EQ(id_size, original_lsize);
  CHECK_EQ(id_size, original_rsize);

  const uint16 lsize = original_lsize + special_pos_size;
  const uint16 rsize = original_rsize + special_pos_size;

  CHECK_EQ(lsize, rsize);

  LOG(INFO) << "Making " << lsize << " x " << rsize << " matrix";
  vector<int16> matrix(lsize * rsize);
  fill(matrix.begin(), matrix.end(), 0);

  CHECK_EQ(lsize, rsize);

  while (getline(ifs, line)) {
    fields.clear();
    Util::SplitStringUsing(line, "\t ", &fields);
    CHECK_GE(fields.size(), 3);
    const int l = atoi32(fields[0].c_str());
    const int r = atoi32(fields[1].c_str());
    int       c = atoi32(fields[2].c_str());
    CHECK(l < lsize && r < rsize) << "index values are out of range";
    // BOS->EOS connection cost is always 0
    if (l == 0 && r == 0) {
      c = 0;
    }
    matrix[(l + lsize * r)] = static_cast<int16>(c);
  }

  for (int l = original_lsize; l < lsize; ++l) {
    for (int r = 1; r < rsize; ++r) {   // SKIP EOS (r == 0)
      CHECK(l < lsize && r < rsize) << "index values are out of range";
      matrix[(l + lsize * r)] = ConnectorInterface::kInvalidCost;
    }
  }

  for (int r = original_rsize; r < rsize; ++r) {
    for (int l = 1; l < lsize; ++l) {   // SKIP BOS (r == 0)
      CHECK(l < lsize && r < rsize) << "index values are out of range";
      matrix[(l + lsize * r)] = ConnectorInterface::kInvalidCost;
    }
  }

  {
    LOG(INFO) << "compiling matrix with " << lsize * rsize;

    SparseArrayBuilder builder;
    if (FLAGS_use_1byte_cost) {
      builder.SetUse1ByteValue(true);
    }
    vector<int16> default_cost(lsize);
    fill(default_cost.begin(), default_cost.end(), static_cast<int16>(0));

    for (int lid = 0; lid < lsize; ++lid) {
      for (int rid = 0; rid < rsize; ++rid) {
        const int16 c = matrix[lid + lsize * rid];
        if (ConnectorInterface::kInvalidCost != c) {
          default_cost[lid] = max(default_cost[lid], c);  // save default cost
        }
      }
    }

    for (int lid = 0; lid < lsize; ++lid) {
      for (int rid = 0; rid < rsize; ++rid) {
        int16 c = matrix[lid + lsize * rid];
        if (c != default_cost[lid]) {
          if (FLAGS_use_1byte_cost) {
            if (c == ConnectorInterface::kInvalidCost) {
              c = SparseConnector::kInvalid1ByteCostValue;
            } else {
              c = c / FLAGS_cost_resolution;
              CHECK(c < 256 && c != SparseConnector::kInvalid1ByteCostValue);
            }
          }
          builder.AddValue(SparseConnector::EncodeKey(lid, rid), c);
        }
      }
    }

    builder.Build();

    OutputFileStream ofs(output_file.c_str(), ios::binary|ios::out);
    CHECK(ofs) << "permission denied: " << output_file;

    CHECK_EQ(lsize, default_cost.size());

    const uint16 magic = SparseConnector::kSparseConnectorMagic;
    ofs.write(reinterpret_cast<const char*>(&magic), sizeof(magic));
    uint16 resolution = 1;
    if (FLAGS_use_1byte_cost) {
      resolution = FLAGS_cost_resolution;
    }
    ofs.write(reinterpret_cast<const char*>(&resolution), sizeof(resolution));
    ofs.write(reinterpret_cast<const char*>(&lsize), sizeof(lsize));
    ofs.write(reinterpret_cast<const char*>(&rsize), sizeof(rsize));
    ofs.write(reinterpret_cast<const char*>(&default_cost[0]),
              sizeof(default_cost[0]) * default_cost.size());
    ofs.write(builder.GetImage(), builder.GetSize());
    ofs.close();
  }

  // verify connector
  {
    Mmap<char> mmap;
    CHECK(mmap.Open(output_file.c_str(), "r"));

    scoped_ptr<SparseConnector> connector(
        new SparseConnector(mmap.begin(), mmap.GetFileSize()));
    CHECK(connector.get());

    const int resolution = connector->GetResolution();

    for (int rid = 0; rid < rsize; ++rid) {
      for (int lid = 0; lid < lsize; ++lid) {
        const int diff = abs(connector->GetTransitionCost(lid, rid) -
                             matrix[lid + lsize * rid]);
        CHECK_LT(diff, resolution);
      }
    }
  }

  return;
}
}  // namespace mozc