src/gui/base/encoding_util.cc - mozc (280e38f)

encoding_util.cc @280e38f — raw · history · blame

// Copyright 2010-2016, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#include "gui/base/encoding_util.h"

#include "base/port.h"
#include "base/string_piece.h"
#include "base/util.h"

namespace mozc {
namespace {

#include "gui/base/sjis_to_ucs2_table.h"

// Each character of SJIS is encoded in one or two bytes.
//
// For first byte, there are 4 valid ranges (closed intervals):
//   * FirstByteRange1: [0x00, 0x80]
//   * FirstByteRange2: [0x81, 0x9F]
//   * FirstByteRange3: [0xA1, 0xDF]
//   * FirstByteRange4: [0xE0, 0xFF]
// Ranges 2 and 4 are for two bytes encoding, so one more byte is needed to
// decode a character.
//
// For second byte, there are 2 valid ranges (closed intervals):
//   * SecondByteRange1: [0x40, 0x7E]
//   * SecondByteRange2: [0x80, 0xFF]
// Two byte characters are decoded using the conversion table defined in
// sjis_to_ucs2_table.h.
inline bool IsInFirstByteRange1(uint8_t byte) {
  return byte <= 0x80;
}

inline bool IsInFirstByteRange2(uint8_t byte) {
  return 0x81 <= byte && byte <= 0x9F;
}

inline bool IsInFirstByteRange3(uint8_t byte) {
  return 0xA1 <= byte && byte <= 0xDF;
}

inline bool IsInFirstByteRange4(uint8_t byte) {
  return 0xE0 <= byte;
}

inline bool IsInSecondByteRange1(uint8_t byte) {
  return 0x40 <= byte && byte <= 0x7E;
}

inline bool IsInSecondByteRange2(uint8_t byte) {
  return 0x80 <= byte;
}

size_t ComputeIndex(uint8_t first, uint8_t second) {
  size_t first_index = 0;
  if (IsInFirstByteRange2(first)) {
    // first_index = "offset of first in FirstByteRange2".
    first_index = first - 0x81;
  } else if (IsInFirstByteRange4(first)) {
    // first_index = "offset of first in FirstByteRange4" +
    //               length(FirstByteRange2)
    first_index = (first - 0xE0) + (0x9F - 0x81 + 1);
  }

  size_t second_index = 0;
  if (IsInSecondByteRange1(second)) {
    // second_index = "offset of second in SecondByteRange1";
    second_index = second - 0x40;
  } else if (IsInSecondByteRange2(second)) {
    // second_index = "offset of second in SecondByteRange2" +
    //                length(SecondByteRange1)
    second_index = (second - 0x80) + (0x7E - 0x40 + 1);
  }

  // width = length(SecondByteRange1) + length(SecondByteRange2)
  const size_t width = (0x7E - 0x40 + 1) + (0xFF - 0x80 + 1);
  return first_index * width + second_index;
}

bool SJISToUTF8Internal(StringPiece input, string* output) {
  bool expect_first_byte = true;
  uint8_t first_byte = 0;
  for (const char c : input) {
    const uint8_t byte = static_cast<uint8_t>(c);

    if (expect_first_byte) {
      if (IsInFirstByteRange1(byte)) {
        Util::UCS4ToUTF8Append(byte, output);
      } else if (IsInFirstByteRange3(byte)) {
        Util::UCS4ToUTF8Append(byte + 0xFEC0, output);
      } else if (IsInFirstByteRange2(byte) || IsInFirstByteRange4(byte)) {
        first_byte = byte;
        expect_first_byte = false;
      } else {
        return false;  // Invalid first byte.
      }
      continue;
    }

    if (!IsInSecondByteRange1(byte) && !IsInSecondByteRange2(byte)) {
      return false;
    }
    const size_t index = ComputeIndex(first_byte, byte);
    if (index >= sizeof(kSJISToUCS2Table)) {
      return false;
    }
    const uint16_t ucs2 = kSJISToUCS2Table[index];
    if (ucs2 == 0) {
      return false;
    }
    Util::UCS4ToUTF8Append(ucs2, output);
    expect_first_byte = true;
  }
  return expect_first_byte;
}

}   // namespace

void EncodingUtil::SJISToUTF8(const string &input, string *output) {
  output->clear();
  if (!SJISToUTF8Internal(input, output)) {
    output->clear();
  }
}

}  // namespace mozc
Tree @280e38f (Download .tar.gz)

encoding_util.cc @280e38f — raw · history · blame