Codebase list mozc / 262156b src / dictionary / user_dictionary_importer.h
262156b

Tree @262156b (Download .tar.gz)

user_dictionary_importer.h @262156b

a1dcada
0fdb7a7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ccedd2d
247b13e
8902bb4
a1dcada
0fdb7a7
 
 
ccedd2d
 
 
0fdb7a7
 
247b13e
 
a1dcada
 
 
 
247b13e
 
 
 
 
 
 
 
 
ccedd2d
 
0fdb7a7
 
 
 
 
ccedd2d
0fdb7a7
 
ccedd2d
 
247b13e
0fdb7a7
 
 
 
 
ccedd2d
 
 
 
0fdb7a7
 
 
 
 
ccedd2d
0fdb7a7
 
ccedd2d
 
 
a1dcada
0fdb7a7
ccedd2d
0fdb7a7
 
 
 
 
 
247b13e
 
 
 
 
 
 
a1dcada
247b13e
 
 
a1dcada
247b13e
 
 
a1dcada
ccedd2d
247b13e
 
 
0fdb7a7
 
 
a1dcada
 
 
 
 
0fdb7a7
 
ccedd2d
 
a1dcada
0fdb7a7
ccedd2d
0fdb7a7
 
 
 
 
 
a1dcada
 
 
 
0fdb7a7
 
ccedd2d
a1dcada
0fdb7a7
ccedd2d
a1dcada
0fdb7a7
ccedd2d
0fdb7a7
 
a1dcada
0fdb7a7
 
 
247b13e
0fdb7a7
 
 
 
 
a1dcada
0fdb7a7
 
 
 
 
 
 
 
 
 
 
 
 
ccedd2d
a1dcada
 
0fdb7a7
ccedd2d
 
a1dcada
 
0fdb7a7
ccedd2d
0fdb7a7
a1dcada
247b13e
0fdb7a7
 
247b13e
0fdb7a7
ccedd2d
0fdb7a7
b64a7de
0fdb7a7
// Copyright 2010-2020, Google Inc.
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
//
//     * Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
//     * Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
//     * Neither the name of Google Inc. nor the names of its
// contributors may be used to endorse or promote products derived from
// this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

#ifndef MOZC_DICTIONARY_USER_DICTIONARY_IMPORTER_H_
#define MOZC_DICTIONARY_USER_DICTIONARY_IMPORTER_H_

#include <string>

#include "base/port.h"
#include "protocol/user_dictionary_storage.pb.h"
#include "absl/strings/string_view.h"

namespace mozc {

// An utilitiy class for importing user dictionary from different devices,
// including text files and MS-IME, Kotoeri, and ATOK(optional) user
// dictionaries.
class UserDictionaryImporter {
 public:
  // A raw entry to be read.
  struct RawEntry {
    std::string key;
    std::string value;
    std::string pos;  // Mozc extension: pos can encode locale. e.g. 名詞:en
    std::string comment;

    void Clear() {
      key.clear();
      value.clear();
      pos.clear();
      comment.clear();
    }
  };

  // An abstract class for representing an input device for user dictionary.
  // It runs over only valid lines which show entries in input.
  class InputIteratorInterface {
   public:
    InputIteratorInterface() {}
    virtual ~InputIteratorInterface() {}

    // Return true if the input iterator is available.
    virtual bool IsAvailable() const = 0;

    // Return true if entry is read successfully.
    // Next method doesn't have to convert the POS of entry.
    virtual bool Next(RawEntry *raw_entry) = 0;

   private:
    DISALLOW_COPY_AND_ASSIGN(InputIteratorInterface);
  };

  // An abstract class for reading a text file per line.  It runs over
  // all lines, e.g. comment lines.
  // As we'd like to use QTextFileStream to load UTF16 files, make an
  // interface class for reading text per line.
  class TextLineIteratorInterface {
   public:
    TextLineIteratorInterface() {}
    virtual ~TextLineIteratorInterface() {}

    // Return true text line iterator is available.
    virtual bool IsAvailable() const = 0;

    // Read a line and convert its encoding to UTF-8.
    // The TextLineIteratorInterface class takes a responsibility of character
    // set conversion. |line| must always be stored in UTF-8.
    virtual bool Next(std::string *line) = 0;

    // Reset the current position.
    virtual void Reset() = 0;

   private:
    DISALLOW_COPY_AND_ASSIGN(TextLineIteratorInterface);
  };

  // A wrapper for string. The string should contain utf-8 characters.
  // This class should resolve CR/LF issue.
  // This class does NOT take the ownership of the given string.
  // So it is caller's responsibility to extend the lifetime of the given
  // string until this iterator is destroyed.
  class StringTextLineIterator : public TextLineIteratorInterface {
   public:
    explicit StringTextLineIterator(absl::string_view data);
    virtual ~StringTextLineIterator();

    virtual bool IsAvailable() const;
    virtual bool Next(std::string *line);
    virtual void Reset();

   private:
    const absl::string_view data_;
    size_t position_;
    DISALLOW_COPY_AND_ASSIGN(StringTextLineIterator);
  };

  // List of IMEs.
  enum IMEType {
    IME_AUTO_DETECT = 0,
    MOZC = 1,
    MSIME = 2,
    ATOK = 3,
    KOTOERI = 4,
    NUM_IMES = 5,
  };

  // Guess IME type from the first line of IME file.
  // Return "NUM_IMES" if the format is unknown.
  static IMEType GuessIMEType(absl::string_view line);

  // Return the final IME type from user_ime_type and guessed_ime_type.
  static IMEType DetermineFinalIMEType(IMEType user_ime_type,
                                       IMEType guessed_ime_type);

  // List of character encodings.
  enum EncodingType {
    ENCODING_AUTO_DETECT = 0,
    UTF8 = 1,
    UTF16 = 2,
    SHIFT_JIS = 3,
    NUM_ENCODINGS = 4
  };

  // Guess encoding type of a string.
  static EncodingType GuessEncodingType(absl::string_view str);

  // Guess encoding type of a file.
  static EncodingType GuessFileEncodingType(const std::string &filename);

  // A special input iterator to read entries from TextLineIteratorInterface.
  class TextInputIterator : public InputIteratorInterface {
   public:
    TextInputIterator(IMEType ime_type, TextLineIteratorInterface *iter);
    virtual ~TextInputIterator();

    virtual bool IsAvailable() const;
    virtual bool Next(RawEntry *entry);
    IMEType ime_type() const { return ime_type_; }

   private:
    IMEType ime_type_;
    TextLineIteratorInterface *iter_;
    std::string first_line_;

    DISALLOW_COPY_AND_ASSIGN(TextInputIterator);
  };

  enum ErrorType {
    IMPORT_NO_ERROR,
    IMPORT_NOT_SUPPORTED,
    IMPORT_TOO_MANY_WORDS,
    IMPORT_INVALID_ENTRIES,
    IMPORT_FATAL,
    IMPORT_UNKNOWN_ERROR
  };

  // Convert POS's of other IME's into Mozc's.
  static bool ConvertEntry(const RawEntry &from,
                           user_dictionary::UserDictionary::Entry *to);

  // Import a dictionary from InputIteratorInterface.
  // This is the most generic interface.
  static ErrorType ImportFromIterator(InputIteratorInterface *iter,
                                      user_dictionary::UserDictionary *dic);

  // Import a dictionary from TextLineIterator.
  static ErrorType ImportFromTextLineIterator(
      IMEType ime_type, TextLineIteratorInterface *iter,
      user_dictionary::UserDictionary *dic);

 private:
  DISALLOW_IMPLICIT_CONSTRUCTORS(UserDictionaryImporter);
};

}  // namespace mozc

#endif  // MOZC_DICTIONARY_USER_DICTIONARY_IMPORTER_H_