Codebase list mozc / d2e045e client / gen_client_quality_test_data.py
d2e045e

Tree @d2e045e (Download .tar.gz)

gen_client_quality_test_data.py @d2e045eraw · history · blame

# -*- coding: utf-8 -*-
# Copyright 2010-2012, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

import codecs
import logging
import sys


def escape(string):
  return ''.join('\\x%02x' % ord(char) for char in string.encode('utf-8'))


def convert_tsv(filename):
  tsv = codecs.open(filename, 'rb', 'utf-8')
  while True:
    line = tsv.readline()
    if not line:
      break
    line = line.rstrip()

    fields = line.split('\t')
    if len(fields) < 6:
      logging.warning('invalid row format: %s', line)
      continue
    print ('  // {"%s", "%s", "%s"},'
           % (fields[0], fields[4], fields[5]))
    print ('  {"%s", "%s", "%s"},'
           % (escape(fields[0]), escape(fields[4]), escape(fields[5])))
  tsv.close()


def main():
  sys.stdin = codecs.getreader('utf-8')(sys.stdin)
  sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
  sys.stderr = codecs.getwriter('utf-8')(sys.stderr)
  logging.basicConfig(level = logging.INFO)

  print '// Automatically generated by mozc'
  print '#ifndef MOZC_SESSION_QUALITY_MAIN_DATA_H_'
  print '#define MOZC_SESSION_QUALITY_MAIN_DATA_H_'
  print ''
  print 'namespace mozc {'
  print 'struct TestCase {'
  print '  const char* source;'
  print '  const char* expected_result;'
  print '  const char* hiragana_sentence;'
  print '};'
  print ''
  print 'static TestCase test_cases[] = {'

  for filename in sys.argv[1:]:
    convert_tsv(filename)

  print '  {NULL, NULL, NULL}'
  print '};'
  print '}  // namespace mozc'
  print '#endif  // MOZC_SESSION_QUALITY_MAIN_DATA_H_'


if __name__ == '__main__':
  main()