Codebase list mozc / debian/0.12.410.102-1 dictionary / gen_zip_code_seed.py
debian/0.12.410.102-1

Tree @debian/0.12.410.102-1 (Download .tar.gz)

gen_zip_code_seed.py @debian/0.12.410.102-1raw · history · blame

# -*- coding: utf-8 -*-
# Copyright 2010, Google Inc.
# All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are
# met:
#
#     * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
#     * Redistributions in binary form must reproduce the above
# copyright notice, this list of conditions and the following disclaimer
# in the documentation and/or other materials provided with the
# distribution.
#     * Neither the name of Google Inc. nor the names of its
# contributors may be used to endorse or promote products derived from
# this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

"""
 The tool for generating zip code dictionary.
 Input files are shift-jis csv.
 Output lines will be printed as utf-8.

 usage:
 ./gen_zip_code_seed.py --zip_code=zip_code.csv --jigyosyo=jigyosyo.csv > zip_code_seed.tsv

 Zip code sample input line:
 01101,"060  ","0600007","ホッカイドウ","サッポロシチュウオウク","キタ7ジョウニシ","北海道","札幌市中央区","北七条西",0

 Jigyosyo zip code sample input line:
 01101,"サツポロシチユウオウクヤクシヨ","札幌市中央区役所","北海道","札幌市中央区","南三条西","11丁目","0608612","060  ","札幌",0,0,0
"""

__author__ = "toshiyuki"

import codecs
import optparse
import re
import sys
import unicodedata

class ZipEntry:
  def __init__(self, zip_code, level1, level2, level3, level4, allow_multiple):
    self.allow_multiple = allow_multiple

    # XXX-XXXX format
    self.zip_code = '-'.join([zip_code[0:3], zip_code[3:]])

    # When a postal code corresponds to multiple area, we don't use individual
    # area name.
    if (level3.find(u'以下に掲載がない場合') != -1 or
        level3.find(u'、') != -1):
      level3 = ''

    # We ignore additional information here.
    level3 = re.sub(u'(.*', u'', level3)

    # Normalize business name.
    level4 = re.sub(u' ', u' ', level4)

    address = u''.join([level1, level2, level3])
    if level4:
      address = u' '.join([address, level4])

    # Normalize character width.
    address = unicodedata.normalize('NFKC', address)
    self.address = address


def TrimColumnContent(column):
  """Returns column content without '\"'."""
  return column.strip('"')


def GetColumns(line):
  """Returns columns contents in list."""
  line = line.strip()
  columns = line.split(',')
  return map(TrimColumnContent, columns)


def ReadZipEntry(line):
  """Read zip code entry."""
  columns = GetColumns(line)
  zip_entry = ZipEntry(columns[2], columns[6], columns[7], columns[8], '',
                       (columns[12] == '1'))
  return zip_entry


def ReadJigyosyoEntry(line):
  """Read jigyosyo zip code entry."""
  columns = GetColumns(line)
  jigyosyo_entry = ZipEntry(columns[7], columns[3], columns[4], columns[5],
                       columns[2], False)
  return jigyosyo_entry


def PrintEntry(entry):
  """Print zip code entry."""
  zip_code = entry.zip_code
  address = entry.address
  line = '\t'.join([zip_code, address, '0'])
  print line.encode('utf-8')


def ParseOptions():
  """Parse command line options."""
  parser = optparse.OptionParser(usage='Usage: %prog [options]')
  parser.add_option('--zip_code', dest='zip_code',
                    action='store', default='',
                    help='specify zip code csv file path.')
  parser.add_option('--jigyosyo', dest='jigyosyo',
                    action='store', default='',
                    help='specify zip code csv file path.')
  (options, unused_args) = parser.parse_args()
  return options


def main():
  options = ParseOptions()
  header = '# zip code dictionary seed generated by %s' % __file__
  print header.encode('utf-8')

  seen = set()
  if options.zip_code:
    for line in codecs.open(options.zip_code, 'r', 'shift_jis',
                            errors='replace'):
      entry = ReadZipEntry(line)
      if (entry.zip_code in seen and
          not entry.allow_multiple):
        # Single entry may be recorded as multiple lines.
        # Here, simply discard them.

        # TODO(toshiyuki): Support multiple address entry.
        # for example,
        # 440-0032, "愛知県","豊橋市","岩田町居村、北郷中"
        # should be
        # 440-0032, "愛知県豊橋市岩田町居村" and
        # 440-0032, "愛知県豊橋市北郷中".
        continue
      PrintEntry(entry)
      seen.add(entry.zip_code)
  if options.jigyosyo:
    for line in codecs.open(options.jigyosyo, 'r', 'shift_jis',
                            errors='replace'):
      entry = ReadJigyosyoEntry(line)
      PrintEntry(entry)
  return 0


if __name__ == '__main__':
  sys.exit(main())