Codebase list connectagram / upstream/1.1.2 tools / words.py
upstream/1.1.2

Tree @upstream/1.1.2 (Download .tar.gz)

words.py @upstream/1.1.2raw · history · blame

#!/usr/bin/env python3
#
# Copyright (C) 2013 Graeme Gott <graeme@gottcode.org>
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

"""words.py: Create Connectagram word list from a list of words."""


import argparse
from collections import defaultdict
import re


def replacement_map(language):
    """Return a dict of replacement strings for language."""
    if language == 'he':
        return {ord('ך'): 'כ',
                ord('ם'): 'מ',
                ord('ן'): 'נ',
                ord('ף'): 'פ',
                ord('ץ'): 'צ'}
    else:
        return {}


def skip_regex(language):
    """Return a regex that finds characters not allowed in words."""
    if language == 'en':
        return re.compile("[A-Z0-9'\-]")
    elif language == 'fr':
        return re.compile("[ÀÂÆÉÈÊËÎÏÔŒÙÛÜŸÇA-Z0-9'\-]")
    else:
        return re.compile('\d')


def main():
    parser = argparse.ArgumentParser(
            description='Create Connectagram word list from a list of words')
    parser.add_argument('FILE',
            help='file containing list of words')
    parser.add_argument('--language',
            help='specify language', type=str,
            choices=['en', 'fr', 'he', 'ro', 'ru', 'tr'])
    parser.add_argument('-o', '--out', type=str,
            help='place words in file OUT instead of default file')
    args = parser.parse_args()

    # Read words and replace letters
    words = defaultdict(list)
    with open(args.FILE, encoding='utf-8', mode='r') as f:
        replacements = replacement_map(args.language)
        skip = skip_regex(args.language)
        for word in f:
            word = word.strip()
            if len(word) < 5:
                continue
            if bool(skip.search(word)) == True:
                continue
            key = word.upper().translate(replacements)
            if not word in words[key]:
                words[key].append(word)

    # Join words into lines
    lines = []
    for solution, spellings in words.items():
        if (len(spellings) == 1) and (spellings[0] == solution.lower()):
            # Doesn't need to store proper spellings in this case;
            # Connectagram will fall back to all lowercase of solution.
            lines.append(solution + '\n')
        else:
            # Store solution and proper spellings.
            lines.append(solution + ' ' + ' '.join(spellings) + '\n')
    lines.sort()

    # Save lines to disk
    outf = 'words'
    if args.out:
        outf = args.out
    with open(outf, encoding='utf-8', mode='w') as f:
        for line in lines:
            f.write(line)
    print("Words generated and placed in file '{0}'".format(outf))


if __name__ == '__main__':
    main()