Codebase list apertium-apy / debian/0.9.1_r343-2 util.py
debian/0.9.1_r343-2

Tree @debian/0.9.1_r343-2 (Download .tar.gz)

util.py @debian/0.9.1_r343-2raw · history · blame

#!/usr/bin/env python3
# vim: set ts=4 sw=4 sts=4 et :

import sqlite3, re, os, logging
from subprocess import Popen, PIPE
from datetime import datetime

iso639Codes = {"abk":"ab","aar":"aa","afr":"af","aka":"ak","sqi":"sq","amh":"am","ara":"ar","arg":"an","hye":"hy","asm":"as","ava":"av","ave":"ae","aym":"ay","aze":"az","bam":"bm","bak":"ba","eus":"eu","bel":"be","ben":"bn","bih":"bh","bis":"bi","bos":"bs","bre":"br","bul":"bg","mya":"my","cat":"ca","cha":"ch","che":"ce","nya":"ny","zho":"zh","chv":"cv","cor":"kw","cos":"co","cre":"cr","hrv":"hr","ces":"cs","dan":"da","div":"dv","nld":"nl","dzo":"dz","eng":"en","epo":"eo","est":"et","ewe":"ee","fao":"fo","fij":"fj","fin":"fi","fra":"fr","ful":"ff","glg":"gl","kat":"ka","deu":"de","ell":"el","grn":"gn","guj":"gu","hat":"ht","hau":"ha","heb":"he","her":"hz","hin":"hi","hmo":"ho","hun":"hu","ina":"ia","ind":"id","ile":"ie","gle":"ga","ibo":"ig","ipk":"ik","ido":"io","isl":"is","ita":"it","iku":"iu","jpn":"ja","jav":"jv","kal":"kl","kan":"kn","kau":"kr","kas":"ks","kaz":"kk","khm":"km","kik":"ki","kin":"rw","kir":"ky","kom":"kv","kon":"kg","kor":"ko","kur":"ku","kua":"kj","lat":"la","ltz":"lb","lug":"lg","lim":"li","lin":"ln","lao":"lo","lit":"lt","lub":"lu","lav":"lv","glv":"gv","mkd":"mk","mlg":"mg","msa":"ms","mal":"ml","mlt":"mt","mri":"mi","mar":"mr","mah":"mh","mon":"mn","nau":"na","nav":"nv","nob":"nb","nde":"nd","nep":"ne","ndo":"ng","nno":"nn","nor":"no","iii":"ii","nbl":"nr","oci":"oc","oji":"oj","chu":"cu","orm":"om","ori":"or","oss":"os","pan":"pa","pli":"pi","fas":"fa","pol":"pl","pus":"ps","por":"pt","que":"qu","roh":"rm","run":"rn","ron":"ro","rus":"ru","san":"sa","srd":"sc","snd":"sd","sme":"se","smo":"sm","sag":"sg","srp":"sr","gla":"gd","sna":"sn","sin":"si","slk":"sk","slv":"sl","som":"so","sot":"st","azb":"az","spa":"es","sun":"su","swa":"sw","ssw":"ss","swe":"sv","tam":"ta","tel":"te","tgk":"tg","tha":"th","tir":"ti","bod":"bo","tuk":"tk","tgl":"tl","tsn":"tn","ton":"to","tur":"tr","tso":"ts","tat":"tt","twi":"tw","tah":"ty","uig":"ug","ukr":"uk","urd":"ur","uzb":"uz","ven":"ve","vie":"vi","vol":"vo","wln":"wa","cym":"cy","wol":"wo","fry":"fy","xho":"xh","yid":"yi","yor":"yo","zha":"za","zul":"zu", "hbs":"sh", "arg":"an", "pes":"fa"}
'''
    Bootstrapped from https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes using
        var out = {};
        $.each($('tr', $('table').get(1)), function(i, elem) { var rows = $('td', elem); out[$(rows[5]).text()] = $(rows[4]).text(); });
        JSON.stringify(out);
'''

# TODO: does this need a lock?
langNamesDBConn = None

def toAlpha2Code(code):
    if '_' in code:
        code, variant = code.split('_')
        return '%s_%s' % ((iso639Codes[code], variant) if code in iso639Codes else  (code, variant))
    else:
        return iso639Codes[code] if code in iso639Codes else code

def toAlpha3Code(code):
    iso639CodesInverse = {v: k for k, v in iso639Codes.items()}
    if '_' in code:
        code, variant = code.split('_')
        return '%s_%s' % ((iso639CodesInverse[code], variant) if code in iso639CodesInverse else  (code, variant))
    else:
        return iso639CodesInverse[code] if code in iso639CodesInverse else code

def getLocalizedLanguages(locale, dbPath, languages=[]):
    global langNamesDBConn
    if not langNamesDBConn:
        if os.path.exists(dbPath):
            langNameDBConn = sqlite3.connect(dbPath)
            c = langNameDBConn.cursor()
        else:
            logging.error('Failed to locate language name DB: %s' % dbPath)
            return {}


    locale = toAlpha2Code(locale)
    languages = list(set(languages))

    convertedLanguages, duplicatedLanguages = {}, {}
    for language in languages:
        if language in iso639Codes and iso639Codes[language] in languages:
            duplicatedLanguages[iso639Codes[language]] = language
            duplicatedLanguages[language] = iso639Codes[language]
        convertedLanguages[toAlpha2Code(language)] = language
    output = {}

    languageResults = c.execute('SELECT * FROM languageNames WHERE lg=?', (locale, )).fetchall()
    if languages:
        for languageResult in languageResults:
            if languageResult[2] in convertedLanguages:
                language, languageName = languageResult[2], languageResult[3]
                output[convertedLanguages[language]] = languageName
                if language in duplicatedLanguages:
                    output[language] = languageName
                    output[duplicatedLanguages[language]] = languageName
    else:
        for languageResult in languageResults:
            output[languageResult[2]] = languageResult[3]
    return output


def apertium(input, dir, mode, formatting=None):
    p1 = Popen(['echo', input], stdout=PIPE)
    print(input, dir, mode, formatting)
    if formatting:
        p2 = Popen(['apertium', '-d . -f %s' % formatting, mode], stdin=p1.stdout, stdout=PIPE, cwd=dir)
    else:
        p2 = Popen(['apertium', '-d {}'.format(dir), mode], stdin=p1.stdout, stdout=PIPE)
    p1.stdout.close()
    output = p2.communicate()[0].decode('utf-8')
    return output

def bilingualTranslate(toTranslate, dir, mode):
    p1 = Popen(['echo', toTranslate], stdout=PIPE)
    p2 = Popen(['lt-proc', '-b', mode], stdin=p1.stdout, stdout=PIPE, cwd=dir)
    p1.stdout.close()
    output = p2.communicate()[0].decode('utf-8')
    return output

def removeDotFromDeformat(query, analyses):
    """When using the txt format, a dot is added at EOF (also, double line
    breaks) if the last part of the query isn't itself a dot"""
    if not query[-1] == '.':
        return analyses[:-1]
    else:
        return analyses

def stripTags(analysis):
    if '<' in analysis:
        return analysis[:analysis.index('<')]
    else:
        return analysis

def getCoverages(text, modes, penalize=False):
    coverages = {}
    for mode, modeTuple in modes.items():
        coverages[mode] = getCoverage(text, modeTuple[0], modeTuple[1], penalize=penalize)
    return coverages

def getCoverage(text, mode, modeDir, penalize=False):
    analysis = apertium(text, mode, modeDir)
    lexicalUnits = removeDotFromDeformat(text, re.findall(r'\^([^\$]*)\$([^\^]*)', analysis))
    analyzedLexicalUnits = list(filter(lambda x: not x[0].split('/')[1][0] in '*&#', lexicalUnits))
    if len(lexicalUnits) and not penalize:
        return len(analyzedLexicalUnits) / len(lexicalUnits)
    elif len(lexicalUnits) and len(text) and penalize:
        return len(analyzedLexicalUnits) / len(lexicalUnits) - (1 - sum([len(lexicalUnit[0].split('/')[0]) for lexicalUnit in lexicalUnits]) / len(text))
    else:
        return -1

def processPerWord(analyzers, taggers, lang, modes, query):
    outputs = {}
    morph_lexicalUnits = None
    tagger_lexicalUnits = None
    lexicalUnitRE = r'\^([^\$]*)\$'

    if 'morph' in modes or 'biltrans' in modes:
        if lang in analyzers:
            modeInfo = analyzers[lang]
            analysis = apertium(query, modeInfo[0], modeInfo[1])
            morph_lexicalUnits = removeDotFromDeformat(query, re.findall(lexicalUnitRE, analysis))
            outputs['morph'] = [lexicalUnit.split('/')[1:] for lexicalUnit in morph_lexicalUnits]
            outputs['morph_inputs'] = [stripTags(lexicalUnit.split('/')[0]) for lexicalUnit in morph_lexicalUnits]
        else:
            return

    if 'tagger' in modes or 'disambig' in modes or 'translate' in modes:
        if lang in taggers:
            modeInfo = taggers[lang]
            analysis = apertium(query, modeInfo[0], modeInfo[1])
            tagger_lexicalUnits = removeDotFromDeformat(query, re.findall(lexicalUnitRE, analysis))
            outputs['tagger'] = [lexicalUnit.split('/')[1:] if '/' in lexicalUnit else lexicalUnit for lexicalUnit in tagger_lexicalUnits]
            outputs['tagger_inputs'] = [stripTags(lexicalUnit.split('/')[0]) for lexicalUnit in tagger_lexicalUnits]
        else:
            return

    if 'biltrans' in modes:
        if morph_lexicalUnits:
            outputs['biltrans'] = []
            for lexicalUnit in morph_lexicalUnits:
                splitUnit = lexicalUnit.split('/')
                forms = splitUnit[1:] if len(splitUnit) > 1 else splitUnit
                rawTranslations = bilingualTranslate(''.join(['^%s$' % form for form in forms]), modeInfo[0], lang + '.autobil.bin')
                translations = re.findall(lexicalUnitRE, rawTranslations)
                outputs['biltrans'].append(list(map(lambda x: '/'.join(x.split('/')[1:]), translations)))
                outputs['translate_inputs'] = outputs['morph_inputs']
        else:
            return

    if 'translate' in modes:
        if tagger_lexicalUnits:
            outputs['translate'] = []
            for lexicalUnit in tagger_lexicalUnits:
                splitUnit = lexicalUnit.split('/')
                forms = splitUnit[1:] if len(splitUnit) > 1 else splitUnit
                rawTranslations = bilingualTranslate(''.join(['^%s$' % form for form in forms]), modeInfo[0], lang + '.autobil.bin')
                translations = re.findall(lexicalUnitRE, rawTranslations)
                outputs['translate'].append(list(map(lambda x: '/'.join(x.split('/')[1:]), translations)))
                outputs['translate_inputs'] = outputs['tagger_inputs']
        else:
            return

    return (outputs, tagger_lexicalUnits, morph_lexicalUnits)

def getTimestamp():
    return datetime.now().strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3]

def scaleMtLog(status, time, tInfo, key, length):
    logging.getLogger('scale-mt').error("%s %s %s html %s %s %s %s %s %s",
                                        getTimestamp(),
                                        timedeltaToMilliseconds(time),
                                        tInfo.langpair,
                                        key,
                                        tInfo.ip,
                                        tInfo.referer,
                                        status,
                                        length,
                                        'null'
    )


class TranslationInfo:
    def __init__(self, handler):
        self.langpair = handler.get_argument('langpair')
        self.key = handler.get_argument('key', default = 'null')
        self.ip = handler.request.headers.get('X-Real-IP', handler.request.remote_ip)
        self.referer = handler.request.headers.get('Referer', 'null')