from itertools import product import re from importer.Phrase import * from .morph_generation import MorphologyError, select_form PRE, POST = 0, 1 def build_phrase(head, dep, head_type, dep_type, order_override=None): order = None if order_override is not None: order = PRE if order_override == 'pre' else POST else: if head_type == NP: if dep_type in (AdjP, LexAdjP, LexPPasP, LexPActP, PossP, LexQub, Fixed,): order = PRE # LexAdvP: nic więcej if dep_type in (NP, LexNP, PrepNP, ComPrepNP, LexPrepNP, LexPrepGerP, CP, LexCP, NCP, XP, LexXP, LexAdvP, OR,): order = POST if head_type == NumP: if dep_type in (AdjP, LexAdjP, PossP): order = PRE # XP: w pół drogi ‹dokądś› # NP: na dwóch biegunach ‹kogoś/czegoś› if dep_type in (NP, XP,): order = POST if head_type == AdjP: if dep_type in (AdvP, LexAdvP, AdjP, LexAdjP, LexQub,): order = PRE # NP: pełny czegoś # Fixed: samo przez się if dep_type in (NP, LexNP, PrepNP, LexPrepNP, XP, LexXP, Compar, LexCompar, Fixed): order = POST if head_type == AdvP: if dep_type in (XP, AdvP,): order = PRE # LexNP: dalej własnego nosa # LexPrepNP: prosto w oczy # LexCP: tak, że... if dep_type in (LexCompar, NP, LexNP, PrepNP, LexPrepNP, LexCP,): order = POST if head_type == InfP: order = POST if head_type == Qub: if dep_type in (LexQub,): order = PRE if order == PRE: return '{} {}'.format(dep, head) if order == POST: return '{} {}'.format(head, dep) else: raise RuntimeError('couldn’t build phrase: {} {} {} {}'.format(head, dep, head_type, dep_type)) def correct_lemma(lemma): # TODO see notes l = lemma.strip('\'') if l == 'bliźnięta': return 'bliźnię' return l NUM_LEMMA = { '2' : 'dwa', '3' : 'trzy', '5' : 'pięć', } def correct_num_lemma(lemma): return NUM_LEMMA.get(lemma, lemma) def correct_pos(lemma, pos): if lemma == 'siebie': return 'siebie' if lemma in ('ja', 'ty', 'my', 'wy'): return 'ppron12' if lemma == 'on': return 'ppron3' if lemma == 'oba': return 'num' if lemma == 'jeden': return 'adj' return pos def correct_num(lemma, num): if lemma == 'siebie': return '' if lemma in ('ja', 'ty') and num == '_': return 'sg' if lemma in ('oba', 'plecy', 'usta',): return 'pl' if lemma in ('pół', 'półtora'): return 'sg' # TODO (?) if num == 'agr': return 'sg' # TODO _ -> sg or _ -> sg and pl? return num if num != '_' else ['sg', 'pl'] def correct_gend(gend): if gend == 'agr': return 'm1' return gend # TODO is the mapping for no function correct? # TODO the mapping should be more complex, e.g. most lex(np)s should be in acc (dać kosza etc.), # but adjps seem to need nom: chrzest bojowy STR_CASE = { 'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' }, 'obj' : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' }, None : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' }, } AGR_CASE = { 'subj' : 'nom', 'obj' : 'acc', 'head' : 'nom', None : 'nom' } PART_CASE = { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' }, PRED_CASE = { 'subj' : { '_' : 'nom', 'aff' : 'nom', 'neg' : 'nom' }, 'obj' : { '_' : 'acc', 'aff' : 'acc', 'neg' : 'gen' }, None : { '_' : 'inst', 'aff' : 'inst', 'neg' : 'inst' }, } def correct_case(case, function, negativity='_'): if case == 'str': return STR_CASE[function][negativity] if case == 'agr': return AGR_CASE[function] if case == 'part': return PART_CASE[negativity] if case == 'pred': return PRED_CASE[function][negativity] if case == 'postp': return 'dat' return case def correct_deg(deg): # positive degree = positive or no degree at all if deg == 'pos': return [deg, ''] if deg == '_': return ['pos', 'com', 'sup', ''] return deg def correct_congr(lemma): if lemma in ('pół', 'półtora'): return 'rec' # heuristic: if both congr and rec forms available, prefer congr # no congr/rec also possible return ['congr', 'rec', ''] def correct_aff(aff): if aff == '_': return ['aff', 'neg'] return aff NEG = { '_' : '(nie) ', 'aff' : '', 'neg' : 'nie '} def correct_neg(neg): return NEG[neg] SIE = { '' : '', 'się' : 'się ', } def correct_sie(sie): return SIE[sie] def correct_feats(lemma, feats, praep=False): if lemma == 'on': return feats + ['m1', 'nakc', 'praep' if praep else 'npraep'] if lemma in ('ja', 'ty',): # mi, ci, cię akc = 'nakc' if 'dat' in feats or ({'acc', 'gen'}.intersection(feats) and lemma == 'ty') else 'akc' return feats + ['m1', [akc, '']] if lemma in ('my', 'wy'): return feats + ['m1'] if lemma == 'oba': return feats + ['congr', 'ncol'] return feats def get_subst_attrs(lemma, tag): feats = tag.split(':') if lemma == 'siebie': return { 'case' : feats[1] } return {'num': feats[1], 'case': feats[2], 'gend' : feats[3]} def get_gender(lemma): form = get_form(lemma, ['subst', 'sg', 'nom']) # 1 or 2 values: ['f'], ['n', 'ncol'], ... gend = form[1].split(':')[3:] if len(gend) == 2: # no col/ncol variant for jeden, wiele itp. gend[1] = [gend[1], ''] else: # choose ncol for e.g. czterech/czworo m1 gend = [gend[0], ['ncol', '']] return gend def get_form(lemma, feats): if lemma.startswith('E('): return ('', 'subst:pl:nom:{}'.format(lemma.strip('E()'))) lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats] return select_form(lemma, lemma_feats) def get_forms(lemma, feats): lemma_feats = [f(lemma) if hasattr(f, '__call__') else f for f in feats] lemma_feats = [[f] if type(f) == str else f for f in lemma_feats] ret = [] errors = [] for feats in product(*lemma_feats): try: ret.append(select_form(lemma, feats)) except MorphologyError as e: errors.append(str(e)) if ret: #print('get_forms', lemma, feats, ret) return ret raise MorphologyError('couldn’t select form: {}'.format(' + '.join(errors))) WOK_PREP = { 'bez' : ('^mn',), # beze mnie 'nad' : ( '^mn', '^wszystko' # nade wszystko, ale: nad wszystkim ), 'od' : ('^mn',), 'pod' : ('^mn',), 'przed' : ( '^mn', '^wszystkim$' # przede wszystkim, ale: przed wszystkimi ), 'przez' : ('^mn',), 'w' : ( '^dwoje', # ale: w dwojaki '^dwój', '^fr', '^mgl', '^mnie$', # ale: w mniejszych '^wc', '^wn', '^wp', '^wr', '^ws', '^wt', '^wz', '^wł', '^znaki$', '^śnie', ), 'z' : ( '^mnie$', '^mną$', '^sobą$', '^sc', '^sf', '^sk', '^sm', '^sn', '^sp', '^st', '^sw', '^szc', '^szk', '^szp', '^szt', '^szw', '^sł', '^wsc', '^wsi', '^wsk', '^wsp', '^wst', '^wszec', '^wszystk', '^wz', '^zb', '^zd', '^zg', '^zj', '^zm', '^zn', '^zr', '^zw', '^zł', '^łz', '^ś', '^ź', ), } def combine_with_prep(prep, rest): if prep in WOK_PREP: for pattern in WOK_PREP[prep]: if re.match(pattern, rest.lower()): return '{}e {}'.format(prep, rest) return '{} {}'.format(prep, rest)