Skip to content
Snippets Groups Projects
Select Git revision
  • 755164cd25393161d85837d1ba51f7be3dd43757
  • main default protected
  • ud_training_script
  • fix_seed
  • merged-with-ner
  • multiword_fix_transformer
  • transformer_encoder
  • combo3
  • save_deprel_matrix_to_npz
  • master protected
  • combo-lambo
  • lambo-sent-attributes
  • adding_lambo
  • develop
  • update_allenlp2
  • develop_tmp
  • tokens_truncation
  • LR_test
  • eud_iwpt
  • iob
  • eud_iwpt_shared_task_bert_finetuning
  • 3.3.1
  • list
  • 3.2.1
  • 3.0.3
  • 3.0.1
  • 3.0.0
  • v1.0.6
  • v1.0.5
  • v1.0.4
  • v1.0.3
  • v1.0.2
  • v1.0.1
  • v1.0.0
34 results

example.conllu

Blame
  • ruljos2wccl.py 3.08 KiB
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    import os, sys, codecs, re
    from optparse import OptionParser
    from StringIO import StringIO
    
    descr = """%prog [options] IN OUT
    
    Attempts to convert JOSKIPI rules to WCCL rules.
    NOTE: this is based on very naive heuristics.
    """
    
    # TODO agr bits
    # TODO isbig -> regex
    
    p_strset = re.compile(u'{(\\s*"[^"]*"(\\s*,\\s*"[^"]*")*\\s*)}', re.U)
    p_negposref = re.compile(u'\$\-([0-9]+)([A-Z][A-Za-z0-9]*)')
    p_posposref = re.compile(u'\$[\+]?([0-9]+)([A-Z][A-Za-z0-9]*)')
    p_isbig = re.compile(u'isbig\(([^)]*)\)', re.U)
    p_hasnum = re.compile(u'hasnum\(([^)]*)\)', re.U)
    p_agr = re.compile(u'(w?agr[a-z]*)\\s*\(\\s*([\-0-9]+)\\s*,\\s*([\-0-9]+)\\s*,\\s*({[^}]*})\\s*,\\s*([\-0-9]+)\\s*\)', re.U)
    
    def jos2ccl(what):
    	what = what.replace('{none}', '{}')
    	what = what.replace('flex[', 'class[')
    	what = p_negposref.sub(u'$\\2 - \\1', what)
    	what = p_posposref.sub(u'$\\2 + \\1', what)
    	what = p_strset.sub(u'[\\1]', what)
    	what = p_isbig.sub(u'regex(\\1, "\\\\\\p{Lu}.*")', what)
    	what = p_hasnum.sub(u'regex(\\1, ".*[0-9].*")', what)
    	what = p_agr.sub(u'\\1(\\2, \\3, \\4)', what)
    	return what
    	
    class IndentWriter:
    	def __init__(self, out, baseindent = 0):
    		self.out = out
    		self.indent = baseindent
    	
    	def write(self, text):
    		for line in text.split('\n'):
    			line = line.strip()
    			if line:
    				meat = line.split('//', 1)[0]
    				idelta = meat.count('(') - meat.count(')')
    				nowindent = (self.indent + idelta) if meat.startswith(')') else self.indent
    				self.out.write('\t' * nowindent)
    				self.out.write(line)
    				self.out.write('\n')
    				self.indent += idelta
    			else:
    				self.out.write('\n')
    
    class Rule:
    	def __init__(self, text):
    		pre, cond = text.split(':-')
    		act, name = pre.split('#')
    		self.cond = jos2ccl(cond.strip())
    		self.act = jos2ccl(act.strip())
    		self.name = name.strip()
    	
    	def write(self, out, comma = False):
    		out.write('rule("%s",\n' % self.name)
    		out.write('%s,\n' % self.cond)
    		out.write('%s\n' % self.act)
    		out.write(')%s\n' % (',' if comma else ''))
    
    def rule_texts(infile):
    	buf = StringIO()
    	for line in infile:
    		if 'delete' in line: # TODO: regex with other actions
    			yield buf.getvalue()
    			buf = StringIO()
    		buf.write(line)
    	yield buf.getvalue()
    
    def rules(infile):
    	for rt in rule_texts(infile):
    		rt = '\n'.join(line.split('//', 1)[0] for line in rt.split('\n'))
    		if rt.strip():
    			yield Rule(rt)
    
    def go():
    	parser = OptionParser(usage=descr)
    	parser.add_option('-v', '--verbose', action='store_true', dest='verbose', default=False, help='set the verbose mode')
    	
    	(options, args) = parser.parse_args()
    	
    	if len(args) != 2:
    		print 'You need to provide input and output paths'
    		print 'See %s --help' % os.path.basename(sys.argv[0])
    		sys.exit(1)
    	
    	inpath, outpath = args
    	
    	inf = codecs.open(inpath, 'rb', 'utf-8')
    	outf = codecs.open(outpath, 'wb', 'utf-8')
    	w = IndentWriter(outf)
    	
    	indent = 0
    	w.write('rules(\n')
    	# quick and dirty: to get all but last
    	allrules = [r for r in rules(inf)]
    	for rule in allrules[:-1]:
    		rule.write(w, True)
    	allrules[-1].write(w, False) # no comma
    	w.write(')\n')
    	
    	outf.close()
    	inf.close()
    
    if __name__ == '__main__':
    	go()