From 6b9ad465e6014a5bbdc2789b3bfb048c271cd6d0 Mon Sep 17 00:00:00 2001
From: Adam Radziszewski <adam.radziszewski@pwr.wroc.pl>
Date: Tue, 8 Jan 2013 13:47:33 +0100
Subject: [PATCH] util to extract morphological dictionary from a corpus

---
 utils/get_morpho.py | 103 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 103 insertions(+)
 create mode 100755 utils/get_morpho.py

diff --git a/utils/get_morpho.py b/utils/get_morpho.py
new file mode 100755
index 0000000..ec724b0
--- /dev/null
+++ b/utils/get_morpho.py
@@ -0,0 +1,103 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# Copyright (C) 2013 Adam Radziszewski.
+# This program is free software; you can redistribute and/or modify it
+# under the terms of the GNU Lesser General Public License as published by the Free
+# Software Foundation; either version 3 of the License, or (at your option)
+# any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+# or FITNESS FOR A PARTICULAR PURPOSE.
+#
+# See the LICENCE and COPYING files for more details
+
+descr = """%prog [options] CORPUS OUTDATAFILE
+
+Gathers morphological data from the given corpus.
+
+The extracted morphological data will be saved to OUTDATAFILE in tab-delimited
+format (compliant with Maca analysers).
+NOTE: the morphological data will not be compacted. To do so, you may use
+tabclean.py script from Maca repository.
+"""
+
+import sys, codecs
+from optparse import OptionParser
+import corpus2
+
+class Analyser:
+	"""Morphological dictionary resulting from reading a corpus.
+	Provides consume function that updates the data with the given token."""
+	def __init__(self, tagset, case_sensitive):
+		self.tagset = tagset
+		self.unk_tag = self.tagset.make_ign_tag()
+		self.get_form = (lambda form: form) if case_sensitive else (lambda form: form.lower())
+		self.data = {}
+	
+	def _add_one(self, ready_form, lex):
+		"""Adds tag and lemma from the given lexeme."""
+		if ready_form not in self.data:
+			self.data[ready_form] = set()
+		self.data[ready_form].add(
+			(unicode(lex.lemma()), self.tagset.tag_to_string(lex.tag()))
+		)
+	
+	def consume(self, tok):
+		ready_form = self.get_form(unicode(tok.orth()))
+		for lex in tok.lexemes():
+			if lex.tag() != self.unk_tag:
+				self._add_one(ready_form, lex)
+	
+	def write(self, output):
+		for form in sorted(self.data):
+			entries = sorted(self.data[form])
+			for lemma, tag in entries:
+				output.write(u'%s\t%s\t%s\n' % (form, lemma, tag))
+	
+	def save(self, fname):
+		output = codecs.open(fname, 'wb', 'utf-8')
+		self.write(output)
+		output.close()
+
+def get_morpho(options, corpname, outfname):
+	tagset = corpus2.get_named_tagset(options.tagset)
+	anal = Analyser(tagset, options.case_sens)
+	rdr = corpus2.TokenReader.create_path_reader(options.input_format, tagset, corpname)
+	while True:
+		tok = rdr.get_next_token()
+		if not tok:
+			break
+		anal.consume(tok)
+	del rdr
+	anal.save(outfname)
+
+def go():
+	parser = OptionParser(usage=descr)
+	parser.add_option('-i', '--input-format', type='string', action='store',
+		dest='input_format', default='xces',
+		help='set the input format; default: xces')
+	parser.add_option('-t', '--tagset', type='string', action='store',
+		dest='tagset', default='nkjp',
+		help='set the tagset used in input; default: nkjp')
+	parser.add_option('-s', '--sep', type='string', action='store',
+		dest='sep', default='\t',
+		help='set the separator used in morpho file; default: tab character')
+	parser.add_option('-c', '--case-sensitive', action='store_true', default=False, dest='case_sens')
+	parser.add_option('-q', '--quiet', action='store_false', default=True, dest='verbose')
+	(options, args) = parser.parse_args()
+	
+	if len(args) != 2:
+		print 'You need to provide an input corpus and output path'
+		print
+		parser.print_help()
+		sys.exit(1)
+	
+	corpname = args[0]
+	outfname = args[1]
+	
+	get_morpho(options, corpname, outfname)
+
+if __name__ == '__main__':
+	go()
-- 
GitLab