Skip to content
Snippets Groups Projects
Commit c5771637 authored by Adam Radziszewski's avatar Adam Radziszewski
Browse files

corpus-merge: switch to prefix chunk (=par) ids with file name

parent d998c411
Branches
No related merge requests found
#!/usr/bin/python #!/usr/bin/python
# -*- coding: utf-8 -*- # -*- coding: utf-8 -*-
import sys import sys, os
from optparse import OptionParser from optparse import OptionParser
from collections import defaultdict as dd from collections import defaultdict as dd
from itertools import repeat, izip from itertools import repeat, izip
...@@ -45,6 +45,9 @@ def go(): ...@@ -45,6 +45,9 @@ def go():
parser.add_option('-C', '--chunks', action='store_true', parser.add_option('-C', '--chunks', action='store_true',
dest='chunks', default=False, dest='chunks', default=False,
help='Process chunks (select chunks/sentences, not tokens)') help='Process chunks (select chunks/sentences, not tokens)')
parser.add_option('--prefix-chunks', action='store_true',
dest='prefix_chunks', default=False,
help='Prefix chunk ids with filename (file:NAME:ORIGID)')
parser.add_option('-v', '--verbose', action='store_true', parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False, dest='verbose', default=False,
help='verbose mode') help='verbose mode')
...@@ -68,11 +71,23 @@ def go(): ...@@ -68,11 +71,23 @@ def go():
for arg in args: for arg in args:
reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, arg) reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, arg)
if options.chunks: if options.chunks:
fname, _ = os.path.splitext(os.path.basename(arg))
chunk_no = 1
for chunk in chunks(reader): for chunk in chunks(reader):
if options.prefix_chunks:
if chunk.has_attribute('id'):
their_id = chunk.get_attribute('id')
else:
# autogen
their_id = ('auto%03d' % chunk_no)
full_id = 'file:%s:%s' % (fname, their_id)
chunk.set_attribute('id', full_id)
writer.write_chunk(chunk) writer.write_chunk(chunk)
chunk_no += 1
else: else:
for sent in sentences(reader): for sent in sentences(reader):
writer.write_sentence(sent) writer.write_sentence(sent)
del reader
if __name__ == '__main__': if __name__ == '__main__':
go() go()
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment