Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from optparse import OptionParser
from collections import defaultdict as dd
from itertools import repeat, izip
import corpus2
descr = """%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]]
Reads a corpus file and outputs all or some tokens.
Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
""" + ' '.join(corpus2.TokenWriter.available_writer_types_help())
def parse_range_info(s):
"""Parses a comma-separated list of numbers that
can also be dash-separated ranges"""
selection = set()
for elem in (x.strip() for x in s.split(',')):
try:
selection.add(int(elem))
except:
split = [x.strip() for x in elem.split('-')]
try:
if len(split) == 2:
split.sort()
for x in xrange(int(split[0]), int(split[1])+1):
selection.add(x)
else:
raise
except:
print "Fail:", elem
return selection
def sentences(rdr):
"""Yields subsequent sentences from a reader.
Declared here for demonstration."""
while True:
sent = rdr.get_next_sentence()
if not sent:
break
yield sent
def chunks(rdr):
"""Yields subsequent sentences from a reader."""
while True:
chunk = rdr.get_next_chunk()
if not chunk:
break
yield chunk
ilor
committed
def write_selected_sentences(sents, writer, selection, maxsel = None):
sid = 0
for sent in sents:
if sid in selection:
if len(selection[sid]) == 0:
writer.write_sentence(sent)
else:
tid = 0
for tok in sent.tokens():
if tid in selection[sid]:
writer.write_token(tok)
tid += 1
sid += 1
ilor
committed
if maxsel is not None and sid > maxsel: break
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces-fast')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='xces',
help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='kipi',
help='set the tagset used in input; default: kipi')
parser.add_option('-C', '--chunks', action='store_true',
dest='chunks', default=False,
help='Process chunks (select chunks/sentences, not tokens)')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='verbose mode')
(options, args) = parser.parse_args()
if len(args) < 1:
print 'You need to provide an input corpus.'
print 'See %s --help' % sys.argv[0]
sys.exit(1)
inpath = args[0]
# load a tagset, create a reader
tagset = corpus2.get_named_tagset(options.tagset)
reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset)
selection = {}
for arg in args[1:]:
if ':' in arg:
sp = arg.split(':')
if len(sp) == 2 and options.chunks:
selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(()))))))
elif len(sp) == 3 and options.chunks:
selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(parse_range_info(sp[2])))))))
elif len(sp) == 2:
selection.update(izip(parse_range_info(sp[0]), repeat(parse_range_info(sp[1]))))
else:
print >> sys.stderr, "Invalid argument:", arg
return
else:
selection.update(izip(parse_range_info(arg), repeat(())))
maxsel = max(selection.keys()) if selection.keys() != [] else None
if selection == {}:
if options.chunks:
for chunk in chunks(reader):
writer.write_chunk(chunk)
else:
for sent in sentences(reader):
writer.write_sentence(sent)
else:
if options.chunks:
cid = 0
for chunk in chunks(reader):
if cid in selection:
if len(selection[cid]) == 0:
writer.write_chunk(chunk)
else:
write_selected_sentences(chunk.sentences(), writer, selection[cid])
cid += 1
if maxsel is not None and cid > maxsel: break
ilor
committed
write_selected_sentences(sentences(reader), writer, selection, maxsel)