Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/python
# -*- coding: utf-8 -*-
import sys
from optparse import OptionParser
from collections import defaultdict as dd
from itertools import repeat, izip
import corpus2
descr = """%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]]
Reads a corpus file and outputs all or some tokens.
Available input formats: """ + ' '.join(corpus2.TokenReader.available_reader_types()) + """
""" + ' '.join(corpus2.TokenReader.available_reader_types_help()) + """
Available output formats: """ + ' '.join(corpus2.TokenWriter.available_writer_types()) + """
""" + ' '.join(corpus2.TokenWriter.available_writer_types_help())
def parse_range_info(s):
"""Parses a comma-separated list of numbers that
can also be dash-separated ranges"""
selection = set()
for elem in (x.strip() for x in s.split(',')):
try:
selection.add(int(elem))
except:
split = [x.strip() for x in elem.split('-')]
try:
if len(split) == 2:
split.sort()
for x in xrange(int(split[0]), int(split[1])+1):
selection.add(x)
else:
raise
except:
print "Fail:", elem
return selection
def sentences(rdr):
"""Yields subsequent sentences from a reader.
Declared here for demonstration."""
while True:
sent = rdr.get_next_sentence()
if not sent:
break
yield sent
def chunks(rdr):
"""Yields subsequent sentences from a reader."""
while True:
chunk = rdr.get_next_chunk()
if not chunk:
break
yield chunk
def write_selected_sentences(sents, writer, selection):
sid = 0
for sent in sents:
if sid in selection:
if len(selection[sid]) == 0:
writer.write_sentence(sent)
else:
tid = 0
for tok in sent.tokens():
if tid in selection[sid]:
writer.write_token(tok)
tid += 1
sid += 1
def go():
parser = OptionParser(usage=descr)
parser.add_option('-i', '--input-format', type='string', action='store',
dest='input_format', default='xces',
help='set the input format; default: xces-fast')
parser.add_option('-o', '--output-format', type='string', action='store',
dest='output_format', default='xces',
help='set the output format; default: xces')
parser.add_option('-t', '--tagset', type='string', action='store',
dest='tagset', default='kipi',
help='set the tagset used in input; default: kipi')
parser.add_option('-C', '--chunks', action='store_true',
dest='chunks', default=False,
help='Process chunks (select chunks/sentences, not tokens)')
parser.add_option('-v', '--verbose', action='store_true',
dest='verbose', default=False,
help='verbose mode')
(options, args) = parser.parse_args()
if len(args) < 1:
print 'You need to provide an input corpus.'
print 'See %s --help' % sys.argv[0]
sys.exit(1)
inpath = args[0]
# load a tagset, create a reader
tagset = corpus2.get_named_tagset(options.tagset)
reader = corpus2.TokenReader.create_path_reader(options.input_format, tagset, inpath)
writer = corpus2.TokenWriter.create_stdout_writer(options.output_format, tagset)
selection = {}
for arg in args[1:]:
if ':' in arg:
sp = arg.split(':')
if len(sp) == 2 and options.chunks:
selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(()))))))
elif len(sp) == 3 and options.chunks:
selection.update(izip(parse_range_info(sp[0]), repeat(dict(izip(parse_range_info(sp[1]), repeat(parse_range_info(sp[2])))))))
elif len(sp) == 2:
selection.update(izip(parse_range_info(sp[0]), repeat(parse_range_info(sp[1]))))
else:
print >> sys.stderr, "Invalid argument:", arg
return
else:
selection.update(izip(parse_range_info(arg), repeat(())))
if selection == {}:
if options.chunks:
for chunk in chunks(reader):
writer.write_chunk(chunk)
else:
for sent in sentences(reader):
writer.write_sentence(sent)
else:
if options.chunks:
cid = 0
for chunk in chunks(reader):
if cid in selection:
if len(selection[cid]) == 0:
writer.write_chunk(chunk)
else:
write_selected_sentences(chunk.sentences(), writer, selection[cid])
cid += 1
else:
write_selected_sentences(sentences(reader), writer, selection)
if __name__ == '__main__':
go()