From 50043d3464e56c6f8fbc7a6fd42ecb988828d585 Mon Sep 17 00:00:00 2001 From: ilor <kailoran@gmail.com> Date: Thu, 2 Jun 2011 17:18:01 +0200 Subject: [PATCH] corpus-get upgrade: stop reading corpus after reading the last to-be-returned chunk/sentence --- corpus2tools/corpus-get | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/corpus2tools/corpus-get b/corpus2tools/corpus-get index 6e62ae3..98a5af0 100755 --- a/corpus2tools/corpus-get +++ b/corpus2tools/corpus-get @@ -52,7 +52,7 @@ def chunks(rdr): yield chunk -def write_selected_sentences(sents, writer, selection): +def write_selected_sentences(sents, writer, selection, maxsel = None): sid = 0 for sent in sents: if sid in selection: @@ -65,6 +65,7 @@ def write_selected_sentences(sents, writer, selection): writer.write_token(tok) tid += 1 sid += 1 + if maxsel is not None and sid > maxsel: break def go(): parser = OptionParser(usage=descr) @@ -110,6 +111,7 @@ def go(): return else: selection.update(izip(parse_range_info(arg), repeat(()))) + maxsel = max(selection.keys()) if selection == {}: if options.chunks: for chunk in chunks(reader): @@ -127,8 +129,9 @@ def go(): else: write_selected_sentences(chunk.sentences(), writer, selection[cid]) cid += 1 + if cid > maxsel: break else: - write_selected_sentences(sentences(reader), writer, selection) + write_selected_sentences(sentences(reader), writer, selection, maxsel) if __name__ == '__main__': go() -- GitLab