From 11e898ebdbf4cd0bb0aecfe1d88d18226d3fc6c9 Mon Sep 17 00:00:00 2001 From: Maja Jablonska <majajjablonska@gmail.com> Date: Mon, 4 Mar 2024 23:53:19 +1100 Subject: [PATCH] Fix sentence IDs in turns --- combo/data/tokenizers/lambo_tokenizer.py | 4 ++-- pyproject.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py index 635ab5d..fafec8e 100644 --- a/combo/data/tokenizers/lambo_tokenizer.py +++ b/combo/data/tokenizers/lambo_tokenizer.py @@ -71,9 +71,9 @@ class LamboTokenizer(Tokenizer): if split_level.upper() == "TURN": for turn in document.turns: + _reset_idx() sentence_tokens = [] for sentence in turn.sentences: - _reset_idx() for token in sentence.tokens: sentence_tokens.extend(_sentence_tokens(token, split_multiwords)) tokens.append(sentence_tokens) @@ -96,8 +96,8 @@ class LamboTokenizer(Tokenizer): tokens.append(sentence_tokens) else: for turn in document.turns: + _reset_idx() for sentence in turn.sentences: - _reset_idx() for token in sentence.tokens: tokens.extend(_sentence_tokens(token, split_multiwords)) tokens = [tokens] diff --git a/pyproject.toml b/pyproject.toml index c17c0df..5952c1b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools"] [project] name = "combo" -version = "3.2.1" +version = "3.2.2" authors = [ {name = "Maja Jablonska", email = "maja.jablonska@ipipan.waw.pl"} ] -- GitLab