diff --git a/src/winer_worker.py b/src/winer_worker.py index b4d18011a2cb0636c170450a6b00762ac0689239..47d522a3e46f594b20db5533b575bcf01fd78a1e 100644 --- a/src/winer_worker.py +++ b/src/winer_worker.py @@ -34,6 +34,25 @@ class WinerWorker(nlp_ws.NLPWorker): @staticmethod def pack_sentences_to_max_tokens(plain_inputs, tokenized_inputs, sent_starts, max_tokens=512): + """ + Pack sentences into chunks, ensuring that the token count per chunk does not exceed a given maximum. + + This method takes in plain text sentences, their tokenized versions, and sentence start indices, + and it creates batches of sentences such that the total token count per batch does not exceed the + specified max_tokens limit. + + Args: + plain_inputs (list): List of plain text sentences. + tokenized_inputs (list): List of tokenized versions of the sentences. + sent_starts (list): List of sentence start indices. + max_tokens (int, optional): The maximum number of tokens allowed per chunk. Defaults to 512. + + Returns: + tuple: + - packed_plain_inputs (list): List of packed plain text sentences, where each element is a chunk of sentences. + - packed_tokenized_inputs (list): List of packed tokenized inputs corresponding to the plain text chunks. + - packed_sent_starts (list): List of sentence start indices for each chunk. + """ packed_plain_inputs = [] packed_tokenized_inputs = [] packed_sent_starts = [] @@ -87,7 +106,6 @@ class WinerWorker(nlp_ws.NLPWorker): worker will store result. :type output_path: str """ - # Read inputs and open output F_ASCII = task_options.get("ensure_ascii", False) with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout: @@ -98,7 +116,7 @@ class WinerWorker(nlp_ws.NLPWorker): get_sentences_from_document(document) packed_plain_texts, packed_tokenized_inputs, packed_sent_starts = pack_sentences_to_max_tokens( - plain_inputs, tokenized_inputs, sent_starts, max_tokens=512 + plain_inputs, tokenized_inputs, sent_starts, max_tokens=510 ) # Process data