From 44b886c36f769302e09132ceaa542384eab687c4 Mon Sep 17 00:00:00 2001 From: Marek Maziarz <marek.maziarz@pwr.edu.pl> Date: Wed, 2 Oct 2024 17:25:38 +0200 Subject: [PATCH] doctring added to pack_sentences_to_max_tokens method --- src/winer_worker.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/src/winer_worker.py b/src/winer_worker.py index b4d1801..47d522a 100644 --- a/src/winer_worker.py +++ b/src/winer_worker.py @@ -34,6 +34,25 @@ class WinerWorker(nlp_ws.NLPWorker): @staticmethod def pack_sentences_to_max_tokens(plain_inputs, tokenized_inputs, sent_starts, max_tokens=512): + """ + Pack sentences into chunks, ensuring that the token count per chunk does not exceed a given maximum. + + This method takes in plain text sentences, their tokenized versions, and sentence start indices, + and it creates batches of sentences such that the total token count per batch does not exceed the + specified max_tokens limit. + + Args: + plain_inputs (list): List of plain text sentences. + tokenized_inputs (list): List of tokenized versions of the sentences. + sent_starts (list): List of sentence start indices. + max_tokens (int, optional): The maximum number of tokens allowed per chunk. Defaults to 512. + + Returns: + tuple: + - packed_plain_inputs (list): List of packed plain text sentences, where each element is a chunk of sentences. + - packed_tokenized_inputs (list): List of packed tokenized inputs corresponding to the plain text chunks. + - packed_sent_starts (list): List of sentence start indices for each chunk. + """ packed_plain_inputs = [] packed_tokenized_inputs = [] packed_sent_starts = [] @@ -87,7 +106,6 @@ class WinerWorker(nlp_ws.NLPWorker): worker will store result. :type output_path: str """ - # Read inputs and open output F_ASCII = task_options.get("ensure_ascii", False) with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout: @@ -98,7 +116,7 @@ class WinerWorker(nlp_ws.NLPWorker): get_sentences_from_document(document) packed_plain_texts, packed_tokenized_inputs, packed_sent_starts = pack_sentences_to_max_tokens( - plain_inputs, tokenized_inputs, sent_starts, max_tokens=512 + plain_inputs, tokenized_inputs, sent_starts, max_tokens=510 ) # Process data -- GitLab