From 44b886c36f769302e09132ceaa542384eab687c4 Mon Sep 17 00:00:00 2001
From: Marek Maziarz <marek.maziarz@pwr.edu.pl>
Date: Wed, 2 Oct 2024 17:25:38 +0200
Subject: [PATCH] doctring added to pack_sentences_to_max_tokens method

---
 src/winer_worker.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/winer_worker.py b/src/winer_worker.py
index b4d1801..47d522a 100644
--- a/src/winer_worker.py
+++ b/src/winer_worker.py
@@ -34,6 +34,25 @@ class WinerWorker(nlp_ws.NLPWorker):
 
     @staticmethod
     def pack_sentences_to_max_tokens(plain_inputs, tokenized_inputs, sent_starts, max_tokens=512):
+        """
+        Pack sentences into chunks, ensuring that the token count per chunk does not exceed a given maximum.
+
+        This method takes in plain text sentences, their tokenized versions, and sentence start indices,
+        and it creates batches of sentences such that the total token count per batch does not exceed the
+        specified max_tokens limit.
+
+        Args:
+            plain_inputs (list): List of plain text sentences.
+            tokenized_inputs (list): List of tokenized versions of the sentences.
+            sent_starts (list): List of sentence start indices.
+            max_tokens (int, optional): The maximum number of tokens allowed per chunk. Defaults to 512.
+
+        Returns:
+            tuple: 
+                - packed_plain_inputs (list): List of packed plain text sentences, where each element is a chunk of sentences.
+                - packed_tokenized_inputs (list): List of packed tokenized inputs corresponding to the plain text chunks.
+                - packed_sent_starts (list): List of sentence start indices for each chunk.
+        """
         packed_plain_inputs = []
         packed_tokenized_inputs = []
         packed_sent_starts = []
@@ -87,7 +106,6 @@ class WinerWorker(nlp_ws.NLPWorker):
             worker will store result.
         :type output_path: str
         """
-        
         # Read inputs and open output
         F_ASCII = task_options.get("ensure_ascii", False)
         with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout:
@@ -98,7 +116,7 @@ class WinerWorker(nlp_ws.NLPWorker):
                         get_sentences_from_document(document)
 
                     packed_plain_texts, packed_tokenized_inputs, packed_sent_starts = pack_sentences_to_max_tokens(
-                        plain_inputs, tokenized_inputs, sent_starts, max_tokens=512
+                        plain_inputs, tokenized_inputs, sent_starts, max_tokens=510
                     )
 
                     # Process data
-- 
GitLab