doctring added to pack_sentences_to_max_tokens method

44b886c3 · Marek Maziarz · ec05cf26 · 44b886c3
Commit 44b886c3 authored 11 months ago by Marek Maziarz
--- a/src/winer_worker.py
+++ b/src/winer_worker.py
@@ -34,6 +34,25 @@ class WinerWorker(nlp_ws.NLPWorker):
    @staticmethod
    def pack_sentences_to_max_tokens(plain_inputs, tokenized_inputs, sent_starts, max_tokens=512):
+        """
+        Pack sentences into chunks, ensuring that the token count per chunk does not exceed a given maximum.
+        This method takes in plain text sentences, their tokenized versions, and sentence start indices,
+        and it creates batches of sentences such that the total token count per batch does not exceed the
+        specified max_tokens limit.
+        Args:
+            plain_inputs (list): List of plain text sentences.
+            tokenized_inputs (list): List of tokenized versions of the sentences.
+            sent_starts (list): List of sentence start indices.
+            max_tokens (int, optional): The maximum number of tokens allowed per chunk. Defaults to 512.
+        Returns:
+            tuple: 
+                - packed_plain_inputs (list): List of packed plain text sentences, where each element is a chunk of sentences.
+                - packed_tokenized_inputs (list): List of packed tokenized inputs corresponding to the plain text chunks.
+                - packed_sent_starts (list): List of sentence start indices for each chunk.
+        """
        packed_plain_inputs = []
        packed_tokenized_inputs = []
        packed_sent_starts = []
@@ -87,7 +106,6 @@ class WinerWorker(nlp_ws.NLPWorker):
            worker will store result.
        :type output_path: str
        """
        # Read inputs and open output
        F_ASCII = task_options.get("ensure_ascii", False)
        with clarin_json.open(output_path, "w", ensure_ascii=F_ASCII) as fout:
@@ -98,7 +116,7 @@ class WinerWorker(nlp_ws.NLPWorker):
                        get_sentences_from_document(document)
                    packed_plain_texts, packed_tokenized_inputs, packed_sent_starts = pack_sentences_to_max_tokens(
-                        plain_inputs, tokenized_inputs, sent_starts, max_tokens=512
+                        plain_inputs, tokenized_inputs, sent_starts, max_tokens=510
                    )
                    # Process data