Skip to content
Snippets Groups Projects
Commit 10076064 authored by Marek Maziarz's avatar Marek Maziarz
Browse files

shortening lines

parent 44b886c3
No related branches found
No related tags found
No related merge requests found
Pipeline #19506 failed
...@@ -33,25 +33,40 @@ class WinerWorker(nlp_ws.NLPWorker): ...@@ -33,25 +33,40 @@ class WinerWorker(nlp_ws.NLPWorker):
self._model._tokenizer.model_max_length = 512 self._model._tokenizer.model_max_length = 512
@staticmethod @staticmethod
def pack_sentences_to_max_tokens(plain_inputs, tokenized_inputs, sent_starts, max_tokens=512): def pack_sentences_to_max_tokens(
plain_inputs,
tokenized_inputs,
sent_starts,
max_tokens=512
):
""" """
Pack sentences into chunks, ensuring that the token count per chunk does not exceed a given maximum. Pack sentences into chunks, ensuring that the token count
per chunk does not exceed a given maximum.
This method takes in plain text sentences, their tokenized versions, and sentence start indices, This method takes in plain text sentences, their tokenized
and it creates batches of sentences such that the total token count per batch does not exceed the versions, and sentence start indices,
and it creates batches of sentences such that the total
token count per batch does not exceed the
specified max_tokens limit. specified max_tokens limit.
Args: Args:
plain_inputs (list): List of plain text sentences. plain_inputs (list): List of plain text sentences.
tokenized_inputs (list): List of tokenized versions of the sentences. tokenized_inputs (list): List of tokenized versions
of the sentences.
sent_starts (list): List of sentence start indices. sent_starts (list): List of sentence start indices.
max_tokens (int, optional): The maximum number of tokens allowed per chunk. Defaults to 512. max_tokens (int, optional): The maximum number of
tokens allowed per chunk. Defaults to 512.
Returns: Returns:
tuple: tuple:
- packed_plain_inputs (list): List of packed plain text sentences, where each element is a chunk of sentences. - packed_plain_inputs (list): List of packed
- packed_tokenized_inputs (list): List of packed tokenized inputs corresponding to the plain text chunks. plain text sentences, where each element
- packed_sent_starts (list): List of sentence start indices for each chunk. is a chunk of sentences.
- packed_tokenized_inputs (list): List of packed
tokenized inputs corresponding to the plain
text chunks.
- packed_sent_starts (list): List of sentence
start indices for each chunk.
""" """
packed_plain_inputs = [] packed_plain_inputs = []
packed_tokenized_inputs = [] packed_tokenized_inputs = []
...@@ -62,7 +77,8 @@ class WinerWorker(nlp_ws.NLPWorker): ...@@ -62,7 +77,8 @@ class WinerWorker(nlp_ws.NLPWorker):
current_sent_start = [] current_sent_start = []
current_token_count = 0 current_token_count = 0
for sentence, sentence_tokens, sent_start in zip(plain_inputs, tokenized_inputs, sent_starts): for sentence, sentence_tokens, sent_start in zip(
plain_inputs, tokenized_inputs, sent_starts):
if current_token_count + len(sentence_tokens) <= max_tokens: if current_token_count + len(sentence_tokens) <= max_tokens:
current_plain_inputs.append(sentence) current_plain_inputs.append(sentence)
...@@ -115,8 +131,17 @@ class WinerWorker(nlp_ws.NLPWorker): ...@@ -115,8 +131,17 @@ class WinerWorker(nlp_ws.NLPWorker):
plain_inputs, tokenized_inputs, sent_starts = \ plain_inputs, tokenized_inputs, sent_starts = \
get_sentences_from_document(document) get_sentences_from_document(document)
packed_plain_texts, packed_tokenized_inputs, packed_sent_starts = pack_sentences_to_max_tokens( (
plain_inputs, tokenized_inputs, sent_starts, max_tokens=510 packed_plain_texts,
packed_tokenized_inputs,
packed_sent_starts
) = (
pack_sentences_to_max_tokens(
plain_inputs,
tokenized_inputs,
sent_starts,
max_tokens=510
)
) )
# Process data # Process data
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment