Skip to content
Snippets Groups Projects
Commit 6da40619 authored by Michał Marcińczuk's avatar Michał Marcińczuk
Browse files

Auto segment size.

parent 923c30f0
Branches
1 merge request!41Dev v07
Pipeline #5077 failed with stage
in 2 minutes and 45 seconds
......@@ -23,7 +23,7 @@ RUN pip3 install wheel
WORKDIR "/poldeepner2"
ADD ./requirements.txt /poldeepner2/requirements.txt
RUN pip3 install -r requirements.txt
RUN python3.8 -m spacy download pl_core_news_sm
RUN pip3 install protobuf==3.20.1
RUN apt-get install -y wget
RUN apt-get install -y unzip
......
......@@ -204,11 +204,11 @@ class FeatureGeneratorSingleSentenceWithContext(FeatureGenerator):
@dataclass
class FeatureGeneratorWindowContext(FeatureGenerator):
max_segment_length: int = 64
max_segment_length: int
def __init__(self, label_list: List[str], max_seq_length: int, encode_method: Any, max_segment_length: int):
def __init__(self, label_list: List[str], max_seq_length: int, encode_method: Any):
super().__init__(label_list, max_seq_length, encode_method)
self.max_segment_length = max_segment_length
self.max_segment_length = int(max_seq_length/4)
def generate(self, examples: List[InputExample]) -> List[InputFeatures]:
sentences_tokens_features = [
......@@ -247,7 +247,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
if idx > 0:
segment_context.add_token(segments[idx-1], mask_out=True)
segment_context.add_token(segments[idx], mask_out=False)
while idx + 1 < len(segments) and segment_context.length() + segments[idx+1].length() < self.max_seq_length:
while idx + 1 < len(segments) \
and segment_context.length() + segments[idx+1].length() < self.max_seq_length:
segment_context.add_token(segments[idx+1], mask_out=True)
idx += 1
features.append(segment_context)
......@@ -258,34 +259,34 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
return features
class FeatureGeneratorSingleSentenceWithContextMix(FeatureGenerator):
def generate(self, examples: List[InputExample]) -> List[InputFeatures]:
sentences = []
for (ex_index, example) in enumerate(examples):
sentences.append(SentenceTokenFeatures(
self.tokens_and_labels_into_token_features(example.tokens, example.labels)))
features = []
for idx, sentence in enumerate(sentences):
feature = SequenceFeatures()
feature.add_sentence(sentence)
offset = 1
while idx + offset < len(sentences) \
and feature.length() + 1 + sentences[idx+offset].length() + 1 < self.max_seq_length:
feature.add_separator()
feature.add_sentence(sentences[idx+offset], mask_out=True)
offset += 1
feature.close_and_fill(self.max_seq_length)
assert feature.length() == self.max_seq_length, "Length of the sequence does not match"
features.append(feature)
feature_single = SequenceFeatures()
feature_single.add_sentence(sentence)
feature_single.close_and_fill(self.max_seq_length)
features.append(feature_single)
return features
# class FeatureGeneratorSingleSentenceWithContextMix(FeatureGenerator):
#
# def generate(self, examples: List[InputExample]) -> List[InputFeatures]:
# sentences = []
# for (ex_index, example) in enumerate(examples):
# sentences.append(SentenceTokenFeatures(
# self.tokens_and_labels_into_token_features(example.tokens, example.labels)))
#
# features = []
# for idx, sentence in enumerate(sentences):
# feature = SequenceFeatures()
# feature.add_sentence(sentence)
# offset = 1
# while idx + offset < len(sentences) \
# and feature.length() + 1 + sentences[idx+offset].length() + 1 < self.max_seq_length:
# feature.add_separator()
# feature.add_sentence(sentences[idx+offset], mask_out=True)
# offset += 1
# feature.close_and_fill(self.max_seq_length)
# assert feature.length() == self.max_seq_length, "Length of the sequence does not match"
# features.append(feature)
#
# feature_single = SequenceFeatures()
# feature_single.add_sentence(sentence)
# feature_single.close_and_fill(self.max_seq_length)
# features.append(feature_single)
#
# return features
class FeatureGeneratorUnion(FeatureGenerator):
......@@ -313,9 +314,9 @@ class FeatureGeneratorFactory:
elif method == "context-right":
return FeatureGeneratorSingleSentenceWithContext(label_list, max_seq_length, encode_method)
elif method == "context-window":
return FeatureGeneratorWindowContext(label_list, max_seq_length, encode_method, 64)
elif method == "context-single":
return FeatureGeneratorSingleSentenceWithContextMix(label_list, max_seq_length, encode_method)
return FeatureGeneratorWindowContext(label_list, max_seq_length, encode_method)
# elif method == "context-single":
# return FeatureGeneratorSingleSentenceWithContextMix(label_list, max_seq_length, encode_method)
elif method == "union":
generators = [FeatureGeneratorFactory.create(m, label_list, max_seq_length, encode_method)
for m in ["single", "merged", "context-right", "context-window"]]
......
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment