Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
poldeepner2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Information extraction
poldeepner2
Commits
6da40619
Commit
6da40619
authored
May 30, 2022
by
Michał Marcińczuk
Browse files
Options
Downloads
Patches
Plain Diff
Auto segment size.
parent
923c30f0
Branches
Branches containing commit
No related tags found
1 merge request
!41
Dev v07
Pipeline
#5077
failed
May 30, 2022
Stage: test
Changes
2
Pipelines
1
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
Dockerfiles/base/Dockerfile
+1
-1
1 addition, 1 deletion
Dockerfiles/base/Dockerfile
poldeepner2/utils/sequences.py
+36
-35
36 additions, 35 deletions
poldeepner2/utils/sequences.py
with
37 additions
and
36 deletions
Dockerfiles/base/Dockerfile
+
1
−
1
View file @
6da40619
...
...
@@ -23,7 +23,7 @@ RUN pip3 install wheel
WORKDIR
"/poldeepner2"
ADD
./requirements.txt /poldeepner2/requirements.txt
RUN
pip3
install
-r
requirements.txt
RUN
p
ython3.8
-m
spacy download pl_core_news_sm
RUN
p
ip3
install
protobuf
==
3.20.1
RUN
apt-get
install
-y
wget
RUN
apt-get
install
-y
unzip
...
...
This diff is collapsed.
Click to expand it.
poldeepner2/utils/sequences.py
+
36
−
35
View file @
6da40619
...
...
@@ -204,11 +204,11 @@ class FeatureGeneratorSingleSentenceWithContext(FeatureGenerator):
@dataclass
class
FeatureGeneratorWindowContext
(
FeatureGenerator
):
max_segment_length
:
int
=
64
max_segment_length
:
int
def
__init__
(
self
,
label_list
:
List
[
str
],
max_seq_length
:
int
,
encode_method
:
Any
,
max_segment_length
:
int
):
def
__init__
(
self
,
label_list
:
List
[
str
],
max_seq_length
:
int
,
encode_method
:
Any
):
super
().
__init__
(
label_list
,
max_seq_length
,
encode_method
)
self
.
max_segment_length
=
max_se
gment
_length
self
.
max_segment_length
=
int
(
max_se
q
_length
/
4
)
def
generate
(
self
,
examples
:
List
[
InputExample
])
->
List
[
InputFeatures
]:
sentences_tokens_features
=
[
...
...
@@ -247,7 +247,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
if
idx
>
0
:
segment_context
.
add_token
(
segments
[
idx
-
1
],
mask_out
=
True
)
segment_context
.
add_token
(
segments
[
idx
],
mask_out
=
False
)
while
idx
+
1
<
len
(
segments
)
and
segment_context
.
length
()
+
segments
[
idx
+
1
].
length
()
<
self
.
max_seq_length
:
while
idx
+
1
<
len
(
segments
)
\
and
segment_context
.
length
()
+
segments
[
idx
+
1
].
length
()
<
self
.
max_seq_length
:
segment_context
.
add_token
(
segments
[
idx
+
1
],
mask_out
=
True
)
idx
+=
1
features
.
append
(
segment_context
)
...
...
@@ -258,34 +259,34 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
return
features
class
FeatureGeneratorSingleSentenceWithContextMix
(
FeatureGenerator
):
def
generate
(
self
,
examples
:
List
[
InputExample
])
->
List
[
InputFeatures
]:
sentences
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
sentences
.
append
(
SentenceTokenFeatures
(
self
.
tokens_and_labels_into_token_features
(
example
.
tokens
,
example
.
labels
)))
features
=
[]
for
idx
,
sentence
in
enumerate
(
sentences
):
feature
=
SequenceFeatures
()
feature
.
add_sentence
(
sentence
)
offset
=
1
while
idx
+
offset
<
len
(
sentences
)
\
and
feature
.
length
()
+
1
+
sentences
[
idx
+
offset
].
length
()
+
1
<
self
.
max_seq_length
:
feature
.
add_separator
()
feature
.
add_sentence
(
sentences
[
idx
+
offset
],
mask_out
=
True
)
offset
+=
1
feature
.
close_and_fill
(
self
.
max_seq_length
)
assert
feature
.
length
()
==
self
.
max_seq_length
,
"
Length of the sequence does not match
"
features
.
append
(
feature
)
feature_single
=
SequenceFeatures
()
feature_single
.
add_sentence
(
sentence
)
feature_single
.
close_and_fill
(
self
.
max_seq_length
)
features
.
append
(
feature_single
)
return
features
#
class FeatureGeneratorSingleSentenceWithContextMix(FeatureGenerator):
#
#
def generate(self, examples: List[InputExample]) -> List[InputFeatures]:
#
sentences = []
#
for (ex_index, example) in enumerate(examples):
#
sentences.append(SentenceTokenFeatures(
#
self.tokens_and_labels_into_token_features(example.tokens, example.labels)))
#
#
features = []
#
for idx, sentence in enumerate(sentences):
#
feature = SequenceFeatures()
#
feature.add_sentence(sentence)
#
offset = 1
#
while idx + offset < len(sentences) \
#
and feature.length() + 1 + sentences[idx+offset].length() + 1 < self.max_seq_length:
#
feature.add_separator()
#
feature.add_sentence(sentences[idx+offset], mask_out=True)
#
offset += 1
#
feature.close_and_fill(self.max_seq_length)
#
assert feature.length() == self.max_seq_length, "Length of the sequence does not match"
#
features.append(feature)
#
#
feature_single = SequenceFeatures()
#
feature_single.add_sentence(sentence)
#
feature_single.close_and_fill(self.max_seq_length)
#
features.append(feature_single)
#
#
return features
class
FeatureGeneratorUnion
(
FeatureGenerator
):
...
...
@@ -313,9 +314,9 @@ class FeatureGeneratorFactory:
elif
method
==
"
context-right
"
:
return
FeatureGeneratorSingleSentenceWithContext
(
label_list
,
max_seq_length
,
encode_method
)
elif
method
==
"
context-window
"
:
return
FeatureGeneratorWindowContext
(
label_list
,
max_seq_length
,
encode_method
,
64
)
elif
method
==
"
context-single
"
:
return
FeatureGeneratorSingleSentenceWithContextMix
(
label_list
,
max_seq_length
,
encode_method
)
return
FeatureGeneratorWindowContext
(
label_list
,
max_seq_length
,
encode_method
)
#
elif method == "context-single":
#
return FeatureGeneratorSingleSentenceWithContextMix(label_list, max_seq_length, encode_method)
elif
method
==
"
union
"
:
generators
=
[
FeatureGeneratorFactory
.
create
(
m
,
label_list
,
max_seq_length
,
encode_method
)
for
m
in
[
"
single
"
,
"
merged
"
,
"
context-right
"
,
"
context-window
"
]]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment