Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
poldeepner2
Manage
Activity
Members
Labels
Plan
Issues
29
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Information extraction
poldeepner2
Commits
6da40619
Commit
6da40619
authored
3 years ago
by
Michał Marcińczuk
Browse files
Options
Downloads
Patches
Plain Diff
Auto segment size.
parent
923c30f0
Branches
Branches containing commit
1 merge request
!41
Dev v07
Pipeline
#5077
failed with stage
in 2 minutes and 45 seconds
Changes
2
Pipelines
1
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
Dockerfiles/base/Dockerfile
+1
-1
1 addition, 1 deletion
Dockerfiles/base/Dockerfile
poldeepner2/utils/sequences.py
+36
-35
36 additions, 35 deletions
poldeepner2/utils/sequences.py
with
37 additions
and
36 deletions
Dockerfiles/base/Dockerfile
+
1
−
1
View file @
6da40619
...
...
@@ -23,7 +23,7 @@ RUN pip3 install wheel
WORKDIR
"/poldeepner2"
ADD
./requirements.txt /poldeepner2/requirements.txt
RUN
pip3
install
-r
requirements.txt
RUN
p
ython3.8
-m
spacy download pl_core_news_sm
RUN
p
ip3
install
protobuf
==
3.20.1
RUN
apt-get
install
-y
wget
RUN
apt-get
install
-y
unzip
...
...
This diff is collapsed.
Click to expand it.
poldeepner2/utils/sequences.py
+
36
−
35
View file @
6da40619
...
...
@@ -204,11 +204,11 @@ class FeatureGeneratorSingleSentenceWithContext(FeatureGenerator):
@dataclass
class
FeatureGeneratorWindowContext
(
FeatureGenerator
):
max_segment_length
:
int
=
64
max_segment_length
:
int
def
__init__
(
self
,
label_list
:
List
[
str
],
max_seq_length
:
int
,
encode_method
:
Any
,
max_segment_length
:
int
):
def
__init__
(
self
,
label_list
:
List
[
str
],
max_seq_length
:
int
,
encode_method
:
Any
):
super
().
__init__
(
label_list
,
max_seq_length
,
encode_method
)
self
.
max_segment_length
=
max_se
gment
_length
self
.
max_segment_length
=
int
(
max_se
q
_length
/
4
)
def
generate
(
self
,
examples
:
List
[
InputExample
])
->
List
[
InputFeatures
]:
sentences_tokens_features
=
[
...
...
@@ -247,7 +247,8 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
if
idx
>
0
:
segment_context
.
add_token
(
segments
[
idx
-
1
],
mask_out
=
True
)
segment_context
.
add_token
(
segments
[
idx
],
mask_out
=
False
)
while
idx
+
1
<
len
(
segments
)
and
segment_context
.
length
()
+
segments
[
idx
+
1
].
length
()
<
self
.
max_seq_length
:
while
idx
+
1
<
len
(
segments
)
\
and
segment_context
.
length
()
+
segments
[
idx
+
1
].
length
()
<
self
.
max_seq_length
:
segment_context
.
add_token
(
segments
[
idx
+
1
],
mask_out
=
True
)
idx
+=
1
features
.
append
(
segment_context
)
...
...
@@ -258,34 +259,34 @@ class FeatureGeneratorWindowContext(FeatureGenerator):
return
features
class
FeatureGeneratorSingleSentenceWithContextMix
(
FeatureGenerator
):
def
generate
(
self
,
examples
:
List
[
InputExample
])
->
List
[
InputFeatures
]:
sentences
=
[]
for
(
ex_index
,
example
)
in
enumerate
(
examples
):
sentences
.
append
(
SentenceTokenFeatures
(
self
.
tokens_and_labels_into_token_features
(
example
.
tokens
,
example
.
labels
)))
features
=
[]
for
idx
,
sentence
in
enumerate
(
sentences
):
feature
=
SequenceFeatures
()
feature
.
add_sentence
(
sentence
)
offset
=
1
while
idx
+
offset
<
len
(
sentences
)
\
and
feature
.
length
()
+
1
+
sentences
[
idx
+
offset
].
length
()
+
1
<
self
.
max_seq_length
:
feature
.
add_separator
()
feature
.
add_sentence
(
sentences
[
idx
+
offset
],
mask_out
=
True
)
offset
+=
1
feature
.
close_and_fill
(
self
.
max_seq_length
)
assert
feature
.
length
()
==
self
.
max_seq_length
,
"
Length of the sequence does not match
"
features
.
append
(
feature
)
feature_single
=
SequenceFeatures
()
feature_single
.
add_sentence
(
sentence
)
feature_single
.
close_and_fill
(
self
.
max_seq_length
)
features
.
append
(
feature_single
)
return
features
#
class FeatureGeneratorSingleSentenceWithContextMix(FeatureGenerator):
#
#
def generate(self, examples: List[InputExample]) -> List[InputFeatures]:
#
sentences = []
#
for (ex_index, example) in enumerate(examples):
#
sentences.append(SentenceTokenFeatures(
#
self.tokens_and_labels_into_token_features(example.tokens, example.labels)))
#
#
features = []
#
for idx, sentence in enumerate(sentences):
#
feature = SequenceFeatures()
#
feature.add_sentence(sentence)
#
offset = 1
#
while idx + offset < len(sentences) \
#
and feature.length() + 1 + sentences[idx+offset].length() + 1 < self.max_seq_length:
#
feature.add_separator()
#
feature.add_sentence(sentences[idx+offset], mask_out=True)
#
offset += 1
#
feature.close_and_fill(self.max_seq_length)
#
assert feature.length() == self.max_seq_length, "Length of the sequence does not match"
#
features.append(feature)
#
#
feature_single = SequenceFeatures()
#
feature_single.add_sentence(sentence)
#
feature_single.close_and_fill(self.max_seq_length)
#
features.append(feature_single)
#
#
return features
class
FeatureGeneratorUnion
(
FeatureGenerator
):
...
...
@@ -313,9 +314,9 @@ class FeatureGeneratorFactory:
elif
method
==
"
context-right
"
:
return
FeatureGeneratorSingleSentenceWithContext
(
label_list
,
max_seq_length
,
encode_method
)
elif
method
==
"
context-window
"
:
return
FeatureGeneratorWindowContext
(
label_list
,
max_seq_length
,
encode_method
,
64
)
elif
method
==
"
context-single
"
:
return
FeatureGeneratorSingleSentenceWithContextMix
(
label_list
,
max_seq_length
,
encode_method
)
return
FeatureGeneratorWindowContext
(
label_list
,
max_seq_length
,
encode_method
)
#
elif method == "context-single":
#
return FeatureGeneratorSingleSentenceWithContextMix(label_list, max_seq_length, encode_method)
elif
method
==
"
union
"
:
generators
=
[
FeatureGeneratorFactory
.
create
(
m
,
label_list
,
max_seq_length
,
encode_method
)
for
m
in
[
"
single
"
,
"
merged
"
,
"
context-right
"
,
"
context-window
"
]]
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment