Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
7e52f5e3
Commit
7e52f5e3
authored
12 years ago
by
Pawel Orlowicz
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of nlp.pwr.wroc.pl:corpus2
parents
89e3a78c
ff0a41e5
Branches
Branches containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
libcorpus2/io/conllwriter.cpp
+64
-13
64 additions, 13 deletions
libcorpus2/io/conllwriter.cpp
libcorpus2/io/conllwriter.h
+9
-0
9 additions, 0 deletions
libcorpus2/io/conllwriter.h
utils/tagger-eval.py
+1
-0
1 addition, 0 deletions
utils/tagger-eval.py
with
74 additions
and
13 deletions
libcorpus2/io/conllwriter.cpp
+
64
−
13
View file @
7e52f5e3
...
...
@@ -7,12 +7,52 @@
namespace
Corpus2
{
bool
ConllWriter
::
registered
=
TokenWriter
::
register_writer
<
ConllWriter
>
(
"conll"
);
const
std
::
string
ConllWriter
::
SUPERPOS_ATTR
(
"superpos"
);
ConllWriter
::
ConllWriter
(
std
::
ostream
&
os
,
const
Tagset
&
tagset
,
const
string_range_vector
&
params
)
:
TokenWriter
(
os
,
tagset
,
params
)
{
myTagset
=
tagset
;
// check if the tagset contains 'superpos' attribute
idx_t
superpos_attr
=
myTagset
.
get_attribute_index
(
SUPERPOS_ATTR
);
if
(
superpos_attr
==
-
1
)
{
throw
Corpus2Error
(
"Tagset "
+
myTagset
.
name
()
+
" contains no 'superpos' attribute"
" (required by CONLL format)"
);
}
// ensure that the 'superpos' attribute is obligatory and first
// for each of the gram. classes defined
for
(
idx_t
pos
=
0
;
pos
<
myTagset
.
pos_count
();
++
pos
)
{
const
std
::
vector
<
bool
>
req_attrs
=
myTagset
.
get_pos_required_attributes
(
pos
);
// superpos_attr is the index of 'superpos' attr
// this index should be within range of required attributes for pos
// the attrubite should be marked as required
if
((
idx_t
)
req_attrs
.
size
()
<=
superpos_attr
)
{
throw
Corpus2Error
(
"Tagset "
+
myTagset
.
name
()
+
" should define 'superpos' attribute for each"
" grammatical class (req. by CONLL writer)"
);
}
if
(
!
req_attrs
[
superpos_attr
])
{
throw
Corpus2Error
(
"Tagset "
+
myTagset
.
name
()
+
" should define 'superpos' attribute"
" as REQUIRED for each class"
" (req. by CONLL writer)"
);
}
// ensure that no attribute comes before superpos
if
(
tagset
.
get_pos_attributes
(
pos
)[
0
]
!=
superpos_attr
)
{
throw
Corpus2Error
(
"Tagset "
+
myTagset
.
name
()
+
" should define 'superpos' attribute"
" as the FIRST one for each class"
" (req. by CONLL writer)"
);
}
}
}
ConllWriter
::~
ConllWriter
()
...
...
@@ -22,25 +62,36 @@ ConllWriter::~ConllWriter()
void
ConllWriter
::
write_token
(
const
Token
&
t
)
{
os
()
<<
t
.
orth_utf8
()
<<
"
\t
"
;
Lexeme
lex
=
t
.
get_preferred_lexeme
(
myTagset
);
os
()
<<
lex
.
lemma_utf8
()
+
"
\t
"
;
std
::
string
tag
=
myTagset
.
tag_to_string
(
lex
.
tag
());
std
::
vector
<
std
::
string
>
strs
;
std
::
transform
(
tag
.
begin
(),
tag
.
end
(),
tag
.
begin
(),
::
tolower
);
boost
::
split
(
strs
,
tag
,
boost
::
is_any_of
(
":"
));
os
()
<<
strs
[
1
]
<<
"
\t
"
<<
strs
[
0
]
<<
"
\t
"
;
if
(
strs
.
size
()
>
2
)
const
Lexeme
&
lex
=
t
.
get_preferred_lexeme
(
myTagset
);
os
()
<<
t
.
orth_utf8
()
<<
"
\t
"
<<
lex
.
lemma_utf8
()
<<
"
\t
"
;
// get lower-case tag representation
std
::
string
tagstr
=
myTagset
.
tag_to_string
(
lex
.
tag
());
std
::
transform
(
tagstr
.
begin
(),
tagstr
.
end
(),
tagstr
.
begin
(),
::
tolower
);
// ugly, but should work: split the lower tag repr on colons
std
::
vector
<
std
::
string
>
segs
;
boost
::
split
(
segs
,
tagstr
,
boost
::
is_any_of
(
":"
));
// now write each part of the split string and pad the non-existent
// attributes with _
// (ctr has asserted that after the obligatory gram. class comes
// 'superpos' attribute, so it is safe to assume there are always
// at least 2 segments)
os
()
<<
segs
[
1
]
<<
"
\t
"
<<
segs
[
0
]
<<
"
\t
"
;
if
(
segs
.
size
()
>
2
)
{
size_t
i
;
for
(
i
=
2
;
i
<
str
s
.
size
()
-
1
;
i
++
)
for
(
i
=
2
;
i
<
seg
s
.
size
()
-
1
;
i
++
)
{
os
()
<<
str
s
[
i
]
<<
"|"
;
os
()
<<
seg
s
[
i
]
<<
"|"
;
}
os
()
<<
str
s
[
i
]
<<
"
\t
_
\t
_
\t
_
\t
_"
;
os
()
<<
seg
s
[
i
]
<<
"
\t
_
\t
_
\t
_
\t
_"
;
}
else
os
()
<<
"_
\t
_
\t
_
\t
_
\t
_"
;
{
os
()
<<
"_
\t
_
\t
_
\t
_
\t
_"
;
}
}
void
ConllWriter
::
write_sentence
(
const
Sentence
&
s
)
...
...
This diff is collapsed.
Click to expand it.
libcorpus2/io/conllwriter.h
+
9
−
0
View file @
7e52f5e3
...
...
@@ -5,6 +5,14 @@
namespace
Corpus2
{
/**
* Writer in the CONLL format (as required by MALT parser). The writer
* assumes that the tagset used employs an attribute named 'superpos'
* (this naming is obligatory) and the attribute is defined as first
* and required for each grammatical class. This attribute is used to
* designate a more general POS category for each token (e.g. all verb
* classes could be marked as VERB there).
*/
class
ConllWriter
:
public
TokenWriter
{
public:
...
...
@@ -19,6 +27,7 @@ public:
void
write_chunk
(
const
Chunk
&
c
);
const
static
std
::
string
SUPERPOS_ATTR
;
static
bool
registered
;
protected:
...
...
This diff is collapsed.
Click to expand it.
utils/tagger-eval.py
+
1
−
0
View file @
7e52f5e3
...
...
@@ -135,6 +135,7 @@ class Metric:
# as above but metric for POS hits
POS_WC
=
([
Feat
.
WEAK_POS_HIT
],
None
)
POS_SC
=
([
Feat
.
STRONG_POS_HIT
],
None
)
POS_WC_LOWER
=
([
Feat
.
WEAK_POS_HIT
,
Feat
.
SEG_NOCHANGE
],
None
)
# lower bound for POS WC
# separate stats for known and unknown forms
KN_WC
=
([
Feat
.
WEAK_TAG_HIT
,
Feat
.
KNOWN
],
[
Feat
.
KNOWN
])
UNK_WC
=
([
Feat
.
WEAK_TAG_HIT
,
Feat
.
UNKNOWN
],
[
Feat
.
UNKNOWN
])
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment