Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
dd9a3136
Commit
dd9a3136
authored
13 years ago
by
ilor
Browse files
Options
Downloads
Patches
Plain Diff
IOB-CHAN~~~~
parent
cbf8a71a
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
libcorpus2/CMakeLists.txt
+1
-0
1 addition, 0 deletions
libcorpus2/CMakeLists.txt
libcorpus2/io/iob-chan.cpp
+216
-0
216 additions, 0 deletions
libcorpus2/io/iob-chan.cpp
libcorpus2/io/iob-chan.h
+88
-0
88 additions, 0 deletions
libcorpus2/io/iob-chan.h
with
305 additions
and
0 deletions
libcorpus2/CMakeLists.txt
+
1
−
0
View file @
dd9a3136
...
@@ -59,6 +59,7 @@ SET(libcorpus2_STAT_SRC
...
@@ -59,6 +59,7 @@ SET(libcorpus2_STAT_SRC
io/cclwriter.cpp
io/cclwriter.cpp
io/helpers.cpp
io/helpers.cpp
io/fastxces.cpp
io/fastxces.cpp
io/iob-chan.cpp
io/nonewriter.cpp
io/nonewriter.cpp
io/orthwriter.cpp
io/orthwriter.cpp
io/pathwriter.cpp
io/pathwriter.cpp
...
...
This diff is collapsed.
Click to expand it.
libcorpus2/io/iob-chan.cpp
0 → 100644
+
216
−
0
View file @
dd9a3136
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#include
<libcorpus2/io/iob-chan.h>
#include
<libpwrutils/foreach.h>
#include
<libcorpus2/ann/annotatedsentence.h>
#include
<boost/algorithm/string.hpp>
#include
<boost/make_shared.hpp>
#include
<fstream>
#include
<boost/algorithm/string/split.hpp>
namespace
Corpus2
{
bool
IobChanWriter
::
registered
=
TokenWriter
::
register_writer
<
IobChanWriter
>
(
"iob-chan"
,
"nowarn,noforce"
);
IobChanWriter
::
IobChanWriter
(
std
::
ostream
&
os
,
const
Tagset
&
tagset
,
const
string_range_vector
&
params
)
:
TokenWriter
(
os
,
tagset
,
params
),
warn_on_no_lexemes_
(
true
),
force_
(
true
)
{
foreach
(
const
string_range
&
param
,
params
)
{
std
::
string
p
=
boost
::
copy_range
<
std
::
string
>
(
param
);
if
(
p
==
"nowarn"
)
{
warn_on_no_lexemes_
=
false
;
}
else
if
(
p
==
"noforce"
)
{
force_
=
false
;
}
}
}
void
IobChanWriter
::
write_token
(
const
Token
&
t
)
{
os
()
<<
t
.
orth_utf8
();
if
(
t
.
lexemes
().
empty
())
{
if
(
warn_on_no_lexemes_
)
{
std
::
cerr
<<
"No lexemes for token!"
;
}
}
else
{
const
Lexeme
&
pref
=
t
.
get_preferred_lexeme
(
tagset
());
os
()
<<
"
\t
"
;
write_tag
(
pref
.
tag
());
}
os
()
<<
"
\n
"
;
}
void
IobChanWriter
::
write_sentence
(
const
Sentence
&
s
)
{
const
AnnotatedSentence
*
ann
=
dynamic_cast
<
const
AnnotatedSentence
*>
(
&
s
);
if
(
force_
)
{
// I sincerely apologize
AnnotatedSentence
*
hax
=
const_cast
<
AnnotatedSentence
*>
(
ann
);
foreach
(
const
AnnotatedSentence
::
chan_map_t
::
value_type
&
v
,
hax
->
all_channels
())
{
hax
->
get_channel
(
v
.
first
).
make_iob_from_segments
();
}
}
for
(
size_t
idx
=
0
;
idx
<
s
.
size
();
++
idx
)
{
const
Token
*
t
=
s
.
tokens
()[
idx
];
os
()
<<
t
->
orth_utf8
();
if
(
t
->
lexemes
().
empty
())
{
if
(
warn_on_no_lexemes_
)
{
std
::
cerr
<<
"No lexemes for token!"
;
}
}
else
{
const
Lexeme
&
pref
=
t
->
get_preferred_lexeme
(
tagset
());
os
()
<<
"
\t
"
;
os
()
<<
pref
.
lemma_utf8
();
os
()
<<
"
\t
"
;
write_tag
(
pref
.
tag
());
os
()
<<
"
\t
"
;
}
if
(
ann
)
{
bool
first
=
true
;
foreach
(
const
AnnotatedSentence
::
chan_map_t
::
value_type
&
v
,
ann
->
all_channels
())
{
if
(
!
first
)
{
os
()
<<
","
;
}
os
()
<<
v
.
first
<<
"-"
;
os
()
<<
Corpus2
::
IOB
::
to_string
(
v
.
second
.
get_iob_at
(
idx
));
first
=
false
;
}
}
os
()
<<
"
\n
"
;
}
os
()
<<
"
\n
"
;
}
void
IobChanWriter
::
write_chunk
(
const
Chunk
&
c
)
{
foreach
(
const
Sentence
::
ConstPtr
&
s
,
c
.
sentences
())
{
write_sentence
(
*
s
);
}
}
void
IobChanWriter
::
write_tag
(
const
Tag
&
tag
)
{
os
()
<<
tagset
().
tag_to_string
(
tag
);
}
bool
IobChanReader
::
registered
=
TokenReader
::
register_reader
<
IobChanReader
>
(
"iob-chan"
,
"ign,loose,strict,no_set_disamb"
);
IobChanReader
::
IobChanReader
(
const
Tagset
&
tagset
,
std
::
istream
&
is
)
:
BufferedSentenceReader
(
tagset
),
is_
(
&
is
),
disamb_
(
true
)
{
}
IobChanReader
::
IobChanReader
(
const
Tagset
&
tagset
,
const
std
::
string
&
filename
)
:
BufferedSentenceReader
(
tagset
),
is_
(),
disamb_
(
true
)
{
is_owned_
.
reset
(
new
std
::
ifstream
(
filename
.
c_str
(),
std
::
ifstream
::
in
));
if
(
!
this
->
is_owned_
->
good
())
{
throw
Corpus2Error
(
"File not found!"
);
}
else
{
this
->
is_
=
is_owned_
.
get
();
}
}
Sentence
::
Ptr
IobChanReader
::
actual_next_sentence
()
{
std
::
string
line
;
boost
::
shared_ptr
<
AnnotatedSentence
>
s
;
typedef
boost
::
split_iterator
<
std
::
string
::
const_iterator
>
string_split_iterator
;
while
(
is
().
good
())
{
std
::
getline
(
is
(),
line
);
if
(
line
.
empty
())
{
break
;
}
std
::
vector
<
std
::
string
>
spl
;
boost
::
algorithm
::
split
(
spl
,
line
,
boost
::
is_any_of
(
"
\t
"
));
if
(
spl
.
size
()
!=
4
)
{
std
::
cerr
<<
"Invalid line: "
<<
line
<<
"("
<<
spl
.
size
()
<<
")
\n
"
;
}
else
{
const
std
::
string
&
orth
=
spl
[
0
];
const
std
::
string
&
lemma
=
spl
[
1
];
const
std
::
string
&
tag_string
=
spl
[
2
];
const
std
::
string
&
anns
=
spl
[
3
];
Tag
tag
=
parse_tag
(
tag_string
);
Token
*
t
=
new
Token
();
t
->
set_orth
(
UnicodeString
::
fromUTF8
(
orth
));
t
->
set_wa
(
PwrNlp
::
Whitespace
::
Space
);
t
->
add_lexeme
(
Lexeme
(
UnicodeString
::
fromUTF8
(
lemma
),
tag
));
if
(
disamb_
)
{
t
->
lexemes
().
back
().
set_disamb
(
true
);
}
if
(
!
s
)
{
s
=
boost
::
make_shared
<
AnnotatedSentence
>
();
t
->
set_wa
(
PwrNlp
::
Whitespace
::
Newline
);
}
s
->
append
(
t
);
std
::
vector
<
std
::
string
>
annsplit
;
boost
::
algorithm
::
split
(
annsplit
,
anns
,
boost
::
is_any_of
(
","
));
foreach
(
const
std
::
string
&
a
,
annsplit
)
{
std
::
vector
<
std
::
string
>
one_ann_split
;
boost
::
algorithm
::
split
(
one_ann_split
,
a
,
boost
::
is_any_of
(
"-"
));
if
(
one_ann_split
.
size
()
!=
2
)
{
std
::
cerr
<<
"Invalid annotation:"
<<
a
<<
"
\n
"
;
}
else
{
const
std
::
string
&
aname
=
one_ann_split
[
0
];
const
std
::
string
&
aiob
=
one_ann_split
[
1
];
Corpus2
::
IOB
::
Enum
iob
=
Corpus2
::
IOB
::
from_string
(
aiob
);
if
(
iob
==
Corpus2
::
IOB
::
PostLast
)
{
std
::
cerr
<<
"Invalid IOB tag: "
<<
aiob
<<
"
\n
"
;
}
else
{
if
(
!
s
->
has_channel
(
aname
))
{
s
->
create_channel
(
aname
);
}
s
->
get_channel
(
aname
).
set_iob_at
(
s
->
size
()
-
1
,
iob
);
}
}
}
}
}
if
(
s
)
{
foreach
(
const
AnnotatedSentence
::
chan_map_t
::
value_type
&
v
,
s
->
all_channels
())
{
s
->
get_channel
(
v
.
first
).
make_segments_from_iob
();
}
}
return
s
;
}
void
IobChanReader
::
set_option
(
const
std
::
string
&
option
)
{
if
(
option
==
"no_set_disamb"
)
{
disamb_
=
false
;
}
else
{
BufferedSentenceReader
::
set_option
(
option
);
}
}
std
::
string
IobChanReader
::
get_option
(
const
std
::
string
&
option
)
const
{
if
(
option
==
"no_set_disamb"
)
{
return
!
disamb_
?
option
:
""
;
}
return
BufferedSentenceReader
::
get_option
(
option
);
}
}
/* end ns Corpus2 */
This diff is collapsed.
Click to expand it.
libcorpus2/io/iob-chan.h
0 → 100644
+
88
−
0
View file @
dd9a3136
/*
Copyright (C) 2010 Tomasz Śniatowski, Adam Radziszewski
Part of the libcorpus2 project
This program is free software; you can redistribute it and/or modify it
under the terms of the GNU General Public License as published by the Free
Software Foundation; either version 3 of the License, or (at your option)
any later version.
This program is distributed in the hope that it will be useful, but
WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
or FITNESS FOR A PARTICULAR PURPOSE.
See the LICENSE and COPYING files for more details.
*/
#ifndef LIBSORPUS2_IO_IOB_CHAN_H
#define LIBCORPUS2_IO_IOB_CHAN_H
#include
<libcorpus2/io/reader.h>
#include
<libcorpus2/io/writer.h>
#include
<boost/scoped_ptr.hpp>
namespace
Corpus2
{
/**
* Simple writer class to output token in RFTagger-compatible corpora form.
*
* One token per line, token line consists of the orth, followed by the
* tag, followed by newline (one tag per token only). Each sentence
* is followed by a blank line.
*
* The first lexeme is used. No-lexeme tokens trigger a warning unless
* nowarn is passed.
*/
class
IobChanWriter
:
public
TokenWriter
{
public:
IobChanWriter
(
std
::
ostream
&
os
,
const
Tagset
&
tagset
,
const
string_range_vector
&
params
);
void
write_token
(
const
Token
&
t
);
void
write_sentence
(
const
Sentence
&
s
);
void
write_chunk
(
const
Chunk
&
p
);
void
write_tag
(
const
Tag
&
tag
);
static
bool
registered
;
private:
bool
warn_on_no_lexemes_
;
bool
force_
;
};
class
IobChanReader
:
public
BufferedSentenceReader
{
public:
IobChanReader
(
const
Tagset
&
tagset
,
std
::
istream
&
is
);
IobChanReader
(
const
Tagset
&
tagset
,
const
std
::
string
&
filename
);
std
::
istream
&
is
()
{
return
*
is_
;
}
void
set_option
(
const
std
::
string
&
option
);
std
::
string
get_option
(
const
std
::
string
&
option
)
const
;
static
bool
registered
;
protected
:
/// BufferedSentenceReader override
Sentence
::
Ptr
actual_next_sentence
();
std
::
istream
*
is_
;
boost
::
scoped_ptr
<
std
::
istream
>
is_owned_
;
/// Whether to mark all incoming tags as disambiguated
bool
disamb_
;
};
}
/* end ns Corpus2 */
#endif // LIBCORPUS2_IO_IOB_CHAN_H
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment