Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
GitLab community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
6968244d
Commit
6968244d
authored
Feb 16, 2011
by
ilor
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' into annotations
parents
381668c5
b6c9b5fa
Branches
Branches containing commit
No related tags found
No related merge requests found
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
libcorpus2/io/xcesreader.cpp
+36
-11
36 additions, 11 deletions
libcorpus2/io/xcesreader.cpp
libcorpus2/tagset.cpp
+12
-0
12 additions, 0 deletions
libcorpus2/tagset.cpp
tagset-tool/main.cpp
+2
-2
2 additions, 2 deletions
tagset-tool/main.cpp
with
50 additions
and
13 deletions
libcorpus2/io/xcesreader.cpp
+
36
−
11
View file @
6968244d
...
...
@@ -18,6 +18,7 @@ or FITNESS FOR A PARTICULAR PURPOSE.
#include
<libcorpus2/io/sax.h>
#include
<libpwrutils/foreach.h>
#include
<libxml++/libxml++.h>
#include
<libxml2/libxml/parser.h>
#include
<boost/make_shared.hpp>
#include
<fstream>
...
...
@@ -37,6 +38,8 @@ protected:
const
AttributeList
&
attributes
);
void
on_end_element
(
const
Glib
::
ustring
&
name
);
void
finish_sentence
();
const
Tagset
&
tagset_
;
enum
state_t
{
XS_NONE
,
XS_CHUNK
,
XS_SENTENCE
,
XS_TOK
,
XS_ORTH
,
XS_LEX
,
...
...
@@ -45,6 +48,8 @@ protected:
bool
chunkless_
;
bool
out_of_chunk_
;
PwrNlp
::
Whitespace
::
Enum
wa_
;
Glib
::
ustring
sbuf_
;
...
...
@@ -105,7 +110,7 @@ XcesReaderImpl::XcesReaderImpl(const Tagset& tagset,
std
::
deque
<
boost
::
shared_ptr
<
Chunk
>
>&
obuf
,
bool
disamb_only
,
bool
disamb_sh
)
:
BasicSaxParser
()
,
tagset_
(
tagset
),
state_
(
XS_NONE
),
chunkless_
(
false
)
,
tagset_
(
tagset
),
state_
(
XS_NONE
),
chunkless_
(
false
)
,
out_of_chunk_
(
false
)
,
wa_
(
PwrNlp
::
Whitespace
::
Newline
)
,
sbuf_
(),
tok_
(
NULL
),
sent_
(),
chunk_
(),
obuf_
(
obuf
)
,
disamb_only_
(
disamb_only
),
disamb_sh_
(
disamb_sh
)
...
...
@@ -127,6 +132,10 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
type
=
a
.
value
;
}
}
if
(
out_of_chunk_
)
{
finish_sentence
();
out_of_chunk_
=
false
;
}
if
(
state_
==
XS_NONE
)
{
if
(
type
==
"s"
)
{
//throw XcesError("Top level <chunk> is type=\"s\"");
...
...
@@ -191,6 +200,31 @@ void XcesReaderImpl::on_start_element(const Glib::ustring &name,
clear_buf
();
}
else
if
(
name
==
"ns"
)
{
wa_
=
PwrNlp
::
Whitespace
::
None
;
}
else
if
(
name
==
"tok"
&&
state_
==
XS_NONE
)
{
std
::
cerr
<<
"Warning: out-of-chunk token, assuming sentence start on line "
;
std
::
cerr
<<
this
->
context_
->
input
->
line
<<
"
\n
"
;
chunkless_
=
true
;
out_of_chunk_
=
true
;
chunk_
=
boost
::
make_shared
<
Chunk
>
();
sent_
=
boost
::
make_shared
<
Sentence
>
();
state_
=
XS_TOK
;
tok_
=
new
Token
();
tok_
->
set_wa
(
wa_
);
wa_
=
PwrNlp
::
Whitespace
::
Space
;
}
}
void
XcesReaderImpl
::
finish_sentence
()
{
chunk_
->
append
(
sent_
);
sent_
.
reset
();
if
(
chunkless_
)
{
obuf_
.
push_back
(
chunk_
);
chunk_
.
reset
();
state_
=
XS_NONE
;
chunkless_
=
false
;
}
else
{
state_
=
XS_CHUNK
;
}
}
...
...
@@ -216,16 +250,7 @@ void XcesReaderImpl::on_end_element(const Glib::ustring &name)
tok_
=
NULL
;
state_
=
XS_SENTENCE
;
}
else
if
(
state_
==
XS_SENTENCE
&&
name
==
"chunk"
)
{
chunk_
->
append
(
sent_
);
sent_
.
reset
();
if
(
chunkless_
)
{
obuf_
.
push_back
(
chunk_
);
chunk_
.
reset
();
state_
=
XS_NONE
;
chunkless_
=
false
;
}
else
{
state_
=
XS_CHUNK
;
}
finish_sentence
();
}
else
if
(
state_
==
XS_CHUNK
&&
name
==
"chunk"
)
{
obuf_
.
push_back
(
chunk_
);
chunk_
.
reset
();
...
...
This diff is collapsed.
Click to expand it.
libcorpus2/tagset.cpp
+
12
−
0
View file @
6968244d
...
...
@@ -141,6 +141,7 @@ namespace {
std
::
vector
<
mask_t
>
&
current
,
const
std
::
vector
<
mask_t
>
&
to_add
,
mask_t
to_add_attr
)
{
if
(
to_add
.
empty
())
return
;
size_t
current_size
=
current
.
size
();
for
(
size_t
ai
=
1
;
ai
<
to_add
.
size
();
++
ai
)
{
for
(
size_t
oi
=
0
;
oi
<
current_size
;
++
oi
)
{
...
...
@@ -176,6 +177,7 @@ void Tagset::parse_tag(const string_range_vector &fields, bool allow_extra,
std
::
vector
<
mask_t
>
values
;
mask_t
amask
;
foreach
(
string_range
&
dot
,
dots
)
{
if
(
dot
.
empty
())
continue
;
mask_t
v
=
get_value_mask
(
boost
::
copy_range
<
std
::
string
>
(
dot
));
mask_t
curr
=
get_attribute_mask
(
get_value_attribute
(
v
));
...
...
@@ -281,6 +283,16 @@ Tag Tagset::make_tag(idx_t pos_idx, mask_t values, bool allow_extra) const
// << " of " << pos_required_attributes_idx_[pos_idx].size() << "\n";
size_t
has_req
=
PwrNlp
::
count_bits_set
(
required_values
&
values
);
if
(
has_req
!=
pos_required_attributes_idx_
[
pos_idx
].
size
())
{
foreach
(
idx_t
a
,
get_pos_attributes
(
pos_idx
))
{
if
(
pos_requires_attribute
(
pos_idx
,
a
))
{
mask_t
amask
=
get_attribute_mask
(
a
);
if
((
values
&
amask
).
none
())
{
throw
TagParseError
(
"Required attribute missing"
,
tag_to_string
(
Tag
(
get_pos_mask
(
pos_idx
),
values
)),
get_attribute_name
(
a
),
id_string
());
}
}
}
throw
TagParseError
(
"Required attribute missing"
,
tag_to_string
(
Tag
(
get_pos_mask
(
pos_idx
),
values
)),
get_pos_name
(
pos_idx
),
id_string
());
...
...
This diff is collapsed.
Click to expand it.
tagset-tool/main.cpp
+
2
−
2
View file @
6968244d
...
...
@@ -96,7 +96,7 @@ void tagset_info(const Corpus2::Tagset& tagset)
for
(
Corpus2
::
idx_t
a
=
0
;
a
<
tagset
.
attribute_count
();
++
a
)
{
std
::
cerr
<<
tagset
.
get_attribute_values
(
a
).
size
()
<<
" "
;
}
std
::
cerr
<<
"
\n
"
;
std
::
cerr
<<
"
]
\n
"
;
std
::
cerr
<<
"Size is "
<<
tagset
.
size
()
<<
" (extra size is "
<<
tagset
.
size_extra
()
<<
")
\n
"
;
std
::
cerr
<<
"POSes: "
;
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment