Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
2b8f6a60
Commit
2b8f6a60
authored
13 years ago
by
ilor
Browse files
Options
Downloads
Plain Diff
Merge branch 'master' of nlp.pwr.wroc.pl:corpus2
Conflicts: swig/libcorpustokenreader.i
parents
56a53217
69967758
Branches
Branches containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
doc/corpstats.py
+101
-0
101 additions, 0 deletions
doc/corpstats.py
swig/libcorpustokenreader.i
+12
-3
12 additions, 3 deletions
swig/libcorpustokenreader.i
with
113 additions
and
3 deletions
doc/corpstats.py
0 → 100755
+
101
−
0
View file @
2b8f6a60
#!/usr/bin/python
# -*- coding: utf-8 -*-
import
sys
from
optparse
import
OptionParser
from
collections
import
defaultdict
as
dd
import
corpus2
descr
=
"""
%prog [options] CORPUSFILE
Reads a corpus file and reports some statistics.
This script is a demo of the Python API.
"""
def
tokens
(
rdr
):
"""
Yields subsequent tokens from a reader.
Declared here for demonstration.
"""
while
True
:
tok
=
rdr
.
get_next_token
()
if
not
tok
:
break
yield
tok
def
sentences
(
rdr
):
"""
Yields subsequent sentences from a reader.
Declared here for demonstration.
"""
while
True
:
sent
=
rdr
.
get_next_sentence
()
if
not
sent
:
break
yield
sent
def
chunks
(
rdr
):
"""
Yields subsequent sentences from a reader.
"""
while
True
:
chunk
=
rdr
.
get_next_chunk
()
if
not
chunk
:
break
yield
chunk
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
xces
'
,
help
=
'
set the input format; default: xces
'
)
#parser.add_option('-o', '--output-format', type='string', action='store',
#dest='output_format', default='xces',
#help='set the output format; default: xces')
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
kipi
'
,
help
=
'
set the tagset used in input; default: kipi
'
)
parser
.
add_option
(
'
-v
'
,
'
--verbose
'
,
action
=
'
store_true
'
,
dest
=
'
verbose
'
,
default
=
False
,
help
=
'
report each token
'
)
parser
.
add_option
(
'
-n
'
,
'
--number-of-tags
'
,
type
=
'
int
'
,
action
=
'
store
'
,
dest
=
'
num_tags
'
,
default
=
10
,
help
=
'
set the max number of tags to report
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
!=
1
:
print
'
You need to provide an input corpus.
'
print
'
See %s --help
'
%
sys
.
argv
[
0
]
sys
.
exit
(
1
)
inpath
=
args
[
0
]
# load a tagset, create a reader
tagset
=
corpus2
.
get_named_tagset
(
options
.
tagset
)
rdr
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
inpath
)
# init stats (for this example)
num_toks
,
num_sents
,
num_chunks
=
0
,
0
,
0
tag_count
=
dd
(
int
)
for
chunk
in
chunks
(
rdr
):
for
sent
in
chunk
.
sentences
():
for
tok
in
sent
.
tokens
():
if
options
.
verbose
:
print
tok
.
orth_utf8
()
for
lex
in
tok
.
lexemes
():
tag_str
=
tagset
.
tag_to_string
(
lex
.
tag
())
tag_count
[
tag_str
]
+=
1
if
options
.
verbose
:
lemma
=
lex
.
lemma_utf8
()
print
(
'
+
'
if
lex
.
is_disamb
()
else
'
'
),
lemma
,
tag_str
# if you want a unicode object, orth_utf8().decode('utf-8')
num_toks
+=
1
num_sents
+=
1
num_chunks
+=
1
print
'
Tokens:
'
,
num_toks
print
'
Sents:
'
,
num_sents
print
'
Chunks:
'
,
num_chunks
print
print
'
Most frequent tags:
'
for
tc
in
sorted
(
tag_count
.
items
(),
key
=
lambda
tc
:
(
-
tc
[
1
],
tc
[
0
]))[:
options
.
num_tags
]:
print
'
\t
%s
\t
%d
'
%
tc
if
__name__
==
'
__main__
'
:
go
()
This diff is collapsed.
Click to expand it.
swig/libcorpustokenreader.i
+
12
−
3
View file @
2b8f6a60
...
@@ -18,13 +18,15 @@
...
@@ -18,13 +18,15 @@
%
nodefaultctor
Corpus2
::
TokenReader
;
%
nodefaultctor
Corpus2
::
TokenReader
;
%
template
(
TokenReaderPtr
)
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
;
%
template
(
TokenReaderPtr
)
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
;
%
template
(
TokenPtr
)
boost
::
shared_ptr
<
Corpus2
::
Token
>
;
// %template(StdStringVector) std::vector<std::string>;
// %template(StdStringVector) std::vector<std::string>;
// %template(ChunkPtr) boost::shared_ptr<Corpus2::Chunk>;
// %template(ChunkPtr) boost::shared_ptr<Corpus2::Chunk>;
typedef
boost
::
shared_ptr
<
Corpus2
::
Token
>
TokenPtr
;
namespace
Corpus2
{
namespace
Corpus2
{
class TokenReader {
class TokenReader {
public:
public:
typedef boost::shared_ptr<TokenReader> TokenReaderPtr;
typedef boost::shared_ptr<TokenReader> TokenReaderPtr;
//typedef boost::shared_ptr<Token> TokenPtr;
/* --------------------------------------------------------------------- */
/* --------------------------------------------------------------------- */
explicit TokenReader(const Tagset& tagset);
explicit TokenReader(const Tagset& tagset);
...
@@ -60,7 +62,7 @@ namespace Corpus2 {
...
@@ -60,7 +62,7 @@ namespace Corpus2 {
std::istream& stream);
std::istream& stream);
/* --------------------------------------------------------------------- */
/* --------------------------------------------------------------------- */
virtual Token* get_next_token() = 0;
/*
virtual Token* get_next_token() = 0;
*/
virtual Sentence::Ptr get_next_sentence() = 0;
virtual Sentence::Ptr get_next_sentence() = 0;
virtual boost::shared_ptr<Chunk> get_next_chunk() = 0;
virtual boost::shared_ptr<Chunk> get_next_chunk() = 0;
...
@@ -77,7 +79,14 @@ namespace Corpus2 {
...
@@ -77,7 +79,14 @@ namespace Corpus2 {
static std::vector<std::string> available_reader_types_help();
static std::vector<std::string> available_reader_types_help();
};
};
%feature("autodoc", "1");
%extend TokenReader {
/* modfify the native get_next_token to wrap the tokens into shared_ptr */
boost::shared_ptr<Corpus2::Token> get_next_token() {
return boost::shared_ptr<Corpus2::Token>(self->get_next_token());
}
}
%feature("autodoc", "1");
std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string(
std::vector<boost::shared_ptr<Chunk> > read_chunks_from_utf8_string(
const std::string& data, const Tagset& tagset, const std::string& format);
const std::string& data, const Tagset& tagset, const std::string& format);
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment