Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
27e5a477
Commit
27e5a477
authored
13 years ago
by
ilor
Browse files
Options
Downloads
Patches
Plain Diff
rewrite corpus-get in python
parent
6cc8e28a
Branches
Branches containing commit
No related merge requests found
Changes
3
Hide whitespace changes
Inline
Side-by-side
Showing
3 changed files
corpus2tools/CMakeLists.txt
+5
-5
5 additions, 5 deletions
corpus2tools/CMakeLists.txt
corpus2tools/corpus-get
+134
-0
134 additions, 0 deletions
corpus2tools/corpus-get
corpus2tools/corpus-get.cpp
+0
-77
0 additions, 77 deletions
corpus2tools/corpus-get.cpp
with
139 additions
and
82 deletions
corpus2tools/CMakeLists.txt
+
5
−
5
View file @
27e5a477
...
...
@@ -12,15 +12,15 @@ include_directories( ${CMAKE_SOURCE_DIR} )
add_executable
(
tagset-tool tagset-tool.cpp
)
target_link_libraries
(
tagset-tool corpus2 pwrutils
${
Boost_LIBRARIES
}
${
LIBS
}
)
add_executable
(
corpus-get corpus-get.cpp
)
target_link_libraries
(
corpus-get corpus2 pwrutils
${
Boost_LIBRARIES
}
${
LIBS
}
)
include_directories
(
${
Boost_INCLUDE_DIR
}
)
link_directories
(
${
Boost_LIBRARY_DIRS
}
)
if
(
UNIX
)
install
(
TARGETS tagset-tool
corpus-get
install
(
TARGETS tagset-tool
RUNTIME DESTINATION bin
)
install
(
FILES corpus-get
DESTINATION bin
PERMISSIONS OWNER_READ OWNER_WRITE OWNER_EXECUTE
GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE
)
endif
(
UNIX
)
This diff is collapsed.
Click to expand it.
corpus2tools/corpus-get
0 → 100755
+
134
−
0
View file @
27e5a477
#!/usr/bin/python
# -*- coding: utf-8 -*-
import
sys
from
optparse
import
OptionParser
from
collections
import
defaultdict
as
dd
from
itertools
import
repeat
,
izip
import
corpus2
descr
=
"""
%prog [options] CORPUSFILE [SENTENCERANGE[:TOKENRANGE]]
Reads a corpus file and outputs all or some tokens.
Available input formats:
"""
+
'
'
.
join
(
corpus2
.
TokenReader
.
available_reader_types
())
+
"""
"""
+
'
'
.
join
(
corpus2
.
TokenReader
.
available_reader_types_help
())
+
"""
Available output formats:
"""
+
'
'
.
join
(
corpus2
.
TokenWriter
.
available_writer_types
())
+
"""
"""
+
'
'
.
join
(
corpus2
.
TokenWriter
.
available_writer_types_help
())
def
parse_range_info
(
s
):
"""
Parses a comma-separated list of numbers that
can also be dash-separated ranges
"""
selection
=
set
()
for
elem
in
(
x
.
strip
()
for
x
in
s
.
split
(
'
,
'
)):
try
:
selection
.
add
(
int
(
elem
))
except
:
split
=
[
x
.
strip
()
for
x
in
elem
.
split
(
'
-
'
)]
try
:
if
len
(
split
)
==
2
:
split
.
sort
()
for
x
in
xrange
(
int
(
split
[
0
]),
int
(
split
[
1
])
+
1
):
selection
.
add
(
x
)
else
:
raise
except
:
print
"
Fail:
"
,
elem
return
selection
def
sentences
(
rdr
):
"""
Yields subsequent sentences from a reader.
Declared here for demonstration.
"""
while
True
:
sent
=
rdr
.
get_next_sentence
()
if
not
sent
:
break
yield
sent
def
chunks
(
rdr
):
"""
Yields subsequent sentences from a reader.
"""
while
True
:
chunk
=
rdr
.
get_next_chunk
()
if
not
chunk
:
break
yield
chunk
def
write_selected_sentences
(
sents
,
writer
,
selection
):
sid
=
0
for
sent
in
sents
:
if
sid
in
selection
:
if
len
(
selection
[
sid
])
==
0
:
writer
.
write_sentence
(
sent
)
else
:
tid
=
0
for
tok
in
sent
.
tokens
():
if
tid
in
selection
[
sid
]:
writer
.
write_token
(
tok
)
tid
+=
1
sid
+=
1
def
go
():
parser
=
OptionParser
(
usage
=
descr
)
parser
.
add_option
(
'
-i
'
,
'
--input-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
input_format
'
,
default
=
'
xces
'
,
help
=
'
set the input format; default: xces-fast
'
)
parser
.
add_option
(
'
-o
'
,
'
--output-format
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
output_format
'
,
default
=
'
xces
'
,
help
=
'
set the output format; default: xces
'
)
parser
.
add_option
(
'
-t
'
,
'
--tagset
'
,
type
=
'
string
'
,
action
=
'
store
'
,
dest
=
'
tagset
'
,
default
=
'
kipi
'
,
help
=
'
set the tagset used in input; default: kipi
'
)
parser
.
add_option
(
'
-C
'
,
'
--chunks
'
,
action
=
'
store_true
'
,
dest
=
'
chunks
'
,
default
=
False
,
help
=
'
Process chunks (select chunks/sentences, not tokens)
'
)
parser
.
add_option
(
'
-v
'
,
'
--verbose
'
,
action
=
'
store_true
'
,
dest
=
'
verbose
'
,
default
=
False
,
help
=
'
verbose mode
'
)
(
options
,
args
)
=
parser
.
parse_args
()
if
len
(
args
)
<
1
:
print
'
You need to provide an input corpus.
'
print
'
See %s --help
'
%
sys
.
argv
[
0
]
sys
.
exit
(
1
)
inpath
=
args
[
0
]
# load a tagset, create a reader
tagset
=
corpus2
.
get_named_tagset
(
options
.
tagset
)
reader
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
inpath
)
writer
=
corpus2
.
TokenWriter
.
create_stdout_writer
(
options
.
output_format
,
tagset
)
selection
=
{}
for
arg
in
args
[
1
:]:
if
'
:
'
in
arg
:
sp
=
arg
.
split
(
'
:
'
)
if
len
(
sp
)
==
2
and
options
.
chunks
:
selection
.
update
(
izip
(
parse_range_info
(
sp
[
0
]),
repeat
(
dict
(
izip
(
parse_range_info
(
sp
[
1
]),
repeat
(()))))))
elif
len
(
sp
)
==
3
and
options
.
chunks
:
selection
.
update
(
izip
(
parse_range_info
(
sp
[
0
]),
repeat
(
dict
(
izip
(
parse_range_info
(
sp
[
1
]),
repeat
(
parse_range_info
(
sp
[
2
])))))))
elif
len
(
sp
)
==
2
:
selection
.
update
(
izip
(
parse_range_info
(
sp
[
0
]),
repeat
(
parse_range_info
(
sp
[
1
]))))
else
:
print
>>
sys
.
stderr
,
"
Invalid argument:
"
,
arg
return
else
:
selection
.
update
(
izip
(
parse_range_info
(
arg
),
repeat
(())))
if
selection
==
{}:
if
options
.
chunks
:
for
chunk
in
chunks
(
reader
):
writer
.
write_chunk
(
chunk
)
else
:
for
sent
in
sentences
(
reader
):
writer
.
write_sentence
(
sent
)
else
:
if
options
.
chunks
:
cid
=
0
for
chunk
in
chunks
(
reader
):
if
cid
in
selection
:
if
len
(
selection
[
cid
])
==
0
:
writer
.
write_chunk
(
chunk
)
else
:
write_selected_sentences
(
chunk
.
sentences
(),
writer
,
selection
[
cid
])
cid
+=
1
else
:
write_selected_sentences
(
sentences
(
reader
),
writer
,
selection
)
if
__name__
==
'
__main__
'
:
go
()
This diff is collapsed.
Click to expand it.
corpus2tools/corpus-get.cpp
deleted
100644 → 0
+
0
−
77
View file @
6cc8e28a
#include
<libcorpus2/tagsetmanager.h>
#include
<libcorpus2/util/ioformat-options.h>
#include
<boost/program_options.hpp>
#include
<boost/algorithm/string.hpp>
int
main
(
int
argc
,
char
**
argv
)
{
std
::
string
tagset_name
,
filename
;
std
::
string
input_format
,
output_format
;
int
sentence
,
token
=
-
1
;
size_t
stats
=
0
;
using
boost
::
program_options
::
value
;
boost
::
program_options
::
options_description
desc
(
"Allowed options"
);
desc
.
add_options
()
(
"filename,F"
,
value
(
&
filename
),
"filename"
)
(
"sentence,S"
,
value
(
&
sentence
),
"Sentence idx"
)
(
"stats,s"
,
value
(
&
stats
),
"Stats"
)
(
"token,T"
,
value
(
&
token
),
"Token idx "
)
(
"tagset,t"
,
value
(
&
tagset_name
)
->
default_value
(
"kipi"
),
"Tagset name"
)
;
Corpus2
::
add_input_options
(
desc
);
Corpus2
::
add_output_options
(
desc
);
boost
::
program_options
::
variables_map
vm
;
boost
::
program_options
::
positional_options_description
p
;
p
.
add
(
"filename"
,
1
);
p
.
add
(
"sentence"
,
1
);
p
.
add
(
"token"
,
1
);
try
{
boost
::
program_options
::
store
(
boost
::
program_options
::
command_line_parser
(
argc
,
argv
)
.
options
(
desc
).
positional
(
p
).
run
(),
vm
);
}
catch
(
boost
::
program_options
::
error
&
e
)
{
std
::
cerr
<<
e
.
what
()
<<
"
\n
"
;
return
2
;
}
boost
::
program_options
::
notify
(
vm
);
if
(
vm
.
count
(
"help"
))
{
std
::
cout
<<
desc
<<
"
\n
"
;
return
1
;
}
const
Corpus2
::
Tagset
&
tagset
=
Corpus2
::
get_named_tagset
(
tagset_name
);
boost
::
shared_ptr
<
Corpus2
::
TokenReader
>
reader
;
reader
=
Corpus2
::
create_reader
(
vm
,
tagset
,
filename
);
Corpus2
::
Sentence
::
Ptr
s
;
boost
::
shared_ptr
<
Corpus2
::
TokenWriter
>
writer
;
writer
=
Corpus2
::
create_writer
(
vm
,
tagset
);
std
::
map
<
int
,
int
>
lens
;
for
(
int
i
=
0
;
i
<=
sentence
;
++
i
)
{
s
=
reader
->
get_next_sentence
();
if
(
s
)
{
lens
[
s
->
size
()]
++
;
if
(
s
->
size
()
>
stats
)
{
std
::
cerr
<<
i
<<
"
\n
"
;
writer
->
write_sentence
(
*
s
);
}
}
}
if
(
s
)
{
if
(
token
==
-
1
)
{
writer
->
write_sentence
(
*
s
);
}
else
if
(
static_cast
<
size_t
>
(
token
)
<
s
->
size
())
{
writer
->
write_token
(
*
(
*
s
)[
token
]);
}
}
if
(
stats
)
{
typedef
std
::
pair
<
int
,
int
>
pp
;
foreach
(
const
pp
&
p
,
lens
)
{
std
::
cerr
<<
p
.
first
<<
" "
<<
p
.
second
<<
"
\n
"
;
}
}
}
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment