Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
c5771637
Commit
c5771637
authored
12 years ago
by
Adam Radziszewski
Browse files
Options
Downloads
Patches
Plain Diff
corpus-merge: switch to prefix chunk (=par) ids with file name
parent
d998c411
Branches
Branches containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
corpus2tools/corpus-merge
+16
-1
16 additions, 1 deletion
corpus2tools/corpus-merge
with
16 additions
and
1 deletion
corpus2tools/corpus-merge
+
16
−
1
View file @
c5771637
#!/usr/bin/python
#!/usr/bin/python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
sys
import
sys
,
os
from
optparse
import
OptionParser
from
optparse
import
OptionParser
from
collections
import
defaultdict
as
dd
from
collections
import
defaultdict
as
dd
from
itertools
import
repeat
,
izip
from
itertools
import
repeat
,
izip
...
@@ -45,6 +45,9 @@ def go():
...
@@ -45,6 +45,9 @@ def go():
parser
.
add_option
(
'
-C
'
,
'
--chunks
'
,
action
=
'
store_true
'
,
parser
.
add_option
(
'
-C
'
,
'
--chunks
'
,
action
=
'
store_true
'
,
dest
=
'
chunks
'
,
default
=
False
,
dest
=
'
chunks
'
,
default
=
False
,
help
=
'
Process chunks (select chunks/sentences, not tokens)
'
)
help
=
'
Process chunks (select chunks/sentences, not tokens)
'
)
parser
.
add_option
(
'
--prefix-chunks
'
,
action
=
'
store_true
'
,
dest
=
'
prefix_chunks
'
,
default
=
False
,
help
=
'
Prefix chunk ids with filename (file:NAME:ORIGID)
'
)
parser
.
add_option
(
'
-v
'
,
'
--verbose
'
,
action
=
'
store_true
'
,
parser
.
add_option
(
'
-v
'
,
'
--verbose
'
,
action
=
'
store_true
'
,
dest
=
'
verbose
'
,
default
=
False
,
dest
=
'
verbose
'
,
default
=
False
,
help
=
'
verbose mode
'
)
help
=
'
verbose mode
'
)
...
@@ -68,11 +71,23 @@ def go():
...
@@ -68,11 +71,23 @@ def go():
for
arg
in
args
:
for
arg
in
args
:
reader
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
arg
)
reader
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
arg
)
if
options
.
chunks
:
if
options
.
chunks
:
fname
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
arg
))
chunk_no
=
1
for
chunk
in
chunks
(
reader
):
for
chunk
in
chunks
(
reader
):
if
options
.
prefix_chunks
:
if
chunk
.
has_attribute
(
'
id
'
):
their_id
=
chunk
.
get_attribute
(
'
id
'
)
else
:
# autogen
their_id
=
(
'
auto%03d
'
%
chunk_no
)
full_id
=
'
file:%s:%s
'
%
(
fname
,
their_id
)
chunk
.
set_attribute
(
'
id
'
,
full_id
)
writer
.
write_chunk
(
chunk
)
writer
.
write_chunk
(
chunk
)
chunk_no
+=
1
else
:
else
:
for
sent
in
sentences
(
reader
):
for
sent
in
sentences
(
reader
):
writer
.
write_sentence
(
sent
)
writer
.
write_sentence
(
sent
)
del
reader
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
go
()
go
()
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment