Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
C
corpus2
Manage
Activity
Members
Labels
Plan
Issues
4
Issue boards
Milestones
Wiki
Redmine
Code
Merge requests
0
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container Registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Analysers
corpus2
Commits
a3c9f89d
Commit
a3c9f89d
authored
11 years ago
by
blaz
Browse files
Options
Downloads
Patches
Plain Diff
Zmiany w corpus-merge
parent
831e4113
Branches
Branches containing commit
No related merge requests found
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
corpus2tools/corpus-merge
+29
-2
29 additions, 2 deletions
corpus2tools/corpus-merge
libcorpus2_whole/relation.h
+1
-1
1 addition, 1 deletion
libcorpus2_whole/relation.h
with
30 additions
and
3 deletions
corpus2tools/corpus-merge
+
29
−
2
View file @
a3c9f89d
#!/usr/bin/python
#!/usr/bin/python
# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-
import
sys
,
os
import
sys
,
os
from
xml.sax.saxutils
import
escape
from
optparse
import
OptionParser
from
optparse
import
OptionParser
from
collections
import
defaultdict
as
dd
from
collections
import
defaultdict
as
dd
from
itertools
import
repeat
,
izip
from
itertools
import
repeat
,
izip
...
@@ -48,6 +49,12 @@ def go():
...
@@ -48,6 +49,12 @@ def go():
parser
.
add_option
(
'
--prefix-chunks
'
,
action
=
'
store_true
'
,
parser
.
add_option
(
'
--prefix-chunks
'
,
action
=
'
store_true
'
,
dest
=
'
prefix_chunks
'
,
default
=
False
,
dest
=
'
prefix_chunks
'
,
default
=
False
,
help
=
'
Prefix chunk ids with filename (file:NAME:ORIGID)
'
)
help
=
'
Prefix chunk ids with filename (file:NAME:ORIGID)
'
)
parser
.
add_option
(
'
--prefix-sentences
'
,
action
=
'
store_true
'
,
dest
=
'
prefix_sentences
'
,
default
=
False
,
help
=
'
Prefix sentneces ids with filename (file:NAME:ORIGID)
'
)
parser
.
add_option
(
'
--documents-as-chunks
'
,
action
=
'
store_true
'
,
dest
=
'
documents_as_chunks
'
,
default
=
False
,
help
=
'
Writes every document into single chunk node
'
)
parser
.
add_option
(
'
-v
'
,
'
--verbose
'
,
action
=
'
store_true
'
,
parser
.
add_option
(
'
-v
'
,
'
--verbose
'
,
action
=
'
store_true
'
,
dest
=
'
verbose
'
,
default
=
False
,
dest
=
'
verbose
'
,
default
=
False
,
help
=
'
verbose mode
'
)
help
=
'
verbose mode
'
)
...
@@ -70,8 +77,9 @@ def go():
...
@@ -70,8 +77,9 @@ def go():
writer
=
corpus2
.
TokenWriter
.
create_stdout_writer
(
options
.
output_format
,
tagset
)
writer
=
corpus2
.
TokenWriter
.
create_stdout_writer
(
options
.
output_format
,
tagset
)
for
arg
in
args
:
for
arg
in
args
:
reader
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
arg
)
reader
=
corpus2
.
TokenReader
.
create_path_reader
(
options
.
input_format
,
tagset
,
arg
)
fname
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
arg
))
fname
=
escape
(
fname
)
if
options
.
chunks
:
if
options
.
chunks
:
fname
,
_
=
os
.
path
.
splitext
(
os
.
path
.
basename
(
arg
))
chunk_no
=
1
chunk_no
=
1
for
chunk
in
chunks
(
reader
):
for
chunk
in
chunks
(
reader
):
if
options
.
prefix_chunks
:
if
options
.
prefix_chunks
:
...
@@ -85,8 +93,27 @@ def go():
...
@@ -85,8 +93,27 @@ def go():
writer
.
write_chunk
(
chunk
)
writer
.
write_chunk
(
chunk
)
chunk_no
+=
1
chunk_no
+=
1
else
:
else
:
big_chunk
=
None
if
options
.
documents_as_chunks
:
big_chunk
=
corpus2
.
Chunk
()
big_chunk
.
set_attribute
(
'
id
'
,
'
file:%s:%s
'
%
(
fname
,
'
ch1
'
))
sent_no
=
1
for
sent
in
sentences
(
reader
):
for
sent
in
sentences
(
reader
):
writer
.
write_sentence
(
sent
)
if
options
.
prefix_sentences
:
if
not
sent
.
id
():
their_id
=
sent
.
id
()
else
:
#autogen
their_id
=
(
'
s%d
'
%
sent_no
)
full_id
=
'
file:%s:%s
'
%
(
fname
,
their_id
)
sent
.
set_id
(
full_id
)
if
big_chunk
:
big_chunk
.
append
(
sent
)
else
:
writer
.
write_sentence
(
sent
)
sent_no
+=
1
if
big_chunk
:
writer
.
write_chunk
(
big_chunk
)
del
reader
del
reader
if
__name__
==
'
__main__
'
:
if
__name__
==
'
__main__
'
:
...
...
This diff is collapsed.
Click to expand it.
libcorpus2_whole/relation.h
+
1
−
1
View file @
a3c9f89d
...
@@ -33,7 +33,7 @@ public:
...
@@ -33,7 +33,7 @@ public:
/**
/**
* @param sentence_id Sentence identifier
* @param sentence_id Sentence identifier
* @param channel_name Channel name
* @param channel_name Channel name
* @param annotation_number Annotation number
* @param annotation_number Annotation number
aka annotation segment
*/
*/
DirectionPoint
(
const
std
::
string
sentence_id
,
DirectionPoint
(
const
std
::
string
sentence_id
,
const
std
::
string
channel_name
,
const
std
::
string
channel_name
,
...
...
This diff is collapsed.
Click to expand it.
Preview
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment