Skip to content
Snippets Groups Projects
Commit 98b6f93d authored by Maja Jabłońska's avatar Maja Jabłońska
Browse files

Add ConllReader

parent def607ea
1 merge request!46Merge COMBO 3.0 into master
......@@ -5,4 +5,5 @@ from .instance import Instance
from .token_indexers import (SingleIdTokenIndexer, TokenIndexer, TokenFeatsIndexer)
from .tokenizers import (Tokenizer, TokenizerToken, CharacterTokenizer, PretrainedTransformerTokenizer,
SpacyTokenizer, WhitespaceTokenizer)
from .dataset_readers import DatasetReader, TextClassificationJSONReader
from .dataset_readers import (ConllDatasetReader, DatasetReader,
TextClassificationJSONReader, UniversalDependenciesDatasetReader)
from .dataset_reader import DatasetReader
from .text_classification_json_reader import TextClassificationJSONReader
from .universal_dependencies_dataset_reader import UniversalDependenciesDatasetReader
from .conll import ConllDatasetReader
"""
Adapted from AllenNLP
https://github.com/allenai/allennlp/blob/main/allennlp/data/dataset_readers/conll2003.py
"""
from typing import Dict, List, Optional, Sequence, Iterable
import itertools
import logging
from combo.utils import ConfigurationError
from .dataset_reader import DatasetReader, PathOrStr
from combo.data.token_indexers.token_indexer import TokenIndexer, TokenizerToken
from combo.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
from .dataset_utils.span_utils import to_bioul
from .. import Instance
from ..fields import MetadataField, TextField, Field, SequenceLabelField
from ...utils.file_utils import cached_path
logger = logging.getLogger(__name__)
def _is_divider(line: str) -> bool:
empty_line = line.strip() == ""
if empty_line:
return True
else:
first_token = line.split()[0]
if first_token == "-DOCSTART-":
return True
else:
return False
# TODO: maybe one should note whether the format is IOB1 or IOB2 in the processed dataset?
class ConllDatasetReader(DatasetReader):
"""
Reads instances from a pretokenised file where each line is in the following format:
```
WORD POS-TAG CHUNK-TAG NER-TAG
```
with a blank line indicating the end of each sentence
and `-DOCSTART- -X- -X- O` indicating the end of each article,
and converts it into a `Dataset` suitable for sequence tagging.
Each `Instance` contains the words in the `"tokens"` `TextField`.
The values corresponding to the `tag_label`
values will get loaded into the `"tags"` `SequenceLabelField`.
And if you specify any `feature_labels` (you probably shouldn't),
the corresponding values will get loaded into their own `SequenceLabelField` s.
This dataset reader ignores the "article" divisions and simply treats
each sentence as an independent `Instance`. (Technically the reader splits sentences
on any combination of blank lines and "DOCSTART" tags; in particular, it does the right
thing on well formed inputs.)
Registered as a `DatasetReader` with name "conll2003".
# Parameters
token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`)
We use this to define the input representation for the text. See :class:`TokenIndexer`.
tag_label : `str`, optional (default=`ner`)
Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
feature_labels : `Sequence[str]`, optional (default=`()`)
These labels will be loaded as features into the corresponding instance fields:
`pos` -> `pos_tags`, `chunk` -> `chunk_tags`, `ner` -> `ner_tags`
Each will have its own namespace : `pos_tags`, `chunk_tags`, `ner_tags`.
If you want to use one of the tags as a `feature` in your model, it should be
specified here.
convert_to_coding_scheme : `Optional[str]`, optional (default=`None`)
Specifies the coding scheme for `ner_labels` and `chunk_labels`.
If `None` is passed, no change will be applied.
Valid options are `None` and `BIOUL`.
In the IOB1 scheme, I is a token inside a span, O is a token outside
a span and B is the beginning of span immediately following another
span of the same type.
coding_scheme: `str`, optional (default=`IOB1`)
Specifies the coding scheme of the input file.
Valid options are `IOB1` and `IOB2`.
label_namespace : `str`, optional (default=`labels`)
Specifies the namespace for the chosen `tag_label`.
"""
_VALID_LABELS = {"ner", "pos", "chunk"}
_VALID_CODING_OPTIONS = ('IOB1', 'IOB2')
_VALID_CONVERT_TO_CODING_OPTIONS = (None, 'BIOUL')
def __init__(
self,
token_indexers: Dict[str, TokenIndexer] = None,
tag_label: str = "ner",
feature_labels: Sequence[str] = (),
convert_to_coding_scheme: Optional[str] = None,
coding_scheme: str = 'IOB1',
label_namespace: str = "labels",
**kwargs,
) -> None:
super().__init__(**kwargs)
self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
if tag_label is not None and tag_label not in self._VALID_LABELS:
raise ConfigurationError("unknown tag label type: {}".format(tag_label))
for label in feature_labels:
if label not in self._VALID_LABELS:
raise ConfigurationError("unknown feature label type: {}".format(label))
if coding_scheme not in self._VALID_CODING_OPTIONS:
raise ConfigurationError(
"unknown coding_scheme: {}".format(coding_scheme)
)
if convert_to_coding_scheme not in self._VALID_CONVERT_TO_CODING_OPTIONS:
raise ConfigurationError(
"unknown convert_to_coding_scheme: {}".format(convert_to_coding_scheme)
)
self.tag_label = tag_label
self.feature_labels = set(feature_labels)
self.__convert_to_coding_scheme = convert_to_coding_scheme
self.label_namespace = label_namespace
self.__coding_scheme = coding_scheme
@property
def convert_to_coding_scheme(self) -> str:
return self.__convert_to_coding_scheme
@property
def coding_scheme(self) -> str:
return self.__coding_scheme
def _read(self) -> Iterable[Instance]:
# if `file_path` is a URL, redirect to the cache
file_path = cached_path(self.file_path)
with open(file_path, "r") as data_file:
logger.info("Reading instances from lines in file at: %s", file_path)
# Group lines into sentence chunks based on the divider.
line_chunks = (
lines
for is_divider, lines in itertools.groupby(data_file, _is_divider)
# Ignore the divider chunks, so that `lines` corresponds to the words
# of a single sentence.
if not is_divider
)
for lines in line_chunks:
fields = [line.strip().split() for line in lines]
# unzipping trick returns tuples, but our Fields need lists
fields = [list(field) for field in zip(*fields)]
tokens_, pos_tags, chunk_tags, ner_tags = fields
# TextField requires `Token` objects
tokens = [TokenizerToken(token) for token in tokens_]
yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
def text_to_instance( # type: ignore
self,
tokens: List[TokenizerToken],
pos_tags: List[str] = None,
chunk_tags: List[str] = None,
ner_tags: List[str] = None,
) -> Instance:
"""
We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
"""
sequence = TextField(tokens)
instance_fields: Dict[str, Field] = {"tokens": sequence,
"metadata": MetadataField({"words": [x.text for x in tokens]})}
# Recode the labels if necessary.
if self.__convert_to_coding_scheme == "BIOUL":
coded_chunks = (
to_bioul(chunk_tags, encoding=self.__coding_scheme)
if chunk_tags is not None
else None
)
coded_ner = (
to_bioul(ner_tags, encoding=self.__coding_scheme)
if ner_tags is not None
else None
)
else:
# the default IOB1/IOB2
coded_chunks = chunk_tags
coded_ner = ner_tags
# Add "feature labels" to instance
if "pos" in self.feature_labels:
if pos_tags is None:
raise ConfigurationError(
"Dataset reader was specified to use pos_tags as "
"features. Pass them to text_to_instance."
)
instance_fields["pos_tags"] = SequenceLabelField(pos_tags, sequence, "pos_tags")
if "chunk" in self.feature_labels:
if coded_chunks is None:
raise ConfigurationError(
"Dataset reader was specified to use chunk tags as "
"features. Pass them to text_to_instance."
)
instance_fields["chunk_tags"] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
if "ner" in self.feature_labels:
if coded_ner is None:
raise ConfigurationError(
"Dataset reader was specified to use NER tags as "
" features. Pass them to text_to_instance."
)
instance_fields["ner_tags"] = SequenceLabelField(coded_ner, sequence, "ner_tags")
# Add "tag label" to instance
if self.tag_label == "ner" and coded_ner is not None:
instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
elif self.tag_label == "pos" and pos_tags is not None:
instance_fields["tags"] = SequenceLabelField(pos_tags, sequence, self.label_namespace)
elif self.tag_label == "chunk" and coded_chunks is not None:
instance_fields["tags"] = SequenceLabelField(
coded_chunks, sequence, self.label_namespace
)
return Instance(instance_fields)
def __call__(self, file_path: str):
self.file_path = file_path
return self
def apply_token_indexers(self, instance: Instance) -> None:
instance.fields["tokens"]._token_indexers = self._token_indexers # type: ignore
......@@ -12,12 +12,11 @@ from combo.utils import ConfigurationError, InvalidTagSequence
TypedSpan = Tuple[int, Tuple[int, int]]
TypedStringSpan = Tuple[str, Tuple[int, int]]
T = TypeVar("T", str, TokenizerToken)
def bio_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BIO tags, extracts spans.
......@@ -85,8 +84,10 @@ def bio_tags_to_spans(
spans.add((active_conll_tag, (span_start, span_end)))
return list(spans)
def iob1_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
def _iob_tags_to_spans(
start_of_chunk_fun: Callable[[Optional[str], Optional[str], str, str], bool],
tag_sequence: List[str], classes_to_ignore: List[str] = None,
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to IOB1 tags, extracts spans.
......@@ -122,7 +123,7 @@ def iob1_tags_to_spans(
if active_conll_tag is not None:
spans.add((active_conll_tag, (span_start, span_end)))
active_conll_tag = None
elif _iob1_start_of_chunk(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag):
elif start_of_chunk_fun(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag):
# We are entering a new span; reset indices
# and active tag to new span.
if active_conll_tag is not None:
......@@ -144,10 +145,10 @@ def iob1_tags_to_spans(
def _iob1_start_of_chunk(
prev_bio_tag: Optional[str],
prev_conll_tag: Optional[str],
curr_bio_tag: str,
curr_conll_tag: str,
prev_bio_tag: Optional[str],
prev_conll_tag: Optional[str],
curr_bio_tag: str,
curr_conll_tag: str,
) -> bool:
if curr_bio_tag == "B":
return True
......@@ -158,8 +159,35 @@ def _iob1_start_of_chunk(
return False
def _iob2_start_of_chunk(
prev_bio_tag: Optional[str],
prev_conll_tag: Optional[str],
curr_bio_tag: str,
curr_conll_tag: str,
) -> bool:
if curr_bio_tag == "B":
return True
if curr_bio_tag != "O" and prev_conll_tag != curr_conll_tag:
return True
return False
def iob1_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None,
) -> List[TypedStringSpan]:
return _iob_tags_to_spans(_iob1_start_of_chunk,
tag_sequence, classes_to_ignore)
def iob2_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None,
) -> List[TypedStringSpan]:
return _iob_tags_to_spans(_iob2_start_of_chunk,
tag_sequence, classes_to_ignore)
def bioul_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BIOUL tags, extracts spans.
......@@ -201,14 +229,6 @@ def bioul_tags_to_spans(
return [span for span in spans if span[0] not in classes_to_ignore]
def iob1_to_bioul(tag_sequence: List[str]) -> List[str]:
warnings.warn(
"iob1_to_bioul has been replaced with 'to_bioul' to allow more encoding options.",
FutureWarning,
)
return to_bioul(tag_sequence)
def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
"""
Given a tag sequence encoded with IOB1 labels, recode to BIOUL.
......@@ -221,12 +241,12 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
tag_sequence : `List[str]`, required.
The tag sequence encoded in IOB1, e.g. ["I-PER", "I-PER", "O"].
encoding : `str`, optional, (default = `"IOB1"`).
The encoding type to convert from. Must be either "IOB1" or "BIO".
The encoding type to convert from. Must be either "IOB1", "IOB2", or "BIO".
# Returns
bioul_sequence : `List[str]`
The tag sequence encoded in IOB1, e.g. ["B-PER", "L-PER", "O"].
"""
if encoding not in {"IOB1", "BIO"}:
if encoding not in {"IOB1", "IOB2", "BIO"}:
raise ConfigurationError(f"Invalid encoding {encoding} passed to 'to_bioul'.")
def replace_label(full_label, new_label):
......@@ -275,12 +295,14 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
process_stack(stack, bioul_sequence)
bioul_sequence.append(label)
elif label[0] == "I":
# IOB1:
# check if the previous type is the same as this one
# if it is then append to stack
# otherwise this start a new entity if the type
# is different
if len(stack) == 0:
if encoding == "BIO":
# Beginning of the sequence
if encoding in {"IOB2", "BIO"}:
raise InvalidTagSequence(tag_sequence)
stack.append(label)
else:
......@@ -290,7 +312,7 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
if this_type == prev_type:
stack.append(label)
else:
if encoding == "BIO":
if encoding in {"IOB2", "BIO"}:
raise InvalidTagSequence(tag_sequence)
# a new entity
process_stack(stack, bioul_sequence)
......@@ -310,7 +332,7 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
def bmes_tags_to_spans(
tag_sequence: List[str], classes_to_ignore: List[str] = None
tag_sequence: List[str], classes_to_ignore: List[str] = None
) -> List[TypedStringSpan]:
"""
Given a sequence corresponding to BMES tags, extracts spans.
......
from .checks import *
from .sequence import *
from .exceptions import *
\ No newline at end of file
-DOCSTART- -X- -X- O
SOCCER NN B-NP O
- : O O
JAPAN NNP B-NP B-LOC
GET VB B-VP O
LUCKY NNP B-NP O
WIN NNP I-NP O
, , O O
CHINA NNP B-NP B-PER
IN IN B-PP O
SURPRISE DT B-NP O
DEFEAT NN I-NP O
. . O O
Nadim NNP B-NP B-PER
Ladki NNP I-NP I-PER
AL-AIN NNP B-NP B-LOC
, , O O
United NNP B-NP B-LOC
Arab NNP I-NP I-LOC
Emirates NNPS I-NP I-LOC
1996-12-06 CD I-NP O
Japan NNP B-NP B-LOC
began VBD B-VP O
the DT B-NP O
defence NN I-NP O
of IN B-PP O
their PRP$ B-NP O
Asian JJ I-NP B-MISC
Cup NNP I-NP I-MISC
title NN I-NP O
with IN B-PP O
a DT B-NP O
lucky JJ I-NP O
2-1 CD I-NP O
win VBP B-VP O
against IN B-PP O
Syria NNP B-NP B-LOC
in IN B-PP O
a DT B-NP O
Group NNP I-NP O
C NNP I-NP O
championship NN I-NP O
match NN I-NP O
on IN B-PP O
Friday NNP B-NP O
. . O O
But CC O O
China NNP B-NP B-LOC
saw VBD B-VP O
their PRP$ B-NP O
luck NN I-NP O
desert VB B-VP O
them PRP B-NP O
in IN B-PP O
the DT B-NP O
second NN I-NP O
match NN I-NP O
of IN B-PP O
the DT B-NP O
group NN I-NP O
, , O O
crashing VBG B-VP O
to TO B-PP O
a DT B-NP O
surprise NN I-NP O
2-0 CD I-NP O
defeat NN I-NP O
to TO B-PP O
newcomers NNS B-NP O
Uzbekistan NNP I-NP B-LOC
. . O O
China NNP B-NP B-LOC
controlled VBD B-VP O
most JJS B-NP O
of IN B-PP O
the DT B-NP O
match NN I-NP O
and CC O O
saw VBD B-VP O
several JJ B-NP O
chances NNS I-NP O
missed VBD B-VP O
until IN B-SBAR O
the DT B-NP O
78th JJ I-NP O
minute NN I-NP O
when WRB B-ADVP O
Uzbek NNP B-NP B-MISC
striker NN I-NP O
Igor JJ B-NP B-PER
Shkvyrin NNP I-NP I-PER
took VBD B-VP O
advantage NN B-NP O
of IN B-PP O
a DT B-NP O
misdirected JJ I-NP O
defensive JJ I-NP O
header NN I-NP O
to TO B-VP O
lob VB I-VP O
the DT B-NP O
ball NN I-NP O
over IN B-PP O
the DT B-NP O
advancing VBG I-NP O
Chinese JJ I-NP B-MISC
keeper NN I-NP O
and CC O O
into IN B-PP O
an DT B-NP O
empty JJ I-NP O
net NN I-NP O
. . O O
Oleg NNP B-NP B-PER
Shatskiku NNP I-NP I-PER
made VBD B-VP O
sure JJ B-ADJP O
of IN B-PP O
the DT B-NP O
win VBP B-VP O
in IN B-PP O
injury NN B-NP O
time NN I-NP O
, , O O
hitting VBG B-VP O
an DT B-NP O
unstoppable JJ I-NP O
left VBD B-VP O
foot NN B-NP O
shot NN I-NP O
from IN B-PP O
just RB B-NP O
outside IN B-PP O
the DT B-NP O
area NN I-NP O
. . O O
\ No newline at end of file
import unittest
from combo.data import ConllDatasetReader
class ConllDatasetReaderTest(unittest.TestCase):
def test_read_all_tokens(self):
reader = ConllDatasetReader(coding_scheme='IOB2')
tokens = [token for token in reader('conll_test_file.txt')]
self.assertEqual(len(tokens), 6)
def test_tokenize_correct_tokens(self):
reader = ConllDatasetReader(coding_scheme='IOB2')
token = next(iter(reader('conll_test_file.txt')))
self.assertListEqual([str(t) for t in token['tokens'].tokens],
['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',',
'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.'])
def test_tokenize_correct_tags(self):
reader = ConllDatasetReader(coding_scheme='IOB2')
token = next(iter(reader('conll_test_file.txt')))
self.assertListEqual(token['tags'].labels,
['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O',
'B-PER', 'O', 'O', 'O', 'O'])
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment