Add ConllReader

98b6f93d · Maja Jabłońska · def607ea · 98b6f93d · 98b6f93d · 98b6f93d
Commit 98b6f93d authored 2 years ago by Maja Jabłońska
--- a/combo/data/__init__.py
+++ b/combo/data/__init__.py
@@ -5,4 +5,5 @@ from .instance import Instance
 from .token_indexers import (SingleIdTokenIndexer, TokenIndexer, TokenFeatsIndexer)
 from .tokenizers import (Tokenizer, TokenizerToken, CharacterTokenizer, PretrainedTransformerTokenizer,
                         SpacyTokenizer, WhitespaceTokenizer)
-from .dataset_readers import DatasetReader, TextClassificationJSONReader
+from .dataset_readers import (ConllDatasetReader, DatasetReader,
+                              TextClassificationJSONReader, UniversalDependenciesDatasetReader)
--- a/combo/data/dataset_readers/__init__.py
+++ b/combo/data/dataset_readers/__init__.py
 from .dataset_reader import DatasetReader
 from .text_classification_json_reader import TextClassificationJSONReader
 from .universal_dependencies_dataset_reader import UniversalDependenciesDatasetReader
+from .conll import ConllDatasetReader
--- a/combo/data/dataset_readers/conll.py
+++ b/combo/data/dataset_readers/conll.py
+"""
+Adapted from AllenNLP
+https://github.com/allenai/allennlp/blob/main/allennlp/data/dataset_readers/conll2003.py
+"""
+
+from typing import Dict, List, Optional, Sequence, Iterable
+import itertools
+import logging
+
+from combo.utils import ConfigurationError
+from .dataset_reader import DatasetReader, PathOrStr
+from combo.data.token_indexers.token_indexer import TokenIndexer, TokenizerToken
+from combo.data.token_indexers.single_id_token_indexer import SingleIdTokenIndexer
+from .dataset_utils.span_utils import to_bioul
+from .. import Instance
+from ..fields import MetadataField, TextField, Field, SequenceLabelField
+from ...utils.file_utils import cached_path
+
+
+logger = logging.getLogger(__name__)
+
+
+def _is_divider(line: str) -> bool:
+    empty_line = line.strip() == ""
+    if empty_line:
+        return True
+    else:
+        first_token = line.split()[0]
+        if first_token == "-DOCSTART-":
+            return True
+        else:
+            return False
+
+
+# TODO: maybe one should note whether the format is IOB1 or IOB2 in the processed dataset?
+class ConllDatasetReader(DatasetReader):
+    """
+    Reads instances from a pretokenised file where each line is in the following format:
+    ```
+    WORD POS-TAG CHUNK-TAG NER-TAG
+    ```
+    with a blank line indicating the end of each sentence
+    and `-DOCSTART- -X- -X- O` indicating the end of each article,
+    and converts it into a `Dataset` suitable for sequence tagging.
+    Each `Instance` contains the words in the `"tokens"` `TextField`.
+    The values corresponding to the `tag_label`
+    values will get loaded into the `"tags"` `SequenceLabelField`.
+    And if you specify any `feature_labels` (you probably shouldn't),
+    the corresponding values will get loaded into their own `SequenceLabelField` s.
+    This dataset reader ignores the "article" divisions and simply treats
+    each sentence as an independent `Instance`. (Technically the reader splits sentences
+    on any combination of blank lines and "DOCSTART" tags; in particular, it does the right
+    thing on well formed inputs.)
+    Registered as a `DatasetReader` with name "conll2003".
+    # Parameters
+    token_indexers : `Dict[str, TokenIndexer]`, optional (default=`{"tokens": SingleIdTokenIndexer()}`)
+        We use this to define the input representation for the text.  See :class:`TokenIndexer`.
+    tag_label : `str`, optional (default=`ner`)
+        Specify `ner`, `pos`, or `chunk` to have that tag loaded into the instance field `tag`.
+    feature_labels : `Sequence[str]`, optional (default=`()`)
+        These labels will be loaded as features into the corresponding instance fields:
+        `pos` -> `pos_tags`, `chunk` -> `chunk_tags`, `ner` -> `ner_tags`
+        Each will have its own namespace : `pos_tags`, `chunk_tags`, `ner_tags`.
+        If you want to use one of the tags as a `feature` in your model, it should be
+        specified here.
+    convert_to_coding_scheme : `Optional[str]`, optional (default=`None`)
+        Specifies the coding scheme for `ner_labels` and `chunk_labels`.
+        If `None` is passed, no change will be applied.
+        Valid options are `None` and `BIOUL`.
+        In the IOB1 scheme, I is a token inside a span, O is a token outside
+        a span and B is the beginning of span immediately following another
+        span of the same type.
+    coding_scheme: `str`, optional (default=`IOB1`)
+        Specifies the coding scheme of the input file.
+        Valid options are `IOB1` and `IOB2`.
+    label_namespace : `str`, optional (default=`labels`)
+        Specifies the namespace for the chosen `tag_label`.
+    """
+
+    _VALID_LABELS = {"ner", "pos", "chunk"}
+    _VALID_CODING_OPTIONS = ('IOB1', 'IOB2')
+    _VALID_CONVERT_TO_CODING_OPTIONS = (None, 'BIOUL')
+
+    def __init__(
+        self,
+        token_indexers: Dict[str, TokenIndexer] = None,
+        tag_label: str = "ner",
+        feature_labels: Sequence[str] = (),
+        convert_to_coding_scheme: Optional[str] = None,
+        coding_scheme: str = 'IOB1',
+        label_namespace: str = "labels",
+        **kwargs,
+    ) -> None:
+
+        super().__init__(**kwargs)
+
+        self._token_indexers = token_indexers or {"tokens": SingleIdTokenIndexer()}
+        if tag_label is not None and tag_label not in self._VALID_LABELS:
+            raise ConfigurationError("unknown tag label type: {}".format(tag_label))
+        for label in feature_labels:
+            if label not in self._VALID_LABELS:
+                raise ConfigurationError("unknown feature label type: {}".format(label))
+        if coding_scheme not in self._VALID_CODING_OPTIONS:
+            raise ConfigurationError(
+                "unknown coding_scheme: {}".format(coding_scheme)
+            )
+        if convert_to_coding_scheme not in self._VALID_CONVERT_TO_CODING_OPTIONS:
+            raise ConfigurationError(
+                "unknown convert_to_coding_scheme: {}".format(convert_to_coding_scheme)
+            )
+
+        self.tag_label = tag_label
+        self.feature_labels = set(feature_labels)
+        self.__convert_to_coding_scheme = convert_to_coding_scheme
+        self.label_namespace = label_namespace
+        self.__coding_scheme = coding_scheme
+
+    @property
+    def convert_to_coding_scheme(self) -> str:
+        return self.__convert_to_coding_scheme
+
+    @property
+    def coding_scheme(self) -> str:
+        return self.__coding_scheme
+
+    def _read(self) -> Iterable[Instance]:
+        # if `file_path` is a URL, redirect to the cache
+        file_path = cached_path(self.file_path)
+
+        with open(file_path, "r") as data_file:
+            logger.info("Reading instances from lines in file at: %s", file_path)
+
+            # Group lines into sentence chunks based on the divider.
+            line_chunks = (
+                lines
+                for is_divider, lines in itertools.groupby(data_file, _is_divider)
+                # Ignore the divider chunks, so that `lines` corresponds to the words
+                # of a single sentence.
+                if not is_divider
+            )
+            for lines in line_chunks:
+                fields = [line.strip().split() for line in lines]
+                # unzipping trick returns tuples, but our Fields need lists
+                fields = [list(field) for field in zip(*fields)]
+                tokens_, pos_tags, chunk_tags, ner_tags = fields
+                # TextField requires `Token` objects
+                tokens = [TokenizerToken(token) for token in tokens_]
+
+                yield self.text_to_instance(tokens, pos_tags, chunk_tags, ner_tags)
+
+    def text_to_instance(  # type: ignore
+        self,
+        tokens: List[TokenizerToken],
+        pos_tags: List[str] = None,
+        chunk_tags: List[str] = None,
+        ner_tags: List[str] = None,
+    ) -> Instance:
+        """
+        We take `pre-tokenized` input here, because we don't have a tokenizer in this class.
+        """
+
+        sequence = TextField(tokens)
+        instance_fields: Dict[str, Field] = {"tokens": sequence,
+                                             "metadata": MetadataField({"words": [x.text for x in tokens]})}
+
+        # Recode the labels if necessary.
+        if self.__convert_to_coding_scheme == "BIOUL":
+            coded_chunks = (
+                to_bioul(chunk_tags, encoding=self.__coding_scheme)
+                if chunk_tags is not None
+                else None
+            )
+            coded_ner = (
+                to_bioul(ner_tags, encoding=self.__coding_scheme)
+                if ner_tags is not None
+                else None
+            )
+        else:
+            # the default IOB1/IOB2
+            coded_chunks = chunk_tags
+            coded_ner = ner_tags
+
+        # Add "feature labels" to instance
+        if "pos" in self.feature_labels:
+            if pos_tags is None:
+                raise ConfigurationError(
+                    "Dataset reader was specified to use pos_tags as "
+                    "features. Pass them to text_to_instance."
+                )
+            instance_fields["pos_tags"] = SequenceLabelField(pos_tags, sequence, "pos_tags")
+        if "chunk" in self.feature_labels:
+            if coded_chunks is None:
+                raise ConfigurationError(
+                    "Dataset reader was specified to use chunk tags as "
+                    "features. Pass them to text_to_instance."
+                )
+            instance_fields["chunk_tags"] = SequenceLabelField(coded_chunks, sequence, "chunk_tags")
+        if "ner" in self.feature_labels:
+            if coded_ner is None:
+                raise ConfigurationError(
+                    "Dataset reader was specified to use NER tags as "
+                    " features. Pass them to text_to_instance."
+                )
+            instance_fields["ner_tags"] = SequenceLabelField(coded_ner, sequence, "ner_tags")
+
+        # Add "tag label" to instance
+        if self.tag_label == "ner" and coded_ner is not None:
+            instance_fields["tags"] = SequenceLabelField(coded_ner, sequence, self.label_namespace)
+        elif self.tag_label == "pos" and pos_tags is not None:
+            instance_fields["tags"] = SequenceLabelField(pos_tags, sequence, self.label_namespace)
+        elif self.tag_label == "chunk" and coded_chunks is not None:
+            instance_fields["tags"] = SequenceLabelField(
+                coded_chunks, sequence, self.label_namespace
+            )
+
+        return Instance(instance_fields)
+
+    def __call__(self, file_path: str):
+        self.file_path = file_path
+        return self
+
+    def apply_token_indexers(self, instance: Instance) -> None:
+        instance.fields["tokens"]._token_indexers = self._token_indexers  # type: ignore
--- a/combo/data/dataset_readers/dataset_utils/span_utils.py
+++ b/combo/data/dataset_readers/dataset_utils/span_utils.py
@@ -12,12 +12,11 @@ from combo.utils import ConfigurationError, InvalidTagSequence
 TypedSpan = Tuple[int, Tuple[int, int]]
 TypedStringSpan = Tuple[str, Tuple[int, int]]

-
 T = TypeVar("T", str, TokenizerToken)


 def bio_tags_to_spans(
-    tag_sequence: List[str], classes_to_ignore: List[str] = None
+        tag_sequence: List[str], classes_to_ignore: List[str] = None
 ) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BIO tags, extracts spans.
@@ -85,8 +84,10 @@ def bio_tags_to_spans(
        spans.add((active_conll_tag, (span_start, span_end)))
    return list(spans)

-def iob1_tags_to_spans(
-    tag_sequence: List[str], classes_to_ignore: List[str] = None
+
+def _iob_tags_to_spans(
+        start_of_chunk_fun: Callable[[Optional[str], Optional[str], str, str], bool],
+        tag_sequence: List[str], classes_to_ignore: List[str] = None,
 ) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to IOB1 tags, extracts spans.
@@ -122,7 +123,7 @@ def iob1_tags_to_spans(
            if active_conll_tag is not None:
                spans.add((active_conll_tag, (span_start, span_end)))
            active_conll_tag = None
-        elif _iob1_start_of_chunk(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag):
+        elif start_of_chunk_fun(prev_bio_tag, prev_conll_tag, curr_bio_tag, curr_conll_tag):
            # We are entering a new span; reset indices
            # and active tag to new span.
            if active_conll_tag is not None:
@@ -144,10 +145,10 @@ def iob1_tags_to_spans(


 def _iob1_start_of_chunk(
-    prev_bio_tag: Optional[str],
-    prev_conll_tag: Optional[str],
-    curr_bio_tag: str,
-    curr_conll_tag: str,
+        prev_bio_tag: Optional[str],
+        prev_conll_tag: Optional[str],
+        curr_bio_tag: str,
+        curr_conll_tag: str,
 ) -> bool:
    if curr_bio_tag == "B":
        return True
@@ -158,8 +159,35 @@ def _iob1_start_of_chunk(
    return False


+def _iob2_start_of_chunk(
+        prev_bio_tag: Optional[str],
+        prev_conll_tag: Optional[str],
+        curr_bio_tag: str,
+        curr_conll_tag: str,
+) -> bool:
+    if curr_bio_tag == "B":
+        return True
+    if curr_bio_tag != "O" and prev_conll_tag != curr_conll_tag:
+        return True
+    return False
+
+
+def iob1_tags_to_spans(
+        tag_sequence: List[str], classes_to_ignore: List[str] = None,
+) -> List[TypedStringSpan]:
+    return _iob_tags_to_spans(_iob1_start_of_chunk,
+                              tag_sequence, classes_to_ignore)
+
+
+def iob2_tags_to_spans(
+        tag_sequence: List[str], classes_to_ignore: List[str] = None,
+) -> List[TypedStringSpan]:
+    return _iob_tags_to_spans(_iob2_start_of_chunk,
+                              tag_sequence, classes_to_ignore)
+
+
 def bioul_tags_to_spans(
-    tag_sequence: List[str], classes_to_ignore: List[str] = None
+        tag_sequence: List[str], classes_to_ignore: List[str] = None
 ) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BIOUL tags, extracts spans.
@@ -201,14 +229,6 @@ def bioul_tags_to_spans(
    return [span for span in spans if span[0] not in classes_to_ignore]


-def iob1_to_bioul(tag_sequence: List[str]) -> List[str]:
-    warnings.warn(
-        "iob1_to_bioul has been replaced with 'to_bioul' to allow more encoding options.",
-        FutureWarning,
-    )
-    return to_bioul(tag_sequence)
-
-
 def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
    """
    Given a tag sequence encoded with IOB1 labels, recode to BIOUL.
@@ -221,12 +241,12 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
    tag_sequence : `List[str]`, required.
        The tag sequence encoded in IOB1, e.g. ["I-PER", "I-PER", "O"].
    encoding : `str`, optional, (default = `"IOB1"`).
-        The encoding type to convert from. Must be either "IOB1" or "BIO".
+        The encoding type to convert from. Must be either "IOB1", "IOB2", or "BIO".
    # Returns
    bioul_sequence : `List[str]`
        The tag sequence encoded in IOB1, e.g. ["B-PER", "L-PER", "O"].
    """
-    if encoding not in {"IOB1", "BIO"}:
+    if encoding not in {"IOB1", "IOB2", "BIO"}:
        raise ConfigurationError(f"Invalid encoding {encoding} passed to 'to_bioul'.")

    def replace_label(full_label, new_label):
@@ -275,12 +295,14 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
            process_stack(stack, bioul_sequence)
            bioul_sequence.append(label)
        elif label[0] == "I":
+            # IOB1:
            # check if the previous type is the same as this one
            # if it is then append to stack
            # otherwise this start a new entity if the type
            # is different
            if len(stack) == 0:
-                if encoding == "BIO":
+                # Beginning of the sequence
+                if encoding in {"IOB2", "BIO"}:
                    raise InvalidTagSequence(tag_sequence)
                stack.append(label)
            else:
@@ -290,7 +312,7 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:
                if this_type == prev_type:
                    stack.append(label)
                else:
-                    if encoding == "BIO":
+                    if encoding in {"IOB2", "BIO"}:
                        raise InvalidTagSequence(tag_sequence)
                    # a new entity
                    process_stack(stack, bioul_sequence)
@@ -310,7 +332,7 @@ def to_bioul(tag_sequence: List[str], encoding: str = "IOB1") -> List[str]:


 def bmes_tags_to_spans(
-    tag_sequence: List[str], classes_to_ignore: List[str] = None
+        tag_sequence: List[str], classes_to_ignore: List[str] = None
 ) -> List[TypedStringSpan]:
    """
    Given a sequence corresponding to BMES tags, extracts spans.

--- a/combo/utils/__init__.py
+++ b/combo/utils/__init__.py
 from .checks import *
 from .sequence import *
+from .exceptions import *
\ No newline at end of file
--- a/tests/data/data_readers/conll_test_file.txt
+++ b/tests/data/data_readers/conll_test_file.txt
+-DOCSTART- -X- -X- O
+
+SOCCER NN B-NP O
+- : O O
+JAPAN NNP B-NP B-LOC
+GET VB B-VP O
+LUCKY NNP B-NP O
+WIN NNP I-NP O
+, , O O
+CHINA NNP B-NP B-PER
+IN IN B-PP O
+SURPRISE DT B-NP O
+DEFEAT NN I-NP O
+. . O O
+
+Nadim NNP B-NP B-PER
+Ladki NNP I-NP I-PER
+AL-AIN NNP B-NP B-LOC
+, , O O
+United NNP B-NP B-LOC
+Arab NNP I-NP I-LOC
+Emirates NNPS I-NP I-LOC
+1996-12-06 CD I-NP O
+
+Japan NNP B-NP B-LOC
+began VBD B-VP O
+the DT B-NP O
+defence NN I-NP O
+of IN B-PP O
+their PRP$ B-NP O
+Asian JJ I-NP B-MISC
+Cup NNP I-NP I-MISC
+title NN I-NP O
+with IN B-PP O
+a DT B-NP O
+lucky JJ I-NP O
+2-1 CD I-NP O
+win VBP B-VP O
+against IN B-PP O
+Syria NNP B-NP B-LOC
+in IN B-PP O
+a DT B-NP O
+Group NNP I-NP O
+C NNP I-NP O
+championship NN I-NP O
+match NN I-NP O
+on IN B-PP O
+Friday NNP B-NP O
+. . O O
+
+But CC O O
+China NNP B-NP B-LOC
+saw VBD B-VP O
+their PRP$ B-NP O
+luck NN I-NP O
+desert VB B-VP O
+them PRP B-NP O
+in IN B-PP O
+the DT B-NP O
+second NN I-NP O
+match NN I-NP O
+of IN B-PP O
+the DT B-NP O
+group NN I-NP O
+, , O O
+crashing VBG B-VP O
+to TO B-PP O
+a DT B-NP O
+surprise NN I-NP O
+2-0 CD I-NP O
+defeat NN I-NP O
+to TO B-PP O
+newcomers NNS B-NP O
+Uzbekistan NNP I-NP B-LOC
+. . O O
+
+China NNP B-NP B-LOC
+controlled VBD B-VP O
+most JJS B-NP O
+of IN B-PP O
+the DT B-NP O
+match NN I-NP O
+and CC O O
+saw VBD B-VP O
+several JJ B-NP O
+chances NNS I-NP O
+missed VBD B-VP O
+until IN B-SBAR O
+the DT B-NP O
+78th JJ I-NP O
+minute NN I-NP O
+when WRB B-ADVP O
+Uzbek NNP B-NP B-MISC
+striker NN I-NP O
+Igor JJ B-NP B-PER
+Shkvyrin NNP I-NP I-PER
+took VBD B-VP O
+advantage NN B-NP O
+of IN B-PP O
+a DT B-NP O
+misdirected JJ I-NP O
+defensive JJ I-NP O
+header NN I-NP O
+to TO B-VP O
+lob VB I-VP O
+the DT B-NP O
+ball NN I-NP O
+over IN B-PP O
+the DT B-NP O
+advancing VBG I-NP O
+Chinese JJ I-NP B-MISC
+keeper NN I-NP O
+and CC O O
+into IN B-PP O
+an DT B-NP O
+empty JJ I-NP O
+net NN I-NP O
+. . O O
+
+Oleg NNP B-NP B-PER
+Shatskiku NNP I-NP I-PER
+made VBD B-VP O
+sure JJ B-ADJP O
+of IN B-PP O
+the DT B-NP O
+win VBP B-VP O
+in IN B-PP O
+injury NN B-NP O
+time NN I-NP O
+, , O O
+hitting VBG B-VP O
+an DT B-NP O
+unstoppable JJ I-NP O
+left VBD B-VP O
+foot NN B-NP O
+shot NN I-NP O
+from IN B-PP O
+just RB B-NP O
+outside IN B-PP O
+the DT B-NP O
+area NN I-NP O
+. . O O
\ No newline at end of file
--- a/tests/data/data_readers/test_conll.py
+++ b/tests/data/data_readers/test_conll.py
+import unittest
+
+from combo.data import ConllDatasetReader
+
+
+class ConllDatasetReaderTest(unittest.TestCase):
+    def test_read_all_tokens(self):
+        reader = ConllDatasetReader(coding_scheme='IOB2')
+        tokens = [token for token in reader('conll_test_file.txt')]
+        self.assertEqual(len(tokens), 6)
+
+    def test_tokenize_correct_tokens(self):
+        reader = ConllDatasetReader(coding_scheme='IOB2')
+        token = next(iter(reader('conll_test_file.txt')))
+        self.assertListEqual([str(t) for t in  token['tokens'].tokens],
+                             ['SOCCER', '-', 'JAPAN', 'GET', 'LUCKY', 'WIN', ',',
+                              'CHINA', 'IN', 'SURPRISE', 'DEFEAT', '.'])
+
+    def test_tokenize_correct_tags(self):
+        reader = ConllDatasetReader(coding_scheme='IOB2')
+        token = next(iter(reader('conll_test_file.txt')))
+        self.assertListEqual(token['tags'].labels,
+                             ['O', 'O', 'B-LOC', 'O', 'O', 'O', 'O',
+                              'B-PER', 'O', 'O', 'O', 'O'])