diff --git a/combo/data/dataset_readers/classification_textfile_dataset_reader.py b/combo/data/dataset_readers/classification_textfile_dataset_reader.py index 9ec7ca90be3bda7e1e7c835f4546d873da31ee98..a535a89e6437a74cc075862d7294edd4495ba9c6 100644 --- a/combo/data/dataset_readers/classification_textfile_dataset_reader.py +++ b/combo/data/dataset_readers/classification_textfile_dataset_reader.py @@ -1,8 +1,8 @@ from typing import Dict, Iterable, Optional -from .dataset_reader import DatasetReader, DatasetReaderInput from overrides import overrides +from .dataset_reader import DatasetReader from .. import Instance, Tokenizer, TokenIndexer from ..fields.label_field import LabelField from ..fields.text_field import TextField @@ -10,12 +10,10 @@ from ..fields.text_field import TextField class ClassificationTextfileDatasetReader(DatasetReader): def __init__(self, - file_path: Optional[DatasetReaderInput] = None, tokenizer: Optional[Tokenizer] = None, - token_indexers: Optional[Dict[str, TokenIndexer]] = None, - separator: str = ',') -> None: - super().__init__(file_path, tokenizer, token_indexers) - self.__separator = separator + token_indexers: Optional[Dict[str, TokenIndexer]] = None) -> None: + super().__init__(tokenizer, token_indexers) + self.__separator = None @property def separator(self) -> str: @@ -27,6 +25,11 @@ class ClassificationTextfileDatasetReader(DatasetReader): @overrides def _read(self) -> Iterable[Instance]: + if self.file_path is None: + raise ValueError('File path is None') + elif self.separator is None: + raise ValueError('Separator is None') + with open(self.file_path, 'r') as lines: for line in lines: text, label = line.strip().split(self.separator) @@ -35,3 +38,8 @@ class ClassificationTextfileDatasetReader(DatasetReader): label_field = LabelField(label) fields = {'text': text_field, 'label': label_field} yield Instance(fields) + + def __call__(self, file_path: str, separator: str): + self.file_path = file_path + self.separator = separator + return self diff --git a/combo/data/dataset_readers/dataset_reader.py b/combo/data/dataset_readers/dataset_reader.py index 6a0227427b0a0990050065694b0263f4ac4d90db..0b1ffcec7fb9b370da800544a24da992975ad7ec 100644 --- a/combo/data/dataset_readers/dataset_reader.py +++ b/combo/data/dataset_readers/dataset_reader.py @@ -9,7 +9,8 @@ from typing import Iterable, Iterator, Optional, Union, TypeVar, Dict, List from overrides import overrides from torch.utils.data import IterableDataset -from combo.data import Instance, Tokenizer +from combo.data.instance import Instance +from combo.data.tokenizers import Tokenizer from combo.data.token_indexers import TokenIndexer logger = logging.getLogger(__name__) @@ -25,11 +26,10 @@ class DatasetReader(IterableDataset): of `Instance`s. """ def __init__(self, - file_path: Optional[DatasetReaderInput] = None, tokenizer: Optional[Tokenizer] = None, token_indexers: Optional[Dict[str, TokenIndexer]] = None) -> None: super(DatasetReader).__init__() - self.__file_path = file_path + self.__file_path = None self.__tokenizer = tokenizer self.__token_indexers = token_indexers @@ -50,7 +50,7 @@ class DatasetReader(IterableDataset): return self.__token_indexers @overrides - def __getitem__(self, item) -> Instance: + def __getitem__(self, item, **kwargs) -> Instance: raise NotImplementedError @overrides diff --git a/combo/data/fields/adjacency_field.py b/combo/data/fields/adjacency_field.py index 492b5b9b2f28653f2f7d35e7217d9304edc3e243..a0ac7abd542862ae83fd40c0a8bcbc764cb4b9a8 100644 --- a/combo/data/fields/adjacency_field.py +++ b/combo/data/fields/adjacency_field.py @@ -9,8 +9,8 @@ import textwrap import torch -from combo.data import Vocabulary -from combo.data.fields import Field +from combo.data.vocabulary import Vocabulary +from combo.data.fields.field import Field from combo.data.fields.sequence_field import SequenceField from combo.utils import ConfigurationError diff --git a/combo/data/fields/label_field.py b/combo/data/fields/label_field.py index 12bad9aa7e4ad05a4d850a62e75dd0520197f1d4..3ba609751abedc090b501611f7161eb94cf9008c 100644 --- a/combo/data/fields/label_field.py +++ b/combo/data/fields/label_field.py @@ -9,7 +9,7 @@ import logging import torch -from combo.data import Vocabulary +from combo.data.vocabulary import Vocabulary from combo.data.fields import Field from combo.utils import ConfigurationError diff --git a/combo/data/fields/list_field.py b/combo/data/fields/list_field.py index 24e57f096960df2f95c0f22202820aa7d4c1e15a..63d7cd5d2e1efd916c1936e7d1d511b9e118d72a 100644 --- a/combo/data/fields/list_field.py +++ b/combo/data/fields/list_field.py @@ -4,7 +4,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/fields/list_field.py """ from typing import Dict, List, Iterator, Sequence, Any -from combo.data import Vocabulary +from combo.data.vocabulary import Vocabulary from combo.data.fields.field import DataArray, Field from combo.data.fields.sequence_field import SequenceField from combo.utils import pad_sequence_to_length diff --git a/combo/data/fields/sequence_label_field.py b/combo/data/fields/sequence_label_field.py index 4d0299d95f30d2e8a45f8133ccc8bec70f9c58ec..2a33940b1b314450f7e57ed5b1d50d64cefb20fc 100644 --- a/combo/data/fields/sequence_label_field.py +++ b/combo/data/fields/sequence_label_field.py @@ -10,7 +10,7 @@ import textwrap import torch -from combo.data import Vocabulary +from combo.data.vocabulary import Vocabulary from combo.data.fields import Field from combo.data.fields.sequence_field import SequenceField from combo.utils import ConfigurationError, pad_sequence_to_length diff --git a/combo/data/fields/sequence_multilabel_field.py b/combo/data/fields/sequence_multilabel_field.py index a1e6683011d45c79aec380c02fd1cc3dd6473a81..a13a499e64daafb55b273303bb898ee0ad24ab1f 100644 --- a/combo/data/fields/sequence_multilabel_field.py +++ b/combo/data/fields/sequence_multilabel_field.py @@ -11,7 +11,7 @@ from typing import Set, List, Callable, Iterator, Union, Dict import torch from overrides import overrides -from combo.data import Vocabulary +from combo.data.vocabulary import Vocabulary from combo.data.fields import Field from combo.data.fields.sequence_field import SequenceField from combo.utils import ConfigurationError diff --git a/combo/data/instance.py b/combo/data/instance.py index b4c008a0bf1b359cb14af83938b7d2e0d8aa0c44..34771aec12bb13a7a53a047dddba4d09b3da939b 100644 --- a/combo/data/instance.py +++ b/combo/data/instance.py @@ -4,7 +4,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/instance.py """ from typing import Dict, MutableMapping, Mapping, Any -from combo.data import Vocabulary +from combo.data.vocabulary import Vocabulary from combo.data.fields import Field from combo.data.fields.field import DataArray diff --git a/requirements.txt b/requirements.txt index 119654b0065fdd212f76b9c37d190d22b354684c..f7222b4321a04fe45bdf1f8ad96336675d5451b4 100644 --- a/requirements.txt +++ b/requirements.txt @@ -2,8 +2,10 @@ absl-py~=1.4.0 base58~=2.1.1 cached-path~=1.3.3 conllu~=4.4.1 +conllutils~=1.1.4 dependency-injector~=4.41.0 dill~=0.3.6 +importlib-resources~=5.12.0 overrides~=7.3.1 torch~=2.0.0 torchtext~=0.15.1 diff --git a/requirements_no_deps.txt b/requirements_no_deps.txt new file mode 100644 index 0000000000000000000000000000000000000000..1609845e84634fb353d179e1db214b74431cfdbb --- /dev/null +++ b/requirements_no_deps.txt @@ -0,0 +1 @@ +lambo @ git+https://gitlab.clarin-pl.eu/syntactic-tools/lambo.git \ No newline at end of file