Skip to content
Snippets Groups Projects
Commit 5d55691a authored by Maja Jablonska's avatar Maja Jablonska
Browse files

Add ClassificationTextfileDatasetReader

parent 755164cd
No related branches found
No related tags found
1 merge request!46Merge COMBO 3.0 into master
from typing import Dict, Iterable, Optional from typing import Dict, Iterable, Optional
from .dataset_reader import DatasetReader, DatasetReaderInput
from overrides import overrides from overrides import overrides
from .dataset_reader import DatasetReader
from .. import Instance, Tokenizer, TokenIndexer from .. import Instance, Tokenizer, TokenIndexer
from ..fields.label_field import LabelField from ..fields.label_field import LabelField
from ..fields.text_field import TextField from ..fields.text_field import TextField
...@@ -10,12 +10,10 @@ from ..fields.text_field import TextField ...@@ -10,12 +10,10 @@ from ..fields.text_field import TextField
class ClassificationTextfileDatasetReader(DatasetReader): class ClassificationTextfileDatasetReader(DatasetReader):
def __init__(self, def __init__(self,
file_path: Optional[DatasetReaderInput] = None,
tokenizer: Optional[Tokenizer] = None, tokenizer: Optional[Tokenizer] = None,
token_indexers: Optional[Dict[str, TokenIndexer]] = None, token_indexers: Optional[Dict[str, TokenIndexer]] = None) -> None:
separator: str = ',') -> None: super().__init__(tokenizer, token_indexers)
super().__init__(file_path, tokenizer, token_indexers) self.__separator = None
self.__separator = separator
@property @property
def separator(self) -> str: def separator(self) -> str:
...@@ -27,6 +25,11 @@ class ClassificationTextfileDatasetReader(DatasetReader): ...@@ -27,6 +25,11 @@ class ClassificationTextfileDatasetReader(DatasetReader):
@overrides @overrides
def _read(self) -> Iterable[Instance]: def _read(self) -> Iterable[Instance]:
if self.file_path is None:
raise ValueError('File path is None')
elif self.separator is None:
raise ValueError('Separator is None')
with open(self.file_path, 'r') as lines: with open(self.file_path, 'r') as lines:
for line in lines: for line in lines:
text, label = line.strip().split(self.separator) text, label = line.strip().split(self.separator)
...@@ -35,3 +38,8 @@ class ClassificationTextfileDatasetReader(DatasetReader): ...@@ -35,3 +38,8 @@ class ClassificationTextfileDatasetReader(DatasetReader):
label_field = LabelField(label) label_field = LabelField(label)
fields = {'text': text_field, 'label': label_field} fields = {'text': text_field, 'label': label_field}
yield Instance(fields) yield Instance(fields)
def __call__(self, file_path: str, separator: str):
self.file_path = file_path
self.separator = separator
return self
...@@ -9,7 +9,8 @@ from typing import Iterable, Iterator, Optional, Union, TypeVar, Dict, List ...@@ -9,7 +9,8 @@ from typing import Iterable, Iterator, Optional, Union, TypeVar, Dict, List
from overrides import overrides from overrides import overrides
from torch.utils.data import IterableDataset from torch.utils.data import IterableDataset
from combo.data import Instance, Tokenizer from combo.data.instance import Instance
from combo.data.tokenizers import Tokenizer
from combo.data.token_indexers import TokenIndexer from combo.data.token_indexers import TokenIndexer
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
...@@ -25,11 +26,10 @@ class DatasetReader(IterableDataset): ...@@ -25,11 +26,10 @@ class DatasetReader(IterableDataset):
of `Instance`s. of `Instance`s.
""" """
def __init__(self, def __init__(self,
file_path: Optional[DatasetReaderInput] = None,
tokenizer: Optional[Tokenizer] = None, tokenizer: Optional[Tokenizer] = None,
token_indexers: Optional[Dict[str, TokenIndexer]] = None) -> None: token_indexers: Optional[Dict[str, TokenIndexer]] = None) -> None:
super(DatasetReader).__init__() super(DatasetReader).__init__()
self.__file_path = file_path self.__file_path = None
self.__tokenizer = tokenizer self.__tokenizer = tokenizer
self.__token_indexers = token_indexers self.__token_indexers = token_indexers
...@@ -50,7 +50,7 @@ class DatasetReader(IterableDataset): ...@@ -50,7 +50,7 @@ class DatasetReader(IterableDataset):
return self.__token_indexers return self.__token_indexers
@overrides @overrides
def __getitem__(self, item) -> Instance: def __getitem__(self, item, **kwargs) -> Instance:
raise NotImplementedError raise NotImplementedError
@overrides @overrides
......
...@@ -9,8 +9,8 @@ import textwrap ...@@ -9,8 +9,8 @@ import textwrap
import torch import torch
from combo.data import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.data.fields import Field from combo.data.fields.field import Field
from combo.data.fields.sequence_field import SequenceField from combo.data.fields.sequence_field import SequenceField
from combo.utils import ConfigurationError from combo.utils import ConfigurationError
......
...@@ -9,7 +9,7 @@ import logging ...@@ -9,7 +9,7 @@ import logging
import torch import torch
from combo.data import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.data.fields import Field from combo.data.fields import Field
from combo.utils import ConfigurationError from combo.utils import ConfigurationError
......
...@@ -4,7 +4,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/fields/list_field.py ...@@ -4,7 +4,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/fields/list_field.py
""" """
from typing import Dict, List, Iterator, Sequence, Any from typing import Dict, List, Iterator, Sequence, Any
from combo.data import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.data.fields.field import DataArray, Field from combo.data.fields.field import DataArray, Field
from combo.data.fields.sequence_field import SequenceField from combo.data.fields.sequence_field import SequenceField
from combo.utils import pad_sequence_to_length from combo.utils import pad_sequence_to_length
......
...@@ -10,7 +10,7 @@ import textwrap ...@@ -10,7 +10,7 @@ import textwrap
import torch import torch
from combo.data import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.data.fields import Field from combo.data.fields import Field
from combo.data.fields.sequence_field import SequenceField from combo.data.fields.sequence_field import SequenceField
from combo.utils import ConfigurationError, pad_sequence_to_length from combo.utils import ConfigurationError, pad_sequence_to_length
......
...@@ -11,7 +11,7 @@ from typing import Set, List, Callable, Iterator, Union, Dict ...@@ -11,7 +11,7 @@ from typing import Set, List, Callable, Iterator, Union, Dict
import torch import torch
from overrides import overrides from overrides import overrides
from combo.data import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.data.fields import Field from combo.data.fields import Field
from combo.data.fields.sequence_field import SequenceField from combo.data.fields.sequence_field import SequenceField
from combo.utils import ConfigurationError from combo.utils import ConfigurationError
......
...@@ -4,7 +4,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/instance.py ...@@ -4,7 +4,7 @@ https://github.com/allenai/allennlp/blob/main/allennlp/data/instance.py
""" """
from typing import Dict, MutableMapping, Mapping, Any from typing import Dict, MutableMapping, Mapping, Any
from combo.data import Vocabulary from combo.data.vocabulary import Vocabulary
from combo.data.fields import Field from combo.data.fields import Field
from combo.data.fields.field import DataArray from combo.data.fields.field import DataArray
......
...@@ -2,8 +2,10 @@ absl-py~=1.4.0 ...@@ -2,8 +2,10 @@ absl-py~=1.4.0
base58~=2.1.1 base58~=2.1.1
cached-path~=1.3.3 cached-path~=1.3.3
conllu~=4.4.1 conllu~=4.4.1
conllutils~=1.1.4
dependency-injector~=4.41.0 dependency-injector~=4.41.0
dill~=0.3.6 dill~=0.3.6
importlib-resources~=5.12.0
overrides~=7.3.1 overrides~=7.3.1
torch~=2.0.0 torch~=2.0.0
torchtext~=0.15.1 torchtext~=0.15.1
......
lambo @ git+https://gitlab.clarin-pl.eu/syntactic-tools/lambo.git
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment