Skip to content
Snippets Groups Projects

Add API format converting helper utils.

5 files
+ 97
102
Compare changes
  • Side-by-side
  • Inline

Files

from typing import Optional, List
import collections
from dataclasses import dataclass, field
from dataclasses import dataclass, field
 
from typing import Optional, List, Dict, Any, Union, Tuple
 
 
import conllu
 
from dataclasses_json import dataclass_json
 
from overrides import overrides
 
@dataclass_json
@dataclass
@dataclass
class Token:
class Token:
 
id: Optional[Union[int, Tuple]] = None
token: Optional[str] = None
token: Optional[str] = None
id: Optional[int] = None
lemma: Optional[str] = None
lemma: Optional[str] = None
upostag: Optional[str] = None
upostag: Optional[str] = None
xpostag: Optional[str] = None
xpostag: Optional[str] = None
 
feats: Optional[str] = None
head: Optional[int] = None
head: Optional[int] = None
deprel: Optional[str] = None
deprel: Optional[str] = None
feats: Optional[str] = None
deps: Optional[str] = None
misc: Optional[str] = None
@classmethod
def from_json(cls, json):
return cls(**json)
 
@dataclass_json
@dataclass
@dataclass
class Sentence:
class Sentence:
tokens: List[Token] = field(default_factory=list)
tokens: List[Token] = field(default_factory=list)
embedding: List[float] = field(default_factory=list)
sentence_embedding: List[float] = field(default_factory=list)
 
metadata: Dict[str, Any] = field(default_factory=collections.OrderedDict)
 
 
 
class _TokenList(conllu.TokenList):
 
 
@overrides
 
def __repr__(self):
 
return 'TokenList<' + ', '.join(token['token'] for token in self) + '>'
 
 
 
def sentence2conllu(sentence: Sentence) -> conllu.TokenList:
 
tokens = [collections.OrderedDict(t.to_dict()) for t in sentence.tokens]
 
# Range tokens must be tuple not list, this is conllu library requirement
 
for t in tokens:
 
if type(t["id"]) == list:
 
t["id"] = tuple(t["id"])
 
return _TokenList(tokens=tokens,
 
metadata=sentence.metadata)
 
 
 
def tokens2conllu(tokens: List[str]) -> conllu.TokenList:
 
return _TokenList(
 
[collections.OrderedDict({"id": idx, "token": token}) for
 
idx, token
 
in enumerate(tokens, start=1)],
 
metadata=collections.OrderedDict()
 
)
 
@classmethod
def conllu2sentence(conllu_sentence: conllu.TokenList,
def from_json(cls, json):
sentence_embedding: List[float]) -> Sentence:
return cls(tokens=[Token.from_json(t) for t in json["tree"]],
return Sentence(
embedding=json.get("sentence_embedding", []))
tokens=[Token.from_dict(t) for t in conllu_sentence.tokens],
 
sentence_embedding=sentence_embedding,
 
metadata=conllu_sentence.metadata
 
)
Loading