Commit 8ec75f2d authored by Łukasz Kopociński's avatar Łukasz Kopociński

Prepare new approach to worker prediction

parent 767b0dbd
......@@ -18,8 +18,10 @@ class Vectorizer(abc.ABC):
class ElmoVectorizer(Vectorizer):
def __init__(self, options_path: str, weights_path: str):
self.model = ElmoEmbedder(options_path, weights_path, cuda_device=0)
def __init__(self, options_path: str, weights_path: str, device: int = 0):
self.model = ElmoEmbedder(
options_path, weights_path, cuda_device=device
)
def embed(self, context: List[str]) -> torch.Tensor:
vectors = self.model.embed_sentence(context)
......
......@@ -26,7 +26,7 @@ RUN apt update && apt install -y dvc
# download resources
WORKDIR /home/
COPY ./credentials /root/.aws/credentials
COPY ./deps/credentials /root/.aws/credentials
RUN dvc get https://gitlab.clarin-pl.eu/team-semantics/semrel-extraction --rev lkopocinski/code-clean-up semrel/data/data/elmo
RUN dvc get https://gitlab.clarin-pl.eu/team-semantics/semrel-extraction --rev lkopocinski/code-clean-up semrel/data/data/fasttext
......
import abc
from collections import deque
from itertools import product
from corpus_ccl import cclutils as ccl
from corpus_ccl import corpus_object_utils as cou
from corpus_ccl import token_utils as tou
class Parser(object):
def __init__(self, extractor):
self._slicer = SentenceWindow(window_size=3)
self._extractor = extractor
def __call__(self, document):
for context in self._slicer.contextify(document):
for first, second in self._extractor.extract(context, attr='NE'):
yield first, second
class ContextType(object):
__metaclass__ = abc.ABCMeta
def __init__(self, name='default'):
self._name = name
@abc.abstractmethod
def contextify(self, document):
pass
class ExtractorType(object):
__metaclass__ = abc.ABCMeta
def __init__(self, name='default', tagset='nkjp'):
self._name = name
self._tagset = ccl.get_tagset(tagset)
def find_attribute_tokens(self, context, attr='default'):
found = []
for sentence in context:
for ind, token in enumerate(sentence.tokens()):
if tou.get_annotation(sentence, token, attr, ind, default=0):
ctx = [t.orth_utf8() for t in sentence.tokens()]
found.append((ind, ctx))
return found
def is_noun(self, token):
return cou.get_coarse_pos(token, self._tagset) == 'noun'
def is_ne(self, index, token, sentence):
return tou.get_annotation(sentence, token, 'NE', index, default=0)
def extract(self, context, attr='default'):
analysed = self.find_attribute_tokens(context, attr)
to_match = self._extract(context)
return product(analysed, to_match)
@abc.abstractmethod
def _extract(self, context):
pass
class NounExtractor(ExtractorType):
def __init__(self, name='NounExtractor'):
super(NounExtractor, self).__init__(name)
def _extract(self, context):
matched = []
for sentence in context:
for ind, token in enumerate(sentence.tokens()):
if self.is_noun(token):
ctx = [t.orth_utf8() for t in sentence.tokens()]
matched.append((ind, ctx))
return matched
class NERExtractor(ExtractorType):
def __init__(self, name='NERExtractor'):
super(NERExtractor, self).__init__(name)
def _extract(self, context):
matched = []
for sentence in context:
for ind, token in enumerate(sentence.tokens()):
if self.is_ne(ind, token, sentence):
ctx = [t.orth_utf8() for t in sentence.tokens()]
matched.append((ind, ctx))
return matched
class SentenceWindow(ContextType):
def __init__(self, name='SentenceWindow', window_size=3, tagset='nkjp'):
super(SentenceWindow, self).__init__(name)
self._size = window_size
def contextify(self, document):
iterator = (sentence for paragraph in document.paragraphs()
for sentence in paragraph.sentences())
window = deque(maxlen=self._size)
for element in iterator:
window.append(element)
if len(window) == self._size:
yield window
if len(window) < self._size:
yield window
class Predictor(object):
def __init__(self, net_model, elmo, fasttext, device='cpu'):
self._net = net_model
self._elmo = elmo
self._fasttext = fasttext
self.device = device
def _make_vectors(self, pair):
(idx1, ctx1), (idx2, ctx2) = pair
print(pair)
ev1 = self._elmo.embed(ctx1)[idx1]
ev2 = self._elmo.embed(ctx2)[idx2]
fv1 = self._fasttext.embed(ctx1)[idx1]
fv2 = self._fasttext.embed(ctx2)[idx2]
v = torch.cat([ev1, ev2, fv1, fv2])
return v.to(self.device)
def _predict(self, vectors):
with torch.no_grad():
prediction = self._net(vectors)
prediction = torch.argmax(prediction)
return prediction.item()
def predict(self, pair):
vectors = self._make_vectors(pair)
return self._predict(vectors)
\ No newline at end of file
import os
ELMO_MODEL_OPTIONS = os.getenv('ELMO_MODEL_OPTIONS')
ELMO_MODEL_WEIGHTS = os.getenv('ELMO_MODEL_WEIGHTS')
FASTTEXT_MODEL = os.getenv('FASTTEXT_MODEL')
PREDICTION_MODEL = os.getenv('PREDICTION_MODEL')
NER_KEY = 'ner'
import abc
from abc import ABC
from collections import deque
from typing import Deque, Tuple, Callable, List
from semrel.data.scripts.corpus import Document, DocSentence
WINDOW_SIZE = 3
class Slicer(ABC):
def __init__(self, name: str = 'default'):
self._name = name
@abc.abstractmethod
def contextify(self, document: Document):
pass
class SentenceWindow(Slicer):
def __init__(self, name: str = 'SentenceWindow', window_size: int = 3):
super(SentenceWindow, self).__init__(name)
self._size = window_size
def contextify(self, document: Document) -> Deque:
window = deque(maxlen=self._size)
for sentence in document.sentences:
window.append(sentence)
if len(window) == self._size:
yield window
if len(window) < self._size:
yield window
class Parser:
def __init__(self, extractor: Callable):
self._slicer = SentenceWindow(window_size=WINDOW_SIZE)
self._extractor = extractor
def __call__(self, document: Document) -> List[Tuple]:
for context in self._slicer.contextify(document):
yield self._extractor(context)
def find_nouns(context: Deque[DocSentence]) -> List[Tuple]:
return [
(sentence.noun_indices, sentence.orths)
for sentence in context
]
def find_named_entities(context: Deque[DocSentence]) -> List[Tuple]:
return [
(sentence.named_entities_indices, sentence.orths)
for sentence in context
]
from itertools import permutations
from typing import Tuple, List
import torch
from semrel.data.scripts.vectorizers import ElmoVectorizer, FastTextVectorizer
from semrel.model.scripts import RelNet
class Predictor:
def __init__(
self,
net_model: RelNet,
elmo: ElmoVectorizer,
fasttext: FastTextVectorizer
):
self._net = net_model
self._elmo = elmo
self._fasttext = fasttext
self._device = self._net.get_device()
def _make_vectors(self, indices_context: List[Tuple]):
orths = []
vectors = []
for indices, context in zip(*indices_context):
_orths = [
orth
for index, orth in enumerate(context)
if index in indices
]
_vectors_elmo = self._elmo.embed(context)
_vectors_fasttext = self._fasttext.embed(context)
_vectors_elmo = _vectors_elmo[indices]
_vectors_fasttext = _vectors_fasttext[indices]
orths.extend(_orths)
vectors.append((_vectors_elmo, _vectors_fasttext))
vectors_elmo, vectors_fasttext = zip(*vectors)
vectors_elmo = torch.cat(vectors_elmo)
vectors_fasttext = torch.cat(vectors_fasttext)
size = len(orths)
idx_from, idx_to = zip(*list(permutations(range(size))))
elmo_from = vectors_elmo[idx_from]
elmo_to = vectors_elmo[idx_to]
fasttext_from = vectors_fasttext[idx_from]
fasttext_to = vectors_fasttext[idx_to]
elmo_vectors = torch.cat([elmo_from, elmo_to])
fasttext_vectors = torch.cat([fasttext_from, fasttext_to])
vector = torch.cat([elmo_vectors, fasttext_vectors])
return vector.to(self._device)
def _predict(self, vectors: torch.Tensor):
with torch.no_grad():
predictions = self._net(vectors)
predictions = torch.argmax(predictions)
return predictions
def predict(self, indices_context: List[Tuple]):
vectors = self._make_vectors(indices_context)
return self._predict(vectors)
import logging
import os
from typing import Iterator
from pathlib import Path
from typing import Dict, Iterator
import nlp_ws
from corpus_ccl import cclutils
from semrel.data.scripts.corpus import Document
from semrel.data.scripts.vectorizers import ElmoVectorizer
from semrel.data.scripts.utils.io import save_lines
from semrel.data.scripts.vectorizers import ElmoVectorizer, FastTextVectorizer
from semrel.model.scripts import RelNet
from semrel.model.scripts.utils.utils import get_device
from worker.extractor import Parser, NounExtractor
from worker.prediction import Predictor
from worker.scripts import constant
from worker.scripts.extractor import Parser, find_named_entities, find_nouns
from worker.scripts.prediction import Predictor
_log = logging.getLogger(__name__)
def load_model(model_path, vector_size=2648) -> RelNet:
def load_model(model_path: str, vector_size: int = 2648) -> RelNet:
net = RelNet(in_dim=vector_size)
net.load(model_path)
net.eval()
......@@ -29,59 +31,55 @@ class SemrelWorker(nlp_ws.NLPWorker):
pass
def init(self):
_log.critical("Started loading models.")
_log.critical("Loading ELMO ...")
self.elmo = ElmoVectorizer(
options=os.getenv('ELMO_MODEL_OPTIONS'),
weights=os.getenv('ELMO_MODEL_WEIGHTS')
_log.critical("Loading models.")
self._device = get_device()
_log.critical("Loading ELMO model ...")
self._elmo = ElmoVectorizer(
options_path=constant.ELMO_MODEL_OPTIONS,
weights_path=constant.ELMO_MODEL_WEIGHTS,
device=self._device.index
)
# _log.critical("Loading FASTTEXT ...")
self.fasttext = None
# self.fasttext = FastTextVectorizer(
# model_path=os.getenv('FASTTEXT_MODEL')
# )
# _log.critical("Finished loading models.")
_log.critical("Loading FASTTEXT model ...")
self._fasttext = FastTextVectorizer(
model_path=constant.FASTTEXT_MODEL
)
def process(self, input_path: str, task_options: dict, output_path: str):
extractor = self._get_extractor(task_options)
parser = Parser(extractor)
predictions = self.predict(input_path, parser)
_log.critical("Loading models completed.")
self.save_predictions(predictions, output_path)
def process(self, input_path: str, task_options: Dict, output_path: str):
# load model
net = load_model(constant.PREDICTION_MODEL)
net = net.to(self._device)
def _get_extractor(self, task_options: dict):
if task_options.get('ner', False):
extractor = NerExtractor()
if task_options.get(constant.NER_KEY, False):
parser = Parser(find_named_entities)
else:
extractor = NounExtractor()
return extractor
def predict(self, path: str, parser: Parser):
_log.critical("Loading net model ...")
device = get_device()
net = load_model(os.getenv('PREDICTION_MODEL'))
net = net.to(device)
_log.critical("Net model loaded " + str(net))
predictor = Predictor(net, self.elmo, self.fasttext, device)
document = Document(cclutils.read_ccl(path))
for pair in parser(document):
decision = predictor.predict(pair)
(f_idx, f_ctx), (s_idx, s_ctx) = pair
orth_from = f_ctx[f_idx]
orth_to = s_ctx[s_idx]
_log.critical(f'{orth_from}\t{orth_to}: {decision}\n')
yield f'{orth_from}\t{orth_to}: {decision}\n'
def save_predictions(self, predictions: Iterator, output_path: str):
with open(output_path, 'w', encoding='utf-8') as out_file:
for pred in predictions:
out_file.write(pred)
parser = Parser(find_nouns)
predictor = Predictor(net, self._elmo, self._fasttext)
document = Document(cclutils.read_ccl(input_path))
for indices_context in parser(document):
predictions = predictor.predict(indices_context)
# save predictions
save_lines(Path(output_path), predictions)
# def _predict(self, predictor: Predictor, pairs: Iterator):
# pairs = list(pairs)
# members_from, members_to = zip(*pairs)
# orths_from = [context[index] for index, context in members_from]
# orths_to = [context[index] for index, context in members_to]
#
# predictions = [predictor.predict(pair) for pair in pairs]
#
# return [
# f'{orth_from}\t{orth_to}: {decision}\n'
# for orth_from, orth_to, decision
# in zip(orths_from, orths_to, predictions)
# ]
if __name__ == '__main__':
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment