Commit 1f617a33 authored by Piotr Miłkowski's avatar Piotr Miłkowski

Merge branch 'feature-266' into 'dev'

Feature 266

See merge request !1
parents c7f98d2c 099ca27b
Pipeline #2816 failed with stage
in 27 seconds
.idea
/idea
log.txt
__pycache__
service.log
image: clarinpl/python:3.8
cache:
paths:
- .tox
stages:
- check_style
- build
before_script:
- pip install tox==2.9.1
pep8:
stage: check_style
script:
- tox -v -e pep8
\ No newline at end of file
FROM clarinpl/python:3.8
WORKDIR /home/worker
COPY ./src ./src
COPY ./main.py .
COPY ./requirements.txt .
COPY ./config.ini .
COPY ./models ./models
COPY ./data ./data
COPY ./fastai_contrib ./fastai_contrib
COPY ./multifit ./multifit
RUN apt-get install -y build-essential libffi-dev
RUN pip install --index-url https://pypi.clarin-pl.eu/simple/ -r requirements.txt
RUN pip install torch==1.5.1+cpu torchvision==0.6.1+cpu -f https://download.pytorch.org/whl/torch_stable.html
CMD ["python3.8", "main.py"]
[service]
tool = multifit
root = /samba/requests/
rabbit_host = 10.17.0.85
rabbit_user = clarin
rabbit_password = clarin123
[tool]
workers_number = 1
[logging]
port = 9981
local_log_level = INFO
[model]
model_a = {"file": "models/model_a.hdf5", "labels": ["__label__meta_minus_m", "__label__meta_plus_m", "__label_meta_zero", "__label_meta_amb"]}
model_b = {"file": "models/model_b.hdf5", "labels": ["__label__z_zero", "__label__z_plus_m", "__label__z_minus_m", "__label__z_amb"]}
model_c = {"file": "models/model_b.hdf5", "labels": ["__label__z_zero", "__label__z_plus_m", "__label__z_minus_m", "__label__z_amb"]}
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
import pathlib
from typing import Collection
from pandas import DataFrame
from sacremoses import MosesTokenizer
import fastai
from fastai.basic_data import DataBunch
from fastai.core import ListRules, PathOrStr, IntsOrStrs, is_listy
from fastai.data_block import ItemLists
from fastai.text import *
class MosesPreprocessingFunc():
def __init__(self, lang: str):
self.mt = MosesTokenizer(lang)
def __call__(self, t: str) -> str:
return self.mt.tokenize(t, return_str=True, escape=True)
try:
from fastai.text import SPProcessor
except ImportError:
def _join_texts(texts:Collection[str], mark_fields:bool=False, include_bos:bool=True, include_eos:bool=False):
if not isinstance(texts, np.ndarray): texts = np.array(texts)
if is1d(texts): texts = texts[:,None]
df = pd.DataFrame({i:texts[:,i] for i in range(texts.shape[1])})
bos_tok = f'{BOS} ' if include_bos else ''
text_col = f'{bos_tok}{FLD} {1} ' + df[0].astype(str) if mark_fields else f'{bos_tok}' + df[0].astype(str)
for i in range(1,len(df.columns)):
text_col += (f' {FLD} {i+1} ' if mark_fields else ' ') + df[i].astype(str)
if include_eos: text_col = text_col + f' {EOS}'
return text_col.values
def apply_rules(text, pre_rules=None, post_rules=None):
"Apply `pre_rules` and `post_rules` to `text`"
text = text.strip(' ')
for r in ifnone(pre_rules, defaults.text_pre_rules): text = r(text)
toks = text.split()
for r in ifnone(post_rules, defaults.text_post_rules): toks = r(toks)
return ' '.join(toks)
def get_default_size(texts, max_vocab_sz):
"Either max_vocab_sz or one quarter of the number of unique words in `texts`"
cnt = Counter()
for t in texts:
cnt.update(t.split())
if len(cnt)//4 > max_vocab_sz: return max_vocab_sz
res = len(cnt)//4
while res%8 != 0: res+=1
return res
full_char_coverage_langs = ["bg", "cs", "da", "de", "el", "en", "es", "et", "fi", "fr", "ga", "hr", "hu",
"it","lt","lv","mt","nl","pl","pt","ro","sk","sl","sv"] # all European langs
def train_sentencepiece(texts:Collection[str], path:PathOrStr, pre_rules: ListRules=None, post_rules:ListRules=None,
vocab_sz:int=None, max_vocab_sz:int=30000, model_type:str='unigram', max_sentence_len:int=20480, lang='en',
char_coverage=None, tmp_dir='tmp', enc='utf8'):
"Train a sentencepiece tokenizer on `texts` and save it in `path/tmp_dir`"
from sentencepiece import SentencePieceTrainer
cache_dir = Path(path)/tmp_dir
os.makedirs(cache_dir, exist_ok=True)
if vocab_sz is None: vocab_sz=get_default_size(texts, max_vocab_sz)
raw_text_path = cache_dir / 'all_text.out'
with open(raw_text_path, 'w', encoding=enc) as f: f.write("\n".join(texts))
spec_tokens = ['\u2581'+s for s in defaults.text_spec_tok]
SentencePieceTrainer.Train(" ".join([
f"--input={raw_text_path} --max_sentence_length={max_sentence_len}",
f"--character_coverage={ifnone(char_coverage, 0.99999 if lang in full_char_coverage_langs else 0.9998)}",
f"--unk_id={len(defaults.text_spec_tok)} --pad_id=-1 --bos_id=-1 --eos_id=-1",
f"--user_defined_symbols={','.join(spec_tokens)}",
f"--model_prefix={cache_dir/'spm'} --vocab_size={vocab_sz} --model_type={model_type}"]))
raw_text_path.unlink()
return cache_dir
class SPProcessor(PreProcessor):
"`PreProcessor` that tokenizes and numericalizes with `sentencepiece`"
def __init__(self, ds:ItemList=None, pre_rules: ListRules=None, post_rules:ListRules=None, vocab_sz:int=None,
max_vocab_sz:int=30000, model_type:str='unigram', max_sentence_len:int=20480, lang='en',
char_coverage=None, tmp_dir='tmp', mark_fields:bool=False, include_bos:bool=True,
include_eos:bool=False, sp_model=None, sp_vocab=None, n_cpus:int=None, enc='utf8'):
try: from sentencepiece import SentencePieceTrainer,SentencePieceProcessor
except ImportError:
raise Exception('sentencepiece module is missing: run `pip install sentencepiece`')
self.pre_rules,self.post_rules,self.enc = pre_rules,post_rules,enc
self.mark_fields,self.include_bos,self.include_eos = mark_fields,include_bos,include_eos
self.sp_model,self.sp_vocab,self.n_cpus = sp_model,sp_vocab,ifnone(n_cpus,defaults.cpus)
self.train_func = partial(train_sentencepiece, pre_rules=pre_rules, post_rules=post_rules, vocab_sz=vocab_sz,
max_vocab_sz=max_vocab_sz, model_type=model_type, max_sentence_len=max_sentence_len, lang=lang,
char_coverage=char_coverage, tmp_dir=tmp_dir, enc=enc)
def process_one(self, item, join=True):
if join: text = _join_texts([item], self.mark_fields, self.include_bos, self.include_eos)[0]
text = apply_rules(text, pre_rules=self.pre_rules, post_rules=self.post_rules)
return self._encode_batch([text])[0]
def process(self, ds):
ds.items = _join_texts(ds.items, self.mark_fields, self.include_bos, self.include_eos)
ds.items = [apply_rules(t, pre_rules=self.pre_rules, post_rules=self.post_rules)
for t in progress_bar(ds.items, leave=False)]
if self.sp_model is None or self.sp_vocab is None:
cache_dir = self.train_func(ds.items, ds.path)
self.sp_model,self.sp_vocab = cache_dir/'spm.model',cache_dir/'spm.vocab'
if not getattr(self, 'vocab', False):
with open(self.sp_vocab, 'r', encoding=self.enc) as f: self.vocab = Vocab([line.split('\t')[0] for line in f.readlines()])
if self.n_cpus <= 1: ds.items = self._encode_batch(ds.items)
else:
with ProcessPoolExecutor(self.n_cpus) as e:
ds.items = np.array(sum(e.map(self._encode_batch, partition_by_cores(ds.items, self.n_cpus)), []))
ds.vocab = self.vocab
def _encode_batch(self, texts):
from sentencepiece import SentencePieceProcessor
tok = SentencePieceProcessor()
tok.Load(str(self.sp_model))
return [np.array(tok.EncodeAsIds(t)) for t in texts]
@classmethod
def load(cls, path:PathOrStr, tmp_dir:PathOrStr='tmp', name:str='spm'):
cache_dir = Path(path)/tmp_dir
return cls(sp_model=cache_dir/f'{name}.model', sp_vocab=cache_dir/f'{name}.vocab')
class SPProcessor2(SPProcessor):
def process(self, ds):
super().process(ds)
ds.vocab.sp_model = self.sp_model
ds.vocab.sp_vocab = self.sp_vocab
# temporary loading function as from_df does not support processors
def make_data_bunch_from_df(cls, path: PathOrStr, train_df: DataFrame, valid_df: DataFrame,
tokenizer: Tokenizer = None, vocab: Vocab = None, classes: Collection[str] = None,
text_cols: IntsOrStrs = 1,
label_cols: IntsOrStrs = 0, label_delim: str = None, chunksize: int = 10000,
max_vocab: int = 60000,
min_freq: int = 2, mark_fields: bool = False, include_bos: bool = True,
include_eos: bool = False, processor=None, **kwargs) -> DataBunch:
"Create a `TextDataBunch` from DataFrames. `kwargs` are passed to the dataloader creation."
assert processor is None or tokenizer is None, "Processor and tokenizer are mutually exclusive."
if processor is None:
processor = fastai.text.data._get_processor(tokenizer=tokenizer, vocab=vocab, chunksize=chunksize, max_vocab=max_vocab,
min_freq=min_freq, mark_fields=mark_fields,
include_bos=include_bos, include_eos=include_eos)
if classes is None and is_listy(label_cols) and len(label_cols) > 1: classes = label_cols
src = ItemLists(path, TextList.from_df(train_df, path, cols=text_cols, processor=processor),
TextList.from_df(valid_df, path, cols=text_cols, processor=processor))
if cls == TextLMDataBunch:
src = src.label_for_lm()
else:
if label_delim is not None:
src = src.label_from_df(cols=label_cols, classes=classes, label_delim=label_delim)
else:
src = src.label_from_df(cols=label_cols, classes=classes)
return src.databunch(**kwargs)
\ No newline at end of file
import nlp_ws
from src.Multifit_worker import MultifitWorker
if __name__ == '__main__':
nlp_ws.NLPService.main(MultifitWorker)
multifit @ 6f1d1232
Subproject commit 6f1d1232b4c24850214b1473305d512902306715
nlp_ws
fastai==1.0.61
sentencepiece==0.1.85
sacremoses==0.0.35
ninja==1.10.0.post1
spacy==2.3.0
2020-10-05 10:03:13,352 [INFO]: MainProcess Initializing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:03:13,353 [INFO]: MainProcess Starting log server listening on ('localhost', 9981)
2020-10-05 10:03:13,364 [INFO]: worker-0 Initializing <src.Multifit_worker.MultifitWorker object at 0x7f0943032110>
2020-10-05 10:03:13,386 [INFO]: worker-0 Starting worker <src.Multifit_worker.MultifitWorker object at 0x7f0943032110> with queue: nlp_multifit
2020-10-05 10:03:22,772 [INFO]: worker-0 Started processing task 13e1a586-0ec4-48cb-8b8f-e5df70d54ddd
2020-10-05 10:03:22,772 [INFO]: worker-0 Options passed to task: {'type': 'text'}
2020-10-05 10:03:22,792 [ERROR]: worker-0 Unable to process task 13e1a586-0ec4-48cb-8b8f-e5df70d54ddd with message: {"tool":"multifit","progressQueue":"amq.gen-0lVSqsREs05ja3XK8g2Esg","file":"/samba/requests/predictLang/4e3c9f7e-88fa-48e2-956c-28e5d35fa596","options":{"type":"text"}}
Traceback (most recent call last):
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/nlp_ws/_service.py", line 383, in __call__
self._process_task(result, out_file, props, body)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/nlp_ws/_service.py", line 448, in _process_task
self._wrk.process(in_file, opts, out_file)
File "/mnt/praca/multifit_classifier/multifit/src/Multifit_worker.py", line 27, in process
result = self._classifier.predict(text, lang=lang, task_options=task)
File "/mnt/praca/multifit_classifier/multifit/src/Multifit_worker.py", line 50, in predict
learner = load_learner("models", path)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/basic_train.py", line 621, in load_learner
state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/torch/serialization.py", line 593, in load
return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/torch/serialization.py", line 773, in _legacy_load
result = unpickler.load()
ModuleNotFoundError: No module named 'multifit'
2020-10-05 10:03:22,793 [INFO]: worker-0 Done with task 13e1a586-0ec4-48cb-8b8f-e5df70d54ddd
2020-10-05 10:06:34,930 [INFO]: worker-0 Shutting down on user interrupt
2020-10-05 10:06:34,930 [INFO]: worker-0 Stopping worker <src.Multifit_worker.MultifitWorker object at 0x7f0943032110> with queue: nlp_multifit
2020-10-05 10:06:34,940 [INFO]: worker-0 Finalizing <src.Multifit_worker.MultifitWorker object at 0x7f0943032110>
2020-10-05 10:06:34,947 [INFO]: MainProcess Log server on ('localhost', 9981) is going down
2020-10-05 10:06:35,560 [INFO]: MainProcess Finalizing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:06:37,952 [INFO]: MainProcess Initializing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:06:37,953 [INFO]: MainProcess Starting log server listening on ('localhost', 9981)
2020-10-05 10:06:37,959 [INFO]: worker-0 Initializing <src.Multifit_worker.MultifitWorker object at 0x7f032725d450>
2020-10-05 10:06:37,978 [INFO]: worker-0 Starting worker <src.Multifit_worker.MultifitWorker object at 0x7f032725d450> with queue: nlp_multifit
2020-10-05 10:06:48,453 [INFO]: worker-0 Started processing task e708d5f7-8906-474b-a97f-a7fc6202431e
2020-10-05 10:06:48,453 [INFO]: worker-0 Options passed to task: {'type': 'text'}
2020-10-05 10:06:48,504 [ERROR]: worker-0 Unable to process task e708d5f7-8906-474b-a97f-a7fc6202431e with message: {"tool":"multifit","progressQueue":"amq.gen-bnwONgQHa0wpdyhvH8X4eA","file":"/samba/requests/predictLang/b1e98c45-3414-401d-a148-3bb2e354f61a","options":{"type":"text"}}
Traceback (most recent call last):
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/nlp_ws/_service.py", line 383, in __call__
self._process_task(result, out_file, props, body)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/nlp_ws/_service.py", line 448, in _process_task
self._wrk.process(in_file, opts, out_file)
File "/mnt/praca/multifit_classifier/multifit/src/Multifit_worker.py", line 27, in process
result = self._classifier.predict(text, lang=lang, task_options=task)
File "/mnt/praca/multifit_classifier/multifit/src/Multifit_worker.py", line 50, in predict
learner = load_learner("models", path)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/basic_train.py", line 621, in load_learner
state = torch.load(source, map_location='cpu') if defaults.device == torch.device('cpu') else torch.load(source)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/torch/serialization.py", line 593, in load
return _legacy_load(opened_file, map_location, pickle_module, **pickle_load_args)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/torch/serialization.py", line 773, in _legacy_load
result = unpickler.load()
File "/mnt/praca/multifit_classifier/multifit/multifit/__init__.py", line 1, in <module>
from .datasets import Dataset, ULMFiTDataset
File "/mnt/praca/multifit_classifier/multifit/multifit/datasets/__init__.py", line 1, in <module>
from .dataset import Dataset, ULMFiTDataset, read_clas_csv, read_wiki_articles, ULMFiTTokenizer
File "/mnt/praca/multifit_classifier/multifit/multifit/datasets/dataset.py", line 4, in <module>
from fastai_contrib.text_data import MosesPreprocessingFunc, \
ModuleNotFoundError: No module named 'fastai_contrib'
2020-10-05 10:06:48,505 [INFO]: worker-0 Done with task e708d5f7-8906-474b-a97f-a7fc6202431e
2020-10-05 10:06:51,490 [INFO]: worker-0 Shutting down on user interrupt
2020-10-05 10:06:51,491 [INFO]: worker-0 Stopping worker <src.Multifit_worker.MultifitWorker object at 0x7f032725d450> with queue: nlp_multifit
2020-10-05 10:06:51,502 [INFO]: worker-0 Finalizing <src.Multifit_worker.MultifitWorker object at 0x7f032725d450>
2020-10-05 10:06:51,509 [INFO]: MainProcess Log server on ('localhost', 9981) is going down
2020-10-05 10:06:51,968 [INFO]: MainProcess Finalizing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:07:20,890 [INFO]: MainProcess Initializing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:07:20,891 [INFO]: MainProcess Starting log server listening on ('localhost', 9981)
2020-10-05 10:07:20,898 [INFO]: worker-0 Initializing <src.Multifit_worker.MultifitWorker object at 0x7fb6ef6db210>
2020-10-05 10:07:20,918 [INFO]: worker-0 Starting worker <src.Multifit_worker.MultifitWorker object at 0x7fb6ef6db210> with queue: nlp_multifit
2020-10-05 10:07:20,919 [INFO]: worker-0 Started processing task 8b5d993e-6bf6-42d7-a97c-3188f503d3b7
2020-10-05 10:07:20,919 [INFO]: worker-0 Options passed to task: {'type': 'text'}
2020-10-05 10:07:21,860 [ERROR]: worker-0 Unable to process task 8b5d993e-6bf6-42d7-a97c-3188f503d3b7 with message: {"tool":"multifit","progressQueue":"amq.gen-OpnDhRXfotydq2HtAdXD4g","file":"/samba/requests/predictLang/6d498814-05fa-43ef-b037-53e90a6cd48f","options":{"type":"text"}}
Traceback (most recent call last):
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/nlp_ws/_service.py", line 383, in __call__
self._process_task(result, out_file, props, body)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/nlp_ws/_service.py", line 448, in _process_task
self._wrk.process(in_file, opts, out_file)
File "/mnt/praca/multifit_classifier/multifit/src/Multifit_worker.py", line 27, in process
result = self._classifier.predict(text, lang=lang, task_options=task)
File "/mnt/praca/multifit_classifier/multifit/src/Multifit_worker.py", line 51, in predict
results = learner.predict("xxbos " + str(ccl))
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/basic_train.py", line 372, in predict
batch = self.data.one_item(item)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/basic_data.py", line 181, in one_item
with ds.set_item(item):
File "/home/bbojanowski/anaconda3/lib/python3.7/contextlib.py", line 112, in __enter__
return next(self.gen)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/data_block.py", line 615, in set_item
self.item = self.x.process_one(item)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/data_block.py", line 91, in process_one
for p in self.processor: item = p.process_one(item)
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/text/data.py", line 459, in process_one
return self._encode_batch([text])[0]
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/fastai/text/data.py", line 479, in _encode_batch
tok.Load(str(self.sp_model))
File "/mnt/praca/multifit_classifier/multifit_my/venv/lib/python3.7/site-packages/sentencepiece.py", line 118, in Load
return _sentencepiece.SentencePieceProcessor_Load(self, filename)
OSError: Not found: "data/cls/de-sent/models/sp15k/spm.model": No such file or directory Error #2
2020-10-05 10:07:21,868 [INFO]: worker-0 Done with task 8b5d993e-6bf6-42d7-a97c-3188f503d3b7
2020-10-05 10:07:36,333 [INFO]: worker-0 Shutting down on user interrupt
2020-10-05 10:07:36,333 [INFO]: worker-0 Stopping worker <src.Multifit_worker.MultifitWorker object at 0x7fb6ef6db210> with queue: nlp_multifit
2020-10-05 10:07:36,342 [INFO]: worker-0 Finalizing <src.Multifit_worker.MultifitWorker object at 0x7fb6ef6db210>
2020-10-05 10:07:36,354 [INFO]: MainProcess Log server on ('localhost', 9981) is going down
2020-10-05 10:07:36,908 [INFO]: MainProcess Finalizing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:08:59,570 [INFO]: MainProcess Initializing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 10:08:59,570 [INFO]: MainProcess Starting log server listening on ('localhost', 9981)
2020-10-05 10:08:59,577 [INFO]: worker-0 Initializing <src.Multifit_worker.MultifitWorker object at 0x7f935a6ae090>
2020-10-05 10:08:59,597 [INFO]: worker-0 Starting worker <src.Multifit_worker.MultifitWorker object at 0x7f935a6ae090> with queue: nlp_multifit
2020-10-05 10:08:59,598 [INFO]: worker-0 Started processing task 51402ff4-3000-4bc8-ad05-aad16aa6e777
2020-10-05 10:08:59,598 [INFO]: worker-0 Options passed to task: {'type': 'text'}
2020-10-05 10:08:59,876 [INFO]: worker-0 Finished processing task 51402ff4-3000-4bc8-ad05-aad16aa6e777 in 0.277475
2020-10-05 10:08:59,876 [INFO]: worker-0 Done with task 51402ff4-3000-4bc8-ad05-aad16aa6e777
2020-10-05 10:13:49,281 [INFO]: worker-0 Shutting down on user interrupt
2020-10-05 10:13:49,281 [INFO]: worker-0 Stopping worker <src.Multifit_worker.MultifitWorker object at 0x7f935a6ae090> with queue: nlp_multifit
2020-10-05 10:13:49,290 [INFO]: worker-0 Finalizing <src.Multifit_worker.MultifitWorker object at 0x7f935a6ae090>
2020-10-05 10:13:49,307 [INFO]: MainProcess Log server on ('localhost', 9981) is going down
2020-10-05 10:13:49,866 [INFO]: MainProcess Finalizing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 11:17:29,852 [INFO]: MainProcess Initializing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 11:17:29,852 [INFO]: MainProcess Starting log server listening on ('localhost', 9981)
2020-10-05 11:17:29,859 [INFO]: worker-0 Initializing <src.Multifit_worker.MultifitWorker object at 0x7f468e187650>
2020-10-05 11:17:29,879 [INFO]: worker-0 Starting worker <src.Multifit_worker.MultifitWorker object at 0x7f468e187650> with queue: nlp_multifit
2020-10-05 11:17:37,475 [INFO]: worker-0 Started processing task 35c54cb3-9cd4-403c-a1e4-887982008a1d
2020-10-05 11:17:37,476 [INFO]: worker-0 Options passed to task: {'type': 'text'}
2020-10-05 11:17:37,747 [INFO]: worker-0 Finished processing task 35c54cb3-9cd4-403c-a1e4-887982008a1d in 0.270498
2020-10-05 11:17:37,747 [INFO]: worker-0 Done with task 35c54cb3-9cd4-403c-a1e4-887982008a1d
2020-10-05 11:19:36,887 [INFO]: worker-0 Shutting down on user interrupt
2020-10-05 11:19:36,887 [INFO]: worker-0 Stopping worker <src.Multifit_worker.MultifitWorker object at 0x7f468e187650> with queue: nlp_multifit
2020-10-05 11:19:36,896 [INFO]: worker-0 Finalizing <src.Multifit_worker.MultifitWorker object at 0x7f468e187650>
2020-10-05 11:19:36,909 [INFO]: MainProcess Log server on ('localhost', 9981) is going down
2020-10-05 11:19:37,979 [INFO]: MainProcess Finalizing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 11:32:08,200 [INFO]: MainProcess Initializing <class 'src.Multifit_worker.MultifitWorker'>
2020-10-05 11:32:08,200 [INFO]: MainProcess Starting log server listening on ('localhost', 9981)
2020-10-05 11:32:08,206 [INFO]: worker-0 Initializing <src.Multifit_worker.MultifitWorker object at 0x7feedc700590>
2020-10-05 11:32:08,227 [INFO]: worker-0 Starting worker <src.Multifit_worker.MultifitWorker object at 0x7feedc700590> with queue: nlp_multifit
2020-10-05 11:32:14,886 [INFO]: worker-0 Started processing task 8eb53aaf-2fca-4412-ba90-ac869202a1c9
2020-10-05 11:32:14,886 [INFO]: worker-0 Options passed to task: {'type': 'text'}
2020-10-05 11:32:15,185 [INFO]: worker-0 Finished processing task 8eb53aaf-2fca-4412-ba90-ac869202a1c9 in 0.298381
2020-10-05 11:32:15,185 [INFO]: worker-0 Done with task 8eb53aaf-2fca-4412-ba90-ac869202a1c9
2020-10-05 11:32:19,832 [INFO]: worker-0 Shutting down on user interrupt
2020-10-05 11:32:19,833 [INFO]: worker-0 Stopping worker <src.Multifit_worker.MultifitWorker object at 0x7feedc700590> with queue: nlp_multifit
2020-10-05 11:32:19,840 [INFO]: worker-0 Finalizing <src.Multifit_worker.MultifitWorker object at 0x7feedc700590>
2020-10-05 11:32:19,849 [INFO]: MainProcess Log server on ('localhost', 9981) is going down
2020-10-05 11:32:20,213 [INFO]: MainProcess Finalizing <class 'src.Multifit_worker.MultifitWorker'>
import json
import logging
import operator
import nlp_ws
from fastai.text import *
import torch
log = logging.getLogger(__name__)
class MultifitWorker(nlp_ws.NLPWorker):
@classmethod
def static_init(self, config):
self.config = config
log.debug("static_init(%s)", config)
def init(self):
log.debug("init()")
self._classifier = MultifitClassifier()
def process(self, input_path, task_options, output_path):
task = task_options.get("type", None)
with open(input_path, "r") as f:
text = f.read()
lang = text.split('__label__')[1]
text = text.split('__label__')[0]
result = self._classifier.predict(text, lang=lang, task_options=task)
result["decision"] = max(result.items(), key=operator.itemgetter(1))[0]
result["language"] = lang
print(result)
with open(output_path, "w") as f:
json.dump(result, f, indent=4)
class MultifitClassifier(object):
def __init__(self):
self.labels_text = ["__label__meta_amb", "__label__meta_minus_m",
"__label__meta_plus_m", "___label__meta_zero"]
self.labels_sen = ["__label__z_amb", "__label__z_minus_m",
"__label__z_plus_m", "___label__z_zero"]
def predict(self, ccl, lang=None, task_options=None):
path = ""
if task_options == "sentence":
path = lang + "-sent-sen.pkl"
labels = self.labels_sen
else:
path = lang + "-sent.pkl"
labels = self.labels_text
learner = load_learner("models", path)
results = learner.predict("xxbos " + str(ccl))
probabilities = [str(x) for x in to_np(results[2])]
result = dict(zip(labels, probabilities))
return result
[tox]
envlist = pep8
skipsdist = True
[testenv:pep8]
deps =
flake8
basepython = python3.8
commands =
flake8 {posargs}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment