Skip to content
Snippets Groups Projects
Commit 1bf88a88 authored by Maja Jabłońska's avatar Maja Jabłońska
Browse files

General structure

parent e4b2e1c5
Branches
Tags
1 merge request!46Merge COMBO 3.0 into master
Showing
with 380 additions and 1 deletion
from .base import FeedForwardPredictor
from .graph_parser import GraphDependencyRelationModel
from .parser import DependencyRelationModel
from .embeddings import CharacterBasedWordEmbeddings
from .encoder import ComboEncoder
from .lemma import LemmatizerModel
from .model import ComboModel
from .morpho import MorphologicalFeatures
...@@ -4,7 +4,11 @@ import torch ...@@ -4,7 +4,11 @@ import torch
import torch.nn as nn import torch.nn as nn
import utils import utils
import combo.models.combo_nn as combo_nn import combo.models.combo_nn as combo_nn
import combo.checks as checks import combo.utils.checks as checks
class Model:
pass
class Predictor(nn.Module): class Predictor(nn.Module):
......
class Embedding:
pass
class TokenEmbedder:
pass
class CharacterBasedWordEmbeddings(TokenEmbedder):
pass
class ProjectedWordEmbedder(TokenEmbedder):
pass
class PretrainedTransformerMismatchedEmbedder(TokenEmbedder):
pass
class TransformersWordEmbedder(PretrainedTransformerMismatchedEmbedder):
pass
class FeatsTokenEmbedder(TokenEmbedder):
pass
\ No newline at end of file
class Encoder:
pass
class StackedBidirectionalLstm(Encoder):
pass
class ComboEncoder(Encoder):
pass
from combo.models.base import Predictor
class GraphHeadPredictionModel(Predictor):
pass
class GraphDependencyRelationModel(Predictor):
pass
from combo.models.base import Predictor
class LemmatizerModel(Predictor):
pass
from combo.models.base import Model
class ComboModel(Model):
pass
\ No newline at end of file
from combo.models.base import Predictor
class MorphologicalFeatures(Predictor):
pass
from combo.models.base import Predictor
class HeadPredictionModel(Predictor):
pass
class DependencyRelationModel(Predictor):
pass
import logging
import os
import sys
from typing import List, Union, Dict, Any
from combo import data
from combo.data import sentence2conllu, tokens2conllu, conllu2sentence
from combo.models.base import Predictor
from combo.utils import download, graph
logger = logging.getLogger(__name__)
class COMBO(Predictor):
pass
from .checkpointer import FinishingTrainingCheckpointer
from .scheduler import Scheduler
from .trainer import GradientDescentTrainer
class Checkpointer:
pass
class FinishingTrainingCheckpointer:
pass
class Scheduler:
pass
class NullTensorboardWriter:
pass
\ No newline at end of file
from pytorch_lightning import Trainer
class Callback:
pass
class TransferPatienceEpochCallback:
pass
class GradientDescentTrainer(Trainer):
pass
import torch
class ConfigurationError(Exception): class ConfigurationError(Exception):
def __init__(self, message: str): def __init__(self, message: str):
super().__init__() super().__init__()
self.message = message self.message = message
def file_exists(*paths):
pass
def check_size_match(size_1: torch.Size, size_2: torch.Size, tensor_1_name: str, tensor_2_name: str):
pass
import errno
import logging
import os
import requests
import tqdm
import urllib3
from requests import adapters, exceptions
logger = logging.getLogger(__name__)
DATA_TO_PATH = {
"enhanced" : "iwpt_2020",
"iwpt2021" : "iwpt_2021",
"ud25" : "ud_25",
"ud27" : "ud_27",
"ud29" : "ud_29"}
_URL = "http://s3.clarin-pl.eu/dspace/combo/{data}/{model}.tar.gz"
_HOME_DIR = os.getenv("HOME", os.curdir)
_CACHE_DIR = os.getenv("COMBO_DIR", os.path.join(_HOME_DIR, ".combo"))
def download_file(model_name, force=False):
_make_cache_dir()
data = model_name.split("-")[-1]
url = _URL.format(model=model_name, data=DATA_TO_PATH[data])
local_filename = url.split("/")[-1]
location = os.path.join(_CACHE_DIR, local_filename)
if os.path.exists(location) and not force:
logger.debug("Using cached model.")
return location
chunk_size = 1024
logger.info(url)
try:
with _requests_retry_session(retries=2).get(url, stream=True) as r:
pbar = tqdm.tqdm(unit="B", total=int(r.headers.get("content-length")),
unit_divisor=chunk_size, unit_scale=True)
with open(location, "wb") as f:
with pbar:
for chunk in r.iter_content(chunk_size):
if chunk:
f.write(chunk)
pbar.update(len(chunk))
except exceptions.RetryError:
raise ConnectionError(f"Couldn't find or download model {model_name}.tar.gz. "
"Check if model name is correct or try again later!")
return location
def _make_cache_dir():
try:
os.makedirs(_CACHE_DIR)
logger.info(f"Making cache dir {_CACHE_DIR}")
except OSError as e:
if e.errno != errno.EEXIST:
raise
def _requests_retry_session(
retries=3,
backoff_factor=0.3,
status_forcelist=(404, 500, 502, 504),
session=None,
):
"""Source: https://www.peterbe.com/plog/best-practice-with-retries-with-requests"""
session = session or requests.Session()
retry = urllib3.Retry(
total=retries,
read=retries,
connect=retries,
backoff_factor=backoff_factor,
status_forcelist=status_forcelist,
)
adapter = adapters.HTTPAdapter(max_retries=retry)
session.mount("http://", adapter)
session.mount("https://", adapter)
return session
"""Based on https://github.com/emorynlp/iwpt-shared-task-2020."""
import numpy as np
_ACL_REL_CL = "acl:relcl"
def graph_and_tree_merge(tree_arc_scores,
tree_rel_scores,
graph_arc_scores,
graph_rel_scores,
label2idx,
idx2label,
graph_label2idx,
graph_idx2label,
tokens):
graph_arc_scores = np.copy(graph_arc_scores)
# Exclude self-loops, in-place operation.
np.fill_diagonal(graph_arc_scores, 0)
# Connection to root will be handled by tree.
graph_arc_scores[:, 0] = False
# The same with labels.
root_idx = graph_label2idx["root"]
graph_rel_scores[:, :, root_idx] = -float('inf')
graph_rel_pred = graph_rel_scores.argmax(-1)
# Add tree edges to graph
tree_heads = [0] + tree_arc_scores
graph = [[] for _ in range(len(tree_heads))]
labeled_graph = [[] for _ in range(len(tree_heads))]
for d, h in enumerate(tree_heads):
if not d:
continue
label = idx2label[tree_rel_scores[d - 1]]
# graph_label = graph_idx2label[graph_rel_pred[d - 1][h - 1]]
# if ">" in graph_label and label in graph_label:
# print("Using graph label instead of tree.")
# label = graph_label
if label != _ACL_REL_CL:
graph[h].append(d)
labeled_graph[h].append((d, label))
# Debug only
# Extract graph edges
graph_edges = np.argwhere(graph_arc_scores)
# Add graph edges which aren't creating a cycle
for (d, h) in graph_edges:
if not d or not h or d in graph[h]:
continue
try:
path = next(_dfs(graph, d, h))
except StopIteration:
# There is not path from d to h
label = graph_idx2label[graph_rel_pred[d][h]]
if label != _ACL_REL_CL:
graph[h].append(d)
labeled_graph[h].append((d, label))
# Add 'acl:relcl' without checking for cycles.
for d, h in enumerate(tree_heads):
if not d:
continue
label = idx2label[tree_rel_scores[d - 1]]
if label == _ACL_REL_CL:
graph[h].append(d)
labeled_graph[h].append((d, label))
assert len(labeled_graph[0]) == 1
d = graph[0][0]
graph[d].append(0)
labeled_graph[d].append((0, "root"))
parse_graph = [[] for _ in range(len(tree_heads))]
for h in range(len(tree_heads)):
for d, label in labeled_graph[h]:
parse_graph[d].append((h, label))
parse_graph[d] = sorted(parse_graph[d])
for i, g in enumerate(parse_graph):
heads = np.array([x[0] for x in g])
rels = np.array([x[1] for x in g])
indices = rels.argsort()
heads = heads[indices].tolist()
rels = rels[indices].tolist()
deps = '|'.join(f'{h}:{r}' for h, r in zip(heads, rels))
tokens[i - 1]["deps"] = deps
return
def _dfs(graph, start, end):
fringe = [(start, [])]
while fringe:
state, path = fringe.pop()
if path and state == end:
yield path
continue
for next_state in graph[state]:
if next_state in path:
continue
fringe.append((next_state, path + [next_state]))
def restore_collapse_edges(tree_tokens):
# https://gist.github.com/hankcs/776e7d95c19e5ff5da8469fe4e9ab050
empty_tokens = []
for token in tree_tokens:
deps = token["deps"].split("|")
for i, d in enumerate(deps):
if ">" in d:
# {head}:{empty_node_relation}>{current_node_relation}
# should map to
# For new, empty node:
# {head}:{empty_node_relation}
# For current node:
# {new_empty_node_id}:{current_node_relation}
# TODO consider where to put new_empty_node_id (currently at the end)
head, relation = d.split(':', 1)
ehead = f"{len(tree_tokens)}.{len(empty_tokens) + 1}"
empty_node_relation, current_node_relation = relation.split(">", 1)
# Edge case, double >
if ">" in current_node_relation:
second_empty_node_relation, current_node_relation = current_node_relation.split(">")
deps[i] = f"{ehead}:{current_node_relation}"
second_ehead = f"{len(tree_tokens)}.{len(empty_tokens) + 2}"
empty_tokens.append(
{
"id": ehead,
"deps": f"{second_ehead}:{empty_node_relation}"
}
)
empty_tokens.append(
{
"id": second_ehead,
"deps": f"{head}:{second_empty_node_relation}"
}
)
else:
deps[i] = f"{ehead}:{current_node_relation}"
empty_tokens.append(
{
"id": ehead,
"deps": f"{head}:{empty_node_relation}"
}
)
deps = sorted([d.split(":", 1) for d in deps], key=lambda x: float(x[0]))
token["deps"] = "|".join([f"{k}:{v}" for k, v in deps])
return empty_tokens
class Metric:
pass
class LemmaAccuracy(Metric):
pass
class SequenceBoolAccuracy(Metric):
pass
class AttachmentScores(Metric):
pass
class SemanticMetrics(Metric):
pass
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment