diff --git a/combo/data/api.py b/combo/data/api.py
index b04192f04b2db84981ea34cfcfa1297769078f26..0f407cdc7f3b3a71e88c96f12a0b45d175ff2c1d 100644
--- a/combo/data/api.py
+++ b/combo/data/api.py
@@ -2,14 +2,14 @@ import collections
 import dataclasses
 import json
 from dataclasses import dataclass, field
-from typing import Optional, List, Dict, Any, Union, Tuple
+from typing import List, Dict, Any
 
+import conllu
 from conllu.models import Metadata
+from overrides import overrides
 
 from combo.data.tokenizers import Token
 
-import conllu
-from overrides import overrides
 
 @dataclass
 class Sentence:
diff --git a/combo/data/samplers/__init__.py b/combo/data/samplers/__init__.py
index 7e90dca36a0bf3ca8e0c7cbf939ca0f4305d03e8..db8c55c67dbd4158915dfd120b483d6bfe5accd7 100644
--- a/combo/data/samplers/__init__.py
+++ b/combo/data/samplers/__init__.py
@@ -1,2 +1,2 @@
 from .batch_sampler import BatchSampler
-from .samplers import TokenCountBatchSampler
+from .token_count_batch_sampler import TokenCountBatchSampler
diff --git a/combo/data/samplers/samplers.py b/combo/data/samplers/samplers.py
deleted file mode 100644
index 3dcab19fc6621cb25c7d34321f0b4feeab20612c..0000000000000000000000000000000000000000
--- a/combo/data/samplers/samplers.py
+++ /dev/null
@@ -1,58 +0,0 @@
-"""
-Adapted from COMBO
-Author: Mateusz Klimaszewski
-"""
-
-from typing import List
-
-import numpy as np
-
-from combo.data.samplers import BatchSampler
-
-
-class TokenCountBatchSampler(BatchSampler):
-
-    def __init__(self, dataset, word_batch_size: int = 2500, shuffle_dataset: bool = True):
-        self._index = 0
-        self.shuffle_dataset = shuffle_dataset
-        self.batch_dataset = self._batchify(dataset, word_batch_size)
-        if shuffle_dataset:
-            self._shuffle()
-
-    @staticmethod
-    def _batchify(dataset, word_batch_size) -> List[List[int]]:
-        dataset = list(dataset)
-        batches = []
-        batch = []
-        words_count = 0
-        lengths = [len(instance.fields["sentence"].tokens) for instance in dataset]
-        argsorted_lengths = np.argsort(lengths)
-        for idx in argsorted_lengths:
-            words_count += lengths[idx]
-            batch.append(idx)
-            if words_count > word_batch_size:
-                batches.append(batch)
-                words_count = 0
-                batch = []
-        return batches
-
-    def __iter__(self):
-        return self
-
-    def __next__(self):
-        if self._index >= len(self.batch_dataset):
-            if self.shuffle_dataset:
-                self._index = 0
-                self._shuffle()
-            raise StopIteration()
-
-        batch = self.batch_dataset[self._index]
-        self._index += 1
-        return batch
-
-    def _shuffle(self):
-        indices = np.random.permutation(range(len(self.batch_dataset)))
-        self.batch_dataset = np.array(self.batch_dataset)[indices].tolist()
-
-    def __len__(self):
-        return len(self.batch_dataset)
diff --git a/combo/data/tokenizers/lambo_tokenizer.py b/combo/data/tokenizers/lambo_tokenizer.py
index fc3bbb4639052d7b8fb132456339b64da7738dc8..f37b07640e8cf7f22add0cc56ba2a9d049c3d201 100644
--- a/combo/data/tokenizers/lambo_tokenizer.py
+++ b/combo/data/tokenizers/lambo_tokenizer.py
@@ -6,7 +6,6 @@ from combo.config import Registry
 from combo.config.from_parameters import register_arguments
 from combo.data.tokenizers.token import Token
 from combo.data.tokenizers.tokenizer import Tokenizer
-from combo.data.api import Sentence
 
 
 @Registry.register('lambo_tokenizer')
diff --git a/combo/data/tokenizers/token.py b/combo/data/tokenizers/token.py
index 33182ddd3a30dafa7886fed5c337af43b1b116d3..16c3a8e57131de512201fc27981d363d7fa7e79b 100644
--- a/combo/data/tokenizers/token.py
+++ b/combo/data/tokenizers/token.py
@@ -2,7 +2,6 @@
 Adapted from AllenNLP
 https://github.com/allenai/allennlp/blob/main/allennlp/data/tokenizers/token_class.py
 """
-from collections import defaultdict
 from typing import Any, Dict, List, Optional, Tuple, Union
 import logging
 from dataclasses import dataclass, field
diff --git a/combo/main.py b/combo/main.py
index b471f67861a82f744f24ef1be0b172c581e08644..9e40120761bc44c9575bf339cf8611e9bbd0cce6 100755
--- a/combo/main.py
+++ b/combo/main.py
@@ -16,16 +16,16 @@ from combo.training.trainable_combo import TrainableCombo
 from combo.utils import checks, ComboLogger
 
 from combo.config import resolve
-from combo.default_model import default_ud_dataset_reader, default_data_loader
+from combo.default_model import default_ud_dataset_reader, default_data_loader, default_model
 from combo.modules.archival import load_archive, archive
 from combo.predict import COMBO
 from combo.data import api
-from config import override_parameters
-from data import LamboTokenizer, Vocabulary, DatasetReader
-from data.dataset_loaders import DataLoader
-from modules.model import Model
-from utils import ConfigurationError
-from utils.matrices import extract_combo_matrices
+from combo.config import override_parameters
+from combo.data import LamboTokenizer, Vocabulary, DatasetReader
+from combo.data.dataset_loaders import DataLoader
+from combo.modules.model import Model
+from combo.utils import ConfigurationError
+from combo.utils.matrices import extract_combo_matrices
 
 logging.setLoggerClass(ComboLogger)
 logger = logging.getLogger(__name__)
@@ -275,6 +275,17 @@ def run(_):
             if FLAGS.config_path:
                 logger.info(f'Reading parameters from configuration path {FLAGS.config_path}', prefix=prefix)
                 model, dataset_reader, training_data_loader, validation_data_loader, vocabulary = read_model_from_config(prefix)
+            else:
+                dataset_reader, training_data_loader, validation_data_loader, vocabulary = get_defaults(
+                    dataset_reader,
+                    training_data_loader,
+                    validation_data_loader,
+                    vocabulary,
+                    FLAGS.training_data_path,
+                    FLAGS.validation_data_path,
+                    prefix
+                )
+                model = default_model(FLAGS.pretrained_transformer_name, vocabulary)
 
             if FLAGS.use_pure_config and model is None:
                 logger.error('Error in configuration - model could not be read from parameters. ' +
diff --git a/combo/modules/graph_parser.py b/combo/modules/graph_parser.py
index 8796f1e6937ec289fc62d421941e14c001fb2be7..3f03211c78005d2d00b6e78483dc4fbd0ddd8e9b 100644
--- a/combo/modules/graph_parser.py
+++ b/combo/modules/graph_parser.py
@@ -11,12 +11,13 @@ from combo import data
 from combo.config import Registry
 from combo.config.from_parameters import register_arguments
 from combo.nn import base
-from combo.nn.base import Predictor
-
+from combo.predictors import Predictor
 import torch
 import torch.nn.functional as F
 
 
+
+
 @Registry.register("graph_head_predictor")
 class GraphHeadPredictionModel(Predictor):
     """Head prediction model."""
diff --git a/combo/nn/util.py b/combo/nn/util.py
deleted file mode 100644
index 69c8d017760606cfdbf4b09999d813e1932b71f7..0000000000000000000000000000000000000000
--- a/combo/nn/util.py
+++ /dev/null
@@ -1,259 +0,0 @@
-"""
-Adapted from AllenNLP
-https://github.com/allenai/allennlp/blob/80fb6061e568cb9d6ab5d45b661e86eb61b92c82/allennlp/nn/util.py
-"""
-from typing import Union, Dict, Optional, List, Any
-
-import torch
-
-from combo.common.util import int_to_device
-from combo.utils import ConfigurationError
-
-
-def move_to_device(obj, device: Union[torch.device, int]):
-    """
-    Given a structure (possibly) containing Tensors,
-    move all the Tensors to the specified device (or do nothing, if they are already on
-    the target device).
-    """
-    device = int_to_device(device)
-
-    if isinstance(obj, torch.Tensor):
-        # You may be wondering why we don't just always call `obj.to(device)` since that would
-        # be a no-op anyway if `obj` is already on `device`. Well that works fine except
-        # when PyTorch is not compiled with CUDA support, in which case even calling
-        # `obj.to(torch.device("cpu"))` would result in an error.
-        return obj if obj.device == device else obj.to(device=device)
-    elif isinstance(obj, dict):
-        for key, value in obj.items():
-            obj[key] = move_to_device(value, device)
-        return obj
-    elif isinstance(obj, list):
-        for i, item in enumerate(obj):
-            obj[i] = move_to_device(item, device)
-        return obj
-    elif isinstance(obj, tuple) and hasattr(obj, "_fields"):
-        # This is the best way to detect a NamedTuple, it turns out.
-        return obj.__class__(*(move_to_device(item, device) for item in obj))
-    elif isinstance(obj, tuple):
-        return tuple(move_to_device(item, device) for item in obj)
-    else:
-        return obj
-
-
-def device_mapping(cuda_device: int):
-    """
-    In order to `torch.load()` a GPU-trained model onto a CPU (or specific GPU),
-    you have to supply a `map_location` function. Call this with
-    the desired `cuda_device` to get the function that `torch.load()` needs.
-    """
-
-    def inner_device_mapping(storage: torch.Storage, location) -> torch.Storage:
-        if cuda_device >= 0:
-            return storage.cuda(cuda_device)
-        else:
-            return storage
-
-    return inner_device_mapping
-
-
-def get_lengths_from_binary_sequence_mask(mask: torch.BoolTensor) -> torch.LongTensor:
-    """
-    Compute sequence lengths for each batch element in a tensor using a
-    binary mask.
-    # Parameters
-    mask : `torch.BoolTensor`, required.
-        A 2D binary mask of shape (batch_size, sequence_length) to
-        calculate the per-batch sequence lengths from.
-    # Returns
-    `torch.LongTensor`
-        A torch.LongTensor of shape (batch_size,) representing the lengths
-        of the sequences in the batch.
-    """
-    return mask.sum(-1)
-
-
-def sort_batch_by_length(tensor: torch.Tensor, sequence_lengths: torch.Tensor):
-    """
-    Sort a batch first tensor by some specified lengths.
-    # Parameters
-    tensor : `torch.FloatTensor`, required.
-        A batch first Pytorch tensor.
-    sequence_lengths : `torch.LongTensor`, required.
-        A tensor representing the lengths of some dimension of the tensor which
-        we want to sort by.
-    # Returns
-    sorted_tensor : `torch.FloatTensor`
-        The original tensor sorted along the batch dimension with respect to sequence_lengths.
-    sorted_sequence_lengths : `torch.LongTensor`
-        The original sequence_lengths sorted by decreasing size.
-    restoration_indices : `torch.LongTensor`
-        Indices into the sorted_tensor such that
-        `sorted_tensor.index_select(0, restoration_indices) == original_tensor`
-    permutation_index : `torch.LongTensor`
-        The indices used to sort the tensor. This is useful if you want to sort many
-        tensors using the same ordering.
-    """
-
-    if not isinstance(tensor, torch.Tensor) or not isinstance(sequence_lengths, torch.Tensor):
-        raise ConfigurationError("Both the tensor and sequence lengths must be torch.Tensors.")
-
-    sorted_sequence_lengths, permutation_index = sequence_lengths.sort(0, descending=True)
-    sorted_tensor = tensor.index_select(0, permutation_index)
-
-    index_range = torch.arange(0, len(sequence_lengths), device=sequence_lengths.device)
-    # This is the equivalent of zipping with index, sorting by the original
-    # sequence lengths and returning the now sorted indices.
-    _, reverse_mapping = permutation_index.sort(0, descending=False)
-    restoration_indices = index_range.index_select(0, reverse_mapping)
-    return sorted_tensor, sorted_sequence_lengths, restoration_indices, permutation_index
-
-def get_text_field_mask(
-    text_field_tensors: Dict[str, Dict[str, torch.Tensor]],
-    num_wrapping_dims: int = 0,
-    padding_id: int = 0,
-) -> torch.BoolTensor:
-    """
-    Takes the dictionary of tensors produced by a `TextField` and returns a mask
-    with 0 where the tokens are padding, and 1 otherwise. `padding_id` specifies the id of padding tokens.
-    We also handle `TextFields` wrapped by an arbitrary number of `ListFields`, where the number of wrapping
-    `ListFields` is given by `num_wrapping_dims`.
-    If `num_wrapping_dims == 0`, the returned mask has shape `(batch_size, num_tokens)`.
-    If `num_wrapping_dims > 0` then the returned mask has `num_wrapping_dims` extra
-    dimensions, so the shape will be `(batch_size, ..., num_tokens)`.
-    There could be several entries in the tensor dictionary with different shapes (e.g., one for
-    word ids, one for character ids).  In order to get a token mask, we use the tensor in
-    the dictionary with the lowest number of dimensions.  After subtracting `num_wrapping_dims`,
-    if this tensor has two dimensions we assume it has shape `(batch_size, ..., num_tokens)`,
-    and use it for the mask.  If instead it has three dimensions, we assume it has shape
-    `(batch_size, ..., num_tokens, num_features)`, and sum over the last dimension to produce
-    the mask.  Most frequently this will be a character id tensor, but it could also be a
-    featurized representation of each token, etc.
-    If the input `text_field_tensors` contains the "mask" key, this is returned instead of inferring the mask.
-    """
-    masks = []
-    for indexer_name, indexer_tensors in text_field_tensors.items():
-        if "mask" in indexer_tensors:
-            masks.append(indexer_tensors["mask"].bool())
-    if len(masks) == 1:
-        return masks[0]
-    elif len(masks) > 1:
-        # TODO(mattg): My guess is this will basically never happen, so I'm not writing logic to
-        # handle it.  Should be straightforward to handle, though.  If you see this error in
-        # practice, open an issue on github.
-        raise ValueError("found two mask outputs; not sure which to use!")
-
-    tensor_dims = [
-        (tensor.dim(), tensor)
-        for indexer_output in text_field_tensors.values()
-        for tensor in indexer_output.values()
-    ]
-    tensor_dims.sort(key=lambda x: x[0])
-
-    smallest_dim = tensor_dims[0][0] - num_wrapping_dims
-    if smallest_dim == 2:
-        token_tensor = tensor_dims[0][1]
-        return token_tensor != padding_id
-    elif smallest_dim == 3:
-        character_tensor = tensor_dims[0][1]
-        return (character_tensor != padding_id).any(dim=-1)
-    else:
-        raise ValueError("Expected a tensor with dimension 2 or 3, found {}".format(smallest_dim))
-
-
-def get_dropout_mask(dropout_probability: float, tensor_for_masking: torch.Tensor):
-    """
-    Computes and returns an element-wise dropout mask for a given tensor, where
-    each element in the mask is dropped out with probability dropout_probability.
-    Note that the mask is NOT applied to the tensor - the tensor is passed to retain
-    the correct CUDA tensor type for the mask.
-    # Parameters
-    dropout_probability : `float`, required.
-        Probability of dropping a dimension of the input.
-    tensor_for_masking : `torch.Tensor`, required.
-    # Returns
-    `torch.FloatTensor`
-        A torch.FloatTensor consisting of the binary mask scaled by 1/ (1 - dropout_probability).
-        This scaling ensures expected values and variances of the output of applying this mask
-        and the original tensor are the same.
-    """
-    binary_mask = (torch.rand(tensor_for_masking.size()) > dropout_probability).to(
-        tensor_for_masking.device
-    )
-    # Scale mask by 1/keep_prob to preserve output statistics.
-    dropout_mask = binary_mask.float().div(1.0 - dropout_probability)
-    return dropout_mask
-
-
-def find_embedding_layer(model: torch.nn.Module) -> torch.nn.Module:
-    """
-    Takes a model (typically an AllenNLP `Model`, but this works for any `torch.nn.Module`) and
-    makes a best guess about which module is the embedding layer.  For typical AllenNLP models,
-    this often is the `TextFieldEmbedder`, but if you're using a pre-trained contextualizer, we
-    really want layer 0 of that contextualizer, not the output.  So there are a bunch of hacks in
-    here for specific pre-trained contextualizers.
-    """
-    # We'll look for a few special cases in a first pass, then fall back to just finding a
-    # TextFieldEmbedder in a second pass if we didn't find a special case.
-    from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-    from transformers.models.bert.modeling_bert import BertEmbeddings
-    from transformers.models.albert.modeling_albert import AlbertEmbeddings
-    from transformers.models.roberta.modeling_roberta import RobertaEmbeddings
-
-    for module in model.modules():
-        if isinstance(module, BertEmbeddings):
-            return module.word_embeddings
-        if isinstance(module, RobertaEmbeddings):
-            return module.word_embeddings
-        if isinstance(module, AlbertEmbeddings):
-            return module.word_embeddings
-        if isinstance(module, GPT2Model):
-            return module.wte
-
-    return None
-
-    # for module in model.modules():
-    #     if isinstance(module, TextFieldEmbedder):
-    #
-    #         if isinstance(module, BasicTextFieldEmbedder):
-    #             # We'll have a check for single Embedding cases, because we can be more efficient
-    #             # in cases like this.  If this check fails, then for something like hotflip we need
-    #             # to actually run the text field embedder and construct a vector for each token.
-    #             if len(module._token_embedders) == 1:
-    #                 embedder = list(module._token_embedders.values())[0]
-    #                 if isinstance(embedder, Embedding):
-    #                     if embedder._projection is None:
-    #                         # If there's a projection inside the Embedding, then we need to return
-    #                         # the whole TextFieldEmbedder, because there's more computation that
-    #                         # needs to be done than just multiply by an embedding matrix.
-    #                         return embedder
-    #         return module
-    raise RuntimeError("No embedding module found!")
-
-
-
-def get_token_offsets_from_text_field_inputs(
-    text_field_inputs: List[Any],
-) -> Optional[torch.Tensor]:
-    """
-    Given a list of inputs to a TextFieldEmbedder, tries to find token offsets from those inputs, if
-    there are any.  You will have token offsets if you are using a mismatched token embedder; if
-    you're not, the return value from this function should be None.  This function is intended to be
-    called from a `forward_hook` attached to a `TextFieldEmbedder`, so the inputs are formatted just
-    as a list.
-    It's possible in theory that you could have multiple offsets as inputs to a single call to a
-    `TextFieldEmbedder`, but that's an extremely rare use case (I can't really imagine anyone
-    wanting to do that).  In that case, we'll only return the first one.  If you need different
-    behavior for your model, open an issue on github describing what you're doing.
-    """
-    for input_index, text_field_input in enumerate(text_field_inputs):
-        if not isinstance(text_field_input, dict):
-            continue
-        for input_value in text_field_input.values():
-            if not isinstance(input_value, dict):
-                continue
-            for embedder_arg_name, embedder_arg_value in input_value.items():
-                if embedder_arg_name == "offsets":
-                    return embedder_arg_value
-    return None
-
diff --git a/combo/utils/cached_transformers.py b/combo/utils/cached_transformers.py
deleted file mode 100644
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..0000000000000000000000000000000000000000
diff --git a/combo/utils/matrices.py b/combo/utils/matrices.py
index 395e23d2b4628313551aa061328cbeef87895086..61b357d7263f5e2da145f2668ee9b704c19df9f2 100644
--- a/combo/utils/matrices.py
+++ b/combo/utils/matrices.py
@@ -8,8 +8,8 @@ import numpy as np
 import pandas as pd
 from pathlib import Path
 
-from data import Sentence
-from utils import ComboLogger
+from combo.data import Sentence
+from combo.utils import ComboLogger
 
 
 def extract_combo_matrices(predictions: List[Sentence],
diff --git a/docs/Configuration.md b/docs/Configuration.md
index c7e5761b38ccc71166b1ca483dfed38039faf774..128c02558768663827b055e26358aaf5ac96a913 100644
--- a/docs/Configuration.md
+++ b/docs/Configuration.md
@@ -20,4 +20,4 @@ The ```"type"``` field serves as a dependency-injection like mechanism and
 sets which class and constructor is used.
 The ```"parameters"``` dictionary is passed to the constructor method. Every
 parameter is attempted to be resolved from the class registry and only if that's
-not possible they are passed as-is.
\ No newline at end of file
+not possible they are passed as-is.
diff --git a/docs/training.md b/docs/Training.md
similarity index 100%
rename from docs/training.md
rename to docs/Training.md