diff --git a/combo/config.template.jsonnet b/combo/config.template.jsonnet index 53013efd44f7acc3864566d79632799bd6dc3ecf..70af62af09ad9752be76262e961ff1c01abfb384 100644 --- a/combo/config.template.jsonnet +++ b/combo/config.template.jsonnet @@ -13,7 +13,7 @@ local pretrained_tokens = if std.length(std.extVar("pretrained_tokens")) > 0 the # Name of pretrained transformer model, str or null local pretrained_transformer_name = if std.length(std.extVar("pretrained_transformer_name")) > 0 then std.extVar("pretrained_transformer_name"); # Learning rate value, float -local learning_rate = 0.002; +local learning_rate = 0.0000001; # Number of epochs, int local num_epochs = std.parseInt(std.extVar("num_epochs")); # Cuda device id, -1 for cpu, int @@ -49,7 +49,7 @@ local lemma_char_dim = 64; # Character embedding dim, int local char_dim = 64; # Word embedding projection dim, int -local projected_embedding_dim = 100; +local projected_embedding_dim = 128; # Loss weights, dict[str, int] local loss_weights = { xpostag: 0.05, @@ -78,9 +78,9 @@ local in_targets(name) = !(std.length(std.find(name, targets)) == 0); local use_transformer = pretrained_transformer_name != null; # Transformer encoder options -local use_transformer_encoder = if std.length(std.extVar("use_transformer_encoder")) == "True" then true else false; -local num_layers_transformer_encoder = 6; -local num_attention_heads = 8; + +local num_layers_transformer_encoder = 2; +local num_attention_heads = 16; # Verify some configuration requirements assert in_features("token"): "Key 'token' must be in features!"; @@ -257,7 +257,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't }, }, loss_weights: loss_weights, - seq_encoder: if use_transformer_encoder then { + seq_encoder: { type: "pytorch_transformer", input_dim: (char_dim + projected_embedding_dim + (if in_features('xpostag') then xpostag_dim else 0) + @@ -265,24 +265,8 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't (if in_features('upostag') then upostag_dim else 0) + (if in_features('feats') then feats_dim else 0)), num_layers: num_layers_transformer_encoder, - feedforward_hidden_dim: hidden_size, - num_attention_heads: num_attention_heads, - positional_encoding: "sinusoidal"} else { - type: "combo_encoder", - layer_dropout_probability: 0.33, - stacked_bilstm: { - input_size: - (char_dim + projected_embedding_dim + - (if in_features('xpostag') then xpostag_dim else 0) + - (if in_features('lemma') then lemma_char_dim else 0) + - (if in_features('upostag') then upostag_dim else 0) + - (if in_features('feats') then feats_dim else 0)), - hidden_size: hidden_size, - num_layers: num_layers, - recurrent_dropout_probability: 0.33, - layer_dropout_probability: 0.33 - } - }, + feedforward_hidden_dim: hidden_size , + num_attention_heads: num_attention_heads}, dependency_relation: { type: "combo_dependency_parsing_from_vocab", vocab_namespace: 'deprel_labels', @@ -290,25 +274,25 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't local projection_dim = 512, cycle_loss_n: cycle_loss_n, head_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + projected_embedding_dim, #hidden_size * 2, out_features: projection_dim, activation: "tanh", }, dependency_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + projected_embedding_dim, #hidden_size * 2, out_features: projection_dim, activation: "tanh", }, }, local projection_dim = 128, head_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + projected_embedding_dim,#hidden_size * 2, out_features: projection_dim, dropout_rate: predictors_dropout, activation: "tanh" }, dependency_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + projected_embedding_dim, #hidden_size * 2, out_features: projection_dim, dropout_rate: predictors_dropout, activation: "tanh" @@ -317,7 +301,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't morphological_feat: if in_targets("feats") then { type: "combo_morpho_from_vocab", vocab_namespace: "feats_labels", - input_dim: hidden_size * 2, + input_dim: char_dim + projected_embedding_dim, #hidden_size * 2, hidden_dims: [128], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -329,7 +313,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't lemma_vocab_namespace: "lemma_characters", embedding_dim: 256, input_projection_layer: { - in_features: hidden_size * 2, + in_features: char_dim + projected_embedding_dim, #hidden_size * 2, out_features: 32, dropout_rate: predictors_dropout, activation: "tanh" @@ -342,7 +326,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't activations: ["relu", "relu", "relu", "linear"], }, upos_tagger: if in_targets("upostag") then { - input_dim: hidden_size * 2, + input_dim: char_dim + projected_embedding_dim, #hidden_size * 2, hidden_dims: [64], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -350,7 +334,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't vocab_namespace: "upostag_labels" }, xpos_tagger: if in_targets("xpostag") then { - input_dim: hidden_size * 2, + input_dim: char_dim + projected_embedding_dim, #hidden_size * 2, hidden_dims: [128], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -358,7 +342,7 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't vocab_namespace: "xpostag_labels" }, semantic_relation: if in_targets("semrel") then { - input_dim: hidden_size * 2, + input_dim: char_dim + projected_embedding_dim, #hidden_size * 2, hidden_dims: [64], activations: ["tanh", "linear"], dropout: [predictors_dropout, 0.0], @@ -386,14 +370,14 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't type: "adam", lr: learning_rate, betas: [0.9, 0.9], + parameter_groups: [ + [["seq_encoder"],{"lr": learning_rate}] + ], }, patience: 1, # it will be overwriten by callback epoch_callbacks: [ { type: "transfer_patience" }, ], - learning_rate_scheduler: { - type: "combo_scheduler", - }, tensorboard_writer: if use_tensorboard then { should_log_learning_rate: false, should_log_parameter_statistics: false, @@ -404,4 +388,4 @@ assert pretrained_tokens == null || pretrained_transformer_name == null: "Can't random_seed: 8787, pytorch_seed: 8787, numpy_seed: 8787, -} +} \ No newline at end of file