experiments.conf

# Word embeddings.
glove_300d {
  path = glove.840B.300d.txt
  size = 300
}
glove_300d_filtered {
  path = glove.840B.300d.txt.filtered
  size = 300
}
glove_300d_2w {
  path = glove_50_300_2.txt
  size = 300
}

# Distributed training configurations.
two_local_gpus {
  addresses {
    ps = [localhost:2222]
    worker = [localhost:2223, localhost:2224, localhost:2225, localhost:2226]
  }
  gpus = [0, 1, 2, 3]
}

# Main configuration.
best {
  # Computation limits.
  max_top_antecedents = 50
  max_training_sentences = 50
  top_span_ratio = 0.4

  # Model hyperparameters.
  filter_widths = [3, 4, 5]
  filter_size = 50
  char_embedding_size = 8
  char_vocab_path = "char_vocab.english.txt"
  context_embeddings = ${glove_300d_filtered}
  head_embeddings = ${glove_300d_2w}
  contextualization_size = 200
  contextualization_layers = 3
  ffnn_size = 150
  ffnn_depth = 2
  feature_size = 20
  max_span_width = 30
  use_metadata = true
  use_features = true
  model_heads = true
  coref_depth = 2
  lm_layers = 4
  lm_size = 1024
  coarse_to_fine = true
  refinement_sharing = false

  # Learning hyperparameters.
  max_gradient_norm = 5.0
  lstm_dropout_rate = 0.4
  lexical_dropout_rate = 0.5
  dropout_rate = 0.2
  optimizer = adam
  learning_rate = 0.001
  decay_rate = 1.0
  decay_frequency = 100
  ema_decay = 0.9999

  # Other.
  train_path = train.english.jsonlines
  eval_path = dev.english.jsonlines
  conll_eval_path = dev.english.v4_gold_conll
  lm_path = bert_features.hdf5
  genres = ["bc", "bn", "mz", "nw", "pt", "tc", "wb"]
  eval_frequency = 5000
  # eval_frequency = 1
  report_frequency = 100
  log_root = logs
  cluster = ${two_local_gpus}
  multi_gpu = false
  gold_loss = false
  b3_loss = false
  mention_loss = false
  antecedent_loss = true

  # Entity Equalization
  entity_equalization = true
  antecedent_averaging = false
  use_cluster_size = true
  entity_average = false
}

entity_equalization = ${best}

baseline = ${best} {
  decay_rate = 0.999
  entity_equalization = false
  antecedent_averaging = true
  ema_decay = 1.0
  refinement_sharing = true
}

antecedent_averaging = ${baseline}