Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support lazy, recursive sentence splitting #7

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
173 changes: 173 additions & 0 deletions projects/biaffine_parser/configs/base-config-transition.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
[paths]
train = null
dev = null

[system]
gpu_allocator = "pytorch"
seed = 0

[nlp]
lang = null
pipeline = ["transformer","tagger","morphologizer","senter","parser"]
disabled = ["senter"]
before_creation = null
after_creation = null
after_pipeline_creation = null
batch_size = 64
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}

[components]

[components.parser]
factory = "parser"
learn_tokens = false
min_action_freq = 30
moves = null
scorer = {"@scorers":"spacy.parser_scorer.v1"}
update_with_oracle_cut_size = 100

[components.parser.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "parser"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null

[components.parser.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
upstream = "transformer"
pooling = {"@layers":"reduce_mean.v1"}

[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}

[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
name = null
tokenizer_config = {"use_fast": true}
mixed_precision = true

[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96

[components.morphologizer]
factory = "morphologizer"

[components.morphologizer.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.morphologizer.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
upstream = "transformer"
pooling = {"@layers":"reduce_mean.v1"}

[components.senter]
factory = "senter"
store_activations = true

[components.senter.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.senter.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
upstream = "transformer"
pooling = {"@layers":"reduce_mean.v1"}

[components.tagger]
factory = "tagger"

[components.tagger.model]
@architectures = "spacy.Tagger.v1"
nO = null

[components.tagger.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
upstream = "transformer"
pooling = {"@layers":"reduce_mean.v1"}

[corpora]

[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 500
gold_preproc = false
limit = 0
augmenter = null

[corpora.dev]
@readers = "spacy.Corpus.v1"
max_length = 0
path = ${paths.dev}
gold_preproc = false
limit = 0
augmenter = null

[training]
train_corpus = "corpora.train"
dev_corpus = "corpora.dev"
seed = ${system:seed}
gpu_allocator = ${system:gpu_allocator}
dropout = 0.1
accumulate_gradient = 3
patience = 5000
max_epochs = 0
max_steps = 20000
eval_frequency = 1000
frozen_components = []
before_to_disk = null
annotating_components = []

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
get_length = null
size = 2000
buffer = 256

[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = true

[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = true
eps = 0.00000001

[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005

[training.score_weights]
tag_acc = 0.2
pos_acc = 0.2
morph_acc = 0.2
morph_per_feat = null
dep_uas = 0.0
dep_las = 0.2
dep_las_per_type = null
sents_p = null
sents_r = null
sents_f = 0.2

[initialize]
vectors = null
6 changes: 3 additions & 3 deletions projects/biaffine_parser/configs/base-config.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ pooling = {"@layers":"reduce_mean.v1"}

[components.arc_predicter]
factory = "experimental_arc_predicter"
senter = "senter"

[components.arc_predicter.model]
@architectures = "spacy-experimental.PairwiseBilinear.v1"
Expand Down Expand Up @@ -78,6 +79,7 @@ pooling = {"@layers":"reduce_mean.v1"}

[components.senter]
factory = "senter"
save_activations = true

[components.senter.model]
@architectures = "spacy.Tagger.v1"
Expand Down Expand Up @@ -133,7 +135,7 @@ max_steps = 20000
eval_frequency = 1000
frozen_components = []
before_to_disk = null
annotating_components = ["senter"]
annotating_components = ["transformer", "senter"]

[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
Expand Down Expand Up @@ -170,8 +172,6 @@ morph_per_feat = null
dep_uas = 0.0
dep_las = 0.2
dep_las_per_type = null
bound_dep_uas = 0.0
bound_dep_las = 0.0
sents_p = null
sents_r = null
sents_f = 0.2
Expand Down
17 changes: 13 additions & 4 deletions projects/biaffine_parser/project.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,11 @@ commands:
help: "Convert data to spaCy format"
script:
- "mkdir -p corpus/${vars.treebank}"
- "python -m spacy convert assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/ --n-sents 10"
- "python -m spacy convert assets/${vars.treebank}/${vars.train_name}.conllu corpus/${vars.treebank}/ --n-sents 10 --merge-subtokens"
- "mv corpus/${vars.treebank}/${vars.train_name}.spacy corpus/${vars.treebank}/train.spacy"
- "python -m spacy convert assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/ --n-sents 10"
- "python -m spacy convert assets/${vars.treebank}/${vars.dev_name}.conllu corpus/${vars.treebank}/ --n-sents 10 --merge-subtokens"
- "mv corpus/${vars.treebank}/${vars.dev_name}.spacy corpus/${vars.treebank}/dev.spacy"
- "python -m spacy convert assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/ --n-sents 10"
- "python -m spacy convert assets/${vars.treebank}/${vars.test_name}.conllu corpus/${vars.treebank}/ --n-sents 10 --merge-subtokens"
- "mv corpus/${vars.treebank}/${vars.test_name}.spacy corpus/${vars.treebank}/test.spacy"
deps:
- "assets/${vars.treebank}/"
Expand Down Expand Up @@ -63,6 +63,15 @@ commands:
- name: "evaluate"
help: "Evaluate the parser model on the test corpus."
script:
- "python -m spacy evaluate training/${vars.treebank}/model-best corpus/${vars.treebank}/test.spacy"
- "python -m spacy evaluate --gpu-id ${vars.gpu} training/${vars.treebank}/model-best corpus/${vars.treebank}/test.spacy"
deps:
- "training/${vars.treebank}/model-best"
- "corpus/${vars.treebank}/test.spacy"

- name: "evaluate-dev"
help: "Evaluate the parser model on the test corpus."
script:
- "python -m spacy evaluate --gpu-id ${vars.gpu} training/${vars.treebank}/model-best corpus/${vars.treebank}/dev.spacy"
deps:
- "training/${vars.treebank}/model-best"
- "corpus/${vars.treebank}/dev.spacy"
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ requires = [
"setuptools",
"wheel",
"Cython<3.0",
"spacy>=3.3.0,<3.6.0",
"spacy>=4.0.0.dev0,<4.1.0",
"numpy",
]
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
spacy>=3.3.0,<3.6.0
spacy>=4.0.0.dev0,<4.1.0

# Development dependencies
cython>=0.25,<3.0
Expand Down
3 changes: 1 addition & 2 deletions setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ long_description_content_type = text/markdown
zip_safe = false
python_requires = >=3.6
install_requires =
spacy>=3.3.0,<3.6.0
spacy>=4.0.0.dev0,<4.1.0

[options.entry_points]
spacy_architectures =
Expand All @@ -35,7 +35,6 @@ spacy_scorers =
spacy-experimental.tokenizer_scorer.v1 = spacy_experimental.char_tokenizer.scorers:make_tokenizer_scorer
spacy-experimental.tokenizer_senter_scorer.v1 = spacy_experimental.char_tokenizer.scorers:make_tokenizer_senter_scorer
spacy-experimental.span_finder_scorer.v1 = spacy_experimental.span_finder.span_finder_component:make_span_finder_scorer
spacy-experimental.biaffine_parser_scorer.v1 = spacy_experimental.biaffine_parser.eval:make_parser_scorer
spacy-experimental.coref_scorer.v1 = spacy_experimental.coref.coref_component:make_coref_scorer
spacy-experimental.span_resolver_scorer.v1 = spacy_experimental.coref.span_resolver_component:make_span_resolver_scorer
spacy_tokenizers =
Expand Down
20 changes: 8 additions & 12 deletions spacy_experimental/biaffine_parser/arc_labeler.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ from typing import Callable, Dict, Iterable, Optional
import spacy
from spacy import Language, Vocab
from spacy.errors import Errors
from spacy.pipeline.dep_parser import parser_score
from spacy.pipeline.trainable_pipe cimport TrainablePipe
from spacy.tokens.token cimport Token
from spacy.tokens.doc cimport Doc
Expand All @@ -16,7 +17,6 @@ from thinc.api import Config, Model, Ops, Optimizer
from thinc.api import to_numpy
from thinc.types import Floats2d, Ints1d, Tuple

from .eval import parser_score

default_model_config = """
[model]
Expand All @@ -40,7 +40,7 @@ DEFAULT_ARC_LABELER_MODEL = Config().from_str(default_model_config)["model"]
assigns=["token.dep"],
default_config={
"model": DEFAULT_ARC_LABELER_MODEL,
"scorer": {"@scorers": "spacy-experimental.biaffine_parser_scorer.v1"}
"scorer": {"@scorers": "spacy.parser_scorer.v1"}
},
)
def make_arc_labeler(
Expand All @@ -60,7 +60,7 @@ class ArcLabeler(TrainablePipe):
name: str = "arc_labeler",
*,
overwrite=False,
scorer=parser_score
scorer=parser_score,
):
self.name = name
self.model = model
Expand All @@ -85,10 +85,7 @@ class ArcLabeler(TrainablePipe):
offset = 0
for eg in examples:
aligned_heads, aligned_labels = eg.get_aligned_parse(projectivize=False)
for token in eg.predicted:
gold_head = aligned_heads[token.i]
gold_label = aligned_labels[token.i]

for gold_head, gold_label in zip(aligned_heads, aligned_labels):
# Do not learn from misaligned tokens, since we could no use
# their correct head representations.
if gold_head is not None and gold_label is not None:
Expand Down Expand Up @@ -170,11 +167,10 @@ class ArcLabeler(TrainablePipe):

offset = 0
for doc in docs:
for sent in doc.sents:
for token in sent:
label = self.cfg["labels"][predictions[offset]]
doc.c[token.i].dep = self.vocab.strings[label]
offset += 1
for token in doc:
label = self.cfg["labels"][predictions[offset]]
doc.c[token.i].dep = self.vocab.strings[label]
offset += 1

for i in range(doc.length):
if doc.c[i].head == 0:
Expand Down
Loading