Skip to content

Commit

Permalink
Merge remote-tracking branch 'upstream/v4' into store-activations
Browse files Browse the repository at this point in the history
  • Loading branch information
danieldk committed Jun 27, 2022
2 parents 3b13f17 + 3f76bc1 commit 944b0e3
Show file tree
Hide file tree
Showing 9 changed files with 126 additions and 74 deletions.
7 changes: 4 additions & 3 deletions spacy/matcher/matcher.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -256,6 +256,10 @@ cdef class Matcher:
# non-overlapping ones this `match` can be either (start, end) or
# (start, end, alignments) depending on `with_alignments=` option.
for key, *match in matches:
# Adjust span matches to doc offsets
if isinstance(doclike, Span):
match[0] += doclike.start
match[1] += doclike.start
span_filter = self._filter.get(key)
if span_filter is not None:
pairs = pairs_by_id.get(key, [])
Expand Down Expand Up @@ -286,9 +290,6 @@ cdef class Matcher:
if as_spans:
final_results = []
for key, start, end, *_ in final_matches:
if isinstance(doclike, Span):
start += doclike.start
end += doclike.start
final_results.append(Span(doc, start, end, label=key))
elif with_alignments:
# convert alignments List[Dict[str, int]] --> List[int]
Expand Down
28 changes: 15 additions & 13 deletions spacy/pipeline/_parser_internals/ner.pyx
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
import os
import random
from libc.stdint cimport int32_t
from libcpp.memory cimport shared_ptr
from libcpp.vector cimport vector
from cymem.cymem cimport Pool

from collections import Counter
Expand Down Expand Up @@ -43,9 +45,7 @@ MOVE_NAMES[OUT] = 'O'

cdef struct GoldNERStateC:
Transition* ner
SpanC* negs
int32_t length
int32_t nr_neg
vector[shared_ptr[SpanC]] negs


cdef class BiluoGold:
Expand Down Expand Up @@ -78,8 +78,6 @@ cdef GoldNERStateC create_gold_state(
negs = []
assert example.x.length > 0
gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
gs.nr_neg = len(negs)
ner_ents, ner_tags = example.get_aligned_ents_and_ner()
for i, ner_tag in enumerate(ner_tags):
gs.ner[i] = moves.lookup_transition(ner_tag)
Expand All @@ -93,8 +91,8 @@ cdef GoldNERStateC create_gold_state(
# In order to handle negative samples, we need to maintain the full
# (start, end, label) triple. If we break it down to the 'isnt B-LOC'
# thing, we'll get blocked if there's an incorrect prefix.
for i, neg in enumerate(negs):
gs.negs[i] = neg.c
for neg in negs:
gs.negs.push_back(neg.c)
return gs


Expand Down Expand Up @@ -411,6 +409,8 @@ cdef class Begin:
cdef int g_act = gold.ner[b0].move
cdef attr_t g_tag = gold.ner[b0].label

cdef shared_ptr[SpanC] span

if g_act == MISSING:
pass
elif g_act == BEGIN:
Expand All @@ -428,8 +428,8 @@ cdef class Begin:
# be correct or not. However, we can at least tell whether we're
# going to be opening an entity where there's only one possible
# L.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0:
for span in gold.negs:
if span.get().label == label and span.get().start == b0:
cost += 1
break
return cost
Expand Down Expand Up @@ -574,8 +574,9 @@ cdef class Last:
# If we have negative-example entities, integrate them into the objective,
# by marking actions that close an entity that we know is incorrect
# as costly.
for span in gold.negs[:gold.nr_neg]:
if span.label == label and (span.end-1) == b0 and span.start == ent_start:
cdef shared_ptr[SpanC] span
for span in gold.negs:
if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
cost += 1
break
return cost
Expand Down Expand Up @@ -639,8 +640,9 @@ cdef class Unit:
# This is fairly straight-forward for U- entities, as we have a single
# action
cdef int b0 = s.B(0)
for span in gold.negs[:gold.nr_neg]:
if span.label == label and span.start == b0 and span.end == (b0+1):
cdef shared_ptr[SpanC] span
for span in gold.negs:
if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
cost += 1
break
return cost
Expand Down
14 changes: 13 additions & 1 deletion spacy/tests/doc/test_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

from spacy.attrs import ORTH, LENGTH
from spacy.lang.en import English
from spacy.tokens import Doc, Span, Token
from spacy.tokens import Doc, Span, SpanGroup, Token
from spacy.vocab import Vocab
from spacy.util import filter_spans
from thinc.api import get_current_ops
Expand Down Expand Up @@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
assert span.text == text


@pytest.mark.issue(9556)
def test_modify_span_group(doc):
group = SpanGroup(doc, spans=doc.ents)
for span in group:
span.start = 0
span.label = doc.vocab.strings["TEST"]

# Span changes must be reflected in the span group
assert group[0].start == 0
assert group[0].label == doc.vocab.strings["TEST"]


def test_spans_sent_spans(doc):
sents = list(doc.sents)
assert sents[0].start == 0
Expand Down
6 changes: 4 additions & 2 deletions spacy/tests/doc/test_span_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,8 +99,10 @@ def test_span_group_set_item(doc, other_doc):
span.label_ = "NEW LABEL"
span.kb_id = doc.vocab.strings["KB_ID"]

assert span_group[index].label != span.label
assert span_group[index].kb_id != span.kb_id
# Indexing a span group returns a span in which C
# data is shared.
assert span_group[index].label == span.label
assert span_group[index].kb_id == span.kb_id

span_group[index] = span
assert span_group[index].start == span.start
Expand Down
13 changes: 10 additions & 3 deletions spacy/tests/matcher/test_matcher_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -602,9 +602,16 @@ def test_matcher_span(matcher):
doc = Doc(matcher.vocab, words=text.split())
span_js = doc[:3]
span_java = doc[4:]
assert len(matcher(doc)) == 2
assert len(matcher(span_js)) == 1
assert len(matcher(span_java)) == 1
doc_matches = matcher(doc)
span_js_matches = matcher(span_js)
span_java_matches = matcher(span_java)
assert len(doc_matches) == 2
assert len(span_js_matches) == 1
assert len(span_java_matches) == 1

# match offsets always refer to the doc
assert doc_matches[0] == span_js_matches[0]
assert doc_matches[1] == span_java_matches[0]


def test_matcher_as_spans(matcher):
Expand Down
11 changes: 7 additions & 4 deletions spacy/tokens/span.pxd
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from libcpp.memory cimport shared_ptr
cimport numpy as np

from .doc cimport Doc
Expand All @@ -7,19 +8,21 @@ from ..structs cimport SpanC

cdef class Span:
cdef readonly Doc doc
cdef SpanC c
cdef shared_ptr[SpanC] c
cdef public _vector
cdef public _vector_norm

@staticmethod
cdef inline Span cinit(Doc doc, SpanC span):
cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span):
cdef Span self = Span.__new__(
Span,
doc,
start=span.start,
end=span.end
start=span.get().start,
end=span.get().end
)
self.c = span
return self

cpdef np.ndarray to_array(self, object features)

cdef SpanC* span_c(self)
Loading

0 comments on commit 944b0e3

Please sign in to comment.