Merge remote-tracking branch 'upstream/v4' into store-activations

explosion · Jun 27, 2022 · 944b0e3 · 944b0e3
2 parents 3b13f17 + 3f76bc1
commit 944b0e3
Show file tree

Hide file tree

Showing 9 changed files with 126 additions and 74 deletions.
diff --git a/spacy/matcher/matcher.pyx b/spacy/matcher/matcher.pyx
@@ -256,6 +256,10 @@ cdef class Matcher:
         # non-overlapping ones this `match` can be either (start, end) or
         # (start, end, alignments) depending on `with_alignments=` option.
         for key, *match in matches:
+            # Adjust span matches to doc offsets
+            if isinstance(doclike, Span):
+                match[0] += doclike.start
+                match[1] += doclike.start
             span_filter = self._filter.get(key)
             if span_filter is not None:
                 pairs = pairs_by_id.get(key, [])
@@ -286,9 +290,6 @@ cdef class Matcher:
         if as_spans:
             final_results = []
             for key, start, end, *_ in final_matches:
-                if isinstance(doclike, Span):
-                    start += doclike.start
-                    end += doclike.start
                 final_results.append(Span(doc, start, end, label=key))
         elif with_alignments:
             # convert alignments List[Dict[str, int]] --> List[int]

diff --git a/spacy/pipeline/_parser_internals/ner.pyx b/spacy/pipeline/_parser_internals/ner.pyx
@@ -1,6 +1,8 @@
 import os
 import random
 from libc.stdint cimport int32_t
+from libcpp.memory cimport shared_ptr
+from libcpp.vector cimport vector
 from cymem.cymem cimport Pool
 
 from collections import Counter
@@ -43,9 +45,7 @@ MOVE_NAMES[OUT] = 'O'
 
 cdef struct GoldNERStateC:
     Transition* ner
-    SpanC* negs
-    int32_t length
-    int32_t nr_neg
+    vector[shared_ptr[SpanC]] negs
 
 
 cdef class BiluoGold:
@@ -78,8 +78,6 @@ cdef GoldNERStateC create_gold_state(
         negs = []
     assert example.x.length > 0
     gs.ner = <Transition*>mem.alloc(example.x.length, sizeof(Transition))
-    gs.negs = <SpanC*>mem.alloc(len(negs), sizeof(SpanC))
-    gs.nr_neg = len(negs)
     ner_ents, ner_tags = example.get_aligned_ents_and_ner()
     for i, ner_tag in enumerate(ner_tags):
         gs.ner[i] = moves.lookup_transition(ner_tag)
@@ -93,8 +91,8 @@ cdef GoldNERStateC create_gold_state(
     # In order to handle negative samples, we need to maintain the full
     # (start, end, label) triple. If we break it down to the 'isnt B-LOC'
     # thing, we'll get blocked if there's an incorrect prefix.
-    for i, neg in enumerate(negs):
-        gs.negs[i] = neg.c
+    for neg in negs:
+        gs.negs.push_back(neg.c)
     return gs
 
 
@@ -411,6 +409,8 @@ cdef class Begin:
         cdef int g_act = gold.ner[b0].move
         cdef attr_t g_tag = gold.ner[b0].label
 
+        cdef shared_ptr[SpanC] span
+
         if g_act == MISSING:
             pass
         elif g_act == BEGIN:
@@ -428,8 +428,8 @@ cdef class Begin:
             # be correct or not. However, we can at least tell whether we're
             # going to be opening an entity where there's only one possible
             # L.
-            for span in gold.negs[:gold.nr_neg]:
-                if span.label == label and span.start == b0:
+            for span in gold.negs:
+                if span.get().label == label and span.get().start == b0:
                     cost += 1
                     break
         return cost
@@ -574,8 +574,9 @@ cdef class Last:
         # If we have negative-example entities, integrate them into the objective,
         # by marking actions that close an entity that we know is incorrect
         # as costly.
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and (span.end-1) == b0 and span.start == ent_start:
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and (span.get().end-1) == b0 and span.get().start == ent_start:
                 cost += 1
                 break
         return cost
@@ -639,8 +640,9 @@ cdef class Unit:
         # This is fairly straight-forward for U- entities, as we have a single
         # action
         cdef int b0 = s.B(0)
-        for span in gold.negs[:gold.nr_neg]:
-            if span.label == label and span.start == b0 and span.end == (b0+1):
+        cdef shared_ptr[SpanC] span
+        for span in gold.negs:
+            if span.get().label == label and span.get().start == b0 and span.get().end == (b0+1):
                 cost += 1
                 break
         return cost

diff --git a/spacy/tests/doc/test_span.py b/spacy/tests/doc/test_span.py
@@ -4,7 +4,7 @@
 
 from spacy.attrs import ORTH, LENGTH
 from spacy.lang.en import English
-from spacy.tokens import Doc, Span, Token
+from spacy.tokens import Doc, Span, SpanGroup, Token
 from spacy.vocab import Vocab
 from spacy.util import filter_spans
 from thinc.api import get_current_ops
@@ -163,6 +163,18 @@ def test_char_span(doc, i_sent, i, j, text):
         assert span.text == text
 
 
+@pytest.mark.issue(9556)
+def test_modify_span_group(doc):
+    group = SpanGroup(doc, spans=doc.ents)
+    for span in group:
+        span.start = 0
+        span.label = doc.vocab.strings["TEST"]
+
+    # Span changes must be reflected in the span group
+    assert group[0].start == 0
+    assert group[0].label == doc.vocab.strings["TEST"]
+
+
 def test_spans_sent_spans(doc):
     sents = list(doc.sents)
     assert sents[0].start == 0

diff --git a/spacy/tests/doc/test_span_group.py b/spacy/tests/doc/test_span_group.py
@@ -99,8 +99,10 @@ def test_span_group_set_item(doc, other_doc):
     span.label_ = "NEW LABEL"
     span.kb_id = doc.vocab.strings["KB_ID"]
 
-    assert span_group[index].label != span.label
-    assert span_group[index].kb_id != span.kb_id
+    # Indexing a span group returns a span in which C
+    # data is shared.
+    assert span_group[index].label == span.label
+    assert span_group[index].kb_id == span.kb_id
 
     span_group[index] = span
     assert span_group[index].start == span.start

diff --git a/spacy/tests/matcher/test_matcher_api.py b/spacy/tests/matcher/test_matcher_api.py
@@ -602,9 +602,16 @@ def test_matcher_span(matcher):
     doc = Doc(matcher.vocab, words=text.split())
     span_js = doc[:3]
     span_java = doc[4:]
-    assert len(matcher(doc)) == 2
-    assert len(matcher(span_js)) == 1
-    assert len(matcher(span_java)) == 1
+    doc_matches = matcher(doc)
+    span_js_matches = matcher(span_js)
+    span_java_matches = matcher(span_java)
+    assert len(doc_matches) == 2
+    assert len(span_js_matches) == 1
+    assert len(span_java_matches) == 1
+
+    # match offsets always refer to the doc
+    assert doc_matches[0] == span_js_matches[0]
+    assert doc_matches[1] == span_java_matches[0]
 
 
 def test_matcher_as_spans(matcher):

diff --git a/spacy/tokens/span.pxd b/spacy/tokens/span.pxd
@@ -1,3 +1,4 @@
+from libcpp.memory cimport shared_ptr
 cimport numpy as np
 
 from .doc cimport Doc
@@ -7,19 +8,21 @@ from ..structs cimport SpanC
 
 cdef class Span:
     cdef readonly Doc doc
-    cdef SpanC c
+    cdef shared_ptr[SpanC] c
     cdef public _vector
     cdef public _vector_norm
 
     @staticmethod
-    cdef inline Span cinit(Doc doc, SpanC span):
+    cdef inline Span cinit(Doc doc, const shared_ptr[SpanC] &span):
         cdef Span self = Span.__new__(
             Span,
             doc,
-            start=span.start,
-            end=span.end
+            start=span.get().start,
+            end=span.get().end
         )
         self.c = span
         return self
 
     cpdef np.ndarray to_array(self, object features)
+
+    cdef SpanC* span_c(self)