Skip to content

Commit

Permalink
Cleanup Cython structs (explosion#11337)
Browse files Browse the repository at this point in the history
* cleanup Tokenizer fields

* remove unused object from vocab

* remove IS_OOV_DEPRECATED

* add back in as FLAG13

* FLAG 18 instead

* import fix

* fix clumpsy fingers

* revert symbol changes in favor of explosion#11352

* bint instead of bool
  • Loading branch information
Sofie Van Landeghem authored and jordankanter committed Mar 14, 2024
1 parent 5bf29f7 commit 459a9ce
Show file tree
Hide file tree
Showing 5 changed files with 12 additions and 14 deletions.
6 changes: 1 addition & 5 deletions spacy/tokenizer.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -23,11 +23,7 @@ cdef class Tokenizer:
cdef object _infix_finditer
cdef object _rules
cdef PhraseMatcher _special_matcher
# TODO convert to bool in v4
cdef int _faster_heuristics
# TODO next one is unused and should be removed in v4
# https://github.com/explosion/spaCy/pull/9150
cdef int _unused_int2
cdef bint _faster_heuristics

cdef Doc _tokenize_affixes(self, str string, bint with_special_cases)
cdef int _apply_special_cases(self, Doc doc) except -1
Expand Down
11 changes: 9 additions & 2 deletions spacy/tokenizer.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,18 @@ from libcpp.set cimport set as stdset
from preshed.maps cimport PreshMap

import re

from .tokens.doc cimport Doc
from .strings cimport hash_string
from .lexeme cimport EMPTY_LEXEME
from .strings cimport hash_string
from .tokens.doc cimport Doc

from .attrs import intify_attrs
from .symbols import ORTH, NORM
from .errors import Errors
from . import util
from .util import get_words_and_spaces
from .attrs import intify_attrs
from .errors import Errors
from .scorer import Scorer
Expand Down Expand Up @@ -124,10 +131,10 @@ cdef class Tokenizer:

property faster_heuristics:
def __get__(self):
return bool(self._faster_heuristics)
return self._faster_heuristics

def __set__(self, faster_heuristics):
self._faster_heuristics = bool(faster_heuristics)
self._faster_heuristics = faster_heuristics
self._reload_special_cases()

def __reduce__(self):
Expand Down
1 change: 0 additions & 1 deletion spacy/vocab.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,6 @@ cdef class Vocab:
cdef public object writing_system
cdef public object get_noun_chunks
cdef readonly int length
cdef public object _unused_object # TODO remove in v4, see #9150
cdef public object lex_attr_getters
cdef public object cfg

Expand Down
1 change: 0 additions & 1 deletion spacy/vocab.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,6 @@ def unpickle_vocab(
sstore: StringStore,
vectors: Any,
morphology: Any,
_unused_object: Any,
lex_attr_getters: Any,
lookups: Any,
get_noun_chunks: Any,
Expand Down
7 changes: 2 additions & 5 deletions spacy/vocab.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -579,21 +579,18 @@ def pickle_vocab(vocab):
sstore = vocab.strings
vectors = vocab.vectors
morph = vocab.morphology
_unused_object = vocab._unused_object
lex_attr_getters = srsly.pickle_dumps(vocab.lex_attr_getters)
lookups = vocab.lookups
get_noun_chunks = vocab.get_noun_chunks
return (unpickle_vocab,
(sstore, vectors, morph, _unused_object, lex_attr_getters, lookups, get_noun_chunks))
(sstore, vectors, morph, lex_attr_getters, lookups, get_noun_chunks))


def unpickle_vocab(sstore, vectors, morphology, _unused_object,
lex_attr_getters, lookups, get_noun_chunks):
def unpickle_vocab(sstore, vectors, morphology, lex_attr_getters, lookups, get_noun_chunks):
cdef Vocab vocab = Vocab()
vocab.vectors = vectors
vocab.strings = sstore
vocab.morphology = morphology
vocab._unused_object = _unused_object
vocab.lex_attr_getters = srsly.pickle_loads(lex_attr_getters)
vocab.lookups = lookups
vocab.get_noun_chunks = get_noun_chunks
Expand Down

0 comments on commit 459a9ce

Please sign in to comment.