Skip to content

Commit

Permalink
Consolidate and freeze symbols (explosion#11352)
Browse files Browse the repository at this point in the history
* Consolidate and freeze symbols

Instead of having symbol values defined in three potentially conflicting
places (`spacy.attrs`, `spacy.parts_of_speech`, `spacy.symbols`), define
all symbols in `spacy.symbols` and reference those values in
`spacy.attrs` and `spacy.parts_of_speech`.

Remove deprecated and placeholder symbols from `spacy.attrs.IDS`.

Make `spacy.attrs.NAMES` and `spacy.symbols.NAMES` reverse dicts rather
than lists in order to support future use of hash values in `attr_id_t`.

Minor changes:

* Use `uint64_t` for attrs in `Doc.to_array` to support future use of
hash values
* Remove unneeded attrs filter for error message in `Doc.to_array`
* Remove unused attr `SENT_END`

* Handle dynamic size of attr_id_t in Doc.to_array

* Undo added warnings

* Refactor to make Doc.to_array more similar to Doc.from_array

* Improve refactoring
  • Loading branch information
adrianeboyd authored and jordankanter committed Mar 14, 2024
1 parent 53689bc commit 3c6b11d
Show file tree
Hide file tree
Showing 9 changed files with 551 additions and 179 deletions.
129 changes: 40 additions & 89 deletions spacy/attrs.pxd
Original file line number Diff line number Diff line change
@@ -1,99 +1,50 @@
# Reserve 64 values for flag features
from . cimport symbols


cdef enum attr_id_t:
NULL_ATTR
IS_ALPHA
IS_ASCII
IS_DIGIT
IS_LOWER
IS_PUNCT
IS_SPACE
IS_TITLE
IS_UPPER
LIKE_URL
LIKE_NUM
LIKE_EMAIL
IS_STOP
IS_OOV_DEPRECATED
IS_BRACKET
IS_QUOTE
IS_LEFT_PUNCT
IS_RIGHT_PUNCT
IS_CURRENCY
NULL_ATTR = 0
IS_ALPHA = symbols.IS_ALPHA
IS_ASCII = symbols.IS_ASCII
IS_DIGIT = symbols.IS_DIGIT
IS_LOWER = symbols.IS_LOWER
IS_PUNCT = symbols.IS_PUNCT
IS_SPACE = symbols.IS_SPACE
IS_TITLE = symbols.IS_TITLE
IS_UPPER = symbols.IS_UPPER
LIKE_URL = symbols.LIKE_URL
LIKE_NUM = symbols.LIKE_NUM
LIKE_EMAIL = symbols.LIKE_EMAIL
IS_STOP = symbols.IS_STOP
IS_BRACKET = symbols.IS_BRACKET
IS_QUOTE = symbols.IS_QUOTE
IS_LEFT_PUNCT = symbols.IS_LEFT_PUNCT
IS_RIGHT_PUNCT = symbols.IS_RIGHT_PUNCT
IS_CURRENCY = symbols.IS_CURRENCY

FLAG19 = 19
FLAG20
FLAG21
FLAG22
FLAG23
FLAG24
FLAG25
FLAG26
FLAG27
FLAG28
FLAG29
FLAG30
FLAG31
FLAG32
FLAG33
FLAG34
FLAG35
FLAG36
FLAG37
FLAG38
FLAG39
FLAG40
FLAG41
FLAG42
FLAG43
FLAG44
FLAG45
FLAG46
FLAG47
FLAG48
FLAG49
FLAG50
FLAG51
FLAG52
FLAG53
FLAG54
FLAG55
FLAG56
FLAG57
FLAG58
FLAG59
FLAG60
FLAG61
FLAG62
FLAG63
ID = symbols.ID
ORTH = symbols.ORTH
LOWER = symbols.LOWER
NORM = symbols.NORM
SHAPE = symbols.SHAPE
PREFIX = symbols.PREFIX
SUFFIX = symbols.SUFFIX

ID
ORTH
LOWER
NORM
SHAPE
PREFIX
SUFFIX
LENGTH = symbols.LENGTH
CLUSTER = symbols.CLUSTER
LEMMA = symbols.LEMMA
POS = symbols.POS
TAG = symbols.TAG
DEP = symbols.DEP
ENT_IOB = symbols.ENT_IOB
ENT_TYPE = symbols.ENT_TYPE
HEAD = symbols.HEAD
SENT_START = symbols.SENT_START
SPACY = symbols.SPACY
PROB = symbols.PROB

LENGTH
CLUSTER
LEMMA
POS
TAG
DEP
ENT_IOB
ENT_TYPE
HEAD
SENT_START
SPACY
PROB

LANG
LANG = symbols.LANG
ENT_KB_ID = symbols.ENT_KB_ID
MORPH
MORPH = symbols.MORPH
ENT_ID = symbols.ENT_ID

IDX
SENT_END
IDX = symbols.IDX
49 changes: 1 addition & 48 deletions spacy/attrs.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -17,57 +17,11 @@ IDS = {
"LIKE_NUM": LIKE_NUM,
"LIKE_EMAIL": LIKE_EMAIL,
"IS_STOP": IS_STOP,
"IS_OOV_DEPRECATED": IS_OOV_DEPRECATED,
"IS_BRACKET": IS_BRACKET,
"IS_QUOTE": IS_QUOTE,
"IS_LEFT_PUNCT": IS_LEFT_PUNCT,
"IS_RIGHT_PUNCT": IS_RIGHT_PUNCT,
"IS_CURRENCY": IS_CURRENCY,
"FLAG19": FLAG19,
"FLAG20": FLAG20,
"FLAG21": FLAG21,
"FLAG22": FLAG22,
"FLAG23": FLAG23,
"FLAG24": FLAG24,
"FLAG25": FLAG25,
"FLAG26": FLAG26,
"FLAG27": FLAG27,
"FLAG28": FLAG28,
"FLAG29": FLAG29,
"FLAG30": FLAG30,
"FLAG31": FLAG31,
"FLAG32": FLAG32,
"FLAG33": FLAG33,
"FLAG34": FLAG34,
"FLAG35": FLAG35,
"FLAG36": FLAG36,
"FLAG37": FLAG37,
"FLAG38": FLAG38,
"FLAG39": FLAG39,
"FLAG40": FLAG40,
"FLAG41": FLAG41,
"FLAG42": FLAG42,
"FLAG43": FLAG43,
"FLAG44": FLAG44,
"FLAG45": FLAG45,
"FLAG46": FLAG46,
"FLAG47": FLAG47,
"FLAG48": FLAG48,
"FLAG49": FLAG49,
"FLAG50": FLAG50,
"FLAG51": FLAG51,
"FLAG52": FLAG52,
"FLAG53": FLAG53,
"FLAG54": FLAG54,
"FLAG55": FLAG55,
"FLAG56": FLAG56,
"FLAG57": FLAG57,
"FLAG58": FLAG58,
"FLAG59": FLAG59,
"FLAG60": FLAG60,
"FLAG61": FLAG61,
"FLAG62": FLAG62,
"FLAG63": FLAG63,
"ID": ID,
"ORTH": ORTH,
"LOWER": LOWER,
Expand All @@ -93,8 +47,7 @@ IDS = {
}


# ATTR IDs, in order of the symbol
NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
NAMES = {v: k for k, v in IDS.items()}
locals().update(IDS)


Expand Down
38 changes: 19 additions & 19 deletions spacy/parts_of_speech.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -4,22 +4,22 @@ from . cimport symbols
cpdef enum univ_pos_t:
NO_TAG = 0
ADJ = symbols.ADJ
ADP
ADV
AUX
CONJ
CCONJ # U20
DET
INTJ
NOUN
NUM
PART
PRON
PROPN
PUNCT
SCONJ
SYM
VERB
X
EOL
SPACE
ADP = symbols.ADP
ADV = symbols.ADV
AUX = symbols.AUX
CONJ = symbols.CONJ
CCONJ = symbols.CCONJ # U20
DET = symbols.DET
INTJ = symbols.INTJ
NOUN = symbols.NOUN
NUM = symbols.NUM
PART = symbols.PART
PRON = symbols.PRON
PROPN = symbols.PROPN
PUNCT = symbols.PUNCT
SCONJ = symbols.SCONJ
SYM = symbols.SYM
VERB = symbols.VERB
X = symbols.X
EOL = symbols.EOL
SPACE = symbols.SPACE
2 changes: 1 addition & 1 deletion spacy/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,7 @@ def validate_init_settings(

def validate_token_pattern(obj: list) -> List[str]:
# Try to convert non-string keys (e.g. {ORTH: "foo"} -> {"ORTH": "foo"})
get_key = lambda k: NAMES[k] if isinstance(k, int) and k < len(NAMES) else k
get_key = lambda k: NAMES[k] if isinstance(k, int) and k in NAMES else k
if isinstance(obj, list):
converted = []
for pattern in obj:
Expand Down
4 changes: 2 additions & 2 deletions spacy/strings.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ cdef class StringStore:
elif _try_coerce_to_hash(string_or_id, &str_hash):
if str_hash == 0:
return ""
elif str_hash < len(SYMBOLS_BY_INT):
elif str_hash in SYMBOLS_BY_INT:
return SYMBOLS_BY_INT[str_hash]
else:
utf8str = <Utf8Str*>self._map.get(str_hash)
Expand Down Expand Up @@ -224,7 +224,7 @@ cdef class StringStore:
# TODO: Raise an error instead
return self._map.get(string_or_id) is not NULL

if str_hash < len(SYMBOLS_BY_INT):
if str_hash in SYMBOLS_BY_INT:
return True
else:
return self._map.get(str_hash) is not NULL
Expand Down
15 changes: 8 additions & 7 deletions spacy/symbols.pxd
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# DO NOT EDIT! The symbols are frozen as of spaCy v3.0.0.
cdef enum symbol_t:
NIL
NIL = 0
IS_ALPHA
IS_ASCII
IS_DIGIT
Expand Down Expand Up @@ -65,7 +66,7 @@ cdef enum symbol_t:
FLAG62
FLAG63

ID
ID = 64
ORTH
LOWER
NORM
Expand Down Expand Up @@ -385,7 +386,7 @@ cdef enum symbol_t:
DEPRECATED275
DEPRECATED276

PERSON
PERSON = 380
NORP
FACILITY
ORG
Expand All @@ -405,7 +406,7 @@ cdef enum symbol_t:
ORDINAL
CARDINAL

acomp
acomp = 398
advcl
advmod
agent
Expand Down Expand Up @@ -458,12 +459,12 @@ cdef enum symbol_t:
rcmod
root
xcomp

acl

ENT_KB_ID
ENT_KB_ID = 452
MORPH
ENT_ID

IDX
_
_ = 456
# DO NOT ADD ANY NEW SYMBOLS!
6 changes: 1 addition & 5 deletions spacy/symbols.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -470,11 +470,7 @@ IDS = {
}


def sort_nums(x):
return x[1]


NAMES = [it[0] for it in sorted(IDS.items(), key=sort_nums)]
NAMES = {v: k for k, v in IDS.items()}
# Unfortunate hack here, to work around problem with long cpdef enum
# (which is generating an enormous amount of C++ in Cython 0.24+)
# We keep the enum cdef, and just make sure the names are available to Python
Expand Down
Loading

0 comments on commit 3c6b11d

Please sign in to comment.