Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[3.8] closes bpo-37966: Fully implement the UAX GH-15 quick-check algorithm. (GH-15558) #15671

Merged
merged 1 commit into from
Sep 4, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions Doc/whatsnew/3.8.rst
Original file line number Diff line number Diff line change
Expand Up @@ -1090,8 +1090,9 @@ unicodedata
<http://blog.unicode.org/2019/05/unicode-12-1-en.html>`_ release.

* New function :func:`~unicodedata.is_normalized` can be used to verify a string
is in a specific normal form. (Contributed by Max Belanger and David Euresti in
:issue:`32285`).
is in a specific normal form, often much faster than by actually normalizing
the string. (Contributed by Max Belanger, David Euresti, and Greg Price in
:issue:`32285` and :issue:`37966`).


unittest
Expand Down
2 changes: 2 additions & 0 deletions Lib/test/test_unicodedata.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,6 +220,8 @@ def test_issue29456(self):
self.assertEqual(self.db.normalize('NFC', u11a7_str_a), u11a7_str_b)
self.assertEqual(self.db.normalize('NFC', u11c3_str_a), u11c3_str_b)

# For tests of unicodedata.is_normalized / self.db.is_normalized ,
# see test_normalization.py .

def test_east_asian_width(self):
eaw = self.db.east_asian_width
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
The implementation of :func:`~unicodedata.is_normalized` has been greatly
sped up on strings that aren't normalized, by implementing the full
normalization-quick-check algorithm from the Unicode standard.
75 changes: 51 additions & 24 deletions Modules/unicodedata.c
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include "ucnhash.h"
#include "structmember.h"

#include <stdbool.h>

_Py_IDENTIFIER(NFC);
_Py_IDENTIFIER(NFD);
_Py_IDENTIFIER(NFKC);
Expand Down Expand Up @@ -775,25 +777,40 @@ nfc_nfkc(PyObject *self, PyObject *input, int k)
return result;
}

typedef enum {YES, NO, MAYBE} NormalMode;

/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */
static NormalMode
is_normalized(PyObject *self, PyObject *input, int nfc, int k)
// This needs to match the logic in makeunicodedata.py
// which constructs the quickcheck data.
typedef enum {YES = 0, MAYBE = 1, NO = 2} QuickcheckResult;

/* Run the Unicode normalization "quickcheck" algorithm.
*
* Return YES or NO if quickcheck determines the input is certainly
* normalized or certainly not, and MAYBE if quickcheck is unable to
* tell.
*
* If `yes_only` is true, then return MAYBE as soon as we determine
* the answer is not YES.
*
* For background and details on the algorithm, see UAX #15:
* https://www.unicode.org/reports/tr15/#Detecting_Normalization_Forms
*/
static QuickcheckResult
is_normalized_quickcheck(PyObject *self, PyObject *input,
int nfc, int k, bool yes_only)
{
Py_ssize_t i, len;
int kind;
void *data;
unsigned char prev_combining = 0, quickcheck_mask;

/* An older version of the database is requested, quickchecks must be
disabled. */
if (self && UCD_Check(self))
return NO;

/* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No,
as described in http://unicode.org/reports/tr15/#Annex8. */
quickcheck_mask = 3 << ((nfc ? 4 : 0) + (k ? 2 : 0));
Py_ssize_t i, len;
int kind;
void *data;
unsigned char prev_combining = 0;

/* The two quickcheck bits at this shift have type QuickcheckResult. */
int quickcheck_shift = (nfc ? 4 : 0) + (k ? 2 : 0);

QuickcheckResult result = YES; /* certainly normalized, unless we find something */

i = 0;
kind = PyUnicode_KIND(input);
Expand All @@ -802,16 +819,26 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k)
while (i < len) {
Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
const _PyUnicode_DatabaseRecord *record = _getrecord_ex(ch);
unsigned char combining = record->combining;
unsigned char quickcheck = record->normalization_quick_check;

if (quickcheck & quickcheck_mask)
return MAYBE; /* this string might need normalization */
unsigned char combining = record->combining;
if (combining && prev_combining > combining)
return NO; /* non-canonical sort order, not normalized */
prev_combining = combining;

unsigned char quickcheck_whole = record->normalization_quick_check;
if (yes_only) {
if (quickcheck_whole & (3 << quickcheck_shift))
return MAYBE;
} else {
switch ((quickcheck_whole >> quickcheck_shift) & 3) {
case NO:
return NO;
case MAYBE:
result = MAYBE; /* this string might need normalization */
}
}
}
return YES; /* certainly normalized */
return result;
}

/*[clinic input]
Expand Down Expand Up @@ -844,7 +871,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
PyObject *result;
int nfc = 0;
int k = 0;
NormalMode m;
QuickcheckResult m;

PyObject *cmp;
int match = 0;
Expand All @@ -867,7 +894,7 @@ unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form,
return NULL;
}

m = is_normalized(self, input, nfc, k);
m = is_normalized_quickcheck(self, input, nfc, k, false);

if (m == MAYBE) {
cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k);
Expand Down Expand Up @@ -913,28 +940,28 @@ unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form,
}

if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) {
if (is_normalized(self, input, 1, 0) == YES) {
if (is_normalized_quickcheck(self, input, 1, 0, true) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 0);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) {
if (is_normalized(self, input, 1, 1) == YES) {
if (is_normalized_quickcheck(self, input, 1, 1, true) == YES) {
Py_INCREF(input);
return input;
}
return nfc_nfkc(self, input, 1);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) {
if (is_normalized(self, input, 0, 0) == YES) {
if (is_normalized_quickcheck(self, input, 0, 0, true) == YES) {
Py_INCREF(input);
return input;
}
return nfd_nfkd(self, input, 0);
}
if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) {
if (is_normalized(self, input, 0, 1) == YES) {
if (is_normalized_quickcheck(self, input, 0, 1, true) == YES) {
Py_INCREF(input);
return input;
}
Expand Down