diff options
-rw-r--r-- | Doc/library/unicodedata.rst | 7 | ||||
-rw-r--r-- | Doc/whatsnew/3.8.rst | 7 | ||||
-rw-r--r-- | Lib/test/test_normalization.py | 11 | ||||
-rw-r--r-- | Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst | 2 | ||||
-rw-r--r-- | Modules/clinic/unicodedata.c.h | 40 | ||||
-rw-r--r-- | Modules/unicodedata.c | 115 |
6 files changed, 160 insertions, 22 deletions
diff --git a/Doc/library/unicodedata.rst b/Doc/library/unicodedata.rst index 59548f3e8b4..17e848bf552 100644 --- a/Doc/library/unicodedata.rst +++ b/Doc/library/unicodedata.rst @@ -133,6 +133,13 @@ following functions: a human reader, if one has combining characters and the other doesn't, they may not compare equal. +.. function:: is_normalized(form, unistr) + + Return whether the Unicode string *unistr* is in the normal form *form*. Valid + values for *form* are 'NFC', 'NFKC', 'NFD', and 'NFKD'. + + .. versionadded:: 3.8 + In addition, the module exposes the following constant: diff --git a/Doc/whatsnew/3.8.rst b/Doc/whatsnew/3.8.rst index 5397206030f..566c369c85b 100644 --- a/Doc/whatsnew/3.8.rst +++ b/Doc/whatsnew/3.8.rst @@ -204,6 +204,13 @@ Added method :meth:`~tkinter.Canvas.moveto` in the :class:`tkinter.Canvas` class. (Contributed by Juliette Monsel in :issue:`23831`.) +unicodedata +----------- + +* New function :func:`~unicodedata.is_normalized` can be used to verify a string + is in a specific normal form. (Contributed by Max Belanger and David Euresti in + :issue:`32285`). + venv ---- diff --git a/Lib/test/test_normalization.py b/Lib/test/test_normalization.py index 30424564450..ba877e73f7d 100644 --- a/Lib/test/test_normalization.py +++ b/Lib/test/test_normalization.py @@ -3,7 +3,7 @@ import unittest from http.client import HTTPException import sys -from unicodedata import normalize, unidata_version +from unicodedata import normalize, is_normalized, unidata_version TESTDATAFILE = "NormalizationTest.txt" TESTDATAURL = "http://www.pythontest.net/unicode/" + unidata_version + "/" + TESTDATAFILE @@ -88,6 +88,15 @@ class NormalizationTest(unittest.TestCase): NFKD(c3) == NFKD(c4) == NFKD(c5), line) + self.assertTrue(is_normalized("NFC", c2)) + self.assertTrue(is_normalized("NFC", c4)) + + self.assertTrue(is_normalized("NFD", c3)) + self.assertTrue(is_normalized("NFD", c5)) + + self.assertTrue(is_normalized("NFKC", c4)) + self.assertTrue(is_normalized("NFKD", c5)) + # Record part 1 data if part == "@Part1": part1_data[c1] = 1 diff --git a/Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst b/Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst new file mode 100644 index 00000000000..87f84b02eb8 --- /dev/null +++ b/Misc/NEWS.d/next/Core and Builtins/2017-12-12-13-43-13.bpo-32285.LzKSwz.rst @@ -0,0 +1,2 @@ +New function unicodedata.is_normalized, which can check whether a string is +in a specific normal form. diff --git a/Modules/clinic/unicodedata.c.h b/Modules/clinic/unicodedata.c.h index 72e3f654577..54021fedba4 100644 --- a/Modules/clinic/unicodedata.c.h +++ b/Modules/clinic/unicodedata.c.h @@ -284,6 +284,38 @@ exit: return return_value; } +PyDoc_STRVAR(unicodedata_UCD_is_normalized__doc__, +"is_normalized($self, form, unistr, /)\n" +"--\n" +"\n" +"Return whether the Unicode string unistr is in the normal form \'form\'.\n" +"\n" +"Valid values for form are \'NFC\', \'NFKC\', \'NFD\', and \'NFKD\'."); + +#define UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF \ + {"is_normalized", (PyCFunction)unicodedata_UCD_is_normalized, METH_FASTCALL, unicodedata_UCD_is_normalized__doc__}, + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, + PyObject *input); + +static PyObject * +unicodedata_UCD_is_normalized(PyObject *self, PyObject *const *args, Py_ssize_t nargs) +{ + PyObject *return_value = NULL; + PyObject *form; + PyObject *input; + + if (!_PyArg_ParseStack(args, nargs, "UU:is_normalized", + &form, &input)) { + goto exit; + } + return_value = unicodedata_UCD_is_normalized_impl(self, form, input); + +exit: + return return_value; +} + PyDoc_STRVAR(unicodedata_UCD_normalize__doc__, "normalize($self, form, unistr, /)\n" "--\n" @@ -296,17 +328,17 @@ PyDoc_STRVAR(unicodedata_UCD_normalize__doc__, {"normalize", (PyCFunction)unicodedata_UCD_normalize, METH_FASTCALL, unicodedata_UCD_normalize__doc__}, static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, const char *form, +unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, PyObject *input); static PyObject * unicodedata_UCD_normalize(PyObject *self, PyObject *const *args, Py_ssize_t nargs) { PyObject *return_value = NULL; - const char *form; + PyObject *form; PyObject *input; - if (!_PyArg_ParseStack(args, nargs, "sU:normalize", + if (!_PyArg_ParseStack(args, nargs, "UU:normalize", &form, &input)) { goto exit; } @@ -379,4 +411,4 @@ unicodedata_UCD_lookup(PyObject *self, PyObject *arg) exit: return return_value; } -/*[clinic end generated code: output=dc899bff0ecd14c1 input=a9049054013a1b77]*/ +/*[clinic end generated code: output=2c5fbf597c18f6b8 input=a9049054013a1b77]*/ diff --git a/Modules/unicodedata.c b/Modules/unicodedata.c index e8788f5036d..9ceab1b3db4 100644 --- a/Modules/unicodedata.c +++ b/Modules/unicodedata.c @@ -19,6 +19,11 @@ #include "ucnhash.h" #include "structmember.h" +_Py_IDENTIFIER(NFC); +_Py_IDENTIFIER(NFD); +_Py_IDENTIFIER(NFKC); +_Py_IDENTIFIER(NFKD); + /*[clinic input] module unicodedata class unicodedata.UCD 'PreviousDBVersion *' '&UCD_Type' @@ -770,8 +775,10 @@ nfc_nfkc(PyObject *self, PyObject *input, int k) return result; } -/* Return 1 if the input is certainly normalized, 0 if it might not be. */ -static int +typedef enum {YES, NO, MAYBE} NormalMode; + +/* Return YES if the input is certainly normalized, NO or MAYBE if it might not be. */ +static NormalMode is_normalized(PyObject *self, PyObject *input, int nfc, int k) { Py_ssize_t i, len; @@ -782,7 +789,7 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) /* An older version of the database is requested, quickchecks must be disabled. */ if (self && UCD_Check(self)) - return 0; + return NO; /* The two quickcheck bits at this shift mean 0=Yes, 1=Maybe, 2=No, as described in http://unicode.org/reports/tr15/#Annex8. */ @@ -799,19 +806,92 @@ is_normalized(PyObject *self, PyObject *input, int nfc, int k) unsigned char quickcheck = record->normalization_quick_check; if (quickcheck & quickcheck_mask) - return 0; /* this string might need normalization */ + return MAYBE; /* this string might need normalization */ if (combining && prev_combining > combining) - return 0; /* non-canonical sort order, not normalized */ + return NO; /* non-canonical sort order, not normalized */ prev_combining = combining; } - return 1; /* certainly normalized */ + return YES; /* certainly normalized */ +} + +/*[clinic input] +unicodedata.UCD.is_normalized + + self: self + form: unicode + unistr as input: unicode + / + +Return whether the Unicode string unistr is in the normal form 'form'. + +Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. +[clinic start generated code]*/ + +static PyObject * +unicodedata_UCD_is_normalized_impl(PyObject *self, PyObject *form, + PyObject *input) +/*[clinic end generated code: output=11e5a3694e723ca5 input=a544f14cea79e508]*/ +{ + if (PyUnicode_READY(input) == -1) { + return NULL; + } + + if (PyUnicode_GET_LENGTH(input) == 0) { + /* special case empty input strings. */ + Py_RETURN_TRUE; + } + + PyObject *result; + int nfc = 0; + int k = 0; + NormalMode m; + + PyObject *cmp; + int match = 0; + + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { + nfc = 1; + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { + nfc = 1; + k = 1; + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { + /* matches default values for `nfc` and `k` */ + } + else if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { + k = 1; + } + else { + PyErr_SetString(PyExc_ValueError, "invalid normalization form"); + return NULL; + } + + m = is_normalized(self, input, nfc, k); + + if (m == MAYBE) { + cmp = (nfc ? nfc_nfkc : nfd_nfkd)(self, input, k); + if (cmp == NULL) { + return NULL; + } + match = PyUnicode_Compare(input, cmp); + Py_DECREF(cmp); + result = (match == 0) ? Py_True : Py_False; + } + else { + result = (m == YES) ? Py_True : Py_False; + } + + Py_INCREF(result); + return result; } + /*[clinic input] unicodedata.UCD.normalize self: self - form: str + form: unicode unistr as input: unicode / @@ -821,9 +901,9 @@ Valid values for form are 'NFC', 'NFKC', 'NFD', and 'NFKD'. [clinic start generated code]*/ static PyObject * -unicodedata_UCD_normalize_impl(PyObject *self, const char *form, +unicodedata_UCD_normalize_impl(PyObject *self, PyObject *form, PyObject *input) -/*[clinic end generated code: output=62d1f8870027efdc input=1744c55f4ab79bf0]*/ +/*[clinic end generated code: output=05ca4385a2ad6983 input=3a5206c0ad2833fb]*/ { if (PyUnicode_GET_LENGTH(input) == 0) { /* Special case empty input strings, since resizing @@ -832,29 +912,29 @@ unicodedata_UCD_normalize_impl(PyObject *self, const char *form, return input; } - if (strcmp(form, "NFC") == 0) { - if (is_normalized(self, input, 1, 0)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFC)) { + if (is_normalized(self, input, 1, 0) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 0); } - if (strcmp(form, "NFKC") == 0) { - if (is_normalized(self, input, 1, 1)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKC)) { + if (is_normalized(self, input, 1, 1) == YES) { Py_INCREF(input); return input; } return nfc_nfkc(self, input, 1); } - if (strcmp(form, "NFD") == 0) { - if (is_normalized(self, input, 0, 0)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFD)) { + if (is_normalized(self, input, 0, 0) == YES) { Py_INCREF(input); return input; } return nfd_nfkd(self, input, 0); } - if (strcmp(form, "NFKD") == 0) { - if (is_normalized(self, input, 0, 1)) { + if (_PyUnicode_EqualToASCIIId(form, &PyId_NFKD)) { + if (is_normalized(self, input, 0, 1) == YES) { Py_INCREF(input); return input; } @@ -1271,6 +1351,7 @@ static PyMethodDef unicodedata_functions[] = { UNICODEDATA_UCD_DECOMPOSITION_METHODDEF UNICODEDATA_UCD_NAME_METHODDEF UNICODEDATA_UCD_LOOKUP_METHODDEF + UNICODEDATA_UCD_IS_NORMALIZED_METHODDEF UNICODEDATA_UCD_NORMALIZE_METHODDEF {NULL, NULL} /* sentinel */ }; |