Implement limited surrogate pairs support for platforms with 16-bit wchar_t.

This commit is contained in:
bruvzg
2021-11-05 12:45:47 +02:00
parent e6613ca9d4
commit 262ec9033a
5 changed files with 122 additions and 47 deletions

View File

@ -1480,10 +1480,15 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
skip = 2;
} else if ((c & 0xF8) == 0xF0) {
skip = 3;
if (sizeof(wchar_t) == 2) {
str_size++; // encode as surrogate pair.
}
} else if ((c & 0xFC) == 0xF8) {
skip = 4;
// invalid character, too long to encode as surrogates.
} else if ((c & 0xFE) == 0xFC) {
skip = 5;
// invalid character, too long to encode as surrogates.
} else {
_UNICERROR("invalid skip");
return true; //invalid utf8
@ -1575,12 +1580,14 @@ bool String::parse_utf8(const char *p_utf8, int p_len) {
}
}
//printf("char %i, len %i\n",unichar,len);
if (sizeof(wchar_t) == 2 && unichar > 0xFFFF) {
unichar = ' '; //too long for windows
if (sizeof(wchar_t) == 2 && unichar > 0x10FFFF) {
unichar = ' '; // invalid character, too long to encode as surrogates.
} else if (sizeof(wchar_t) == 2 && unichar > 0xFFFF) {
*(dst++) = uint32_t((unichar >> 10) + 0xD7C0); // lead surrogate.
*(dst++) = uint32_t((unichar & 0x3FF) | 0xDC00); // trail surrogate.
} else {
*(dst++) = unichar;
}
*(dst++) = unichar;
cstr_size -= len;
p_utf8 += len;
}
@ -1598,6 +1605,18 @@ CharString String::utf8() const {
int fl = 0;
for (int i = 0; i < l; i++) {
uint32_t c = d[i];
if ((c & 0xfffffc00) == 0xd800) { // decode surrogate pair.
if ((i < l - 1) && (d[i + 1] & 0xfffffc00) == 0xdc00) {
c = (c << 10UL) + d[i + 1] - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
i++; // skip trail surrogate.
} else {
fl += 1;
continue;
}
} else if ((c & 0xfffffc00) == 0xdc00) {
fl += 1;
continue;
}
if (c <= 0x7f) { // 7 bits.
fl += 1;
} else if (c <= 0x7ff) { // 11 bits
@ -1606,7 +1625,6 @@ CharString String::utf8() const {
fl += 3;
} else if (c <= 0x001fffff) { // 21 bits
fl += 4;
} else if (c <= 0x03ffffff) { // 26 bits
fl += 5;
} else if (c <= 0x7fffffff) { // 31 bits
@ -1626,6 +1644,18 @@ CharString String::utf8() const {
for (int i = 0; i < l; i++) {
uint32_t c = d[i];
if ((c & 0xfffffc00) == 0xd800) { // decode surrogate pair.
if ((i < l - 1) && (d[i + 1] & 0xfffffc00) == 0xdc00) {
c = (c << 10UL) + d[i + 1] - ((0xd800 << 10UL) + 0xdc00 - 0x10000);
i++; // skip trail surrogate.
} else {
APPEND_CHAR(' ');
continue;
}
} else if ((c & 0xfffffc00) == 0xdc00) {
APPEND_CHAR(' ');
continue;
}
if (c <= 0x7f) { // 7 bits.
APPEND_CHAR(c);