diff options
Diffstat (limited to 'core/string')
-rw-r--r-- | core/string/ustring.cpp | 90 | ||||
-rw-r--r-- | core/string/ustring.h | 1 |
2 files changed, 56 insertions, 35 deletions
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp index c276f20f99..12e6423724 100644 --- a/core/string/ustring.cpp +++ b/core/string/ustring.cpp @@ -62,6 +62,7 @@ static _FORCE_INLINE_ char32_t lower_case(char32_t c) { const char CharString::_null = 0; const char16_t Char16String::_null = 0; const char32_t String::_null = 0; +const char32_t String::_replacement_char = 0xfffd; bool select_word(const String &p_s, int p_col, int &r_beg, int &r_end) { const String &s = p_s; @@ -307,7 +308,7 @@ void String::copy_from(const char *p_cstr) { uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]); if (c == 0 && i < len) { print_unicode_error("NUL character", true); - dst[i] = 0x20; + dst[i] = _replacement_char; } else { dst[i] = c; } @@ -340,7 +341,7 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) { uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]); if (c == 0) { print_unicode_error("NUL character", true); - dst[i] = 0x20; + dst[i] = _replacement_char; } else { dst[i] = c; } @@ -373,17 +374,21 @@ void String::copy_from(const char32_t &p_char) { print_unicode_error("NUL character", true); return; } + + resize(2); + + char32_t *dst = ptrw(); + if ((p_char & 0xfffff800) == 0xd800) { print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char)); - } - if (p_char > 0x10ffff) { + dst[0] = _replacement_char; + } else if (p_char > 0x10ffff) { print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char)); + dst[0] = _replacement_char; + } else { + dst[0] = p_char; } - resize(2); - - char32_t *dst = ptrw(); - dst[0] = p_char; dst[1] = 0; } @@ -439,14 +444,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) { for (int i = 0; i < p_length; i++) { if (p_char[i] == 0) { print_unicode_error("NUL character", true); - dst[i] = 0x20; + dst[i] = _replacement_char; continue; } if ((p_char[i] & 0xfffff800) == 0xd800) { print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i])); + dst[i] = _replacement_char; + continue; } if (p_char[i] > 0x10ffff) { print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i])); + dst[i] = _replacement_char; + continue; } dst[i] = p_char[i]; } @@ -538,7 +547,7 @@ String &String::operator+=(const char *p_str) { uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]); if (c == 0 && i < rhs_len) { print_unicode_error("NUL character", true); - dst[i] = 0x20; + dst[i] = _replacement_char; } else { dst[i] = c; } @@ -568,17 +577,21 @@ String &String::operator+=(char32_t p_char) { print_unicode_error("NUL character", true); return *this; } + + const int lhs_len = length(); + resize(lhs_len + 2); + char32_t *dst = ptrw(); + if ((p_char & 0xfffff800) == 0xd800) { print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char)); - } - if (p_char > 0x10ffff) { + dst[lhs_len] = _replacement_char; + } else if (p_char > 0x10ffff) { print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char)); + dst[lhs_len] = _replacement_char; + } else { + dst[lhs_len] = p_char; } - const int lhs_len = length(); - resize(lhs_len + 2); - char32_t *dst = ptrw(); - dst[lhs_len] = p_char; dst[lhs_len + 1] = 0; return *this; @@ -1737,7 +1750,7 @@ Vector<uint8_t> String::hex_decode() const { void String::print_unicode_error(const String &p_message, bool p_critical) const { if (p_critical) { - print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message)); + print_error(vformat("Unicode parsing error, some characters were replaced with � (U+FFFD): %s", p_message)); } else { print_error(vformat("Unicode parsing error: %s", p_message)); } @@ -1757,7 +1770,7 @@ CharString String::ascii(bool p_allow_extended) const { cs[i] = c; } else { print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c)); - cs[i] = 0x20; + cs[i] = 0x20; // ascii doesn't have a replacement character like unicode, 0x1a is sometimes used but is kinda arcane } } @@ -1897,13 +1910,13 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) { unichar = (0xff >> 7) & c; skip = 5; } else { - *(dst++) = 0x20; + *(dst++) = _replacement_char; unichar = 0; skip = 0; } } else { if (c < 0x80 || c > 0xbf) { - *(dst++) = 0x20; + *(dst++) = _replacement_char; skip = 0; } else { unichar = (unichar << 6) | (c & 0x3f); @@ -1912,15 +1925,15 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) { if (unichar == 0) { print_unicode_error("NUL character", true); decode_failed = true; - unichar = 0x20; - } - if ((unichar & 0xfffff800) == 0xd800) { - print_unicode_error(vformat("Unpaired surrogate (%x)", unichar)); - decode_error = true; - } - if (unichar > 0x10ffff) { - print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar)); - decode_error = true; + unichar = _replacement_char; + } else if ((unichar & 0xfffff800) == 0xd800) { + print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true); + decode_failed = true; + unichar = _replacement_char; + } else if (unichar > 0x10ffff) { + print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true); + decode_failed = true; + unichar = _replacement_char; } *(dst++) = unichar; } @@ -2014,7 +2027,11 @@ CharString String::utf8() const { APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits. APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits. } else { - APPEND_CHAR(0x20); + // the string is a valid UTF32, so it should never happen ... + print_unicode_error(vformat("Non scalar value (%x)", c), true); + APPEND_CHAR(uint32_t(0xe0 | ((_replacement_char >> 12) & 0x0f))); // Top 4 bits. + APPEND_CHAR(uint32_t(0x80 | ((_replacement_char >> 6) & 0x3f))); // Middle 6 bits. + APPEND_CHAR(uint32_t(0x80 | (_replacement_char & 0x3f))); // Bottom 6 bits. } } #undef APPEND_CHAR @@ -2187,7 +2204,9 @@ Char16String String::utf16() const { APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate. APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate. } else { - APPEND_CHAR(0x20); + // the string is a valid UTF32, so it should never happen ... + APPEND_CHAR(uint32_t((_replacement_char >> 10) + 0xd7c0)); + APPEND_CHAR(uint32_t((_replacement_char & 0x3ff) | 0xdc00)); } } #undef APPEND_CHAR @@ -4262,12 +4281,13 @@ String String::pad_zeros(int p_digits) const { begin++; } - if (begin >= end) { + int zeros_to_add = p_digits - (end - begin); + + if (zeros_to_add <= 0) { return s; + } else { + return s.insert(begin, String("0").repeat(zeros_to_add)); } - - int zeros_to_add = p_digits - (end - begin); - return s.insert(begin, String("0").repeat(zeros_to_add)); } String String::trim_prefix(const String &p_prefix) const { diff --git a/core/string/ustring.h b/core/string/ustring.h index 782ca47507..295625395d 100644 --- a/core/string/ustring.h +++ b/core/string/ustring.h @@ -183,6 +183,7 @@ struct StrRange { class String { CowData<char32_t> _cowdata; static const char32_t _null; + static const char32_t _replacement_char; void copy_from(const char *p_cstr); void copy_from(const char *p_cstr, const int p_clip_to); |