summaryrefslogtreecommitdiffstats
path: root/core/string
diff options
context:
space:
mode:
Diffstat (limited to 'core/string')
-rw-r--r--core/string/ustring.cpp90
-rw-r--r--core/string/ustring.h1
2 files changed, 56 insertions, 35 deletions
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index c276f20f99..12e6423724 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -62,6 +62,7 @@ static _FORCE_INLINE_ char32_t lower_case(char32_t c) {
const char CharString::_null = 0;
const char16_t Char16String::_null = 0;
const char32_t String::_null = 0;
+const char32_t String::_replacement_char = 0xfffd;
bool select_word(const String &p_s, int p_col, int &r_beg, int &r_end) {
const String &s = p_s;
@@ -307,7 +308,7 @@ void String::copy_from(const char *p_cstr) {
uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
if (c == 0 && i < len) {
print_unicode_error("NUL character", true);
- dst[i] = 0x20;
+ dst[i] = _replacement_char;
} else {
dst[i] = c;
}
@@ -340,7 +341,7 @@ void String::copy_from(const char *p_cstr, const int p_clip_to) {
uint8_t c = p_cstr[i] >= 0 ? p_cstr[i] : uint8_t(256 + p_cstr[i]);
if (c == 0) {
print_unicode_error("NUL character", true);
- dst[i] = 0x20;
+ dst[i] = _replacement_char;
} else {
dst[i] = c;
}
@@ -373,17 +374,21 @@ void String::copy_from(const char32_t &p_char) {
print_unicode_error("NUL character", true);
return;
}
+
+ resize(2);
+
+ char32_t *dst = ptrw();
+
if ((p_char & 0xfffff800) == 0xd800) {
print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
- }
- if (p_char > 0x10ffff) {
+ dst[0] = _replacement_char;
+ } else if (p_char > 0x10ffff) {
print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
+ dst[0] = _replacement_char;
+ } else {
+ dst[0] = p_char;
}
- resize(2);
-
- char32_t *dst = ptrw();
- dst[0] = p_char;
dst[1] = 0;
}
@@ -439,14 +444,18 @@ void String::copy_from_unchecked(const char32_t *p_char, const int p_length) {
for (int i = 0; i < p_length; i++) {
if (p_char[i] == 0) {
print_unicode_error("NUL character", true);
- dst[i] = 0x20;
+ dst[i] = _replacement_char;
continue;
}
if ((p_char[i] & 0xfffff800) == 0xd800) {
print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char[i]));
+ dst[i] = _replacement_char;
+ continue;
}
if (p_char[i] > 0x10ffff) {
print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char[i]));
+ dst[i] = _replacement_char;
+ continue;
}
dst[i] = p_char[i];
}
@@ -538,7 +547,7 @@ String &String::operator+=(const char *p_str) {
uint8_t c = p_str[i] >= 0 ? p_str[i] : uint8_t(256 + p_str[i]);
if (c == 0 && i < rhs_len) {
print_unicode_error("NUL character", true);
- dst[i] = 0x20;
+ dst[i] = _replacement_char;
} else {
dst[i] = c;
}
@@ -568,17 +577,21 @@ String &String::operator+=(char32_t p_char) {
print_unicode_error("NUL character", true);
return *this;
}
+
+ const int lhs_len = length();
+ resize(lhs_len + 2);
+ char32_t *dst = ptrw();
+
if ((p_char & 0xfffff800) == 0xd800) {
print_unicode_error(vformat("Unpaired surrogate (%x)", (uint32_t)p_char));
- }
- if (p_char > 0x10ffff) {
+ dst[lhs_len] = _replacement_char;
+ } else if (p_char > 0x10ffff) {
print_unicode_error(vformat("Invalid unicode codepoint (%x)", (uint32_t)p_char));
+ dst[lhs_len] = _replacement_char;
+ } else {
+ dst[lhs_len] = p_char;
}
- const int lhs_len = length();
- resize(lhs_len + 2);
- char32_t *dst = ptrw();
- dst[lhs_len] = p_char;
dst[lhs_len + 1] = 0;
return *this;
@@ -1737,7 +1750,7 @@ Vector<uint8_t> String::hex_decode() const {
void String::print_unicode_error(const String &p_message, bool p_critical) const {
if (p_critical) {
- print_error(vformat("Unicode parsing error, some characters were replaced with spaces: %s", p_message));
+ print_error(vformat("Unicode parsing error, some characters were replaced with � (U+FFFD): %s", p_message));
} else {
print_error(vformat("Unicode parsing error: %s", p_message));
}
@@ -1757,7 +1770,7 @@ CharString String::ascii(bool p_allow_extended) const {
cs[i] = c;
} else {
print_unicode_error(vformat("Invalid unicode codepoint (%x), cannot represent as ASCII/Latin-1", (uint32_t)c));
- cs[i] = 0x20;
+ cs[i] = 0x20; // ascii doesn't have a replacement character like unicode, 0x1a is sometimes used but is kinda arcane
}
}
@@ -1897,13 +1910,13 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
unichar = (0xff >> 7) & c;
skip = 5;
} else {
- *(dst++) = 0x20;
+ *(dst++) = _replacement_char;
unichar = 0;
skip = 0;
}
} else {
if (c < 0x80 || c > 0xbf) {
- *(dst++) = 0x20;
+ *(dst++) = _replacement_char;
skip = 0;
} else {
unichar = (unichar << 6) | (c & 0x3f);
@@ -1912,15 +1925,15 @@ Error String::parse_utf8(const char *p_utf8, int p_len, bool p_skip_cr) {
if (unichar == 0) {
print_unicode_error("NUL character", true);
decode_failed = true;
- unichar = 0x20;
- }
- if ((unichar & 0xfffff800) == 0xd800) {
- print_unicode_error(vformat("Unpaired surrogate (%x)", unichar));
- decode_error = true;
- }
- if (unichar > 0x10ffff) {
- print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar));
- decode_error = true;
+ unichar = _replacement_char;
+ } else if ((unichar & 0xfffff800) == 0xd800) {
+ print_unicode_error(vformat("Unpaired surrogate (%x)", unichar), true);
+ decode_failed = true;
+ unichar = _replacement_char;
+ } else if (unichar > 0x10ffff) {
+ print_unicode_error(vformat("Invalid unicode codepoint (%x)", unichar), true);
+ decode_failed = true;
+ unichar = _replacement_char;
}
*(dst++) = unichar;
}
@@ -2014,7 +2027,11 @@ CharString String::utf8() const {
APPEND_CHAR(uint32_t(0x80 | ((c >> 6) & 0x3f))); // Lower lower middle 6 bits.
APPEND_CHAR(uint32_t(0x80 | (c & 0x3f))); // Bottom 6 bits.
} else {
- APPEND_CHAR(0x20);
+ // the string is a valid UTF32, so it should never happen ...
+ print_unicode_error(vformat("Non scalar value (%x)", c), true);
+ APPEND_CHAR(uint32_t(0xe0 | ((_replacement_char >> 12) & 0x0f))); // Top 4 bits.
+ APPEND_CHAR(uint32_t(0x80 | ((_replacement_char >> 6) & 0x3f))); // Middle 6 bits.
+ APPEND_CHAR(uint32_t(0x80 | (_replacement_char & 0x3f))); // Bottom 6 bits.
}
}
#undef APPEND_CHAR
@@ -2187,7 +2204,9 @@ Char16String String::utf16() const {
APPEND_CHAR(uint32_t((c >> 10) + 0xd7c0)); // lead surrogate.
APPEND_CHAR(uint32_t((c & 0x3ff) | 0xdc00)); // trail surrogate.
} else {
- APPEND_CHAR(0x20);
+ // the string is a valid UTF32, so it should never happen ...
+ APPEND_CHAR(uint32_t((_replacement_char >> 10) + 0xd7c0));
+ APPEND_CHAR(uint32_t((_replacement_char & 0x3ff) | 0xdc00));
}
}
#undef APPEND_CHAR
@@ -4262,12 +4281,13 @@ String String::pad_zeros(int p_digits) const {
begin++;
}
- if (begin >= end) {
+ int zeros_to_add = p_digits - (end - begin);
+
+ if (zeros_to_add <= 0) {
return s;
+ } else {
+ return s.insert(begin, String("0").repeat(zeros_to_add));
}
-
- int zeros_to_add = p_digits - (end - begin);
- return s.insert(begin, String("0").repeat(zeros_to_add));
}
String String::trim_prefix(const String &p_prefix) const {
diff --git a/core/string/ustring.h b/core/string/ustring.h
index 782ca47507..295625395d 100644
--- a/core/string/ustring.h
+++ b/core/string/ustring.h
@@ -183,6 +183,7 @@ struct StrRange {
class String {
CowData<char32_t> _cowdata;
static const char32_t _null;
+ static const char32_t _replacement_char;
void copy_from(const char *p_cstr);
void copy_from(const char *p_cstr, const int p_clip_to);