summaryrefslogtreecommitdiffstats
path: root/thirdparty/icu4c/common/loclikely.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'thirdparty/icu4c/common/loclikely.cpp')
-rw-r--r--thirdparty/icu4c/common/loclikely.cpp102
1 files changed, 83 insertions, 19 deletions
diff --git a/thirdparty/icu4c/common/loclikely.cpp b/thirdparty/icu4c/common/loclikely.cpp
index 5aa929dc4b..ccbcbfa7a5 100644
--- a/thirdparty/icu4c/common/loclikely.cpp
+++ b/thirdparty/icu4c/common/loclikely.cpp
@@ -19,6 +19,7 @@
* that then do not depend on resource bundle code and likely-subtags data.
*/
+#include <string_view>
#include <utility>
#include "unicode/bytestream.h"
@@ -170,7 +171,7 @@ _uloc_addLikelySubtags(const char* localeID,
return;
}
- int32_t trailingLength = (int32_t)uprv_strlen(trailing);
+ int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
if (U_FAILURE(err)) {
@@ -193,11 +194,11 @@ _uloc_addLikelySubtags(const char* localeID,
}
createTagStringWithAlternates(
language,
- (int32_t)uprv_strlen(language),
+ static_cast<int32_t>(uprv_strlen(language)),
lsr.script,
- (int32_t)uprv_strlen(lsr.script),
+ static_cast<int32_t>(uprv_strlen(lsr.script)),
lsr.region,
- (int32_t)uprv_strlen(lsr.region),
+ static_cast<int32_t>(uprv_strlen(lsr.region)),
variant.data(),
variant.length(),
trailing,
@@ -235,7 +236,7 @@ _uloc_minimizeSubtags(const char* localeID,
return;
}
- int32_t trailingLength = (int32_t)uprv_strlen(trailing);
+ int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing));
const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err);
if (U_FAILURE(err)) {
@@ -256,11 +257,11 @@ _uloc_minimizeSubtags(const char* localeID,
}
createTagStringWithAlternates(
language,
- (int32_t)uprv_strlen(language),
+ static_cast<int32_t>(uprv_strlen(language)),
lsr.script,
- (int32_t)uprv_strlen(lsr.script),
+ static_cast<int32_t>(uprv_strlen(lsr.script)),
lsr.region,
- (int32_t)uprv_strlen(lsr.region),
+ static_cast<int32_t>(uprv_strlen(lsr.region)),
variant.data(),
variant.length(),
trailing,
@@ -388,21 +389,32 @@ U_NAMESPACE_END
namespace {
icu::CharString
-GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) {
+GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) {
icu::CharString result;
-
// First check for keyword value
icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status);
int32_t len = kw.length();
- if (U_SUCCESS(status) && len >= 3 && len <= 7) {
- // chop off the subdivision code (which will generally be "zzzz" anyway)
- const char* const data = kw.data();
- if (uprv_isASCIILetter(data[0])) {
- result.append(uprv_toupper(data[0]), status);
- result.append(uprv_toupper(data[1]), status);
- } else {
- // assume three-digit region code
- result.append(data, 3, status);
+ // In UTS35
+ // type = alphanum{3,8} (sep alphanum{3,8})* ;
+ // so we know the subdivision must fit the type already.
+ //
+ // unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ;
+ // unicode_region_subtag = (alpha{2} | digit{3}) ;
+ // unicode_subdivision_suffix = alphanum{1,4} ;
+ // But we also know there are no id in start with digit{3} in
+ // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml
+ // Therefore we can simplify as
+ // unicode_subdivision_id = alpha{2} alphanum{1,4}
+ //
+ // and only need to accept/reject the code based on the alpha{2} and the length.
+ if (U_SUCCESS(status) && len >= 3 && len <= 6 &&
+ uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) {
+ // Additional Check
+ static icu::RegionValidateMap valid;
+ const char region[] = {kw[0], kw[1], '\0'};
+ if (valid.isSet(region)) {
+ result.append(uprv_toupper(kw[0]), status);
+ result.append(uprv_toupper(kw[1]), status);
}
}
return result;
@@ -435,3 +447,55 @@ ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion,
return rgBuf;
}
+
+namespace {
+
+// The following data is generated by unit test code inside
+// test/intltest/regiontst.cpp from the resource data while
+// the test failed.
+const uint32_t gValidRegionMap[] = {
+ 0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580,
+ 0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f,
+ 0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b,
+ 0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7,
+ 0x0410419a, 0x00408557, 0x00004002, 0x00100001,
+ 0x00400408, 0x00000001,
+};
+
+} // namespace
+ //
+U_NAMESPACE_BEGIN
+RegionValidateMap::RegionValidateMap() {
+ uprv_memcpy(map, gValidRegionMap, sizeof(map));
+}
+
+RegionValidateMap::~RegionValidateMap() {
+}
+
+bool RegionValidateMap::isSet(const char* region) const {
+ int32_t index = value(region);
+ if (index < 0) {
+ return false;
+ }
+ return 0 != (map[index / 32] & (1L << (index % 32)));
+}
+
+bool RegionValidateMap::equals(const RegionValidateMap& that) const {
+ return uprv_memcmp(map, that.map, sizeof(map)) == 0;
+}
+
+// The code transform two letter a-z to a integer valued between -1, 26x26.
+// -1 indicate the region is outside the range of two letter a-z
+// the rest of value is between 0 and 676 (= 26x26) and used as an index
+// the the bigmap in map. The map is an array of 22 int32_t.
+// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t.
+int32_t RegionValidateMap::value(const char* region) const {
+ if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) &&
+ region[2] == '\0') {
+ return (uprv_toupper(region[0])-'A') * 26 +
+ (uprv_toupper(region[1])-'A');
+ }
+ return -1;
+}
+
+U_NAMESPACE_END