diff options
Diffstat (limited to 'thirdparty/icu4c/common/loclikely.cpp')
-rw-r--r-- | thirdparty/icu4c/common/loclikely.cpp | 102 |
1 files changed, 83 insertions, 19 deletions
diff --git a/thirdparty/icu4c/common/loclikely.cpp b/thirdparty/icu4c/common/loclikely.cpp index 5aa929dc4b..ccbcbfa7a5 100644 --- a/thirdparty/icu4c/common/loclikely.cpp +++ b/thirdparty/icu4c/common/loclikely.cpp @@ -19,6 +19,7 @@ * that then do not depend on resource bundle code and likely-subtags data. */ +#include <string_view> #include <utility> #include "unicode/bytestream.h" @@ -170,7 +171,7 @@ _uloc_addLikelySubtags(const char* localeID, return; } - int32_t trailingLength = (int32_t)uprv_strlen(trailing); + int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing)); const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err); if (U_FAILURE(err)) { @@ -193,11 +194,11 @@ _uloc_addLikelySubtags(const char* localeID, } createTagStringWithAlternates( language, - (int32_t)uprv_strlen(language), + static_cast<int32_t>(uprv_strlen(language)), lsr.script, - (int32_t)uprv_strlen(lsr.script), + static_cast<int32_t>(uprv_strlen(lsr.script)), lsr.region, - (int32_t)uprv_strlen(lsr.region), + static_cast<int32_t>(uprv_strlen(lsr.region)), variant.data(), variant.length(), trailing, @@ -235,7 +236,7 @@ _uloc_minimizeSubtags(const char* localeID, return; } - int32_t trailingLength = (int32_t)uprv_strlen(trailing); + int32_t trailingLength = static_cast<int32_t>(uprv_strlen(trailing)); const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err); if (U_FAILURE(err)) { @@ -256,11 +257,11 @@ _uloc_minimizeSubtags(const char* localeID, } createTagStringWithAlternates( language, - (int32_t)uprv_strlen(language), + static_cast<int32_t>(uprv_strlen(language)), lsr.script, - (int32_t)uprv_strlen(lsr.script), + static_cast<int32_t>(uprv_strlen(lsr.script)), lsr.region, - (int32_t)uprv_strlen(lsr.region), + static_cast<int32_t>(uprv_strlen(lsr.region)), variant.data(), variant.length(), trailing, @@ -388,21 +389,32 @@ U_NAMESPACE_END namespace { icu::CharString -GetRegionFromKey(const char* localeID, const char* key, UErrorCode& status) { +GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) { icu::CharString result; - // First check for keyword value icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status); int32_t len = kw.length(); - if (U_SUCCESS(status) && len >= 3 && len <= 7) { - // chop off the subdivision code (which will generally be "zzzz" anyway) - const char* const data = kw.data(); - if (uprv_isASCIILetter(data[0])) { - result.append(uprv_toupper(data[0]), status); - result.append(uprv_toupper(data[1]), status); - } else { - // assume three-digit region code - result.append(data, 3, status); + // In UTS35 + // type = alphanum{3,8} (sep alphanum{3,8})* ; + // so we know the subdivision must fit the type already. + // + // unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ; + // unicode_region_subtag = (alpha{2} | digit{3}) ; + // unicode_subdivision_suffix = alphanum{1,4} ; + // But we also know there are no id in start with digit{3} in + // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml + // Therefore we can simplify as + // unicode_subdivision_id = alpha{2} alphanum{1,4} + // + // and only need to accept/reject the code based on the alpha{2} and the length. + if (U_SUCCESS(status) && len >= 3 && len <= 6 && + uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) { + // Additional Check + static icu::RegionValidateMap valid; + const char region[] = {kw[0], kw[1], '\0'}; + if (valid.isSet(region)) { + result.append(uprv_toupper(kw[0]), status); + result.append(uprv_toupper(kw[1]), status); } } return result; @@ -435,3 +447,55 @@ ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion, return rgBuf; } + +namespace { + +// The following data is generated by unit test code inside +// test/intltest/regiontst.cpp from the resource data while +// the test failed. +const uint32_t gValidRegionMap[] = { + 0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, + 0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, + 0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, + 0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, + 0x0410419a, 0x00408557, 0x00004002, 0x00100001, + 0x00400408, 0x00000001, +}; + +} // namespace + // +U_NAMESPACE_BEGIN +RegionValidateMap::RegionValidateMap() { + uprv_memcpy(map, gValidRegionMap, sizeof(map)); +} + +RegionValidateMap::~RegionValidateMap() { +} + +bool RegionValidateMap::isSet(const char* region) const { + int32_t index = value(region); + if (index < 0) { + return false; + } + return 0 != (map[index / 32] & (1L << (index % 32))); +} + +bool RegionValidateMap::equals(const RegionValidateMap& that) const { + return uprv_memcmp(map, that.map, sizeof(map)) == 0; +} + +// The code transform two letter a-z to a integer valued between -1, 26x26. +// -1 indicate the region is outside the range of two letter a-z +// the rest of value is between 0 and 676 (= 26x26) and used as an index +// the the bigmap in map. The map is an array of 22 int32_t. +// since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t. +int32_t RegionValidateMap::value(const char* region) const { + if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) && + region[2] == '\0') { + return (uprv_toupper(region[0])-'A') * 26 + + (uprv_toupper(region[1])-'A'); + } + return -1; +} + +U_NAMESPACE_END |