// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ******************************************************************************* * * Copyright (C) 1997-2016, International Business Machines * Corporation and others. All Rights Reserved. * ******************************************************************************* * file name: loclikely.cpp * encoding: UTF-8 * tab size: 8 (not used) * indentation:4 * * created on: 2010feb25 * created by: Markus W. Scherer * * Code for likely and minimized locale subtags, separated out from other .cpp files * that then do not depend on resource bundle code and likely-subtags data. */ #include #include #include "unicode/bytestream.h" #include "unicode/utypes.h" #include "unicode/locid.h" #include "unicode/putil.h" #include "unicode/uchar.h" #include "unicode/uloc.h" #include "unicode/ures.h" #include "unicode/uscript.h" #include "bytesinkutil.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "loclikelysubtags.h" #include "ulocimp.h" namespace { /** * Create a tag string from the supplied parameters. The lang, script and region * parameters may be nullptr pointers. If they are, their corresponding length parameters * must be less than or equal to 0. * * If an illegal argument is provided, the function returns the error * U_ILLEGAL_ARGUMENT_ERROR. * * @param lang The language tag to use. * @param langLength The length of the language tag. * @param script The script tag to use. * @param scriptLength The length of the script tag. * @param region The region tag to use. * @param regionLength The length of the region tag. * @param variant The region tag to use. * @param variantLength The length of the region tag. * @param trailing Any trailing data to append to the new tag. * @param trailingLength The length of the trailing data. * @param sink The output sink receiving the tag string. * @param err A pointer to a UErrorCode for error reporting. **/ void U_CALLCONV createTagStringWithAlternates( const char* lang, int32_t langLength, const char* script, int32_t scriptLength, const char* region, int32_t regionLength, const char* variant, int32_t variantLength, const char* trailing, int32_t trailingLength, icu::ByteSink& sink, UErrorCode& err) { if (U_FAILURE(err)) { return; } if (langLength >= ULOC_LANG_CAPACITY || scriptLength >= ULOC_SCRIPT_CAPACITY || regionLength >= ULOC_COUNTRY_CAPACITY) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } if (langLength > 0) { sink.Append(lang, langLength); } if (scriptLength > 0) { sink.Append("_", 1); sink.Append(script, scriptLength); } if (regionLength > 0) { sink.Append("_", 1); sink.Append(region, regionLength); } if (variantLength > 0) { if (regionLength == 0) { /* extra separator is required */ sink.Append("_", 1); } sink.Append("_", 1); sink.Append(variant, variantLength); } if (trailingLength > 0) { /* * Copy the trailing data into the supplied buffer. */ sink.Append(trailing, trailingLength); } } bool CHECK_TRAILING_VARIANT_SIZE(const char* variant, int32_t variantLength) { int32_t count = 0; for (int32_t i = 0; i < variantLength; i++) { if (_isIDSeparator(variant[i])) { count = 0; } else if (count == 8) { return false; } else { count++; } } return true; } void _uloc_addLikelySubtags(const char* localeID, icu::ByteSink& sink, UErrorCode& err) { if (U_FAILURE(err)) { return; } if (localeID == nullptr) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } icu::CharString lang; icu::CharString script; icu::CharString region; icu::CharString variant; const char* trailing = nullptr; ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err); if (U_FAILURE(err)) { return; } if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } if (lang.length() == 4) { if (script.isEmpty()) { script = std::move(lang); lang.clear(); } else { err = U_ILLEGAL_ARGUMENT_ERROR; return; } } else if (lang.length() > 8) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } int32_t trailingLength = static_cast(uprv_strlen(trailing)); const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err); if (U_FAILURE(err)) { return; } // We need to keep l on the stack because lsr may point into internal // memory of l. icu::Locale l = icu::Locale::createFromName(localeID); if (l.isBogus()) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } icu::LSR lsr = likelySubtags->makeMaximizedLsrFrom(l, true, err); if (U_FAILURE(err)) { return; } const char* language = lsr.language; if (uprv_strcmp(language, "und") == 0) { language = ""; } createTagStringWithAlternates( language, static_cast(uprv_strlen(language)), lsr.script, static_cast(uprv_strlen(lsr.script)), lsr.region, static_cast(uprv_strlen(lsr.region)), variant.data(), variant.length(), trailing, trailingLength, sink, err); } void _uloc_minimizeSubtags(const char* localeID, icu::ByteSink& sink, bool favorScript, UErrorCode& err) { if (U_FAILURE(err)) { return; } if (localeID == nullptr) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } icu::CharString lang; icu::CharString script; icu::CharString region; icu::CharString variant; const char* trailing = nullptr; ulocimp_getSubtags(localeID, &lang, &script, ®ion, &variant, &trailing, err); if (U_FAILURE(err)) { return; } if (!CHECK_TRAILING_VARIANT_SIZE(variant.data(), variant.length())) { err = U_ILLEGAL_ARGUMENT_ERROR; return; } int32_t trailingLength = static_cast(uprv_strlen(trailing)); const icu::LikelySubtags* likelySubtags = icu::LikelySubtags::getSingleton(err); if (U_FAILURE(err)) { return; } icu::LSR lsr = likelySubtags->minimizeSubtags( lang.toStringPiece(), script.toStringPiece(), region.toStringPiece(), favorScript, err); if (U_FAILURE(err)) { return; } const char* language = lsr.language; if (uprv_strcmp(language, "und") == 0) { language = ""; } createTagStringWithAlternates( language, static_cast(uprv_strlen(language)), lsr.script, static_cast(uprv_strlen(lsr.script)), lsr.region, static_cast(uprv_strlen(lsr.region)), variant.data(), variant.length(), trailing, trailingLength, sink, err); } } // namespace U_CAPI int32_t U_EXPORT2 uloc_addLikelySubtags(const char* localeID, char* maximizedLocaleID, int32_t maximizedLocaleIDCapacity, UErrorCode* status) { return icu::ByteSinkUtil::viaByteSinkToTerminatedChars( maximizedLocaleID, maximizedLocaleIDCapacity, [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_addLikelySubtags(localeID, sink, status); }, *status); } U_EXPORT icu::CharString ulocimp_addLikelySubtags(const char* localeID, UErrorCode& status) { return icu::ByteSinkUtil::viaByteSinkToCharString( [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_addLikelySubtags(localeID, sink, status); }, status); } U_EXPORT void ulocimp_addLikelySubtags(const char* localeID, icu::ByteSink& sink, UErrorCode& status) { if (U_FAILURE(status)) { return; } icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status); _uloc_addLikelySubtags(localeBuffer.data(), sink, status); } U_CAPI int32_t U_EXPORT2 uloc_minimizeSubtags(const char* localeID, char* minimizedLocaleID, int32_t minimizedLocaleIDCapacity, UErrorCode* status) { return icu::ByteSinkUtil::viaByteSinkToTerminatedChars( minimizedLocaleID, minimizedLocaleIDCapacity, [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_minimizeSubtags(localeID, sink, false, status); }, *status); } U_EXPORT icu::CharString ulocimp_minimizeSubtags(const char* localeID, bool favorScript, UErrorCode& status) { return icu::ByteSinkUtil::viaByteSinkToCharString( [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_minimizeSubtags(localeID, sink, favorScript, status); }, status); } U_EXPORT void ulocimp_minimizeSubtags(const char* localeID, icu::ByteSink& sink, bool favorScript, UErrorCode& status) { if (U_FAILURE(status)) { return; } icu::CharString localeBuffer = ulocimp_canonicalize(localeID, status); _uloc_minimizeSubtags(localeBuffer.data(), sink, favorScript, status); } // Pairs of (language subtag, + or -) for finding out fast if common languages // are LTR (minus) or RTL (plus). static const char LANG_DIR_STRING[] = "root-en-es-pt-zh-ja-ko-de-fr-it-ar+he+fa+ru-nl-pl-th-tr-"; // Implemented here because this calls ulocimp_addLikelySubtags(). U_CAPI UBool U_EXPORT2 uloc_isRightToLeft(const char *locale) { UErrorCode errorCode = U_ZERO_ERROR; icu::CharString lang; icu::CharString script; ulocimp_getSubtags(locale, &lang, &script, nullptr, nullptr, nullptr, errorCode); if (U_FAILURE(errorCode) || script.isEmpty()) { // Fastpath: We know the likely scripts and their writing direction // for some common languages. if (!lang.isEmpty()) { const char* langPtr = uprv_strstr(LANG_DIR_STRING, lang.data()); if (langPtr != nullptr) { switch (langPtr[lang.length()]) { case '-': return false; case '+': return true; default: break; // partial match of a longer code } } } // Otherwise, find the likely script. errorCode = U_ZERO_ERROR; icu::CharString likely = ulocimp_addLikelySubtags(locale, errorCode); if (U_FAILURE(errorCode)) { return false; } ulocimp_getSubtags(likely.data(), nullptr, &script, nullptr, nullptr, nullptr, errorCode); if (U_FAILURE(errorCode) || script.isEmpty()) { return false; } } UScriptCode scriptCode = (UScriptCode)u_getPropertyValueEnum(UCHAR_SCRIPT, script.data()); return uscript_isRightToLeft(scriptCode); } U_NAMESPACE_BEGIN UBool Locale::isRightToLeft() const { return uloc_isRightToLeft(getBaseName()); } U_NAMESPACE_END namespace { icu::CharString GetRegionFromKey(const char* localeID, std::string_view key, UErrorCode& status) { icu::CharString result; // First check for keyword value icu::CharString kw = ulocimp_getKeywordValue(localeID, key, status); int32_t len = kw.length(); // In UTS35 // type = alphanum{3,8} (sep alphanum{3,8})* ; // so we know the subdivision must fit the type already. // // unicode_subdivision_id = unicode_region_subtag unicode_subdivision_suffix ; // unicode_region_subtag = (alpha{2} | digit{3}) ; // unicode_subdivision_suffix = alphanum{1,4} ; // But we also know there are no id in start with digit{3} in // https://github.com/unicode-org/cldr/blob/main/common/validity/subdivision.xml // Therefore we can simplify as // unicode_subdivision_id = alpha{2} alphanum{1,4} // // and only need to accept/reject the code based on the alpha{2} and the length. if (U_SUCCESS(status) && len >= 3 && len <= 6 && uprv_isASCIILetter(kw[0]) && uprv_isASCIILetter(kw[1])) { // Additional Check static icu::RegionValidateMap valid; const char region[] = {kw[0], kw[1], '\0'}; if (valid.isSet(region)) { result.append(uprv_toupper(kw[0]), status); result.append(uprv_toupper(kw[1]), status); } } return result; } } // namespace U_EXPORT icu::CharString ulocimp_getRegionForSupplementalData(const char *localeID, bool inferRegion, UErrorCode& status) { if (U_FAILURE(status)) { return {}; } icu::CharString rgBuf = GetRegionFromKey(localeID, "rg", status); if (U_SUCCESS(status) && rgBuf.isEmpty()) { // No valid rg keyword value, try for unicode_region_subtag rgBuf = ulocimp_getRegion(localeID, status); if (U_SUCCESS(status) && rgBuf.isEmpty() && inferRegion) { // Second check for sd keyword value rgBuf = GetRegionFromKey(localeID, "sd", status); if (U_SUCCESS(status) && rgBuf.isEmpty()) { // no unicode_region_subtag but inferRegion true, try likely subtags UErrorCode rgStatus = U_ZERO_ERROR; icu::CharString locBuf = ulocimp_addLikelySubtags(localeID, rgStatus); if (U_SUCCESS(rgStatus)) { rgBuf = ulocimp_getRegion(locBuf.data(), status); } } } } return rgBuf; } namespace { // The following data is generated by unit test code inside // test/intltest/regiontst.cpp from the resource data while // the test failed. const uint32_t gValidRegionMap[] = { 0xeedf597c, 0xdeddbdef, 0x15943f3f, 0x0e00d580, 0xb0095c00, 0x0015fb9f, 0x781c068d, 0x0340400f, 0xf42b1d00, 0xfd4f8141, 0x25d7fffc, 0x0100084b, 0x538f3c40, 0x40000001, 0xfdf15100, 0x9fbb7ae7, 0x0410419a, 0x00408557, 0x00004002, 0x00100001, 0x00400408, 0x00000001, }; } // namespace // U_NAMESPACE_BEGIN RegionValidateMap::RegionValidateMap() { uprv_memcpy(map, gValidRegionMap, sizeof(map)); } RegionValidateMap::~RegionValidateMap() { } bool RegionValidateMap::isSet(const char* region) const { int32_t index = value(region); if (index < 0) { return false; } return 0 != (map[index / 32] & (1L << (index % 32))); } bool RegionValidateMap::equals(const RegionValidateMap& that) const { return uprv_memcmp(map, that.map, sizeof(map)) == 0; } // The code transform two letter a-z to a integer valued between -1, 26x26. // -1 indicate the region is outside the range of two letter a-z // the rest of value is between 0 and 676 (= 26x26) and used as an index // the the bigmap in map. The map is an array of 22 int32_t. // since 32x21 < 676/32 < 32x22 we store this 676 bits bitmap into 22 int32_t. int32_t RegionValidateMap::value(const char* region) const { if (uprv_isASCIILetter(region[0]) && uprv_isASCIILetter(region[1]) && region[2] == '\0') { return (uprv_toupper(region[0])-'A') * 26 + (uprv_toupper(region[1])-'A'); } return -1; } U_NAMESPACE_END