// © 2016 and later: Unicode, Inc. and others. // License & terms of use: http://www.unicode.org/copyright.html /* ********************************************************************** * Copyright (C) 2009-2015, International Business Machines * Corporation and others. All Rights Reserved. ********************************************************************** */ #include #include #include #include "unicode/bytestream.h" #include "unicode/utypes.h" #include "unicode/ures.h" #include "unicode/localpointer.h" #include "unicode/putil.h" #include "unicode/uenum.h" #include "unicode/uloc.h" #include "bytesinkutil.h" #include "charstr.h" #include "cmemory.h" #include "cstring.h" #include "putilimp.h" #include "uinvchar.h" #include "ulocimp.h" #include "uassert.h" namespace { /* struct holding a single variant */ typedef struct VariantListEntry : public icu::UMemory { const char *variant; struct VariantListEntry *next; } VariantListEntry; /* struct holding a single attribute value */ struct AttributeListEntry : public icu::UMemory { const char *attribute; struct AttributeListEntry *next; }; /* struct holding a single extension */ struct ExtensionListEntry : public icu::UMemory { const char *key; const char *value; struct ExtensionListEntry *next; }; #define MAXEXTLANG 3 typedef struct ULanguageTag { char *buf; /* holding parsed subtags */ const char *language; const char *extlang[MAXEXTLANG]; const char *script; const char *region; VariantListEntry *variants; ExtensionListEntry *extensions; const char *privateuse; const char *legacy; } ULanguageTag; #define MINLEN 2 #define SEP '-' #define PRIVATEUSE 'x' #define LDMLEXT 'u' #define LOCALE_SEP '_' #define LOCALE_EXT_SEP '@' #define LOCALE_KEYWORD_SEP ';' #define LOCALE_KEY_TYPE_SEP '=' constexpr auto ISALPHA = uprv_isASCIILetter; inline bool ISNUMERIC(char c) { return c >= '0' && c <= '9'; } constexpr char EMPTY[] = ""; constexpr char LANG_UND[] = "und"; constexpr char PRIVATEUSE_KEY[] = "x"; constexpr char _POSIX[] = "_POSIX"; constexpr char POSIX_KEY[] = "va"; constexpr char POSIX_VALUE[] = "posix"; constexpr char LOCALE_ATTRIBUTE_KEY[] = "attribute"; constexpr char PRIVUSE_VARIANT_PREFIX[] = "lvariant"; constexpr char LOCALE_TYPE_YES[] = "yes"; #define LANG_UND_LEN 3 /* Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . This table has 2 parts. The part for legacy language tags (marked as “Type: grandfathered” in BCP 47) is generated by the following scripts from the IANA language tag registry. curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ egrep -A 7 'Type: grandfathered' | \ egrep 'Tag|Prefe' | grep -B1 'Preferred' | grep -v '^--' | \ awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' |\ tr 'A-Z' 'a-z' The 2nd part is made of five ICU-specific entries. They're kept for the backward compatibility for now, even though there are no preferred values. They may have to be removed for the strict BCP 47 compliance. */ constexpr const char* LEGACY[] = { /* legacy preferred */ "art-lojban", "jbo", "en-gb-oed", "en-gb-oxendict", "i-ami", "ami", "i-bnn", "bnn", "i-hak", "hak", "i-klingon", "tlh", "i-lux", "lb", "i-navajo", "nv", "i-pwn", "pwn", "i-tao", "tao", "i-tay", "tay", "i-tsu", "tsu", "no-bok", "nb", "no-nyn", "nn", "sgn-be-fr", "sfb", "sgn-be-nl", "vgt", "sgn-ch-de", "sgg", "zh-guoyu", "cmn", "zh-hakka", "hak", "zh-min-nan", "nan", "zh-xiang", "hsn", // Legacy tags with no preferred value in the IANA // registry. Kept for now for the backward compatibility // because ICU has mapped them this way. "i-default", "en-x-i-default", "i-enochian", "und-x-i-enochian", "i-mingo", "see-x-i-mingo", "zh-min", "nan-x-zh-min", }; /* Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . The table lists redundant tags with preferred value in the IANA language tag registry. It's generated with the following command: curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry |\ grep 'Type: redundant' -A 5 | egrep '^(Tag:|Prefer)' | grep -B1 'Preferred' | \ awk -n '/Tag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' | \ tr 'A-Z' 'a-z' In addition, ja-latn-hepburn-heploc is mapped to ja-latn-alalc97 because a variant tag 'hepburn-heploc' has the preferred subtag, 'alaic97'. */ constexpr const char* REDUNDANT[] = { // redundant preferred "sgn-br", "bzs", "sgn-co", "csn", "sgn-de", "gsg", "sgn-dk", "dsl", "sgn-es", "ssp", "sgn-fr", "fsl", "sgn-gb", "bfi", "sgn-gr", "gss", "sgn-ie", "isg", "sgn-it", "ise", "sgn-jp", "jsl", "sgn-mx", "mfs", "sgn-ni", "ncs", "sgn-nl", "dse", "sgn-no", "nsl", "sgn-pt", "psr", "sgn-se", "swl", "sgn-us", "ase", "sgn-za", "sfs", "zh-cmn", "cmn", "zh-cmn-hans", "cmn-hans", "zh-cmn-hant", "cmn-hant", "zh-gan", "gan", "zh-wuu", "wuu", "zh-yue", "yue", // variant tag with preferred value "ja-latn-hepburn-heploc", "ja-latn-alalc97", }; /* Updated on 2018-09-12 from https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry . grep 'Type: language' -A 7 language-subtag-registry | egrep 'Subtag|Prefe' | \ grep -B1 'Preferred' | grep -v '^--' | \ awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' Make sure that 2-letter language subtags come before 3-letter subtags. */ constexpr char DEPRECATEDLANGS[][4] = { /* deprecated new */ "in", "id", "iw", "he", "ji", "yi", "jw", "jv", "mo", "ro", "aam", "aas", "adp", "dz", "aue", "ktz", "ayx", "nun", "bgm", "bcg", "bjd", "drl", "ccq", "rki", "cjr", "mom", "cka", "cmr", "cmk", "xch", "coy", "pij", "cqu", "quh", "drh", "khk", "drw", "prs", "gav", "dev", "gfx", "vaj", "ggn", "gvr", "gti", "nyc", "guv", "duz", "hrr", "jal", "ibi", "opa", "ilw", "gal", "jeg", "oyb", "kgc", "tdf", "kgh", "kml", "koj", "kwv", "krm", "bmf", "ktr", "dtp", "kvs", "gdj", "kwq", "yam", "kxe", "tvd", "kzj", "dtp", "kzt", "dtp", "lii", "raq", "lmm", "rmx", "meg", "cir", "mst", "mry", "mwj", "vaj", "myt", "mry", "nad", "xny", "ncp", "kdz", "nnx", "ngv", "nts", "pij", "oun", "vaj", "pcr", "adx", "pmc", "huw", "pmu", "phr", "ppa", "bfy", "ppr", "lcq", "pry", "prt", "puz", "pub", "sca", "hle", "skk", "oyb", "tdu", "dtp", "thc", "tpo", "thx", "oyb", "tie", "ras", "tkk", "twm", "tlw", "weo", "tmp", "tyj", "tne", "kak", "tnf", "prs", "tsf", "taj", "uok", "ema", "xba", "cax", "xia", "acn", "xkh", "waw", "xsj", "suj", "ybd", "rki", "yma", "lrr", "ymt", "mtm", "yos", "zom", "yuu", "yug", }; /* Updated on 2018-04-24 from curl https://www.iana.org/assignments/language-subtag-registry/language-subtag-registry | \ grep 'Type: region' -A 7 | egrep 'Subtag|Prefe' | \ grep -B1 'Preferred' | \ awk -n '/Subtag/ {printf(" \"%s\", ", $2);} /Preferred/ {printf("\"%s\",\n", $2);}' */ constexpr char DEPRECATEDREGIONS[][3] = { /* deprecated new */ "BU", "MM", "DD", "DE", "FX", "FR", "TP", "TL", "YD", "YE", "ZR", "CD", }; /* * ------------------------------------------------- * * These ultag_ functions may be exposed as APIs later * * ------------------------------------------------- */ ULanguageTag* ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode& status); void ultag_close(ULanguageTag* langtag); const char* ultag_getLanguage(const ULanguageTag* langtag); #if 0 const char* ultag_getJDKLanguage(const ULanguageTag* langtag); #endif const char* ultag_getExtlang(const ULanguageTag* langtag, int32_t idx); int32_t ultag_getExtlangSize(const ULanguageTag* langtag); const char* ultag_getScript(const ULanguageTag* langtag); const char* ultag_getRegion(const ULanguageTag* langtag); const char* ultag_getVariant(const ULanguageTag* langtag, int32_t idx); int32_t ultag_getVariantsSize(const ULanguageTag* langtag); const char* ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx); const char* ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx); int32_t ultag_getExtensionsSize(const ULanguageTag* langtag); const char* ultag_getPrivateUse(const ULanguageTag* langtag); #if 0 const char* ultag_getLegacy(const ULanguageTag* langtag); #endif } // namespace U_NAMESPACE_BEGIN /** * \class LocalULanguageTagPointer * "Smart pointer" class, closes a ULanguageTag via ultag_close(). * For most methods see the LocalPointerBase base class. * * @see LocalPointerBase * @see LocalPointer * @internal */ U_DEFINE_LOCAL_OPEN_POINTER(LocalULanguageTagPointer, ULanguageTag, ultag_close); U_NAMESPACE_END /* * ------------------------------------------------- * * Language subtag syntax validation functions * * ------------------------------------------------- */ namespace { bool _isAlphaString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISALPHA(*(s + i))) { return false; } } return true; } bool _isNumericString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISNUMERIC(*(s + i))) { return false; } } return true; } bool _isAlphaNumericString(const char* s, int32_t len) { int32_t i; for (i = 0; i < len; i++) { if (!ISALPHA(*(s + i)) && !ISNUMERIC(*(s + i))) { return false; } } return true; } bool _isAlphaNumericStringLimitedLength(const char* s, int32_t len, int32_t min, int32_t max) { if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len >= min && len <= max && _isAlphaNumericString(s, len)) { return true; } return false; } } // namespace bool ultag_isLanguageSubtag(const char* s, int32_t len) { /* * unicode_language_subtag = alpha{2,3} | alpha{5,8}; * NOTE: Per ICUTC 2019/01/23- accepting alpha 4 * See ICU-20372 */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len >= 2 && len <= 8 && _isAlphaString(s, len)) { return true; } return false; } namespace { bool _isExtlangSubtag(const char* s, int32_t len) { /* * extlang = 3ALPHA ; selected ISO 639 codes * *2("-" 3ALPHA) ; permanently reserved */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len == 3 && _isAlphaString(s, len)) { return true; } return false; } } // namespace bool ultag_isScriptSubtag(const char* s, int32_t len) { /* * script = 4ALPHA ; ISO 15924 code */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len == 4 && _isAlphaString(s, len)) { return true; } return false; } bool ultag_isRegionSubtag(const char* s, int32_t len) { /* * region = 2ALPHA ; ISO 3166-1 code * / 3DIGIT ; UN M.49 code */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len == 2 && _isAlphaString(s, len)) { return true; } if (len == 3 && _isNumericString(s, len)) { return true; } return false; } namespace { bool _isVariantSubtag(const char* s, int32_t len) { /* * variant = 5*8alphanum ; registered variants * / (DIGIT 3alphanum) */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (_isAlphaNumericStringLimitedLength(s, len, 5, 8)) { return true; } if (len == 4 && ISNUMERIC(*s) && _isAlphaNumericString(s + 1, 3)) { return true; } return false; } bool _isSepListOf(bool (*test)(const char*, int32_t), const char* s, int32_t len) { const char *p = s; const char *pSubtag = nullptr; if (len < 0) { len = static_cast(uprv_strlen(s)); } while ((p - s) < len) { if (*p == SEP) { if (pSubtag == nullptr) { return false; } if (!test(pSubtag, static_cast(p - pSubtag))) { return false; } pSubtag = nullptr; } else if (pSubtag == nullptr) { pSubtag = p; } p++; } if (pSubtag == nullptr) { return false; } return test(pSubtag, static_cast(p - pSubtag)); } } // namespace bool ultag_isVariantSubtags(const char* s, int32_t len) { return _isSepListOf(&_isVariantSubtag, s, len); } namespace { // This is for the ICU-specific "lvariant" handling. bool _isPrivateuseVariantSubtag(const char* s, int32_t len) { /* * variant = 1*8alphanum ; registered variants * / (DIGIT 3alphanum) */ return _isAlphaNumericStringLimitedLength(s, len , 1, 8); } bool _isExtensionSingleton(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) * * singleton = DIGIT ; 0 - 9 * / %x41-57 ; A - W * / %x59-5A ; Y - Z * / %x61-77 ; a - w * / %x79-7A ; y - z */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len == 1 && (ISALPHA(*s) || ISNUMERIC(*s)) && (uprv_tolower(*s) != PRIVATEUSE)) { return true; } return false; } bool _isExtensionSubtag(const char* s, int32_t len) { /* * extension = singleton 1*("-" (2*8alphanum)) */ return _isAlphaNumericStringLimitedLength(s, len, 2, 8); } } // namespace bool ultag_isExtensionSubtags(const char* s, int32_t len) { return _isSepListOf(&_isExtensionSubtag, s, len); } namespace { bool _isPrivateuseValueSubtag(const char* s, int32_t len) { /* * privateuse = "x" 1*("-" (1*8alphanum)) */ return _isAlphaNumericStringLimitedLength(s, len, 1, 8); } } // namespace bool ultag_isPrivateuseValueSubtags(const char* s, int32_t len) { return _isSepListOf(&_isPrivateuseValueSubtag, s, len); } bool ultag_isUnicodeLocaleAttribute(const char* s, int32_t len) { /* * attribute = alphanum{3,8} ; */ return _isAlphaNumericStringLimitedLength(s, len , 3, 8); } bool ultag_isUnicodeLocaleAttributes(const char* s, int32_t len) { return _isSepListOf(&ultag_isUnicodeLocaleAttribute, s, len); } bool ultag_isUnicodeLocaleKey(const char* s, int32_t len) { /* * key = alphanum alpha ; */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len == 2 && (ISALPHA(*s) || ISNUMERIC(*s)) && ISALPHA(s[1])) { return true; } return false; } bool _isUnicodeLocaleTypeSubtag(const char*s, int32_t len) { /* * alphanum{3,8} */ return _isAlphaNumericStringLimitedLength(s, len , 3, 8); } bool ultag_isUnicodeLocaleType(const char*s, int32_t len) { /* * type = alphanum{3,8} (sep alphanum{3,8})* ; */ return _isSepListOf(&_isUnicodeLocaleTypeSubtag, s, len); } namespace { bool _isTKey(const char* s, int32_t len) { /* * tkey = alpha digit ; */ if (len < 0) { len = static_cast(uprv_strlen(s)); } if (len == 2 && ISALPHA(*s) && ISNUMERIC(*(s + 1))) { return true; } return false; } } // namespace const char* ultag_getTKeyStart(const char *localeID) { const char *result = localeID; const char *sep; while((sep = uprv_strchr(result, SEP)) != nullptr) { if (_isTKey(result, static_cast(sep - result))) { return result; } result = ++sep; } if (_isTKey(result, -1)) { return result; } return nullptr; } namespace { bool _isTValue(const char* s, int32_t len) { /* * tvalue = (sep alphanum{3,8})+ ; */ return _isAlphaNumericStringLimitedLength(s, len , 3, 8); } bool _isTransformedExtensionSubtag(int32_t& state, const char* s, int32_t len) { const int32_t kStart = 0; // Start, wait for unicode_language_subtag, tkey or end const int32_t kGotLanguage = 1; // Got unicode_language_subtag, wait for unicode_script_subtag, // unicode_region_subtag, unicode_variant_subtag, tkey or end const int32_t kGotScript = 2; // Got unicode_script_subtag, wait for unicode_region_subtag, // unicode_variant_subtag, tkey, or end const int32_t kGotRegion = 3; // Got unicode_region_subtag, wait for unicode_variant_subtag, // tkey, or end. const int32_t kGotVariant = 4; // Got unicode_variant_subtag, wait for unicode_variant_subtag // tkey or end. const int32_t kGotTKey = -1; // Got tkey, wait for tvalue. ERROR if stop here. const int32_t kGotTValue = 6; // Got tvalue, wait for tkey, tvalue or end if (len < 0) { len = static_cast(uprv_strlen(s)); } switch (state) { case kStart: if (ultag_isLanguageSubtag(s, len) && len != 4) { state = kGotLanguage; return true; } if (_isTKey(s, len)) { state = kGotTKey; return true; } return false; case kGotLanguage: if (ultag_isScriptSubtag(s, len)) { state = kGotScript; return true; } U_FALLTHROUGH; case kGotScript: if (ultag_isRegionSubtag(s, len)) { state = kGotRegion; return true; } U_FALLTHROUGH; case kGotRegion: U_FALLTHROUGH; case kGotVariant: if (_isVariantSubtag(s, len)) { state = kGotVariant; return true; } if (_isTKey(s, len)) { state = kGotTKey; return true; } return false; case kGotTKey: if (_isTValue(s, len)) { state = kGotTValue; return true; } return false; case kGotTValue: if (_isTKey(s, len)) { state = kGotTKey; return true; } if (_isTValue(s, len)) { return true; } return false; } return false; } bool _isUnicodeExtensionSubtag(int32_t& state, const char* s, int32_t len) { const int32_t kStart = 0; // Start, wait for a key or attribute or end const int32_t kGotKey = 1; // Got a key, wait for type or key or end const int32_t kGotType = 2; // Got a type, wait for key or end switch (state) { case kStart: if (ultag_isUnicodeLocaleKey(s, len)) { state = kGotKey; return true; } if (ultag_isUnicodeLocaleAttribute(s, len)) { return true; } return false; case kGotKey: if (ultag_isUnicodeLocaleKey(s, len)) { return true; } if (_isUnicodeLocaleTypeSubtag(s, len)) { state = kGotType; return true; } return false; case kGotType: if (ultag_isUnicodeLocaleKey(s, len)) { state = kGotKey; return true; } if (_isUnicodeLocaleTypeSubtag(s, len)) { return true; } return false; } return false; } bool _isStatefulSepListOf(bool (*test)(int32_t&, const char*, int32_t), const char* s, int32_t len) { int32_t state = 0; const char* p; const char* start = s; int32_t subtagLen = 0; if (len < 0) { len = static_cast(uprv_strlen(s)); } for (p = s; len > 0; p++, len--) { if (*p == SEP) { if (!test(state, start, subtagLen)) { return false; } subtagLen = 0; start = p + 1; } else { subtagLen++; } } if (test(state, start, subtagLen) && state >= 0) { return true; } return false; } } // namespace bool ultag_isTransformedExtensionSubtags(const char* s, int32_t len) { return _isStatefulSepListOf(&_isTransformedExtensionSubtag, s, len); } bool ultag_isUnicodeExtensionSubtags(const char* s, int32_t len) { return _isStatefulSepListOf(&_isUnicodeExtensionSubtag, s, len); } namespace { /* * ------------------------------------------------- * * Helper functions * * ------------------------------------------------- */ bool _addVariantToList(VariantListEntry **first, icu::LocalPointer var) { if (*first == nullptr) { var->next = nullptr; *first = var.orphan(); } else { VariantListEntry *prev, *cur; int32_t cmp; /* variants order should be preserved */ prev = nullptr; cur = *first; while (true) { if (cur == nullptr) { var->next = nullptr; prev->next = var.orphan(); break; } /* Checking for duplicate variant */ cmp = uprv_compareInvCharsAsAscii(var->variant, cur->variant); if (cmp == 0) { /* duplicated variant */ return false; } prev = cur; cur = cur->next; } } return true; } bool _addAttributeToList(AttributeListEntry **first, AttributeListEntry *attr) { bool bAdded = true; if (*first == nullptr) { attr->next = nullptr; *first = attr; } else { AttributeListEntry *prev, *cur; int32_t cmp; /* reorder variants in alphabetical order */ prev = nullptr; cur = *first; while (true) { if (cur == nullptr) { prev->next = attr; attr->next = nullptr; break; } cmp = uprv_compareInvCharsAsAscii(attr->attribute, cur->attribute); if (cmp < 0) { if (prev == nullptr) { *first = attr; } else { prev->next = attr; } attr->next = cur; break; } if (cmp == 0) { /* duplicated variant */ bAdded = false; break; } prev = cur; cur = cur->next; } } return bAdded; } bool _addExtensionToList(ExtensionListEntry **first, ExtensionListEntry *ext, bool localeToBCP) { bool bAdded = true; if (*first == nullptr) { ext->next = nullptr; *first = ext; } else { ExtensionListEntry *prev, *cur; int32_t cmp; /* reorder variants in alphabetical order */ prev = nullptr; cur = *first; while (true) { if (cur == nullptr) { prev->next = ext; ext->next = nullptr; break; } if (localeToBCP) { /* special handling for locale to bcp conversion */ int32_t len, curlen; len = static_cast(uprv_strlen(ext->key)); curlen = static_cast(uprv_strlen(cur->key)); if (len == 1 && curlen == 1) { if (*(ext->key) == *(cur->key)) { cmp = 0; } else if (*(ext->key) == PRIVATEUSE) { cmp = 1; } else if (*(cur->key) == PRIVATEUSE) { cmp = -1; } else { cmp = *(ext->key) - *(cur->key); } } else if (len == 1) { cmp = *(ext->key) - LDMLEXT; } else if (curlen == 1) { cmp = LDMLEXT - *(cur->key); } else { cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); /* Both are u extension keys - we need special handling for 'attribute' */ if (cmp != 0) { if (uprv_strcmp(cur->key, LOCALE_ATTRIBUTE_KEY) == 0) { cmp = 1; } else if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { cmp = -1; } } } } else { cmp = uprv_compareInvCharsAsAscii(ext->key, cur->key); } if (cmp < 0) { if (prev == nullptr) { *first = ext; } else { prev->next = ext; } ext->next = cur; break; } if (cmp == 0) { /* duplicated extension key */ bAdded = false; break; } prev = cur; cur = cur->next; } } return bAdded; } void _initializeULanguageTag(ULanguageTag* langtag) { int32_t i; langtag->buf = nullptr; langtag->language = EMPTY; for (i = 0; i < MAXEXTLANG; i++) { langtag->extlang[i] = nullptr; } langtag->script = EMPTY; langtag->region = EMPTY; langtag->variants = nullptr; langtag->extensions = nullptr; langtag->legacy = EMPTY; langtag->privateuse = EMPTY; } void _appendLanguageToLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, UErrorCode& status) { UErrorCode tmpStatus = U_ZERO_ERROR; if (U_FAILURE(status)) { return; } icu::CharString buf = ulocimp_getLanguage(localeID, tmpStatus); if (U_FAILURE(tmpStatus)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } /* Note: returned language code is in lower case letters */ if (buf.isEmpty()) { sink.Append(LANG_UND, LANG_UND_LEN); } else if (!ultag_isLanguageSubtag(buf.data(), buf.length())) { /* invalid language code */ if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } sink.Append(LANG_UND, LANG_UND_LEN); } else { /* resolve deprecated */ for (int32_t i = 0; i < UPRV_LENGTHOF(DEPRECATEDLANGS); i += 2) { // 2-letter deprecated subtags are listede before 3-letter // ones in DEPRECATEDLANGS[]. Get out of loop on coming // across the 1st 3-letter subtag, if the input is a 2-letter code. // to avoid continuing to try when there's no match. if (buf.length() < static_cast(uprv_strlen(DEPRECATEDLANGS[i]))) break; if (uprv_compareInvCharsAsAscii(buf.data(), DEPRECATEDLANGS[i]) == 0) { const char* const resolved = DEPRECATEDLANGS[i + 1]; sink.Append(resolved, static_cast(uprv_strlen(resolved))); return; } } sink.Append(buf.data(), buf.length()); } } void _appendScriptToLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, UErrorCode& status) { UErrorCode tmpStatus = U_ZERO_ERROR; if (U_FAILURE(status)) { return; } icu::CharString buf = ulocimp_getScript(localeID, tmpStatus); if (U_FAILURE(tmpStatus)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (!buf.isEmpty()) { if (!ultag_isScriptSubtag(buf.data(), buf.length())) { /* invalid script code */ if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; } return; } else { sink.Append("-", 1); sink.Append(buf.data(), buf.length()); } } } void _appendRegionToLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, UErrorCode& status) { UErrorCode tmpStatus = U_ZERO_ERROR; if (U_FAILURE(status)) { return; } icu::CharString buf = ulocimp_getRegion(localeID, tmpStatus); if (U_FAILURE(tmpStatus)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (!buf.isEmpty()) { if (!ultag_isRegionSubtag(buf.data(), buf.length())) { /* invalid region code */ if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; } return; } else { sink.Append("-", 1); /* resolve deprecated */ for (int32_t i = 0; i < UPRV_LENGTHOF(DEPRECATEDREGIONS); i += 2) { if (uprv_compareInvCharsAsAscii(buf.data(), DEPRECATEDREGIONS[i]) == 0) { const char* const resolved = DEPRECATEDREGIONS[i + 1]; sink.Append(resolved, static_cast(uprv_strlen(resolved))); return; } } sink.Append(buf.data(), buf.length()); } } } void _sortVariants(VariantListEntry* first) { for (VariantListEntry* var1 = first; var1 != nullptr; var1 = var1->next) { for (VariantListEntry* var2 = var1->next; var2 != nullptr; var2 = var2->next) { // Swap var1->variant and var2->variant. if (uprv_compareInvCharsAsAscii(var1->variant, var2->variant) > 0) { const char* temp = var1->variant; var1->variant = var2->variant; var2->variant = temp; } } } } void _appendVariantsToLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, bool& hadPosix, UErrorCode& status) { if (U_FAILURE(status)) { return; } UErrorCode tmpStatus = U_ZERO_ERROR; icu::CharString buf = ulocimp_getVariant(localeID, tmpStatus); if (U_FAILURE(tmpStatus) || tmpStatus == U_STRING_NOT_TERMINATED_WARNING) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (!buf.isEmpty()) { char *p, *pVar; bool bNext = true; VariantListEntry *varFirst = nullptr; pVar = nullptr; p = buf.data(); while (bNext) { if (*p == SEP || *p == LOCALE_SEP || *p == 0) { if (*p == 0) { bNext = false; } else { *p = 0; /* terminate */ } if (pVar == nullptr) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } /* ignore empty variant */ } else { /* ICU uses upper case letters for variants, but the canonical format is lowercase in BCP47 */ for (int32_t i = 0; *(pVar + i) != 0; i++) { *(pVar + i) = uprv_tolower(*(pVar + i)); } /* validate */ if (_isVariantSubtag(pVar, -1)) { if (uprv_strcmp(pVar, POSIX_VALUE) || buf.length() != static_cast(uprv_strlen(POSIX_VALUE))) { /* emit the variant to the list */ icu::LocalPointer var(new VariantListEntry, status); if (U_FAILURE(status)) { break; } var->variant = pVar; if (!_addVariantToList(&varFirst, std::move(var))) { /* duplicated variant */ if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } else { /* Special handling for POSIX variant, need to remember that we had it and then */ /* treat it like an extension later. */ hadPosix = true; } } else if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } else if (_isPrivateuseValueSubtag(pVar, -1)) { /* Handle private use subtags separately */ break; } } /* reset variant starting position */ pVar = nullptr; } else if (pVar == nullptr) { pVar = p; } p++; } if (U_SUCCESS(status)) { if (varFirst != nullptr) { int32_t varLen; /* per UTS35, we should sort the variants */ _sortVariants(varFirst); /* write out validated/normalized variants to the target */ VariantListEntry* var = varFirst; while (var != nullptr) { sink.Append("-", 1); varLen = static_cast(uprv_strlen(var->variant)); sink.Append(var->variant, varLen); var = var->next; } } } /* clean up */ for (VariantListEntry* var = varFirst; var != nullptr; ) { VariantListEntry *tmpVar = var->next; delete var; var = tmpVar; } if (U_FAILURE(status)) { return; } } } void _appendKeywordsToLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, bool hadPosix, UErrorCode& status) { if (U_FAILURE(status)) { return; } icu::MemoryPool attrPool; icu::MemoryPool extPool; icu::MemoryPool strPool; icu::LocalUEnumerationPointer keywordEnum(uloc_openKeywords(localeID, &status)); if (U_FAILURE(status) && !hadPosix) { return; } if (keywordEnum.isValid() || hadPosix) { /* reorder extensions */ int32_t len; const char *key; ExtensionListEntry *firstExt = nullptr; ExtensionListEntry *ext; AttributeListEntry *firstAttr = nullptr; AttributeListEntry *attr; icu::MemoryPool extBufPool; const char *bcpKey=nullptr, *bcpValue=nullptr; UErrorCode tmpStatus = U_ZERO_ERROR; int32_t keylen; bool isBcpUExt; while (true) { key = uenum_next(keywordEnum.getAlias(), nullptr, &status); if (key == nullptr) { break; } icu::CharString buf = ulocimp_getKeywordValue(localeID, key, tmpStatus); len = buf.length(); if (U_FAILURE(tmpStatus)) { if (tmpStatus == U_MEMORY_ALLOCATION_ERROR) { status = U_MEMORY_ALLOCATION_ERROR; break; } if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } /* ignore this keyword */ tmpStatus = U_ZERO_ERROR; continue; } keylen = static_cast(uprv_strlen(key)); isBcpUExt = (keylen > 1); /* special keyword used for representing Unicode locale attributes */ if (uprv_strcmp(key, LOCALE_ATTRIBUTE_KEY) == 0) { if (len > 0) { int32_t i = 0; while (true) { icu::CharString attrBuf; for (; i < len; i++) { if (buf[i] != '-') { attrBuf.append(buf[i], status); } else { i++; break; } } if (U_FAILURE(status)) { return; } if (attrBuf.isEmpty() && i >= len) { break; } /* create AttributeListEntry */ attr = attrPool.create(); if (attr == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } if (icu::CharString* str = strPool.create(std::move(attrBuf), status)) { if (U_FAILURE(status)) { break; } attr->attribute = str->data(); } else { status = U_MEMORY_ALLOCATION_ERROR; break; } if (!_addAttributeToList(&firstAttr, attr)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } /* for a place holder ExtensionListEntry */ bcpKey = LOCALE_ATTRIBUTE_KEY; bcpValue = nullptr; } } else if (isBcpUExt) { std::optional optBcpKey = ulocimp_toBcpKeyWithFallback(key); if (!optBcpKey.has_value()) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } bcpKey = optBcpKey->data(); std::optional optBcpValue = ulocimp_toBcpTypeWithFallback(key, buf.toStringPiece()); if (!optBcpValue.has_value()) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } if (optBcpValue->data() == buf.data()) { /* When ulocimp_toBcpTypeWithFallback(key, buf) returns the input value as is, the value is well-formed, but has no known mapping. This implementation normalizes the value to lower case */ icu::CharString* extBuf = extBufPool.create(buf, tmpStatus); if (extBuf == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } if (U_FAILURE(tmpStatus)) { status = tmpStatus; break; } T_CString_toLowerCase(extBuf->data()); bcpValue = extBuf->data(); } else { bcpValue = optBcpValue->data(); } } else { if (*key == PRIVATEUSE) { if (!ultag_isPrivateuseValueSubtags(buf.data(), len)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } else { if (!_isExtensionSingleton(key, keylen) || !ultag_isExtensionSubtags(buf.data(), len)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } continue; } } bcpKey = key; icu::CharString* extBuf = extBufPool.create(buf.data(), len, tmpStatus); if (extBuf == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } if (U_FAILURE(tmpStatus)) { status = tmpStatus; break; } bcpValue = extBuf->data(); } /* create ExtensionListEntry */ ext = extPool.create(); if (ext == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } ext->key = bcpKey; ext->value = bcpValue; if (!_addExtensionToList(&firstExt, ext, true)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } /* Special handling for POSIX variant - add the keywords for POSIX */ if (hadPosix) { /* create ExtensionListEntry for POSIX */ ext = extPool.create(); if (ext == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } ext->key = POSIX_KEY; ext->value = POSIX_VALUE; if (!_addExtensionToList(&firstExt, ext, true)) { // Silently ignore errors. } } if (U_SUCCESS(status) && (firstExt != nullptr || firstAttr != nullptr)) { bool startLDMLExtension = false; for (ext = firstExt; ext; ext = ext->next) { if (!startLDMLExtension && uprv_strlen(ext->key) > 1) { /* first LDML u singlton extension */ sink.Append("-u", 2); startLDMLExtension = true; } /* write out the sorted BCP47 attributes, extensions and private use */ if (uprv_strcmp(ext->key, LOCALE_ATTRIBUTE_KEY) == 0) { /* write the value for the attributes */ for (attr = firstAttr; attr; attr = attr->next) { sink.Append("-", 1); sink.Append( attr->attribute, static_cast(uprv_strlen(attr->attribute))); } } else { sink.Append("-", 1); sink.Append(ext->key, static_cast(uprv_strlen(ext->key))); if (uprv_strcmp(ext->value, "true") != 0 && uprv_strcmp(ext->value, "yes") != 0) { sink.Append("-", 1); sink.Append(ext->value, static_cast(uprv_strlen(ext->value))); } } } } } } /** * Append keywords parsed from LDML extension value * e.g. "u-ca-gregory-co-trad" -> {calendar = gregorian} {collation = traditional} * Note: char* buf is used for storing keywords */ void _appendLDMLExtensionAsKeywords(const char* ldmlext, ExtensionListEntry** appendTo, icu::MemoryPool& extPool, icu::MemoryPool& kwdBuf, bool& posixVariant, UErrorCode& status) { if (U_FAILURE(status)) { return; } const char *pTag; /* beginning of current subtag */ const char *pKwds; /* beginning of key-type pairs */ bool variantExists = posixVariant; ExtensionListEntry *kwdFirst = nullptr; /* first LDML keyword */ ExtensionListEntry *kwd, *nextKwd; int32_t len; /* Reset the posixVariant value */ posixVariant = false; pTag = ldmlext; pKwds = nullptr; { AttributeListEntry *attrFirst = nullptr; /* first attribute */ AttributeListEntry *attr, *nextAttr; icu::MemoryPool strPool; icu::MemoryPool attrPool; /* Iterate through u extension attributes */ while (*pTag) { /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); if (ultag_isUnicodeLocaleKey(pTag, len)) { pKwds = pTag; break; } /* add this attribute to the list */ attr = attrPool.create(); if (attr == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (icu::CharString* str = strPool.create(pTag, len, status)) { if (U_FAILURE(status)) { return; } attr->attribute = str->data(); } else { status = U_MEMORY_ALLOCATION_ERROR; return; } // duplicate attribute is ignored, causes no error. _addAttributeToList(&attrFirst, attr); /* next tag */ pTag += len; if (*pTag) { /* next to the separator */ pTag++; } } if (attrFirst) { /* emit attributes as an LDML keyword, e.g. attribute=attr1-attr2 */ kwd = extPool.create(); if (kwd == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } icu::CharString* value = kwdBuf.create(); if (value == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } /* attribute subtags sorted in alphabetical order as type */ attr = attrFirst; while (attr != nullptr) { nextAttr = attr->next; if (attr != attrFirst) { value->append('-', status); } value->append(attr->attribute, status); attr = nextAttr; } if (U_FAILURE(status)) { return; } kwd->key = LOCALE_ATTRIBUTE_KEY; kwd->value = value->data(); if (!_addExtensionToList(&kwdFirst, kwd, false)) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } } } if (pKwds) { const char *pBcpKey = nullptr; /* u extension key subtag */ const char *pBcpType = nullptr; /* beginning of u extension type subtag(s) */ int32_t bcpKeyLen = 0; int32_t bcpTypeLen = 0; bool isDone = false; pTag = pKwds; /* BCP47 representation of LDML key/type pairs */ while (!isDone) { const char *pNextBcpKey = nullptr; int32_t nextBcpKeyLen = 0; bool emitKeyword = false; if (*pTag) { /* locate next separator char */ for (len = 0; *(pTag + len) && *(pTag + len) != SEP; len++); if (ultag_isUnicodeLocaleKey(pTag, len)) { if (pBcpKey) { emitKeyword = true; pNextBcpKey = pTag; nextBcpKeyLen = len; } else { pBcpKey = pTag; bcpKeyLen = len; } } else { U_ASSERT(pBcpKey != nullptr); /* within LDML type subtags */ if (pBcpType) { bcpTypeLen += (len + 1); } else { pBcpType = pTag; bcpTypeLen = len; } } /* next tag */ pTag += len; if (*pTag) { /* next to the separator */ pTag++; } } else { /* processing last one */ emitKeyword = true; isDone = true; } if (emitKeyword) { const char *pKey = nullptr; /* LDML key */ const char *pType = nullptr; /* LDML type */ U_ASSERT(pBcpKey != nullptr); /* BCP key length is always 2 for now */ if (bcpKeyLen != 2) { /* the BCP key is invalid */ status = U_ILLEGAL_ARGUMENT_ERROR; return; } /* u extension key to LDML key */ std::optional legacyKey = ulocimp_toLegacyKeyWithFallback( {pBcpKey, static_cast(bcpKeyLen)}); if (!legacyKey.has_value()) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (legacyKey->data() == pBcpKey) { /* The key returned by toLegacyKey points to the input buffer. We normalize the result key to lower case. */ icu::CharString* key = kwdBuf.create(pBcpKey, bcpKeyLen, status); if (key == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(status)) { return; } T_CString_toLowerCase(key->data()); pKey = key->data(); } else { pKey = legacyKey->data(); } if (pBcpType) { /* BCP type to locale type */ std::optional legacyType = ulocimp_toLegacyTypeWithFallback( pKey, {pBcpType, static_cast(bcpTypeLen)}); if (!legacyType.has_value()) { status = U_ILLEGAL_ARGUMENT_ERROR; return; } if (legacyType->data() == pBcpType) { /* The type returned by toLegacyType points to the input buffer. We normalize the result type to lower case. */ icu::CharString* type = kwdBuf.create(pBcpType, bcpTypeLen, status); if (type == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } if (U_FAILURE(status)) { return; } T_CString_toLowerCase(type->data()); pType = type->data(); } else { pType = legacyType->data(); } } else { /* typeless - default type value is "yes" */ pType = LOCALE_TYPE_YES; } /* Special handling for u-va-posix, since we want to treat this as a variant, not as a keyword */ if (!variantExists && !uprv_strcmp(pKey, POSIX_KEY) && !uprv_strcmp(pType, POSIX_VALUE) ) { posixVariant = true; } else { /* create an ExtensionListEntry for this keyword */ kwd = extPool.create(); if (kwd == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return; } kwd->key = pKey; kwd->value = pType; if (!_addExtensionToList(&kwdFirst, kwd, false)) { // duplicate keyword is allowed, Only the first // is honored. } } pBcpKey = pNextBcpKey; bcpKeyLen = pNextBcpKey != nullptr ? nextBcpKeyLen : 0; pBcpType = nullptr; bcpTypeLen = 0; } } } kwd = kwdFirst; while (kwd != nullptr) { nextKwd = kwd->next; _addExtensionToList(appendTo, kwd, false); kwd = nextKwd; } } void _appendKeywords(ULanguageTag* langtag, icu::ByteSink& sink, UErrorCode& status) { if (U_FAILURE(status)) { return; } int32_t i, n; int32_t len; ExtensionListEntry *kwdFirst = nullptr; ExtensionListEntry *kwd; const char *key, *type; icu::MemoryPool extPool; icu::MemoryPool kwdBuf; bool posixVariant = false; n = ultag_getExtensionsSize(langtag); /* resolve locale keywords and reordering keys */ for (i = 0; i < n; i++) { key = ultag_getExtensionKey(langtag, i); type = ultag_getExtensionValue(langtag, i); if (*key == LDMLEXT) { /* Determine if variants already exists */ if (ultag_getVariantsSize(langtag)) { posixVariant = true; } _appendLDMLExtensionAsKeywords(type, &kwdFirst, extPool, kwdBuf, posixVariant, status); if (U_FAILURE(status)) { break; } } else { kwd = extPool.create(); if (kwd == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; break; } kwd->key = key; kwd->value = type; if (!_addExtensionToList(&kwdFirst, kwd, false)) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } } } if (U_SUCCESS(status)) { type = ultag_getPrivateUse(langtag); if (static_cast(uprv_strlen(type)) > 0) { /* add private use as a keyword */ kwd = extPool.create(); if (kwd == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; } else { kwd->key = PRIVATEUSE_KEY; kwd->value = type; if (!_addExtensionToList(&kwdFirst, kwd, false)) { status = U_ILLEGAL_ARGUMENT_ERROR; } } } } /* If a POSIX variant was in the extensions, write it out before writing the keywords. */ if (U_SUCCESS(status) && posixVariant) { len = static_cast(uprv_strlen(_POSIX)); sink.Append(_POSIX, len); } if (U_SUCCESS(status) && kwdFirst != nullptr) { /* write out the sorted keywords */ bool firstValue = true; kwd = kwdFirst; do { if (firstValue) { sink.Append("@", 1); firstValue = false; } else { sink.Append(";", 1); } /* key */ len = static_cast(uprv_strlen(kwd->key)); sink.Append(kwd->key, len); sink.Append("=", 1); /* type */ len = static_cast(uprv_strlen(kwd->value)); sink.Append(kwd->value, len); kwd = kwd->next; } while (kwd); } } void _appendPrivateuseToLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, bool /*hadPosix*/, UErrorCode& status) { if (U_FAILURE(status)) { return; } UErrorCode tmpStatus = U_ZERO_ERROR; icu::CharString buf = ulocimp_getVariant(localeID, tmpStatus); if (U_FAILURE(tmpStatus)) { if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; } return; } if (!buf.isEmpty()) { char *p, *pPriv; bool bNext = true; bool firstValue = true; bool writeValue; pPriv = nullptr; p = buf.data(); while (bNext) { writeValue = false; if (*p == SEP || *p == LOCALE_SEP || *p == 0) { if (*p == 0) { bNext = false; } else { *p = 0; /* terminate */ } if (pPriv != nullptr) { /* Private use in the canonical format is lowercase in BCP47 */ for (int32_t i = 0; *(pPriv + i) != 0; i++) { *(pPriv + i) = uprv_tolower(*(pPriv + i)); } /* validate */ if (_isPrivateuseValueSubtag(pPriv, -1)) { if (firstValue) { if (!_isVariantSubtag(pPriv, -1)) { writeValue = true; } } else { writeValue = true; } } else if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; break; } else { break; } if (writeValue) { sink.Append("-", 1); if (firstValue) { sink.Append(PRIVATEUSE_KEY, UPRV_LENGTHOF(PRIVATEUSE_KEY) - 1); sink.Append("-", 1); sink.Append(PRIVUSE_VARIANT_PREFIX, UPRV_LENGTHOF(PRIVUSE_VARIANT_PREFIX) - 1); sink.Append("-", 1); firstValue = false; } int32_t len = static_cast(uprv_strlen(pPriv)); sink.Append(pPriv, len); } } /* reset private use starting position */ pPriv = nullptr; } else if (pPriv == nullptr) { pPriv = p; } p++; } } } /* * ------------------------------------------------- * * ultag_ functions * * ------------------------------------------------- */ /* Bit flags used by the parser */ #define LANG 0x0001 #define EXTL 0x0002 #define SCRT 0x0004 #define REGN 0x0008 #define VART 0x0010 #define EXTS 0x0020 #define EXTV 0x0040 #define PRIV 0x0080 /** * Ticket #12705 - The optimizer in Visual Studio 2015 Update 3 has problems optimizing this function. * As a work-around, optimization is disabled for this function on VS2015 and VS2017. * This work-around should be removed once the following versions of Visual Studio are no * longer supported: All versions of VS2015/VS2017, and versions of VS2019 below 16.4. */ #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924) #pragma optimize( "", off ) #endif ULanguageTag* ultag_parse(const char* tag, int32_t tagLen, int32_t* parsedLen, UErrorCode& status) { if (U_FAILURE(status)) { return nullptr; } char *tagBuf; int16_t next; char *pSubtag, *pNext, *pLastGoodPosition; int32_t subtagLen; int32_t extlangIdx; icu::LocalPointer pExtension; char *pExtValueSubtag, *pExtValueSubtagEnd; int32_t i; bool privateuseVar = false; int32_t legacyLen = 0; if (parsedLen != nullptr) { *parsedLen = 0; } if (tagLen < 0) { tagLen = static_cast(uprv_strlen(tag)); } /* copy the entire string */ tagBuf = static_cast(uprv_malloc(tagLen + 1)); if (tagBuf == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } if (tagLen > 0) { uprv_memcpy(tagBuf, tag, tagLen); } *(tagBuf + tagLen) = 0; /* create a ULanguageTag */ icu::LocalULanguageTagPointer t( static_cast(uprv_malloc(sizeof(ULanguageTag)))); if (t.isNull()) { uprv_free(tagBuf); status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } _initializeULanguageTag(t.getAlias()); t->buf = tagBuf; if (tagLen < MINLEN) { /* the input tag is too short - return empty ULanguageTag */ return t.orphan(); } size_t parsedLenDelta = 0; // Legacy tag will be consider together. Legacy tag with intervening // script and region such as art-DE-lojban or art-Latn-lojban won't be // matched. /* check if the tag is legacy */ for (i = 0; i < UPRV_LENGTHOF(LEGACY); i += 2) { int32_t checkLegacyLen = static_cast(uprv_strlen(LEGACY[i])); if (tagLen < checkLegacyLen) { continue; } if (tagLen > checkLegacyLen && tagBuf[checkLegacyLen] != '-') { // make sure next char is '-'. continue; } if (uprv_strnicmp(LEGACY[i], tagBuf, checkLegacyLen) == 0) { int32_t newTagLength; legacyLen = checkLegacyLen; /* back up for output parsedLen */ int32_t replacementLen = static_cast(uprv_strlen(LEGACY[i+1])); newTagLength = replacementLen + tagLen - checkLegacyLen; int32_t oldTagLength = tagLen; if (tagLen < newTagLength) { uprv_free(tagBuf); // Change t->buf after the free and before return to avoid the second double free in // the destructor of t when t is out of scope. t->buf = tagBuf = static_cast(uprv_malloc(newTagLength + 1)); if (tagBuf == nullptr) { status = U_MEMORY_ALLOCATION_ERROR; return nullptr; } tagLen = newTagLength; } parsedLenDelta = checkLegacyLen - replacementLen; uprv_strcpy(t->buf, LEGACY[i + 1]); if (checkLegacyLen != tagLen) { uprv_memcpy(t->buf + replacementLen, tag + checkLegacyLen, oldTagLength - checkLegacyLen); // NUL-terminate after memcpy(). t->buf[replacementLen + oldTagLength - checkLegacyLen] = 0; } break; } } if (legacyLen == 0) { for (i = 0; i < UPRV_LENGTHOF(REDUNDANT); i += 2) { const char* redundantTag = REDUNDANT[i]; size_t redundantTagLen = uprv_strlen(redundantTag); // The preferred tag for a redundant tag is always shorter than redundant // tag. A redundant tag may or may not be followed by other subtags. // (i.e. "zh-yue" or "zh-yue-u-co-pinyin"). if (uprv_strnicmp(redundantTag, tagBuf, static_cast(redundantTagLen)) == 0) { const char* redundantTagEnd = tagBuf + redundantTagLen; if (*redundantTagEnd == '\0' || *redundantTagEnd == SEP) { const char* preferredTag = REDUNDANT[i + 1]; size_t preferredTagLen = uprv_strlen(preferredTag); uprv_memcpy(t->buf, preferredTag, preferredTagLen); if (*redundantTagEnd == SEP) { uprv_memmove(tagBuf + preferredTagLen, redundantTagEnd, tagLen - redundantTagLen + 1); } else { tagBuf[preferredTagLen] = '\0'; } // parsedLen should be the length of the input // before redundantTag is replaced by preferredTag. // Save the delta to add it back later. parsedLenDelta = redundantTagLen - preferredTagLen; break; } } } } /* * langtag = language * ["-" script] * ["-" region] * *("-" variant) * *("-" extension) * ["-" privateuse] */ next = LANG | PRIV; pNext = pLastGoodPosition = tagBuf; extlangIdx = 0; pExtValueSubtag = nullptr; pExtValueSubtagEnd = nullptr; while (pNext) { char *pSep; pSubtag = pNext; /* locate next separator char */ pSep = pSubtag; while (*pSep) { if (*pSep == SEP) { break; } pSep++; } if (*pSep == 0) { /* last subtag */ pNext = nullptr; } else { pNext = pSep + 1; } subtagLen = static_cast(pSep - pSubtag); if (next & LANG) { if (ultag_isLanguageSubtag(pSubtag, subtagLen)) { *pSep = 0; /* terminate */ // TODO: move deprecated language code handling here. t->language = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; next = SCRT | REGN | VART | EXTS | PRIV; if (subtagLen <= 3) next |= EXTL; continue; } } if (next & EXTL) { if (_isExtlangSubtag(pSubtag, subtagLen)) { *pSep = 0; t->extlang[extlangIdx++] = T_CString_toLowerCase(pSubtag); pLastGoodPosition = pSep; if (extlangIdx < 3) { next = EXTL | SCRT | REGN | VART | EXTS | PRIV; } else { next = SCRT | REGN | VART | EXTS | PRIV; } continue; } } if (next & SCRT) { if (ultag_isScriptSubtag(pSubtag, subtagLen)) { char *p = pSubtag; *pSep = 0; /* to title case */ *p = uprv_toupper(*p); p++; for (; *p; p++) { *p = uprv_tolower(*p); } t->script = pSubtag; pLastGoodPosition = pSep; next = REGN | VART | EXTS | PRIV; continue; } } if (next & REGN) { if (ultag_isRegionSubtag(pSubtag, subtagLen)) { *pSep = 0; // TODO: move deprecated region code handling here. t->region = T_CString_toUpperCase(pSubtag); pLastGoodPosition = pSep; next = VART | EXTS | PRIV; continue; } } if (next & VART) { if (_isVariantSubtag(pSubtag, subtagLen) || (privateuseVar && _isPrivateuseVariantSubtag(pSubtag, subtagLen))) { icu::LocalPointer var(new VariantListEntry, status); if (U_FAILURE(status)) { return nullptr; } *pSep = 0; var->variant = T_CString_toUpperCase(pSubtag); if (!_addVariantToList(&(t->variants), std::move(var))) { /* duplicated variant entry */ break; } pLastGoodPosition = pSep; next = VART | EXTS | PRIV; continue; } } if (next & EXTS) { if (_isExtensionSingleton(pSubtag, subtagLen)) { if (pExtension.isValid()) { if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) { /* the previous extension is incomplete */ delete pExtension.orphan(); break; } /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension.getAlias(), false)) { pExtension.orphan(); pLastGoodPosition = pExtValueSubtagEnd; } else { /* stop parsing here */ delete pExtension.orphan(); break; } } /* create a new extension */ pExtension.adoptInsteadAndCheckErrorCode(new ExtensionListEntry, status); if (U_FAILURE(status)) { return nullptr; } *pSep = 0; pExtension->key = T_CString_toLowerCase(pSubtag); pExtension->value = nullptr; /* will be set later */ /* * reset the start and the end location of extension value * subtags for this extension */ pExtValueSubtag = nullptr; pExtValueSubtagEnd = nullptr; next = EXTV; continue; } } if (next & EXTV) { if (_isExtensionSubtag(pSubtag, subtagLen)) { if (pExtValueSubtag == nullptr) { /* if the start position of this extension's value is not yet, this one is the first value subtag */ pExtValueSubtag = pSubtag; } /* Mark the end of this subtag */ pExtValueSubtagEnd = pSep; next = EXTS | EXTV | PRIV; continue; } } if (next & PRIV) { if (uprv_tolower(*pSubtag) == PRIVATEUSE && subtagLen == 1) { char *pPrivuseVal; if (pExtension.isValid()) { /* Process the last extension */ if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) { /* the previous extension is incomplete */ delete pExtension.orphan(); break; } else { /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension.getAlias(), false)) { pExtension.orphan(); pLastGoodPosition = pExtValueSubtagEnd; } else { /* stop parsing here */ delete pExtension.orphan(); break; } } } /* The rest of part will be private use value subtags */ if (pNext == nullptr) { /* empty private use subtag */ break; } /* back up the private use value start position */ pPrivuseVal = pNext; /* validate private use value subtags */ while (pNext) { pSubtag = pNext; pSep = pSubtag; while (*pSep) { if (*pSep == SEP) { break; } pSep++; } if (*pSep == 0) { /* last subtag */ pNext = nullptr; } else { pNext = pSep + 1; } subtagLen = static_cast(pSep - pSubtag); if (uprv_strncmp(pSubtag, PRIVUSE_VARIANT_PREFIX, uprv_strlen(PRIVUSE_VARIANT_PREFIX)) == 0) { *pSep = 0; next = VART; privateuseVar = true; break; } else if (_isPrivateuseValueSubtag(pSubtag, subtagLen)) { pLastGoodPosition = pSep; } else { break; } } if (next == VART) { continue; } if (pLastGoodPosition - pPrivuseVal > 0) { *pLastGoodPosition = 0; t->privateuse = T_CString_toLowerCase(pPrivuseVal); } /* No more subtags, exiting the parse loop */ break; } break; } /* If we fell through here, it means this subtag is illegal - quit parsing */ break; } if (pExtension.isValid()) { /* Process the last extension */ if (pExtValueSubtag == nullptr || pExtValueSubtagEnd == nullptr) { /* the previous extension is incomplete */ delete pExtension.orphan(); } else { /* terminate the previous extension value */ *pExtValueSubtagEnd = 0; pExtension->value = T_CString_toLowerCase(pExtValueSubtag); /* insert the extension to the list */ if (_addExtensionToList(&(t->extensions), pExtension.getAlias(), false)) { pExtension.orphan(); pLastGoodPosition = pExtValueSubtagEnd; } else { delete pExtension.orphan(); } } } if (parsedLen != nullptr) { *parsedLen = static_cast(pLastGoodPosition - t->buf + parsedLenDelta); } return t.orphan(); } // Ticket #12705 - Turn optimization back on. #if defined(_MSC_VER) && (_MSC_VER >= 1900) && (_MSC_VER < 1924) #pragma optimize( "", on ) #endif void ultag_close(ULanguageTag* langtag) { if (langtag == nullptr) { return; } uprv_free(langtag->buf); if (langtag->variants) { VariantListEntry *curVar = langtag->variants; while (curVar) { VariantListEntry *nextVar = curVar->next; delete curVar; curVar = nextVar; } } if (langtag->extensions) { ExtensionListEntry *curExt = langtag->extensions; while (curExt) { ExtensionListEntry *nextExt = curExt->next; delete curExt; curExt = nextExt; } } uprv_free(langtag); } const char* ultag_getLanguage(const ULanguageTag* langtag) { return langtag->language; } #if 0 const char* ultag_getJDKLanguage(const ULanguageTag* langtag) { int32_t i; for (i = 0; DEPRECATEDLANGS[i] != nullptr; i += 2) { if (uprv_compareInvCharsAsAscii(DEPRECATEDLANGS[i], langtag->language) == 0) { return DEPRECATEDLANGS[i + 1]; } } return langtag->language; } #endif const char* ultag_getExtlang(const ULanguageTag* langtag, int32_t idx) { if (idx >= 0 && idx < MAXEXTLANG) { return langtag->extlang[idx]; } return nullptr; } int32_t ultag_getExtlangSize(const ULanguageTag* langtag) { int32_t size = 0; int32_t i; for (i = 0; i < MAXEXTLANG; i++) { if (langtag->extlang[i]) { size++; } } return size; } const char* ultag_getScript(const ULanguageTag* langtag) { return langtag->script; } const char* ultag_getRegion(const ULanguageTag* langtag) { return langtag->region; } const char* ultag_getVariant(const ULanguageTag* langtag, int32_t idx) { const char *var = nullptr; VariantListEntry *cur = langtag->variants; int32_t i = 0; while (cur) { if (i == idx) { var = cur->variant; break; } cur = cur->next; i++; } return var; } int32_t ultag_getVariantsSize(const ULanguageTag* langtag) { int32_t size = 0; VariantListEntry *cur = langtag->variants; while (true) { if (cur == nullptr) { break; } size++; cur = cur->next; } return size; } const char* ultag_getExtensionKey(const ULanguageTag* langtag, int32_t idx) { const char *key = nullptr; ExtensionListEntry *cur = langtag->extensions; int32_t i = 0; while (cur) { if (i == idx) { key = cur->key; break; } cur = cur->next; i++; } return key; } const char* ultag_getExtensionValue(const ULanguageTag* langtag, int32_t idx) { const char *val = nullptr; ExtensionListEntry *cur = langtag->extensions; int32_t i = 0; while (cur) { if (i == idx) { val = cur->value; break; } cur = cur->next; i++; } return val; } int32_t ultag_getExtensionsSize(const ULanguageTag* langtag) { int32_t size = 0; ExtensionListEntry *cur = langtag->extensions; while (true) { if (cur == nullptr) { break; } size++; cur = cur->next; } return size; } const char* ultag_getPrivateUse(const ULanguageTag* langtag) { return langtag->privateuse; } #if 0 const char* ultag_getLegacy(const ULanguageTag* langtag) { return langtag->legacy; } #endif } // namespace /* * ------------------------------------------------- * * Locale/BCP47 conversion APIs, exposed as uloc_* * * ------------------------------------------------- */ U_CAPI int32_t U_EXPORT2 uloc_toLanguageTag(const char* localeID, char* langtag, int32_t langtagCapacity, UBool strict, UErrorCode* status) { return icu::ByteSinkUtil::viaByteSinkToTerminatedChars( langtag, langtagCapacity, [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_toLanguageTag(localeID, sink, strict, status); }, *status); } U_EXPORT icu::CharString ulocimp_toLanguageTag(const char* localeID, bool strict, UErrorCode& status) { return icu::ByteSinkUtil::viaByteSinkToCharString( [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_toLanguageTag(localeID, sink, strict, status); }, status); } U_EXPORT void ulocimp_toLanguageTag(const char* localeID, icu::ByteSink& sink, bool strict, UErrorCode& status) { if (U_FAILURE(status)) { return; } UErrorCode tmpStatus = U_ZERO_ERROR; bool hadPosix = false; const char* pKeywordStart; /* Note: uloc_canonicalize returns "en_US_POSIX" for input locale ID "". See #6835 */ icu::CharString canonical = ulocimp_canonicalize(localeID, tmpStatus); if (U_FAILURE(tmpStatus)) { status = tmpStatus; return; } /* For handling special case - private use only tag */ pKeywordStart = locale_getKeywordsStart(canonical.data()); if (pKeywordStart == canonical.data()) { int kwdCnt = 0; bool done = false; icu::LocalUEnumerationPointer kwdEnum(uloc_openKeywords(canonical.data(), &tmpStatus)); if (U_SUCCESS(tmpStatus)) { kwdCnt = uenum_count(kwdEnum.getAlias(), &tmpStatus); if (kwdCnt == 1) { const char *key; int32_t len = 0; key = uenum_next(kwdEnum.getAlias(), &len, &tmpStatus); if (len == 1 && *key == PRIVATEUSE) { icu::CharString buf = ulocimp_getKeywordValue(localeID, key, tmpStatus); if (U_SUCCESS(tmpStatus)) { if (ultag_isPrivateuseValueSubtags(buf.data(), buf.length())) { /* return private use only tag */ sink.Append("und-x-", 6); sink.Append(buf.data(), buf.length()); done = true; } else if (strict) { status = U_ILLEGAL_ARGUMENT_ERROR; done = true; } /* if not strict mode, then "und" will be returned */ } else { status = U_ILLEGAL_ARGUMENT_ERROR; done = true; } } } if (done) { return; } } } _appendLanguageToLanguageTag(canonical.data(), sink, strict, status); _appendScriptToLanguageTag(canonical.data(), sink, strict, status); _appendRegionToLanguageTag(canonical.data(), sink, strict, status); _appendVariantsToLanguageTag(canonical.data(), sink, strict, hadPosix, status); _appendKeywordsToLanguageTag(canonical.data(), sink, strict, hadPosix, status); _appendPrivateuseToLanguageTag(canonical.data(), sink, strict, hadPosix, status); } U_CAPI int32_t U_EXPORT2 uloc_forLanguageTag(const char* langtag, char* localeID, int32_t localeIDCapacity, int32_t* parsedLength, UErrorCode* status) { return icu::ByteSinkUtil::viaByteSinkToTerminatedChars( localeID, localeIDCapacity, [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_forLanguageTag(langtag, -1, sink, parsedLength, status); }, *status); } U_EXPORT icu::CharString ulocimp_forLanguageTag(const char* langtag, int32_t tagLen, int32_t* parsedLength, UErrorCode& status) { return icu::ByteSinkUtil::viaByteSinkToCharString( [&](icu::ByteSink& sink, UErrorCode& status) { ulocimp_forLanguageTag(langtag, tagLen, sink, parsedLength, status); }, status); } U_EXPORT void ulocimp_forLanguageTag(const char* langtag, int32_t tagLen, icu::ByteSink& sink, int32_t* parsedLength, UErrorCode& status) { if (U_FAILURE(status)) { return; } bool isEmpty = true; const char *subtag, *p; int32_t len; int32_t i, n; bool noRegion = true; icu::LocalULanguageTagPointer lt(ultag_parse(langtag, tagLen, parsedLength, status)); if (U_FAILURE(status)) { return; } /* language */ subtag = ultag_getExtlangSize(lt.getAlias()) > 0 ? ultag_getExtlang(lt.getAlias(), 0) : ultag_getLanguage(lt.getAlias()); if (uprv_compareInvCharsAsAscii(subtag, LANG_UND) != 0) { len = static_cast(uprv_strlen(subtag)); if (len > 0) { sink.Append(subtag, len); isEmpty = false; } } /* script */ subtag = ultag_getScript(lt.getAlias()); len = static_cast(uprv_strlen(subtag)); if (len > 0) { sink.Append("_", 1); isEmpty = false; /* write out the script in title case */ char c = uprv_toupper(*subtag); sink.Append(&c, 1); sink.Append(subtag + 1, len - 1); } /* region */ subtag = ultag_getRegion(lt.getAlias()); len = static_cast(uprv_strlen(subtag)); if (len > 0) { sink.Append("_", 1); isEmpty = false; /* write out the region in upper case */ p = subtag; while (*p) { char c = uprv_toupper(*p); sink.Append(&c, 1); p++; } noRegion = false; } /* variants */ _sortVariants(lt.getAlias()->variants); n = ultag_getVariantsSize(lt.getAlias()); if (n > 0) { if (noRegion) { sink.Append("_", 1); isEmpty = false; } for (i = 0; i < n; i++) { subtag = ultag_getVariant(lt.getAlias(), i); sink.Append("_", 1); /* write out the variant in upper case */ p = subtag; while (*p) { char c = uprv_toupper(*p); sink.Append(&c, 1); p++; } } } /* keywords */ n = ultag_getExtensionsSize(lt.getAlias()); subtag = ultag_getPrivateUse(lt.getAlias()); if (n > 0 || uprv_strlen(subtag) > 0) { if (isEmpty && n > 0) { /* need a language */ sink.Append(LANG_UND, LANG_UND_LEN); } _appendKeywords(lt.getAlias(), sink, status); } }