diff options
Diffstat (limited to 'thirdparty/icu4c/common/normalizer2impl.cpp')
-rw-r--r-- | thirdparty/icu4c/common/normalizer2impl.cpp | 280 |
1 files changed, 155 insertions, 125 deletions
diff --git a/thirdparty/icu4c/common/normalizer2impl.cpp b/thirdparty/icu4c/common/normalizer2impl.cpp index 9c715ac3ef..758a84c110 100644 --- a/thirdparty/icu4c/common/normalizer2impl.cpp +++ b/thirdparty/icu4c/common/normalizer2impl.cpp @@ -53,9 +53,9 @@ namespace { */ inline uint8_t leadByteForCP(UChar32 c) { if (c <= 0x7f) { - return (uint8_t)c; + return static_cast<uint8_t>(c); } else if (c <= 0x7ff) { - return (uint8_t)(0xc0+(c>>6)); + return static_cast<uint8_t>(0xc0 + (c >> 6)); } else { // Should not occur because ccc(U+0300)!=0. return 0xe0; @@ -82,7 +82,7 @@ UChar32 codePointFromValidUTF8(const uint8_t *cpStart, const uint8_t *cpLimit) { return ((c&0x1f)<<6) | (cpStart[1]&0x3f); case 3: // no need for (c&0xf) because the upper bits are truncated after <<12 in the cast to (char16_t) - return (char16_t)((c<<12) | ((cpStart[1]&0x3f)<<6) | (cpStart[2]&0x3f)); + return static_cast<char16_t>((c << 12) | ((cpStart[1] & 0x3f) << 6) | (cpStart[2] & 0x3f)); case 4: return ((c&7)<<18) | ((cpStart[1]&0x3f)<<12) | ((cpStart[2]&0x3f)<<6) | (cpStart[3]&0x3f); default: @@ -100,8 +100,8 @@ UChar32 previousHangulOrJamo(const uint8_t *start, const uint8_t *p) { uint8_t l = *p; uint8_t t1, t2; if (0xe1 <= l && l <= 0xed && - (t1 = (uint8_t)(p[1] - 0x80)) <= 0x3f && - (t2 = (uint8_t)(p[2] - 0x80)) <= 0x3f && + (t1 = static_cast<uint8_t>(p[1] - 0x80)) <= 0x3f && + (t2 = static_cast<uint8_t>(p[2] - 0x80)) <= 0x3f && (l < 0xed || t1 <= 0x1f)) { return ((l & 0xf) << 12) | (t1 << 6) | t2; } @@ -125,7 +125,7 @@ int32_t getJamoTMinusBase(const uint8_t *src, const uint8_t *limit) { } } else if (src[1] == 0x87) { uint8_t t = src[2]; - if ((int8_t)t <= (int8_t)0x82u) { + if (static_cast<int8_t>(t) <= static_cast<int8_t>(0x82u)) { return t - (0xa7 - 0x40); } } @@ -138,10 +138,10 @@ appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t del ByteSink &sink, Edits *edits) { char buffer[U8_MAX_LENGTH]; int32_t length; - int32_t cpLength = (int32_t)(cpLimit - cpStart); + int32_t cpLength = static_cast<int32_t>(cpLimit - cpStart); if (cpLength == 1) { // The builder makes ASCII map to ASCII. - buffer[0] = (uint8_t)(*cpStart + delta); + buffer[0] = static_cast<uint8_t>(*cpStart + delta); length = 1; } else { int32_t trail = *(cpLimit-1) + delta; @@ -150,7 +150,7 @@ appendCodePointDelta(const uint8_t *cpStart, const uint8_t *cpLimit, int32_t del --cpLimit; length = 0; do { buffer[length++] = *cpStart++; } while (cpStart < cpLimit); - buffer[length++] = (uint8_t)trail; + buffer[length++] = static_cast<uint8_t>(trail); } else { // Decode the code point, add the delta, re-encode. UChar32 c = codePointFromValidUTF8(cpStart, cpLimit) + delta; @@ -205,16 +205,16 @@ UBool ReorderingBuffer::init(int32_t destCapacity, UErrorCode &errorCode) { } UBool ReorderingBuffer::equals(const char16_t *otherStart, const char16_t *otherLimit) const { - int32_t length=(int32_t)(limit-start); + int32_t length = static_cast<int32_t>(limit - start); return - length==(int32_t)(otherLimit-otherStart) && + length == static_cast<int32_t>(otherLimit - otherStart) && 0==u_memcmp(start, otherStart, length); } UBool ReorderingBuffer::equals(const uint8_t *otherStart, const uint8_t *otherLimit) const { U_ASSERT((otherLimit - otherStart) <= INT32_MAX); // ensured by caller - int32_t length = (int32_t)(limit - start); - int32_t otherLength = (int32_t)(otherLimit - otherStart); + int32_t length = static_cast<int32_t>(limit - start); + int32_t otherLength = static_cast<int32_t>(otherLimit - otherStart); // For equal strings, UTF-8 is at least as long as UTF-16, and at most three times as long. if (otherLength < length || (otherLength / 3) > length) { return false; @@ -284,7 +284,7 @@ UBool ReorderingBuffer::append(const char16_t *s, int32_t length, UBool isNFD, U16_NEXT(s, i, length, c); if(i<length) { if (isNFD) { - leadCC = Normalizer2Impl::getCCFromYesOrMaybe(impl.getRawNorm16(c)); + leadCC = Normalizer2Impl::getCCFromYesOrMaybeYes(impl.getRawNorm16(c)); } else { leadCC = impl.getCC(impl.getNorm16(c)); } @@ -304,7 +304,7 @@ UBool ReorderingBuffer::appendZeroCC(UChar32 c, UErrorCode &errorCode) { } remainingCapacity-=cpLength; if(cpLength==1) { - *limit++=(char16_t)c; + *limit++ = static_cast<char16_t>(c); } else { limit[0]=U16_LEAD(c); limit[1]=U16_TRAIL(c); @@ -319,7 +319,7 @@ UBool ReorderingBuffer::appendZeroCC(const char16_t *s, const char16_t *sLimit, if(s==sLimit) { return true; } - int32_t length=(int32_t)(sLimit-s); + int32_t length = static_cast<int32_t>(sLimit - s); if(remainingCapacity<length && !resize(length, errorCode)) { return false; } @@ -350,8 +350,8 @@ void ReorderingBuffer::removeSuffix(int32_t suffixLength) { } UBool ReorderingBuffer::resize(int32_t appendLength, UErrorCode &errorCode) { - int32_t reorderStartIndex=(int32_t)(reorderStart-start); - int32_t length=(int32_t)(limit-start); + int32_t reorderStartIndex = static_cast<int32_t>(reorderStart - start); + int32_t length = static_cast<int32_t>(limit - start); str.releaseBuffer(length); int32_t newCapacity=length+appendLength; int32_t doubleCapacity=2*str.getCapacity(); @@ -392,7 +392,7 @@ uint8_t ReorderingBuffer::previousCC() { --codePointStart; c=U16_GET_SUPPLEMENTARY(c2, c); } - return impl.getCCFromYesOrMaybeCP(c); + return impl.getCCFromYesOrMaybeYesCP(c); } // Inserts c somewhere before the last character. @@ -440,15 +440,14 @@ Normalizer2Impl::init(const int32_t *inIndexes, const UCPTrie *inTrie, minNoNoCompNoMaybeCC = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_COMP_NO_MAYBE_CC]); minNoNoEmpty = static_cast<uint16_t>(inIndexes[IX_MIN_NO_NO_EMPTY]); limitNoNo = static_cast<uint16_t>(inIndexes[IX_LIMIT_NO_NO]); + minMaybeNo = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO]); + minMaybeNoCombinesFwd = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_NO_COMBINES_FWD]); minMaybeYes = static_cast<uint16_t>(inIndexes[IX_MIN_MAYBE_YES]); - U_ASSERT((minMaybeYes & 7) == 0); // 8-aligned for noNoDelta bit fields - centerNoNoDelta = (minMaybeYes >> DELTA_SHIFT) - MAX_DELTA - 1; + U_ASSERT((minMaybeNo & 7) == 0); // 8-aligned for noNoDelta bit fields + centerNoNoDelta = (minMaybeNo >> DELTA_SHIFT) - MAX_DELTA - 1; normTrie=inTrie; - - maybeYesCompositions=inExtraData; - extraData=maybeYesCompositions+((MIN_NORMAL_MAYBE_YES-minMaybeYes)>>OFFSET_SHIFT); - + extraData=inExtraData; smallFCD=inSmallFCD; } @@ -486,7 +485,7 @@ Normalizer2Impl::addPropertyStarts(const USetAdder *sa, UErrorCode & /*errorCode while ((end = ucptrie_getRange(normTrie, start, UCPMAP_RANGE_FIXED_LEAD_SURROGATES, INERT, nullptr, nullptr, &value)) >= 0) { sa->add(sa->set, start); - if (start != end && isAlgorithmicNoNo((uint16_t)value) && + if (start != end && isAlgorithmicNoNo(static_cast<uint16_t>(value)) && (value & Normalizer2Impl::DELTA_TCCC_MASK) > Normalizer2Impl::DELTA_TCCC_1) { // Range of code points with same-norm16-value algorithmic decompositions. // They might have different non-zero FCD16 values. @@ -570,7 +569,7 @@ Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit, int32_t destLengthEstimate, UErrorCode &errorCode) const { if(destLengthEstimate<0 && limit!=nullptr) { - destLengthEstimate=(int32_t)(limit-src); + destLengthEstimate = static_cast<int32_t>(limit - src); } dest.remove(); ReorderingBuffer buffer(*this, dest); @@ -650,7 +649,7 @@ Normalizer2Impl::decompose(const char16_t *src, const char16_t *limit, } } else { if(isDecompYes(norm16)) { - uint8_t cc=getCCFromYesOrMaybe(norm16); + uint8_t cc=getCCFromYesOrMaybeYes(norm16); if(prevCC<=cc || cc==0) { prevCC=cc; if(cc<=1) { @@ -702,12 +701,13 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, UErrorCode &errorCode) const { // get the decomposition and the lead and trail cc's if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { - return buffer.append(c, getCCFromYesOrMaybe(norm16), errorCode); + if (isMaybeYesOrNonZeroCC(norm16)) { + return buffer.append(c, getCCFromYesOrMaybeYes(norm16), errorCode); + } else if (norm16 < minMaybeNo) { + // Maps to an isCompYesAndZeroCC. + c=mapAlgorithmic(c, norm16); + norm16=getRawNorm16(c); } - // Maps to an isCompYesAndZeroCC. - c=mapAlgorithmic(c, norm16); - norm16=getRawNorm16(c); } if (norm16 < minYesNo) { // c does not decompose @@ -718,17 +718,17 @@ UBool Normalizer2Impl::decompose(UChar32 c, uint16_t norm16, return buffer.appendZeroCC(jamos, jamos+Hangul::decompose(c, jamos), errorCode); } // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); + const uint16_t *mapping=getData(norm16); uint16_t firstUnit=*mapping; int32_t length=firstUnit&MAPPING_LENGTH_MASK; uint8_t leadCC, trailCC; - trailCC=(uint8_t)(firstUnit>>8); + trailCC = static_cast<uint8_t>(firstUnit >> 8); if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { - leadCC=(uint8_t)(*(mapping-1)>>8); + leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8); } else { leadCC=0; } - return buffer.append((const char16_t *)mapping+1, length, true, leadCC, trailCC, errorCode); + return buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode); } // Dual functionality: @@ -787,9 +787,9 @@ Normalizer2Impl::decomposeUTF8(uint32_t options, } // Medium-fast path: Quick check. - if (isMaybeOrNonZeroCC(norm16)) { + if (isMaybeYesOrNonZeroCC(norm16)) { // Does not decompose. - uint8_t cc = getCCFromYesOrMaybe(norm16); + uint8_t cc = getCCFromYesOrMaybeYes(norm16); if (prevCC <= cc || cc == 0) { prevCC = cc; if (cc <= 1) { @@ -836,7 +836,7 @@ Normalizer2Impl::decomposeUTF8(uint32_t options, } // We already know there was a change if the original character decomposed; // otherwise compare. - if (isMaybeOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) { + if (isMaybeYesOrNonZeroCC(norm16) && buffer.equals(prevBoundary, src)) { if (!ByteSinkUtil::appendUnchanged(prevBoundary, src, *sink, options, edits, errorCode)) { break; @@ -867,9 +867,9 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit, // Get the decomposition and the lead and trail cc's. UChar32 c = U_SENTINEL; if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { + if (isMaybeYesOrNonZeroCC(norm16)) { // No comp boundaries around this character. - uint8_t cc = getCCFromYesOrMaybe(norm16); + uint8_t cc = getCCFromYesOrMaybeYes(norm16); if (cc == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { return prevSrc; } @@ -881,14 +881,15 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit, return src; } continue; + } else if (norm16 < minMaybeNo) { + // Maps to an isCompYesAndZeroCC. + if (stopAt != STOP_AT_LIMIT) { + return prevSrc; + } + c = codePointFromValidUTF8(prevSrc, src); + c = mapAlgorithmic(c, norm16); + norm16 = getRawNorm16(c); } - // Maps to an isCompYesAndZeroCC. - if (stopAt != STOP_AT_LIMIT) { - return prevSrc; - } - c = codePointFromValidUTF8(prevSrc, src); - c = mapAlgorithmic(c, norm16); - norm16 = getRawNorm16(c); } else if (stopAt != STOP_AT_LIMIT && norm16 < minNoNoCompNoMaybeCC) { return prevSrc; } @@ -918,20 +919,20 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit, } } else { // The character decomposes, get everything from the variable-length extra data. - const uint16_t *mapping = getMapping(norm16); + const uint16_t *mapping = getData(norm16); uint16_t firstUnit = *mapping; int32_t length = firstUnit & MAPPING_LENGTH_MASK; - uint8_t trailCC = (uint8_t)(firstUnit >> 8); + uint8_t trailCC = static_cast<uint8_t>(firstUnit >> 8); uint8_t leadCC; if (firstUnit & MAPPING_HAS_CCC_LCCC_WORD) { - leadCC = (uint8_t)(*(mapping-1) >> 8); + leadCC = static_cast<uint8_t>(*(mapping - 1) >> 8); } else { leadCC = 0; } if (leadCC == 0 && stopAt == STOP_AT_DECOMP_BOUNDARY) { return prevSrc; } - if (!buffer.append((const char16_t *)mapping+1, length, true, leadCC, trailCC, errorCode)) { + if (!buffer.append(reinterpret_cast<const char16_t*>(mapping) + 1, length, true, leadCC, trailCC, errorCode)) { return nullptr; } } @@ -946,7 +947,7 @@ Normalizer2Impl::decomposeShort(const uint8_t *src, const uint8_t *limit, const char16_t * Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length) const { uint16_t norm16; - if(c<minDecompNoCP || isMaybeOrNonZeroCC(norm16=getNorm16(c))) { + if(c<minDecompNoCP || isMaybeYesOrNonZeroCC(norm16=getNorm16(c))) { // c does not decompose return nullptr; } @@ -968,9 +969,9 @@ Normalizer2Impl::getDecomposition(UChar32 c, char16_t buffer[4], int32_t &length return buffer; } // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); + const uint16_t *mapping=getData(norm16); length=*mapping&MAPPING_LENGTH_MASK; - return (const char16_t *)mapping+1; + return reinterpret_cast<const char16_t*>(mapping) + 1; } // The capacity of the buffer must be 30=MAPPING_LENGTH_MASK-1 @@ -995,7 +996,7 @@ Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &le return buffer; } // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); + const uint16_t *mapping=getData(norm16); uint16_t firstUnit=*mapping; int32_t mLength=firstUnit&MAPPING_LENGTH_MASK; // length of normal mapping if(firstUnit&MAPPING_HAS_RAW_MAPPING) { @@ -1005,17 +1006,17 @@ Normalizer2Impl::getRawDecomposition(UChar32 c, char16_t buffer[30], int32_t &le uint16_t rm0=*rawMapping; if(rm0<=MAPPING_LENGTH_MASK) { length=rm0; - return (const char16_t *)rawMapping-rm0; + return reinterpret_cast<const char16_t*>(rawMapping) - rm0; } else { // Copy the normal mapping and replace its first two code units with rm0. - buffer[0]=(char16_t)rm0; - u_memcpy(buffer+1, (const char16_t *)mapping+1+2, mLength-2); + buffer[0] = static_cast<char16_t>(rm0); + u_memcpy(buffer + 1, reinterpret_cast<const char16_t*>(mapping) + 1 + 2, mLength - 2); length=mLength-1; return buffer; } } else { length=mLength; - return (const char16_t *)mapping+1; + return reinterpret_cast<const char16_t*>(mapping) + 1; } } @@ -1052,7 +1053,7 @@ void Normalizer2Impl::decomposeAndAppend(const char16_t *src, const char16_t *li limit=u_strchr(p, 0); } - if (buffer.append(src, (int32_t)(p - src), false, firstCC, prevCC, errorCode)) { + if (buffer.append(src, static_cast<int32_t>(p - src), false, firstCC, prevCC, errorCode)) { buffer.appendZeroCC(p, limit, errorCode); } } @@ -1070,7 +1071,7 @@ UBool Normalizer2Impl::norm16HasDecompBoundaryBefore(uint16_t norm16) const { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; } // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); + const uint16_t *mapping=getDataForYesOrNo(norm16); uint16_t firstUnit=*mapping; // true if leadCC==0 (hasFCDBoundaryBefore()) return (firstUnit&MAPPING_HAS_CCC_LCCC_WORD)==0 || (*(mapping-1)&0xff00)==0; @@ -1091,14 +1092,15 @@ UBool Normalizer2Impl::norm16HasDecompBoundaryAfter(uint16_t norm16) const { return true; } if (norm16 >= limitNoNo) { - if (isMaybeOrNonZeroCC(norm16)) { + if (isMaybeYesOrNonZeroCC(norm16)) { return norm16 <= MIN_NORMAL_MAYBE_YES || norm16 == JAMO_VT; + } else if (norm16 < minMaybeNo) { + // Maps to an isCompYesAndZeroCC. + return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; } - // Maps to an isCompYesAndZeroCC. - return (norm16 & DELTA_TCCC_MASK) <= DELTA_TCCC_1; } // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); + const uint16_t *mapping=getData(norm16); uint16_t firstUnit=*mapping; // decomp after-boundary: same as hasFCDBoundaryAfter(), // fcd16<=1 || trailCC==0 @@ -1140,13 +1142,13 @@ int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { if(trail<COMP_1_TRAIL_LIMIT) { // trail character is 0..33FF // result entry may have 2 or 3 units - key1=(uint16_t)(trail<<1); + key1 = static_cast<uint16_t>(trail << 1); while(key1>(firstUnit=*list)) { list+=2+(firstUnit&COMP_1_TRIPLE); } if(key1==(firstUnit&COMP_1_TRAIL_MASK)) { if(firstUnit&COMP_1_TRIPLE) { - return ((int32_t)list[1]<<16)|list[2]; + return (static_cast<int32_t>(list[1]) << 16) | list[2]; } else { return list[1]; } @@ -1154,10 +1156,10 @@ int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { } else { // trail character is 3400..10FFFF // result entry has 3 units - key1=(uint16_t)(COMP_1_TRAIL_LIMIT+ + key1 = static_cast<uint16_t>(COMP_1_TRAIL_LIMIT + (((trail>>COMP_1_TRAIL_SHIFT))& ~COMP_1_TRIPLE)); - uint16_t key2=(uint16_t)(trail<<COMP_2_TRAIL_SHIFT); + uint16_t key2 = static_cast<uint16_t>(trail << COMP_2_TRAIL_SHIFT); uint16_t secondUnit; for(;;) { if(key1>(firstUnit=*list)) { @@ -1170,7 +1172,7 @@ int32_t Normalizer2Impl::combine(const uint16_t *list, UChar32 trail) { list+=3; } } else if(key2==(secondUnit&COMP_2_TRAIL_MASK)) { - return ((int32_t)(secondUnit&~COMP_2_TRAIL_MASK)<<16)|list[2]; + return (static_cast<int32_t>(secondUnit & ~COMP_2_TRAIL_MASK) << 16) | list[2]; } else { break; } @@ -1195,7 +1197,7 @@ void Normalizer2Impl::addComposites(const uint16_t *list, UnicodeSet &set) const compositeAndFwd=list[1]; list+=2; } else { - compositeAndFwd=(((int32_t)list[1]&~COMP_2_TRAIL_MASK)<<16)|list[2]; + compositeAndFwd = ((static_cast<int32_t>(list[1]) & ~COMP_2_TRAIL_MASK) << 16) | list[2]; list+=3; } UChar32 composite=compositeAndFwd>>1; @@ -1240,7 +1242,7 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart for(;;) { UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, p, limit, c, norm16); - cc=getCCFromYesOrMaybe(norm16); + cc=getCCFromYesOrMaybeYes(norm16); if( // this character combines backward and isMaybe(norm16) && // we have seen a starter that combines forward and @@ -1252,15 +1254,15 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart // c is a Jamo V/T, see if we can compose it with the previous character. if(c<Hangul::JAMO_T_BASE) { // c is a Jamo Vowel, compose with previous Jamo L and following Jamo T. - char16_t prev=(char16_t)(*starter-Hangul::JAMO_L_BASE); + char16_t prev = static_cast<char16_t>(*starter - Hangul::JAMO_L_BASE); if(prev<Hangul::JAMO_L_COUNT) { pRemove=p-1; - char16_t syllable=(char16_t) - (Hangul::HANGUL_BASE+ + char16_t syllable = static_cast<char16_t>( + Hangul::HANGUL_BASE + (prev*Hangul::JAMO_V_COUNT+(c-Hangul::JAMO_V_BASE))* Hangul::JAMO_T_COUNT); char16_t t; - if(p!=limit && (t=(char16_t)(*p-Hangul::JAMO_T_BASE))<Hangul::JAMO_T_COUNT) { + if (p != limit && (t = static_cast<char16_t>(*p - Hangul::JAMO_T_BASE)) < Hangul::JAMO_T_COUNT) { ++p; syllable+=t; // The next character was a Jamo T. } @@ -1298,7 +1300,7 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart starter[0]=U16_LEAD(composite); starter[1]=U16_TRAIL(composite); } else { - *starter=(char16_t)composite; + *starter = static_cast<char16_t>(composite); // The composite is shorter than the starter, // move the intermediate characters forward one. starterIsSupplementary=false; @@ -1323,7 +1325,7 @@ void Normalizer2Impl::recompose(ReorderingBuffer &buffer, int32_t recomposeStart *--starter=U16_LEAD(composite); // undo the temporary increment } else { // both are on the BMP - *starter=(char16_t)composite; + *starter = static_cast<char16_t>(composite); } /* remove the combining mark by moving the following text over it */ @@ -1414,17 +1416,22 @@ Normalizer2Impl::composePair(UChar32 a, UChar32 b) const { } } else { // 'a' has a compositions list in extraData - list=getMapping(norm16); + list=getDataForYesOrNo(norm16); if(norm16>minYesNo) { // composite 'a' has both mapping & compositions list list+= // mapping pointer 1+ // +1 to skip the first unit with the mapping length (*list&MAPPING_LENGTH_MASK); // + mapping length } } - } else if(norm16<minMaybeYes || MIN_NORMAL_MAYBE_YES<=norm16) { + } else if(norm16<minMaybeNoCombinesFwd || MIN_NORMAL_MAYBE_YES<=norm16) { return U_SENTINEL; } else { - list=getCompositionsListForMaybe(norm16); + list=getDataForMaybe(norm16); + if(norm16<minMaybeYes) { // composite 'a' has both mapping & compositions list + list+= // mapping pointer + 1+ // +1 to skip the first unit with the mapping length + (*list&MAPPING_LENGTH_MASK); // + mapping length + } } if(b<0 || 0x10ffff<b) { // combine(list, b) requires a valid code point b return U_SENTINEL; @@ -1502,12 +1509,12 @@ Normalizer2Impl::compose(const char16_t *src, const char16_t *limit, } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) + // or a "maybeYes" / "maybeNo" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. // Medium-fast path: Handle cases that do not require full decomposition and recomposition. - if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes + if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo if (!doCompose) { return false; } @@ -1534,7 +1541,7 @@ Normalizer2Impl::compose(const char16_t *src, const char16_t *limit, if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break; } - const char16_t *mapping = reinterpret_cast<const char16_t *>(getMapping(norm16)); + const char16_t *mapping = reinterpret_cast<const char16_t *>(getDataForYesOrNo(norm16)); int32_t length = *mapping++ & MAPPING_LENGTH_MASK; if(!buffer.appendZeroCC(mapping, mapping + length, errorCode)) { break; @@ -1562,14 +1569,14 @@ Normalizer2Impl::compose(const char16_t *src, const char16_t *limit, if(c<Hangul::JAMO_T_BASE) { // The current character is a Jamo Vowel, // compose with previous Jamo L and following Jamo T. - char16_t l = (char16_t)(prev-Hangul::JAMO_L_BASE); + char16_t l = static_cast<char16_t>(prev - Hangul::JAMO_L_BASE); if(l<Hangul::JAMO_L_COUNT) { if (!doCompose) { return false; } int32_t t; if (src != limit && - 0 < (t = ((int32_t)*src - Hangul::JAMO_T_BASE)) && + 0 < (t = (static_cast<int32_t>(*src) - Hangul::JAMO_T_BASE)) && t < Hangul::JAMO_T_COUNT) { // The next character is a Jamo T. ++src; @@ -1587,7 +1594,7 @@ Normalizer2Impl::compose(const char16_t *src, const char16_t *limit, if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break; } - if(!buffer.appendBMP((char16_t)syllable, 0, errorCode)) { + if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) { break; } prevBoundary = src; @@ -1612,7 +1619,7 @@ Normalizer2Impl::compose(const char16_t *src, const char16_t *limit, if (prevBoundary != prevSrc && !buffer.appendZeroCC(prevBoundary, prevSrc, errorCode)) { break; } - if(!buffer.appendBMP((char16_t)syllable, 0, errorCode)) { + if (!buffer.appendBMP(static_cast<char16_t>(syllable), 0, errorCode)) { break; } prevBoundary = src; @@ -1763,7 +1770,7 @@ Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit, } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) + // or a "maybeYes" / "maybeNo" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. @@ -1784,8 +1791,9 @@ Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit, } } - if(isMaybeOrNonZeroCC(norm16)) { - uint8_t cc=getCCFromYesOrMaybe(norm16); + if (norm16 >= minMaybeNo) { + uint16_t fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16); + uint8_t cc = fcd16 >> 8; if (onlyContiguous /* FCC */ && cc != 0 && getTrailCCFromCompYesAndZeroCC(prevNorm16) > cc) { // The [prevBoundary..prevSrc[ character @@ -1806,11 +1814,12 @@ Normalizer2Impl::composeQuickCheck(const char16_t *src, const char16_t *limit, if (src == limit) { return src; } - uint8_t prevCC = cc; + uint8_t prevCC = fcd16; nextSrc = src; UCPTRIE_FAST_U16_NEXT(normTrie, UCPTRIE_16, nextSrc, limit, c, norm16); - if (isMaybeOrNonZeroCC(norm16)) { - cc = getCCFromYesOrMaybe(norm16); + if (norm16 >= minMaybeNo) { + fcd16 = getFCD16FromMaybeOrNonZeroCC(norm16); + cc = fcd16 >> 8; if (!(prevCC <= cc || cc == 0)) { break; } @@ -1845,11 +1854,11 @@ void Normalizer2Impl::composeAndAppend(const char16_t *src, const char16_t *limi if(src!=firstStarterInSrc) { const char16_t *lastStarterInDest=findPreviousCompBoundary(buffer.getStart(), buffer.getLimit(), onlyContiguous); - int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastStarterInDest); + int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastStarterInDest); UnicodeString middle(lastStarterInDest, destSuffixLength); buffer.removeSuffix(destSuffixLength); safeMiddle=middle; - middle.append(src, (int32_t)(firstStarterInSrc-src)); + middle.append(src, static_cast<int32_t>(firstStarterInSrc - src)); const char16_t *middleStart=middle.getBuffer(); compose(middleStart, middleStart+middle.length(), onlyContiguous, true, buffer, errorCode); @@ -1903,12 +1912,12 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, } // isCompYesAndZeroCC(norm16) is false, that is, norm16>=minNoNo. // The current character is either a "noNo" (has a mapping) - // or a "maybeYes" (combines backward) + // or a "maybeYes" / "maybeNo" (combines backward) // or a "yesYes" with ccc!=0. // It is not a Hangul syllable or Jamo L because those have "yes" properties. // Medium-fast path: Handle cases that do not require full decomposition and recomposition. - if (!isMaybeOrNonZeroCC(norm16)) { // minNoNo <= norm16 < minMaybeYes + if (norm16 < minMaybeNo) { // minNoNo <= norm16 < minMaybeNo if (sink == nullptr) { return false; } @@ -1937,9 +1946,9 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, *sink, options, edits, errorCode)) { break; } - const uint16_t *mapping = getMapping(norm16); + const uint16_t *mapping = getDataForYesOrNo(norm16); int32_t length = *mapping++ & MAPPING_LENGTH_MASK; - if (!ByteSinkUtil::appendChange(prevSrc, src, (const char16_t *)mapping, length, + if (!ByteSinkUtil::appendChange(prevSrc, src, reinterpret_cast<const char16_t*>(mapping), length, *sink, edits, errorCode)) { break; } @@ -1958,7 +1967,7 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, break; } if (edits != nullptr) { - edits->addReplace((int32_t)(src - prevSrc), 0); + edits->addReplace(static_cast<int32_t>(src - prevSrc), 0); } prevBoundary = src; continue; @@ -1976,7 +1985,7 @@ Normalizer2Impl::composeUTF8(uint32_t options, UBool onlyContiguous, // The current character is a Jamo Vowel, // compose with previous Jamo L and following Jamo T. UChar32 l = prev - Hangul::JAMO_L_BASE; - if ((uint32_t)l < Hangul::JAMO_L_COUNT) { + if (static_cast<uint32_t>(l) < Hangul::JAMO_L_COUNT) { if (sink == nullptr) { return false; } @@ -2204,20 +2213,20 @@ uint8_t Normalizer2Impl::getPreviousTrailCC(const char16_t *start, const char16_ if (start == p) { return 0; } - int32_t i = (int32_t)(p - start); + int32_t i = static_cast<int32_t>(p - start); UChar32 c; U16_PREV(start, 0, i, c); - return (uint8_t)getFCD16(c); + return static_cast<uint8_t>(getFCD16(c)); } uint8_t Normalizer2Impl::getPreviousTrailCC(const uint8_t *start, const uint8_t *p) const { if (start == p) { return 0; } - int32_t i = (int32_t)(p - start); + int32_t i = static_cast<int32_t>(p - start); UChar32 c; U8_PREV(start, 0, i, c); - return (uint8_t)getFCD16(c); + return static_cast<uint8_t>(getFCD16(c)); } // Note: normalizer2impl.cpp r30982 (2011-nov-27) @@ -2245,7 +2254,7 @@ uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { return norm16|(norm16<<8); } else if(norm16>=minMaybeYes) { return 0; - } else { // isDecompNoAlgorithmic(norm16) + } else if(norm16<minMaybeNo) { // isDecompNoAlgorithmic(norm16) uint16_t deltaTrailCC = norm16 & DELTA_TCCC_MASK; if (deltaTrailCC <= DELTA_TCCC_1) { return deltaTrailCC >> OFFSET_SHIFT; @@ -2260,7 +2269,7 @@ uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { return 0; } // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16); + const uint16_t *mapping=getData(norm16); uint16_t firstUnit=*mapping; norm16=firstUnit>>8; // tccc if(firstUnit&MAPPING_HAS_CCC_LCCC_WORD) { @@ -2272,6 +2281,23 @@ uint16_t Normalizer2Impl::getFCD16FromNormData(UChar32 c) const { #pragma optimize( "", on ) #endif +uint16_t Normalizer2Impl::getFCD16FromMaybeOrNonZeroCC(uint16_t norm16) const { + U_ASSERT(norm16 >= minMaybeNo); + if (norm16 >= MIN_NORMAL_MAYBE_YES) { + // combining mark + norm16 = getCCFromNormalYesOrMaybe(norm16); + return norm16 | (norm16<<8); + } else if (norm16 >= minMaybeYes) { + return 0; + } + // c decomposes, get everything from the variable-length extra data + const uint16_t *mapping = getDataForMaybe(norm16); + uint16_t firstUnit = *mapping; + // maybeNo has lccc = 0 + U_ASSERT((firstUnit & MAPPING_HAS_CCC_LCCC_WORD) == 0 || (*(mapping - 1) & 0xff00) == 0); + return firstUnit >> 8; // tccc +} + // Dual functionality: // buffer!=nullptr: normalize // buffer==nullptr: isNormalized/quickCheck/spanQuickCheckYes @@ -2395,7 +2421,7 @@ Normalizer2Impl::makeFCD(const char16_t *src, const char16_t *limit, * already but is now going to be decomposed. * prevSrc is set to after what was copied/appended. */ - buffer->removeSuffix((int32_t)(prevSrc-prevBoundary)); + buffer->removeSuffix(static_cast<int32_t>(prevSrc - prevBoundary)); /* * Find the part of the source that needs to be decomposed, * up to the next safe boundary. @@ -2426,11 +2452,11 @@ void Normalizer2Impl::makeFCDAndAppend(const char16_t *src, const char16_t *limi if(src!=firstBoundaryInSrc) { const char16_t *lastBoundaryInDest=findPreviousFCDBoundary(buffer.getStart(), buffer.getLimit()); - int32_t destSuffixLength=(int32_t)(buffer.getLimit()-lastBoundaryInDest); + int32_t destSuffixLength = static_cast<int32_t>(buffer.getLimit() - lastBoundaryInDest); UnicodeString middle(lastBoundaryInDest, destSuffixLength); buffer.removeSuffix(destSuffixLength); safeMiddle=middle; - middle.append(src, (int32_t)(firstBoundaryInSrc-src)); + middle.append(src, static_cast<int32_t>(firstBoundaryInSrc - src)); const char16_t *middleStart=middle.getBuffer(); makeFCD(middleStart, middleStart+middle.length(), &buffer, errorCode); if(U_FAILURE(errorCode)) { @@ -2507,8 +2533,8 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode if(U_FAILURE(errorCode)) { return; } - UChar32 firstOrigin=(UChar32)(canonValue&CANON_VALUE_MASK); - canonValue=(canonValue&~CANON_VALUE_MASK)|CANON_HAS_SET|(uint32_t)canonStartSets.size(); + UChar32 firstOrigin = static_cast<UChar32>(canonValue & CANON_VALUE_MASK); + canonValue = (canonValue & ~CANON_VALUE_MASK) | CANON_HAS_SET | static_cast<uint32_t>(canonStartSets.size()); umutablecptrie_set(mutableTrie, decompLead, canonValue, &errorCode); canonStartSets.adoptElement(lpSet.orphan(), errorCode); if (U_FAILURE(errorCode)) { @@ -2518,7 +2544,7 @@ void CanonIterData::addToStartSet(UChar32 origin, UChar32 decompLead, UErrorCode set->add(firstOrigin); } } else { - set=(UnicodeSet *)canonStartSets[(int32_t)(canonValue&CANON_VALUE_MASK)]; + set = static_cast<UnicodeSet*>(canonStartSets[static_cast<int32_t>(canonValue & CANON_VALUE_MASK)]); } set->add(origin); } @@ -2575,9 +2601,11 @@ void InitCanonIterData::doInit(Normalizer2Impl *impl, UErrorCode &errorCode) { void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, const uint16_t norm16, CanonIterData &newData, UErrorCode &errorCode) const { - if(isInert(norm16) || (minYesNo<=norm16 && norm16<minNoNo)) { + if(isInert(norm16) || + (minYesNo<=norm16 && norm16<minNoNo) || + (minMaybeNo<=norm16 && norm16<minMaybeYes)) { // Inert, or 2-way mapping (including Hangul syllable). - // We do not write a canonStartSet for any yesNo character. + // We do not write a canonStartSet for any yesNo/maybeNo character. // Composites from 2-way mappings are added at runtime from the // starter's compositions list, and the other characters in // 2-way mappings get CANON_NOT_SEGMENT_STARTER set because they are @@ -2587,7 +2615,7 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, co for(UChar32 c=start; c<=end; ++c) { uint32_t oldValue = umutablecptrie_get(newData.mutableTrie, c); uint32_t newValue=oldValue; - if(isMaybeOrNonZeroCC(norm16)) { + if(isMaybeYesOrNonZeroCC(norm16)) { // not a segment starter if it occurs in a decomposition or has cc!=0 newValue|=CANON_NOT_SEGMENT_STARTER; if(norm16<MIN_NORMAL_MAYBE_YES) { @@ -2609,7 +2637,7 @@ void Normalizer2Impl::makeCanonIterDataFromNorm16(UChar32 start, UChar32 end, co } if (norm16_2 > minYesNo) { // c decomposes, get everything from the variable-length extra data - const uint16_t *mapping=getMapping(norm16_2); + const uint16_t *mapping=getDataForYesOrNo(norm16_2); uint16_t firstUnit=*mapping; int32_t length=firstUnit&MAPPING_LENGTH_MASK; if((firstUnit&MAPPING_HAS_CCC_LCCC_WORD)!=0) { @@ -2657,11 +2685,11 @@ UBool Normalizer2Impl::ensureCanonIterData(UErrorCode &errorCode) const { } int32_t Normalizer2Impl::getCanonValue(UChar32 c) const { - return (int32_t)ucptrie_get(fCanonIterData->trie, c); + return static_cast<int32_t>(ucptrie_get(fCanonIterData->trie, c)); } const UnicodeSet &Normalizer2Impl::getCanonStartSet(int32_t n) const { - return *(const UnicodeSet *)fCanonIterData->canonStartSets[n]; + return *static_cast<const UnicodeSet*>(fCanonIterData->canonStartSets[n]); } UBool Normalizer2Impl::isCanonSegmentStarter(UChar32 c) const { @@ -2684,7 +2712,7 @@ UBool Normalizer2Impl::getCanonStartSet(UChar32 c, UnicodeSet &set) const { uint16_t norm16=getRawNorm16(c); if(norm16==JAMO_L) { UChar32 syllable= - (UChar32)(Hangul::HANGUL_BASE+(c-Hangul::JAMO_L_BASE)*Hangul::JAMO_VT_COUNT); + static_cast<UChar32>(Hangul::HANGUL_BASE + (c - Hangul::JAMO_L_BASE) * Hangul::JAMO_VT_COUNT); set.add(syllable, syllable+Hangul::JAMO_VT_COUNT-1); } else { addComposites(getCompositionsList(norm16), set); @@ -2728,7 +2756,7 @@ unorm2_swap(const UDataSwapper *ds, pInfo->dataFormat[1]==0x72 && pInfo->dataFormat[2]==0x6d && pInfo->dataFormat[3]==0x32 && - (1<=formatVersion0 && formatVersion0<=4) + (1<=formatVersion0 && formatVersion0<=5) )) { udata_printError(ds, "unorm2_swap(): data format %02x.%02x.%02x.%02x (format version %02x) is not recognized as Normalizer2 data\n", pInfo->dataFormat[0], pInfo->dataFormat[1], @@ -2747,8 +2775,10 @@ unorm2_swap(const UDataSwapper *ds, minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_YES+1; } else if(formatVersion0==2) { minIndexesLength=Normalizer2Impl::IX_MIN_YES_NO_MAPPINGS_ONLY+1; - } else { + } else if(formatVersion0<=4) { minIndexesLength=Normalizer2Impl::IX_MIN_LCCC_CP+1; + } else { + minIndexesLength=Normalizer2Impl::IX_MIN_MAYBE_NO_COMBINES_FWD+1; } if(length>=0) { |