| 1 | #include <mbgl/text/bidi.hpp> |
| 2 | #include <mbgl/util/traits.hpp> |
| 3 | |
| 4 | #include <unicode/ubidi.h> |
| 5 | #include <unicode/ushape.h> |
| 6 | |
| 7 | #include <memory> |
| 8 | #include <stdexcept> |
| 9 | |
| 10 | namespace mbgl { |
| 11 | |
| 12 | class BiDiImpl { |
| 13 | public: |
| 14 | BiDiImpl() : bidiText(ubidi_open()), bidiLine(ubidi_open()) { |
| 15 | } |
| 16 | ~BiDiImpl() { |
| 17 | ubidi_close(pBiDi: bidiText); |
| 18 | ubidi_close(pBiDi: bidiLine); |
| 19 | } |
| 20 | |
| 21 | UBiDi* bidiText = nullptr; |
| 22 | UBiDi* bidiLine = nullptr; |
| 23 | }; |
| 24 | |
| 25 | BiDi::BiDi() : impl(std::make_unique<BiDiImpl>()) {} |
| 26 | BiDi::~BiDi() = default; |
| 27 | |
| 28 | // Takes UTF16 input in logical order and applies Arabic shaping to the input while maintaining |
| 29 | // logical order. Output won't be intelligible until the bidirectional algorithm is applied |
| 30 | std::u16string applyArabicShaping(const std::u16string& input) { |
| 31 | UErrorCode errorCode = U_ZERO_ERROR; |
| 32 | |
| 33 | const int32_t outputLength = |
| 34 | u_shapeArabic(source: mbgl::utf16char_cast<const UChar*>(in: input.c_str()), sourceLength: static_cast<int32_t>(input.size()), dest: nullptr, destSize: 0, |
| 35 | options: (U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK) | |
| 36 | (U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK), |
| 37 | pErrorCode: &errorCode); |
| 38 | |
| 39 | // Pre-flighting will always set U_BUFFER_OVERFLOW_ERROR |
| 40 | errorCode = U_ZERO_ERROR; |
| 41 | |
| 42 | std::u16string outputText(outputLength, 0); |
| 43 | |
| 44 | u_shapeArabic(source: mbgl::utf16char_cast<const UChar*>(in: input.c_str()), sourceLength: static_cast<int32_t>(input.size()), dest: mbgl::utf16char_cast<UChar*>(in: &outputText[0]), destSize: outputLength, |
| 45 | options: (U_SHAPE_LETTERS_SHAPE & U_SHAPE_LETTERS_MASK) | |
| 46 | (U_SHAPE_TEXT_DIRECTION_LOGICAL & U_SHAPE_TEXT_DIRECTION_MASK), |
| 47 | pErrorCode: &errorCode); |
| 48 | |
| 49 | // If the algorithm fails for any reason, fall back to non-transformed text |
| 50 | if (U_FAILURE(code: errorCode)) |
| 51 | return input; |
| 52 | |
| 53 | return outputText; |
| 54 | } |
| 55 | |
| 56 | void BiDi::mergeParagraphLineBreaks(std::set<size_t>& lineBreakPoints) { |
| 57 | int32_t paragraphCount = ubidi_countParagraphs(pBiDi: impl->bidiText); |
| 58 | for (int32_t i = 0; i < paragraphCount; i++) { |
| 59 | UErrorCode errorCode = U_ZERO_ERROR; |
| 60 | int32_t paragraphEndIndex; |
| 61 | ubidi_getParagraphByIndex(pBiDi: impl->bidiText, paraIndex: i, pParaStart: nullptr, pParaLimit: ¶graphEndIndex, pParaLevel: nullptr, pErrorCode: &errorCode); |
| 62 | |
| 63 | if (U_FAILURE(code: errorCode)) { |
| 64 | throw std::runtime_error(std::string("ProcessedBiDiText::mergeParagraphLineBreaks: " ) + |
| 65 | u_errorName(code: errorCode)); |
| 66 | } |
| 67 | |
| 68 | lineBreakPoints.insert(x: static_cast<std::size_t>(paragraphEndIndex)); |
| 69 | } |
| 70 | } |
| 71 | |
| 72 | std::vector<std::u16string> BiDi::applyLineBreaking(std::set<std::size_t> lineBreakPoints) { |
| 73 | // BiDi::getLine will error if called across a paragraph boundary, so we need to ensure that all |
| 74 | // paragraph boundaries are included in the set of line break points. The calling code might not |
| 75 | // include the line break because it didn't need to wrap at that point, or because the text was |
| 76 | // separated with a more exotic code point such as (U+001C) |
| 77 | mergeParagraphLineBreaks(lineBreakPoints); |
| 78 | |
| 79 | std::vector<std::u16string> transformedLines; |
| 80 | transformedLines.reserve(n: lineBreakPoints.size()); |
| 81 | |
| 82 | std::size_t start = 0; |
| 83 | for (std::size_t lineBreakPoint : lineBreakPoints) { |
| 84 | transformedLines.push_back(x: getLine(start, end: lineBreakPoint)); |
| 85 | start = lineBreakPoint; |
| 86 | } |
| 87 | |
| 88 | return transformedLines; |
| 89 | } |
| 90 | |
| 91 | std::vector<std::u16string> BiDi::processText(const std::u16string& input, |
| 92 | std::set<std::size_t> lineBreakPoints) { |
| 93 | UErrorCode errorCode = U_ZERO_ERROR; |
| 94 | |
| 95 | ubidi_setPara(pBiDi: impl->bidiText, text: mbgl::utf16char_cast<const UChar*>(in: input.c_str()), length: static_cast<int32_t>(input.size()), |
| 96 | UBIDI_DEFAULT_LTR, embeddingLevels: nullptr, pErrorCode: &errorCode); |
| 97 | |
| 98 | if (U_FAILURE(code: errorCode)) { |
| 99 | throw std::runtime_error(std::string("BiDi::processText: " ) + u_errorName(code: errorCode)); |
| 100 | } |
| 101 | |
| 102 | return applyLineBreaking(lineBreakPoints); |
| 103 | } |
| 104 | |
| 105 | std::u16string BiDi::getLine(std::size_t start, std::size_t end) { |
| 106 | UErrorCode errorCode = U_ZERO_ERROR; |
| 107 | ubidi_setLine(pParaBiDi: impl->bidiText, start: static_cast<int32_t>(start), limit: static_cast<int32_t>(end), pLineBiDi: impl->bidiLine, pErrorCode: &errorCode); |
| 108 | |
| 109 | if (U_FAILURE(code: errorCode)) { |
| 110 | throw std::runtime_error(std::string("BiDi::getLine (setLine): " ) + u_errorName(code: errorCode)); |
| 111 | } |
| 112 | |
| 113 | // Because we set UBIDI_REMOVE_BIDI_CONTROLS, the output may be smaller than what we reserve |
| 114 | // Setting UBIDI_INSERT_LRM_FOR_NUMERIC would require |
| 115 | // ubidi_getLength(pBiDi)+2*ubidi_countRuns(pBiDi) |
| 116 | const int32_t outputLength = ubidi_getProcessedLength(pBiDi: impl->bidiLine); |
| 117 | std::u16string outputText(outputLength, 0); |
| 118 | |
| 119 | // UBIDI_DO_MIRRORING: Apply unicode mirroring of characters like parentheses |
| 120 | // UBIDI_REMOVE_BIDI_CONTROLS: Now that all the lines are set, remove control characters so that |
| 121 | // they don't show up on screen (some fonts have glyphs representing them) |
| 122 | ubidi_writeReordered(pBiDi: impl->bidiLine, dest: mbgl::utf16char_cast<UChar*>(in: &outputText[0]), destSize: outputLength, |
| 123 | UBIDI_DO_MIRRORING | UBIDI_REMOVE_BIDI_CONTROLS, pErrorCode: &errorCode); |
| 124 | |
| 125 | if (U_FAILURE(code: errorCode)) { |
| 126 | throw std::runtime_error(std::string("BiDi::getLine (writeReordered): " ) + |
| 127 | u_errorName(code: errorCode)); |
| 128 | } |
| 129 | |
| 130 | return outputText; |
| 131 | } |
| 132 | |
| 133 | } // end namespace mbgl |
| 134 | |