15 files changed, 9499 insertions, 0 deletions
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwCommon.h b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwCommon.h
new file mode 100644
index 0000000000..c0cd8863a2
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwCommon.h
@@ -0,0 +1,564 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TVG_SW_COMMON_H_
+#define _TVG_SW_COMMON_H_
+
+#include "tvgCommon.h"
+#include "tvgRender.h"
+
+#include <algorithm>
+
+#if 0
+#include <sys/time.h>
+static double timeStamp()
+{
+   struct timeval tv;
+   gettimeofday(&tv, NULL);
+   return (tv.tv_sec + tv.tv_usec / 1000000.0);
+}
+#endif
+
+#define SW_CURVE_TYPE_POINT 0
+#define SW_CURVE_TYPE_CUBIC 1
+#define SW_ANGLE_PI (180L << 16)
+#define SW_ANGLE_2PI (SW_ANGLE_PI << 1)
+#define SW_ANGLE_PI2 (SW_ANGLE_PI >> 1)
+#define SW_ANGLE_PI4 (SW_ANGLE_PI >> 2)
+
+using SwCoord = signed long;
+using SwFixed = signed long long;
+
+struct SwPoint
+{
+    SwCoord x, y;
+
+    SwPoint& operator+=(const SwPoint& rhs)
+    {
+        x += rhs.x;
+        y += rhs.y;
+        return *this;
+    }
+
+    SwPoint operator+(const SwPoint& rhs) const
+    {
+        return {x + rhs.x, y + rhs.y};
+    }
+
+    SwPoint operator-(const SwPoint& rhs) const
+    {
+        return {x - rhs.x, y - rhs.y};
+    }
+
+    bool operator==(const SwPoint& rhs) const
+    {
+        return (x == rhs.x && y == rhs.y);
+    }
+
+    bool operator!=(const SwPoint& rhs) const
+    {
+        return (x != rhs.x || y != rhs.y);
+    }
+
+    bool zero() const
+    {
+        if (x == 0 && y == 0) return true;
+        else return false;
+    }
+
+    bool small() const
+    {
+        //2 is epsilon...
+        if (abs(x) < 2 && abs(y) < 2) return true;
+        else return false;
+    }
+
+};
+
+struct SwSize
+{
+    SwCoord w, h;
+};
+
+struct SwOutline
+{
+    Array<SwPoint> pts;             //the outline's points
+    Array<uint32_t> cntrs;          //the contour end points
+    Array<uint8_t> types;           //curve type
+    Array<bool> closed;             //opened or closed path?
+    FillRule fillRule;
+};
+
+struct SwSpan
+{
+    uint16_t x, y;
+    uint16_t len;
+    uint8_t coverage;
+};
+
+struct SwRleData
+{
+    SwSpan *spans;
+    uint32_t alloc;
+    uint32_t size;
+};
+
+struct SwBBox
+{
+    SwPoint min, max;
+
+    void reset()
+    {
+        min.x = min.y = max.x = max.y = 0;
+    }
+};
+
+struct SwFill
+{
+    struct SwLinear {
+        float dx, dy;
+        float len;
+        float offset;
+    };
+
+    struct SwRadial {
+        float a11, a12, a13;
+        float a21, a22, a23;
+        float fx, fy, fr;
+        float dx, dy, dr;
+        float invA, a;
+    };
+
+    union {
+        SwLinear linear;
+        SwRadial radial;
+    };
+
+    uint32_t* ctable;
+    FillSpread spread;
+
+    bool translucent;
+};
+
+struct SwStrokeBorder
+{
+    uint32_t ptsCnt;
+    uint32_t maxPts;
+    SwPoint* pts;
+    uint8_t* tags;
+    int32_t start;     //index of current sub-path start point
+    bool movable;      //true: for ends of lineto borders
+};
+
+struct SwStroke
+{
+    SwFixed angleIn;
+    SwFixed angleOut;
+    SwPoint center;
+    SwFixed lineLength;
+    SwFixed subPathAngle;
+    SwPoint ptStartSubPath;
+    SwFixed subPathLineLength;
+    SwFixed width;
+    SwFixed miterlimit;
+
+    StrokeCap cap;
+    StrokeJoin join;
+    StrokeJoin joinSaved;
+    SwFill* fill = nullptr;
+
+    SwStrokeBorder borders[2];
+
+    float sx, sy;
+
+    bool firstPt;
+    bool closedSubPath;
+    bool handleWideStrokes;
+};
+
+struct SwDashStroke
+{
+    SwOutline* outline = nullptr;
+    float curLen = 0;
+    int32_t curIdx = 0;
+    Point ptStart = {0, 0};
+    Point ptCur = {0, 0};
+    float* pattern = nullptr;
+    uint32_t cnt = 0;
+    bool curOpGap = false;
+};
+
+struct SwShape
+{
+    SwOutline*   outline = nullptr;
+    SwStroke*    stroke = nullptr;
+    SwFill*      fill = nullptr;
+    SwRleData*   rle = nullptr;
+    SwRleData*   strokeRle = nullptr;
+    SwBBox       bbox;           //Keep it boundary without stroke region. Using for optimal filling.
+
+    bool         fastTrack = false;   //Fast Track: axis-aligned rectangle without any clips?
+};
+
+struct SwImage
+{
+    SwOutline*   outline = nullptr;
+    SwRleData*   rle = nullptr;
+    union {
+        pixel_t*  data;      //system based data pointer
+        uint32_t* buf32;     //for explicit 32bits channels
+        uint8_t*  buf8;      //for explicit 8bits grayscale
+    };
+    uint32_t     w, h, stride;
+    int32_t      ox = 0;         //offset x
+    int32_t      oy = 0;         //offset y
+    float        scale;
+    uint8_t      channelSize;
+
+    bool         direct = false;  //draw image directly (with offset)
+    bool         scaled = false;  //draw scaled image
+};
+
+typedef uint8_t(*SwMask)(uint8_t s, uint8_t d, uint8_t a);                  //src, dst, alpha
+typedef uint32_t(*SwBlender)(uint32_t s, uint32_t d, uint8_t a);            //src, dst, alpha
+typedef uint32_t(*SwJoin)(uint8_t r, uint8_t g, uint8_t b, uint8_t a);      //color channel join
+typedef uint8_t(*SwAlpha)(uint8_t*);                                        //blending alpha
+
+struct SwCompositor;
+
+struct SwSurface : Surface
+{
+    SwJoin  join;
+    SwAlpha alphas[4];                    //Alpha:2, InvAlpha:3, Luma:4, InvLuma:5
+    SwBlender blender = nullptr;          //blender (optional)
+    SwCompositor* compositor = nullptr;   //compositor (optional)
+    BlendMethod          blendMethod;     //blending method (uint8_t)
+
+    SwAlpha alpha(CompositeMethod method)
+    {
+        auto idx = (int)(method) - 2;       //0: None, 1: ClipPath
+        return alphas[idx > 3 ? 0 : idx];   //CompositeMethod has only four Matting methods.
+    }
+};
+
+struct SwCompositor : Compositor
+{
+    SwSurface* recoverSfc;                  //Recover surface when composition is started
+    SwCompositor* recoverCmp;               //Recover compositor when composition is done
+    SwImage image;
+    SwBBox bbox;
+    bool valid;
+};
+
+struct SwMpool
+{
+    SwOutline* outline;
+    SwOutline* strokeOutline;
+    SwOutline* dashOutline;
+    unsigned allocSize;
+};
+
+static inline SwCoord TO_SWCOORD(float val)
+{
+    return SwCoord(val * 64.0f);
+}
+
+static inline uint32_t JOIN(uint8_t c0, uint8_t c1, uint8_t c2, uint8_t c3)
+{
+    return (c0 << 24 | c1 << 16 | c2 << 8 | c3);
+}
+
+static inline uint32_t ALPHA_BLEND(uint32_t c, uint32_t a)
+{
+    return (((((c >> 8) & 0x00ff00ff) * a + 0x00ff00ff) & 0xff00ff00) +
+            ((((c & 0x00ff00ff) * a + 0x00ff00ff) >> 8) & 0x00ff00ff));
+}
+
+static inline uint32_t INTERPOLATE(uint32_t s, uint32_t d, uint8_t a)
+{
+    return (((((((s >> 8) & 0xff00ff) - ((d >> 8) & 0xff00ff)) * a) + (d & 0xff00ff00)) & 0xff00ff00) + ((((((s & 0xff00ff) - (d & 0xff00ff)) * a) >> 8) + (d & 0xff00ff)) & 0xff00ff));
+}
+
+static inline uint8_t INTERPOLATE8(uint8_t s, uint8_t d, uint8_t a)
+{
+    return (((s) * (a) + 0xff) >> 8) + (((d) * ~(a) + 0xff) >> 8);
+}
+
+static inline SwCoord HALF_STROKE(float width)
+{
+    return TO_SWCOORD(width * 0.5f);
+}
+
+static inline uint8_t A(uint32_t c)
+{
+    return ((c) >> 24);
+}
+
+static inline uint8_t IA(uint32_t c)
+{
+    return (~(c) >> 24);
+}
+
+static inline uint8_t C1(uint32_t c)
+{
+    return ((c) >> 16);
+}
+
+static inline uint8_t C2(uint32_t c)
+{
+    return ((c) >> 8);
+}
+
+static inline uint8_t C3(uint32_t c)
+{
+    return (c);
+}
+
+static inline uint32_t opBlendInterp(uint32_t s, uint32_t d, uint8_t a)
+{
+    return INTERPOLATE(s, d, a);
+}
+
+static inline uint32_t opBlendNormal(uint32_t s, uint32_t d, uint8_t a)
+{
+    auto t = ALPHA_BLEND(s, a);
+    return t + ALPHA_BLEND(d, IA(t));
+}
+
+static inline uint32_t opBlendPreNormal(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return s + ALPHA_BLEND(d, IA(s));
+}
+
+static inline uint32_t opBlendSrcOver(uint32_t s, TVG_UNUSED uint32_t d, TVG_UNUSED uint8_t a)
+{
+    return s;
+}
+
+//TODO: BlendMethod could remove the alpha parameter.
+static inline uint32_t opBlendDifference(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    //if (s > d) => s - d
+    //else => d - s
+    auto c1 = (C1(s) > C1(d)) ? (C1(s) - C1(d)) : (C1(d) - C1(s));
+    auto c2 = (C2(s) > C2(d)) ? (C2(s) - C2(d)) : (C2(d) - C2(s));
+    auto c3 = (C3(s) > C3(d)) ? (C3(s) - C3(d)) : (C3(d) - C3(s));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendExclusion(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    //A + B - 2AB
+    auto c1 = std::min(255, C1(s) + C1(d) - std::min(255, (C1(s) * C1(d)) << 1));
+    auto c2 = std::min(255, C2(s) + C2(d) - std::min(255, (C2(s) * C2(d)) << 1));
+    auto c3 = std::min(255, C3(s) + C3(d) - std::min(255, (C3(s) * C3(d)) << 1));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendAdd(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // s + d
+    auto c1 = std::min(C1(s) + C1(d), 255);
+    auto c2 = std::min(C2(s) + C2(d), 255);
+    auto c3 = std::min(C3(s) + C3(d), 255);
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendScreen(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // s + d - s * d
+    auto c1 = C1(s) + C1(d) - MULTIPLY(C1(s), C1(d));
+    auto c2 = C2(s) + C2(d) - MULTIPLY(C2(s), C2(d));
+    auto c3 = C3(s) + C3(d) - MULTIPLY(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+
+static inline uint32_t opBlendMultiply(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // s * d
+    auto c1 = MULTIPLY(C1(s), C1(d));
+    auto c2 = MULTIPLY(C2(s), C2(d));
+    auto c3 = MULTIPLY(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+
+static inline uint32_t opBlendOverlay(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // if (2 * d < da) => 2 * s * d,
+    // else => 1 - 2 * (1 - s) * (1 - d)
+    auto c1 = (C1(d) < 128) ? std::min(255, 2 * MULTIPLY(C1(s), C1(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C1(s), 255 - C1(d))));
+    auto c2 = (C2(d) < 128) ? std::min(255, 2 * MULTIPLY(C2(s), C2(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C2(s), 255 - C2(d))));
+    auto c3 = (C3(d) < 128) ? std::min(255, 2 * MULTIPLY(C3(s), C3(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C3(s), 255 - C3(d))));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendDarken(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // min(s, d)
+    auto c1 = std::min(C1(s), C1(d));
+    auto c2 = std::min(C2(s), C2(d));
+    auto c3 = std::min(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendLighten(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // max(s, d)
+    auto c1 = std::max(C1(s), C1(d));
+    auto c2 = std::max(C2(s), C2(d));
+    auto c3 = std::max(C3(s), C3(d));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendColorDodge(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // d / (1 - s)
+    auto is = 0xffffffff - s;
+    auto c1 = (C1(is) > 0) ? (C1(d) / C1(is)) : C1(d);
+    auto c2 = (C2(is) > 0) ? (C2(d) / C2(is)) : C2(d);
+    auto c3 = (C3(is) > 0) ? (C3(d) / C3(is)) : C3(d);
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendColorBurn(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    // 1 - (1 - d) / s
+    auto id = 0xffffffff - d;
+    auto c1 = 255 - ((C1(s) > 0) ? (C1(id) / C1(s)) : C1(id));
+    auto c2 = 255 - ((C2(s) > 0) ? (C2(id) / C2(s)) : C2(id));
+    auto c3 = 255 - ((C3(s) > 0) ? (C3(id) / C3(s)) : C3(id));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendHardLight(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    auto c1 = (C1(s) < 128) ? std::min(255, 2 * MULTIPLY(C1(s), C1(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C1(s), 255 - C1(d))));
+    auto c2 = (C2(s) < 128) ? std::min(255, 2 * MULTIPLY(C2(s), C2(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C2(s), 255 - C2(d))));
+    auto c3 = (C3(s) < 128) ? std::min(255, 2 * MULTIPLY(C3(s), C3(d))) : (255 - std::min(255, 2 * MULTIPLY(255 - C3(s), 255 - C3(d))));
+    return JOIN(255, c1, c2, c3);
+}
+
+static inline uint32_t opBlendSoftLight(uint32_t s, uint32_t d, TVG_UNUSED uint8_t a)
+{
+    //(255 - 2 * s) * (d * d) + (2 * s * b)
+    auto c1 = std::min(255, MULTIPLY(255 - std::min(255, 2 * C1(s)), MULTIPLY(C1(d), C1(d))) + 2 * MULTIPLY(C1(s), C1(d)));
+    auto c2 = std::min(255, MULTIPLY(255 - std::min(255, 2 * C2(s)), MULTIPLY(C2(d), C2(d))) + 2 * MULTIPLY(C2(s), C2(d)));
+    auto c3 = std::min(255, MULTIPLY(255 - std::min(255, 2 * C3(s)), MULTIPLY(C3(d), C3(d))) + 2 * MULTIPLY(C3(s), C3(d)));
+    return JOIN(255, c1, c2, c3);
+}
+
+
+int64_t mathMultiply(int64_t a, int64_t b);
+int64_t mathDivide(int64_t a, int64_t b);
+int64_t mathMulDiv(int64_t a, int64_t b, int64_t c);
+void mathRotate(SwPoint& pt, SwFixed angle);
+SwFixed mathTan(SwFixed angle);
+SwFixed mathAtan(const SwPoint& pt);
+SwFixed mathCos(SwFixed angle);
+SwFixed mathSin(SwFixed angle);
+void mathSplitCubic(SwPoint* base);
+SwFixed mathDiff(SwFixed angle1, SwFixed angle2);
+SwFixed mathLength(const SwPoint& pt);
+bool mathSmallCubic(const SwPoint* base, SwFixed& angleIn, SwFixed& angleMid, SwFixed& angleOut);
+SwFixed mathMean(SwFixed angle1, SwFixed angle2);
+SwPoint mathTransform(const Point* to, const Matrix* transform);
+bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, SwBBox& renderRegion, bool fastTrack);
+bool mathClipBBox(const SwBBox& clipper, SwBBox& clipee);
+
+void shapeReset(SwShape* shape);
+bool shapePrepare(SwShape* shape, const RenderShape* rshape, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid, bool hasComposite);
+bool shapePrepared(const SwShape* shape);
+bool shapeGenRle(SwShape* shape, const RenderShape* rshape, bool antiAlias);
+void shapeDelOutline(SwShape* shape, SwMpool* mpool, uint32_t tid);
+void shapeResetStroke(SwShape* shape, const RenderShape* rshape, const Matrix* transform);
+bool shapeGenStrokeRle(SwShape* shape, const RenderShape* rshape, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid);
+void shapeFree(SwShape* shape);
+void shapeDelStroke(SwShape* shape);
+bool shapeGenFillColors(SwShape* shape, const Fill* fill, const Matrix* transform, SwSurface* surface, uint8_t opacity, bool ctable);
+bool shapeGenStrokeFillColors(SwShape* shape, const Fill* fill, const Matrix* transform, SwSurface* surface, uint8_t opacity, bool ctable);
+void shapeResetFill(SwShape* shape);
+void shapeResetStrokeFill(SwShape* shape);
+void shapeDelFill(SwShape* shape);
+void shapeDelStrokeFill(SwShape* shape);
+
+void strokeReset(SwStroke* stroke, const RenderShape* shape, const Matrix* transform);
+bool strokeParseOutline(SwStroke* stroke, const SwOutline& outline);
+SwOutline* strokeExportOutline(SwStroke* stroke, SwMpool* mpool, unsigned tid);
+void strokeFree(SwStroke* stroke);
+
+bool imagePrepare(SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid);
+bool imageGenRle(SwImage* image, const SwBBox& renderRegion, bool antiAlias);
+void imageDelOutline(SwImage* image, SwMpool* mpool, uint32_t tid);
+void imageReset(SwImage* image);
+void imageFree(SwImage* image);
+
+bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix* transform, SwSurface* surface, uint8_t opacity, bool ctable);
+void fillReset(SwFill* fill);
+void fillFree(SwFill* fill);
+
+//OPTIMIZE_ME: Skip the function pointer access
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask maskOp, uint8_t opacity);                                   //composite masking ver.
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask maskOp, uint8_t opacity);                     //direct masking ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a);                                         //blending ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a);                          //blending + BlendingMethod(op2) ver.
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //matting ver.
+
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask op, uint8_t a);                                             //composite masking ver.
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask op, uint8_t a) ;                              //direct masking ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a);                                         //blending ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a);                          //blending + BlendingMethod(op2) ver.
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity);     //matting ver.
+
+SwRleData* rleRender(SwRleData* rle, const SwOutline* outline, const SwBBox& renderRegion, bool antiAlias);
+SwRleData* rleRender(const SwBBox* bbox);
+void rleFree(SwRleData* rle);
+void rleReset(SwRleData* rle);
+void rleMerge(SwRleData* rle, SwRleData* clip1, SwRleData* clip2);
+void rleClipPath(SwRleData* rle, const SwRleData* clip);
+void rleClipRect(SwRleData* rle, const SwBBox* clip);
+
+SwMpool* mpoolInit(uint32_t threads);
+bool mpoolTerm(SwMpool* mpool);
+bool mpoolClear(SwMpool* mpool);
+SwOutline* mpoolReqOutline(SwMpool* mpool, unsigned idx);
+void mpoolRetOutline(SwMpool* mpool, unsigned idx);
+SwOutline* mpoolReqStrokeOutline(SwMpool* mpool, unsigned idx);
+void mpoolRetStrokeOutline(SwMpool* mpool, unsigned idx);
+SwOutline* mpoolReqDashOutline(SwMpool* mpool, unsigned idx);
+void mpoolRetDashOutline(SwMpool* mpool, unsigned idx);
+
+bool rasterCompositor(SwSurface* surface);
+bool rasterGradientShape(SwSurface* surface, SwShape* shape, unsigned id);
+bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a);
+bool rasterImage(SwSurface* surface, SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox& bbox, uint8_t opacity);
+bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a);
+bool rasterGradientStroke(SwSurface* surface, SwShape* shape, unsigned id);
+bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_t h);
+void rasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len);
+void rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len);
+void rasterUnpremultiply(Surface* surface);
+void rasterPremultiply(Surface* surface);
+bool rasterConvertCS(Surface* surface, ColorSpace to);
+
+#endif /* _TVG_SW_COMMON_H_ */
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwFill.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwFill.cpp
new file mode 100644
index 0000000000..cede9e6eb7
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwFill.cpp
@@ -0,0 +1,779 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+#include "tvgFill.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+#define RADIAL_A_THRESHOLD 0.0005f
+#define GRADIENT_STOP_SIZE 1024
+#define FIXPT_BITS 8
+#define FIXPT_SIZE (1<<FIXPT_BITS)
+
+/*
+ * quadratic equation with the following coefficients (rx and ry defined in the _calculateCoefficients()):
+ * A = a  // fill->radial.a
+ * B = 2 * (dr * fr + rx * dx + ry * dy)
+ * C = fr^2 - rx^2 - ry^2
+ * Derivatives are computed with respect to dx.
+ * This procedure aims to optimize and eliminate the need to calculate all values from the beginning
+ * for consecutive x values with a constant y. The Taylor series expansions are computed as long as
+ * its terms are non-zero.
+ */
+static void _calculateCoefficients(const SwFill* fill, uint32_t x, uint32_t y, float& b, float& deltaB, float& det, float& deltaDet, float& deltaDeltaDet)
+{
+    auto radial = &fill->radial;
+
+    auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+    auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+
+    b = (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy) * radial->invA;
+    deltaB = (radial->a11 * radial->dx + radial->a21 * radial->dy) * radial->invA;
+
+    auto rr = rx * rx + ry * ry;
+    auto deltaRr = 2.0f * (rx * radial->a11 + ry * radial->a21) * radial->invA;
+    auto deltaDeltaRr = 2.0f * (radial->a11 * radial->a11 + radial->a21 * radial->a21) * radial->invA;
+
+    det = b * b + (rr - radial->fr * radial->fr) * radial->invA;
+    deltaDet = 2.0f * b * deltaB + deltaB * deltaB + deltaRr + deltaDeltaRr;
+    deltaDeltaDet = 2.0f * deltaB * deltaB + deltaDeltaRr;
+}
+
+
+static bool _updateColorTable(SwFill* fill, const Fill* fdata, const SwSurface* surface, uint8_t opacity)
+{
+    if (!fill->ctable) {
+        fill->ctable = static_cast<uint32_t*>(malloc(GRADIENT_STOP_SIZE * sizeof(uint32_t)));
+        if (!fill->ctable) return false;
+    }
+
+    const Fill::ColorStop* colors;
+    auto cnt = fdata->colorStops(&colors);
+    if (cnt == 0 || !colors) return false;
+
+    auto pColors = colors;
+
+    auto a = MULTIPLY(pColors->a, opacity);
+    if (a < 255) fill->translucent = true;
+
+    auto r = pColors->r;
+    auto g = pColors->g;
+    auto b = pColors->b;
+    auto rgba = surface->join(r, g, b, a);
+
+    auto inc = 1.0f / static_cast<float>(GRADIENT_STOP_SIZE);
+    auto pos = 1.5f * inc;
+    uint32_t i = 0;
+
+    fill->ctable[i++] = ALPHA_BLEND(rgba | 0xff000000, a);
+
+    while (pos <= pColors->offset) {
+        fill->ctable[i] = fill->ctable[i - 1];
+        ++i;
+        pos += inc;
+    }
+
+    for (uint32_t j = 0; j < cnt - 1; ++j) {
+        auto curr = colors + j;
+        auto next = curr + 1;
+        auto delta = 1.0f / (next->offset - curr->offset);
+        auto a2 = MULTIPLY(next->a, opacity);
+        if (!fill->translucent && a2 < 255) fill->translucent = true;
+
+        auto rgba2 = surface->join(next->r, next->g, next->b, a2);
+
+        while (pos < next->offset && i < GRADIENT_STOP_SIZE) {
+            auto t = (pos - curr->offset) * delta;
+            auto dist = static_cast<int32_t>(255 * t);
+            auto dist2 = 255 - dist;
+
+            auto color = INTERPOLATE(rgba, rgba2, dist2);
+            fill->ctable[i] = ALPHA_BLEND((color | 0xff000000), (color >> 24));
+
+            ++i;
+            pos += inc;
+        }
+        rgba = rgba2;
+        a = a2;
+    }
+    rgba = ALPHA_BLEND((rgba | 0xff000000), a);
+
+    for (; i < GRADIENT_STOP_SIZE; ++i)
+        fill->ctable[i] = rgba;
+
+    //Make sure the last color stop is represented at the end of the table
+    fill->ctable[GRADIENT_STOP_SIZE - 1] = rgba;
+
+    return true;
+}
+
+
+bool _prepareLinear(SwFill* fill, const LinearGradient* linear, const Matrix* transform)
+{
+    float x1, x2, y1, y2;
+    if (linear->linear(&x1, &y1, &x2, &y2) != Result::Success) return false;
+
+    fill->linear.dx = x2 - x1;
+    fill->linear.dy = y2 - y1;
+    fill->linear.len = fill->linear.dx * fill->linear.dx + fill->linear.dy * fill->linear.dy;
+
+    if (fill->linear.len < FLT_EPSILON) return true;
+
+    fill->linear.dx /= fill->linear.len;
+    fill->linear.dy /= fill->linear.len;
+    fill->linear.offset = -fill->linear.dx * x1 - fill->linear.dy * y1;
+
+    auto gradTransform = linear->transform();
+    bool isTransformation = !mathIdentity((const Matrix*)(&gradTransform));
+
+    if (isTransformation) {
+        if (transform) gradTransform = mathMultiply(transform, &gradTransform);
+    } else if (transform) {
+        gradTransform = *transform;
+        isTransformation = true;
+    }
+
+    if (isTransformation) {
+        Matrix invTransform;
+        if (!mathInverse(&gradTransform, &invTransform)) return false;
+
+        fill->linear.offset += fill->linear.dx * invTransform.e13 + fill->linear.dy * invTransform.e23;
+
+        auto dx = fill->linear.dx;
+        fill->linear.dx = dx * invTransform.e11 + fill->linear.dy * invTransform.e21;
+        fill->linear.dy = dx * invTransform.e12 + fill->linear.dy * invTransform.e22;
+
+        fill->linear.len = fill->linear.dx * fill->linear.dx + fill->linear.dy * fill->linear.dy;
+        if (fill->linear.len < FLT_EPSILON) return true;
+    }
+
+    return true;
+}
+
+
+bool _prepareRadial(SwFill* fill, const RadialGradient* radial, const Matrix* transform)
+{
+    auto cx = P(radial)->cx;
+    auto cy = P(radial)->cy;
+    auto r = P(radial)->r;
+    auto fx = P(radial)->fx;
+    auto fy = P(radial)->fy;
+    auto fr = P(radial)->fr;
+
+    if (r < FLT_EPSILON) return true;
+
+    fill->radial.dr = r - fr;
+    fill->radial.dx = cx - fx;
+    fill->radial.dy = cy - fy;
+    fill->radial.fr = fr;
+    fill->radial.fx = fx;
+    fill->radial.fy = fy;
+    fill->radial.a = fill->radial.dr * fill->radial.dr - fill->radial.dx * fill->radial.dx - fill->radial.dy * fill->radial.dy;
+
+    //This condition fulfills the SVG 1.1 std:
+    //the focal point, if outside the end circle, is moved to be on the end circle
+    //See: the SVG 2 std requirements: https://www.w3.org/TR/SVG2/pservers.html#RadialGradientNotes
+    if (fill->radial.a < 0) {
+        auto dist = sqrtf(fill->radial.dx * fill->radial.dx + fill->radial.dy * fill->radial.dy);
+        fill->radial.fx = cx + r * (fx - cx) / dist;
+        fill->radial.fy = cy + r * (fy - cy) / dist;
+        fill->radial.dx = cx - fill->radial.fx;
+        fill->radial.dy = cy - fill->radial.fy;
+        fill->radial.a = fill->radial.dr * fill->radial.dr - fill->radial.dx * fill->radial.dx - fill->radial.dy * fill->radial.dy;
+    }
+
+    if (fill->radial.a > 0) fill->radial.invA = 1.0f / fill->radial.a;
+
+    auto gradTransform = radial->transform();
+    bool isTransformation = !mathIdentity((const Matrix*)(&gradTransform));
+
+    if (transform) {
+        if (isTransformation) gradTransform = mathMultiply(transform, &gradTransform);
+        else {
+            gradTransform = *transform;
+            isTransformation = true;
+        }
+    }
+
+    if (isTransformation) {
+        Matrix invTransform;
+        if (!mathInverse(&gradTransform, &invTransform)) return false;
+        fill->radial.a11 = invTransform.e11;
+        fill->radial.a12 = invTransform.e12;
+        fill->radial.a13 = invTransform.e13;
+        fill->radial.a21 = invTransform.e21;
+        fill->radial.a22 = invTransform.e22;
+        fill->radial.a23 = invTransform.e23;
+    } else {
+        fill->radial.a11 = fill->radial.a22 = 1.0f;
+        fill->radial.a12 = fill->radial.a13 = 0.0f;
+        fill->radial.a21 = fill->radial.a23 = 0.0f;
+    }
+    return true;
+}
+
+
+static inline uint32_t _clamp(const SwFill* fill, int32_t pos)
+{
+    switch (fill->spread) {
+        case FillSpread::Pad: {
+            if (pos >= GRADIENT_STOP_SIZE) pos = GRADIENT_STOP_SIZE - 1;
+            else if (pos < 0) pos = 0;
+            break;
+        }
+        case FillSpread::Repeat: {
+            pos = pos % GRADIENT_STOP_SIZE;
+            if (pos < 0) pos = GRADIENT_STOP_SIZE + pos;
+            break;
+        }
+        case FillSpread::Reflect: {
+            auto limit = GRADIENT_STOP_SIZE * 2;
+            pos = pos % limit;
+            if (pos < 0) pos = limit + pos;
+            if (pos >= GRADIENT_STOP_SIZE) pos = (limit - pos - 1);
+            break;
+        }
+    }
+    return pos;
+}
+
+
+static inline uint32_t _fixedPixel(const SwFill* fill, int32_t pos)
+{
+    int32_t i = (pos + (FIXPT_SIZE / 2)) >> FIXPT_BITS;
+    return fill->ctable[_clamp(fill, i)];
+}
+
+
+static inline uint32_t _pixel(const SwFill* fill, float pos)
+{
+    auto i = static_cast<int32_t>(pos * (GRADIENT_STOP_SIZE - 1) + 0.5f);
+    return fill->ctable[_clamp(fill, i)];
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+{
+    //edge case
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+
+        if (opacity == 255) {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                *dst = opBlendNormal(_pixel(fill, x0), *dst, alpha(cmp));
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        } else {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                *dst = opBlendNormal(_pixel(fill, x0), *dst, MULTIPLY(opacity, alpha(cmp)));
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        if (opacity == 255) {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_pixel(fill, sqrtf(det) - b), *dst, alpha(cmp));
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        } else {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_pixel(fill, sqrtf(det) - b), *dst, MULTIPLY(opacity, alpha(cmp)));
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+            *dst = op(_pixel(fill, x0), *dst, a);
+            rx += radial->a11;
+            ry += radial->a21;
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            *dst = op(_pixel(fill, sqrtf(det) - b), *dst, a);
+            det += deltaDet;
+            deltaDet += deltaDeltaDet;
+            b += deltaB;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask maskOp, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+            auto src = MULTIPLY(a, A(_pixel(fill, x0)));
+            *dst = maskOp(src, *dst, ~src);
+            rx += radial->a11;
+            ry += radial->a21;
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+            auto src = MULTIPLY(a, A(_pixel(fill, sqrtf(det) - b)));
+            *dst = maskOp(src, *dst, ~src);
+            det += deltaDet;
+            deltaDet += deltaDeltaDet;
+            b += deltaB;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask maskOp, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, ++cmp) {
+            auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+            auto src = MULTIPLY(A(A(_pixel(fill, x0))), a);
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            rx += radial->a11;
+            ry += radial->a21;
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+
+        for (uint32_t i = 0 ; i < len ; ++i, ++dst, ++cmp) {
+            auto src = MULTIPLY(A(_pixel(fill, sqrtf(det))), a);
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            deltaDet += deltaDeltaDet;
+            b += deltaB;
+        }
+    }
+}
+
+
+void fillRadial(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a)
+{
+    if (fill->radial.a < RADIAL_A_THRESHOLD) {
+        auto radial = &fill->radial;
+        auto rx = (x + 0.5f) * radial->a11 + (y + 0.5f) * radial->a12 + radial->a13 - radial->fx;
+        auto ry = (x + 0.5f) * radial->a21 + (y + 0.5f) * radial->a22 + radial->a23 - radial->fy;
+
+        if (a == 255) {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                auto tmp = op(_pixel(fill, x0), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        } else {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto x0 = 0.5f * (rx * rx + ry * ry - radial->fr * radial->fr) / (radial->dr * radial->fr + rx * radial->dx + ry * radial->dy);
+                auto tmp = op(_pixel(fill, x0), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                rx += radial->a11;
+                ry += radial->a21;
+            }
+        }
+    } else {
+        float b, deltaB, det, deltaDet, deltaDeltaDet;
+        _calculateCoefficients(fill, x, y, b, deltaB, det, deltaDet, deltaDeltaDet);
+        if (a == 255) {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+                auto tmp = op(_pixel(fill, sqrtf(det) - b), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        } else {
+            for (uint32_t i = 0 ; i < len ; ++i, ++dst) {
+                auto tmp = op(_pixel(fill, sqrtf(det) - b), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                det += deltaDet;
+                deltaDet += deltaDeltaDet;
+                b += deltaB;
+            }
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (opacity == 255) {
+        if (mathZero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(color, *dst, alpha(cmp));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_fixedPixel(fill, t2), *dst, alpha(cmp));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opBlendNormal(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, alpha(cmp));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    } else {
+        if (mathZero(inc)) {
+            auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+            for (uint32_t i = 0; i < len; ++i, ++dst, cmp += csize) {
+                *dst = opBlendNormal(color, *dst, MULTIPLY(alpha(cmp), opacity));
+            }
+            return;
+        }
+
+        auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+        auto vMin = -vMax;
+        auto v = t + (inc * len);
+
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst, cmp += csize) {
+                *dst = opBlendNormal(_fixedPixel(fill, t2), *dst, MULTIPLY(alpha(cmp), opacity));
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                *dst = opBlendNormal(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, MULTIPLY(opacity, alpha(cmp)));
+                ++dst;
+                t += inc;
+                cmp += csize;
+            }
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask maskOp, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (mathZero(inc)) {
+        auto src = MULTIPLY(a, A(_fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE))));
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            *dst = maskOp(src, *dst, ~src);
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    //we can use fixed point math
+    if (v < vMax && v > vMin) {
+        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+        for (uint32_t j = 0; j < len; ++j, ++dst) {
+            auto src = MULTIPLY(_fixedPixel(fill, t2), a);
+            *dst = maskOp(src, *dst, ~src);
+            t2 += inc2;
+        }
+    //we have to fallback to float math
+    } else {
+        uint32_t counter = 0;
+        while (counter++ < len) {
+            auto src = MULTIPLY(_pixel(fill, t / GRADIENT_STOP_SIZE), a);
+            *dst = maskOp(src, *dst, ~src);
+            ++dst;
+            t += inc;
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask maskOp, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (mathZero(inc)) {
+        auto src = A(_fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE)));
+        src = MULTIPLY(src, a);
+        for (uint32_t i = 0; i < len; ++i, ++dst, ++cmp) {
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    //we can use fixed point math
+    if (v < vMax && v > vMin) {
+        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+        for (uint32_t j = 0; j < len; ++j, ++dst, ++cmp) {
+            auto src = MULTIPLY(a, A(_fixedPixel(fill, t2)));
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            t2 += inc2;
+        }
+    //we have to fallback to float math
+    } else {
+        uint32_t counter = 0;
+        while (counter++ < len) {
+            auto src = MULTIPLY(A(_pixel(fill, t / GRADIENT_STOP_SIZE)), a);
+            auto tmp = maskOp(src, *cmp, 0);
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+            ++dst;
+            ++cmp;
+            t += inc;
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (mathZero(inc)) {
+        auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+        for (uint32_t i = 0; i < len; ++i, ++dst) {
+            *dst = op(color, *dst, a);
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    //we can use fixed point math
+    if (v < vMax && v > vMin) {
+        auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+        auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+        for (uint32_t j = 0; j < len; ++j, ++dst) {
+            *dst = op(_fixedPixel(fill, t2), *dst, a);
+            t2 += inc2;
+        }
+    //we have to fallback to float math
+    } else {
+        uint32_t counter = 0;
+        while (counter++ < len) {
+            *dst = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, a);
+            ++dst;
+            t += inc;
+        }
+    }
+}
+
+
+void fillLinear(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a)
+{
+    //Rotation
+    float rx = x + 0.5f;
+    float ry = y + 0.5f;
+    float t = (fill->linear.dx * rx + fill->linear.dy * ry + fill->linear.offset) * (GRADIENT_STOP_SIZE - 1);
+    float inc = (fill->linear.dx) * (GRADIENT_STOP_SIZE - 1);
+
+    if (mathZero(inc)) {
+        auto color = _fixedPixel(fill, static_cast<int32_t>(t * FIXPT_SIZE));
+        if (a == 255) {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto tmp = op(color, *dst, a);
+                *dst = op2(tmp, *dst, 255);
+            }
+        } else {
+            for (uint32_t i = 0; i < len; ++i, ++dst) {
+                auto tmp = op(color, *dst, a);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+            }
+        }
+        return;
+    }
+
+    auto vMax = static_cast<float>(INT32_MAX >> (FIXPT_BITS + 1));
+    auto vMin = -vMax;
+    auto v = t + (inc * len);
+
+    if (a == 255) {
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                auto tmp = op(_fixedPixel(fill, t2), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                auto tmp = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, 255);
+                *dst = op2(tmp, *dst, 255);
+                ++dst;
+                t += inc;
+            }
+        }
+    } else {
+        //we can use fixed point math
+        if (v < vMax && v > vMin) {
+            auto t2 = static_cast<int32_t>(t * FIXPT_SIZE);
+            auto inc2 = static_cast<int32_t>(inc * FIXPT_SIZE);
+            for (uint32_t j = 0; j < len; ++j, ++dst) {
+                auto tmp = op(_fixedPixel(fill, t2), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                t2 += inc2;
+            }
+        //we have to fallback to float math
+        } else {
+            uint32_t counter = 0;
+            while (counter++ < len) {
+                auto tmp = op(_pixel(fill, t / GRADIENT_STOP_SIZE), *dst, 255);
+                auto tmp2 = op2(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, a);
+                ++dst;
+                t += inc;
+            }
+        }
+    }
+}
+
+
+bool fillGenColorTable(SwFill* fill, const Fill* fdata, const Matrix* transform, SwSurface* surface, uint8_t opacity, bool ctable)
+{
+    if (!fill) return false;
+
+    fill->spread = fdata->spread();
+
+    if (ctable) {
+        if (!_updateColorTable(fill, fdata, surface, opacity)) return false;
+    }
+
+    if (fdata->identifier() == TVG_CLASS_ID_LINEAR) {
+        return _prepareLinear(fill, static_cast<const LinearGradient*>(fdata), transform);
+    } else if (fdata->identifier() == TVG_CLASS_ID_RADIAL) {
+        return _prepareRadial(fill, static_cast<const RadialGradient*>(fdata), transform);
+    }
+
+    //LOG: What type of gradient?!
+
+    return false;
+}
+
+
+void fillReset(SwFill* fill)
+{
+    if (fill->ctable) {
+        free(fill->ctable);
+        fill->ctable = nullptr;
+    }
+    fill->translucent = false;
+}
+
+
+void fillFree(SwFill* fill)
+{
+    if (!fill) return;
+
+    if (fill->ctable) free(fill->ctable);
+
+    free(fill);
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwImage.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwImage.cpp
new file mode 100644
index 0000000000..b1624037bc
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwImage.cpp
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+static inline bool _onlyShifted(const Matrix* m)
+{
+    if (mathEqual(m->e11, 1.0f) && mathEqual(m->e22, 1.0f) && mathZero(m->e12) && mathZero(m->e21)) return true;
+    return false;
+}
+
+
+static bool _genOutline(SwImage* image, const RenderMesh* mesh, const Matrix* transform, SwMpool* mpool, unsigned tid)
+{
+    image->outline = mpoolReqOutline(mpool, tid);
+    auto outline = image->outline;
+
+    outline->pts.reserve(5);
+    outline->types.reserve(5);
+    outline->cntrs.reserve(1);
+    outline->closed.reserve(1);
+
+    Point to[4];
+    if (mesh->triangleCnt > 0) {
+        // TODO: Optimise me. We appear to calculate this exact min/max bounding area in multiple
+        // places. We should be able to re-use one we have already done? Also see:
+        //   tvgPicture.h --> bounds
+        //   tvgSwRasterTexmap.h --> _rasterTexmapPolygonMesh
+        //
+        // TODO: Should we calculate the exact path(s) of the triangle mesh instead?
+        // i.e. copy tvgSwShape.capp -> _genOutline?
+        //
+        // TODO: Cntrs?
+        auto triangles = mesh->triangles;
+        auto min = triangles[0].vertex[0].pt;
+        auto max = triangles[0].vertex[0].pt;
+
+        for (uint32_t i = 0; i < mesh->triangleCnt; ++i) {
+            if (triangles[i].vertex[0].pt.x < min.x) min.x = triangles[i].vertex[0].pt.x;
+            else if (triangles[i].vertex[0].pt.x > max.x) max.x = triangles[i].vertex[0].pt.x;
+            if (triangles[i].vertex[0].pt.y < min.y) min.y = triangles[i].vertex[0].pt.y;
+            else if (triangles[i].vertex[0].pt.y > max.y) max.y = triangles[i].vertex[0].pt.y;
+
+            if (triangles[i].vertex[1].pt.x < min.x) min.x = triangles[i].vertex[1].pt.x;
+            else if (triangles[i].vertex[1].pt.x > max.x) max.x = triangles[i].vertex[1].pt.x;
+            if (triangles[i].vertex[1].pt.y < min.y) min.y = triangles[i].vertex[1].pt.y;
+            else if (triangles[i].vertex[1].pt.y > max.y) max.y = triangles[i].vertex[1].pt.y;
+
+            if (triangles[i].vertex[2].pt.x < min.x) min.x = triangles[i].vertex[2].pt.x;
+            else if (triangles[i].vertex[2].pt.x > max.x) max.x = triangles[i].vertex[2].pt.x;
+            if (triangles[i].vertex[2].pt.y < min.y) min.y = triangles[i].vertex[2].pt.y;
+            else if (triangles[i].vertex[2].pt.y > max.y) max.y = triangles[i].vertex[2].pt.y;
+        }
+        to[0] = {min.x, min.y};
+        to[1] = {max.x, min.y};
+        to[2] = {max.x, max.y};
+        to[3] = {min.x, max.y};
+    } else {
+        auto w = static_cast<float>(image->w);
+        auto h = static_cast<float>(image->h);
+        to[0] = {0, 0};
+        to[1] = {w, 0};
+        to[2] = {w, h};
+        to[3] = {0, h};
+    }
+
+    for (int i = 0; i < 4; i++) {
+        outline->pts.push(mathTransform(&to[i], transform));
+        outline->types.push(SW_CURVE_TYPE_POINT);
+    }
+
+    outline->pts.push(outline->pts[0]);
+    outline->types.push(SW_CURVE_TYPE_POINT);
+    outline->cntrs.push(outline->pts.count - 1);
+    outline->closed.push(true);
+
+    image->outline = outline;
+
+    return true;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+bool imagePrepare(SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid)
+{
+    image->direct = _onlyShifted(transform);
+
+    //Fast track: Non-transformed image but just shifted.
+    if (image->direct) {
+        image->ox = -static_cast<int32_t>(round(transform->e13));
+        image->oy = -static_cast<int32_t>(round(transform->e23));
+    //Figure out the scale factor by transform matrix
+    } else {
+        auto scaleX = sqrtf((transform->e11 * transform->e11) + (transform->e21 * transform->e21));
+        auto scaleY = sqrtf((transform->e22 * transform->e22) + (transform->e12 * transform->e12));
+        image->scale = (fabsf(scaleX - scaleY) > 0.01f) ? 1.0f : scaleX;
+
+        if (mathZero(transform->e12) && mathZero(transform->e21)) image->scaled = true;
+        else image->scaled = false;
+    }
+
+    if (!_genOutline(image, mesh, transform, mpool, tid)) return false;
+    return mathUpdateOutlineBBox(image->outline, clipRegion, renderRegion, image->direct);
+}
+
+
+bool imageGenRle(SwImage* image, const SwBBox& renderRegion, bool antiAlias)
+{
+    if ((image->rle = rleRender(image->rle, image->outline, renderRegion, antiAlias))) return true;
+
+    return false;
+}
+
+
+void imageDelOutline(SwImage* image, SwMpool* mpool, uint32_t tid)
+{
+    mpoolRetOutline(mpool, tid);
+    image->outline = nullptr;
+}
+
+
+void imageReset(SwImage* image)
+{
+    rleReset(image->rle);
+}
+
+
+void imageFree(SwImage* image)
+{
+    rleFree(image->rle);
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMath.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMath.cpp
new file mode 100644
index 0000000000..dbcfa754f3
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMath.cpp
@@ -0,0 +1,501 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <math.h>
+#include "tvgSwCommon.h"
+
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+//clz: count leading zero’s
+#if defined(_MSC_VER) && !defined(__clang__)
+    #include <intrin.h>
+    static uint32_t __inline _clz(uint32_t value)
+    {
+        unsigned long leadingZero = 0;
+        if (_BitScanReverse(&leadingZero, value)) return 31 - leadingZero;
+        else return 32;
+    }
+#else
+    #define _clz(x) __builtin_clz((x))
+#endif
+
+
+constexpr SwFixed CORDIC_FACTOR = 0xDBD95B16UL;       //the Cordic shrink factor 0.858785336480436 * 2^32
+
+//this table was generated for SW_FT_PI = 180L << 16, i.e. degrees
+constexpr static auto ATAN_MAX = 23;
+constexpr static SwFixed ATAN_TBL[] = {
+    1740967L, 919879L, 466945L, 234379L, 117304L, 58666L, 29335L,
+    14668L, 7334L, 3667L, 1833L, 917L, 458L, 229L, 115L,
+    57L, 29L, 14L, 7L, 4L, 2L, 1L};
+
+static inline SwCoord SATURATE(const SwCoord x)
+{
+    return (x >> (sizeof(SwCoord) * 8 - 1));
+}
+
+
+static inline SwFixed PAD_ROUND(const SwFixed x, int32_t n)
+{
+    return (((x) + ((n)/2)) & ~((n)-1));
+}
+
+
+static SwCoord _downscale(SwFixed x)
+{
+    //multiply a give value by the CORDIC shrink factor
+    auto s = abs(x);
+    int64_t t = (s * static_cast<int64_t>(CORDIC_FACTOR)) + 0x100000000UL;
+    s = static_cast<SwFixed>(t >> 32);
+    if (x < 0) s = -s;
+    return s;
+}
+
+
+static int32_t _normalize(SwPoint& pt)
+{
+    /* the highest bit in overflow-safe vector components
+       MSB of 0.858785336480436 * sqrt(0.5) * 2^30 */
+    constexpr auto SAFE_MSB = 29;
+
+    auto v = pt;
+
+    //High order bit(MSB)
+    int32_t shift = 31 - _clz(abs(v.x) | abs(v.y));
+
+    if (shift <= SAFE_MSB) {
+        shift = SAFE_MSB - shift;
+        pt.x = static_cast<SwCoord>((unsigned long)v.x << shift);
+        pt.y = static_cast<SwCoord>((unsigned long)v.y << shift);
+    } else {
+        shift -= SAFE_MSB;
+        pt.x = v.x >> shift;
+        pt.y = v.y >> shift;
+        shift = -shift;
+    }
+    return shift;
+}
+
+
+static void _polarize(SwPoint& pt)
+{
+    auto v = pt;
+    SwFixed theta;
+
+    //Get the vector into [-PI/4, PI/4] sector
+    if (v.y > v.x) {
+        if (v.y > -v.x) {
+            auto tmp = v.y;
+            v.y = -v.x;
+            v.x = tmp;
+            theta = SW_ANGLE_PI2;
+        } else {
+            theta = v.y > 0 ? SW_ANGLE_PI : -SW_ANGLE_PI;
+            v.x = -v.x;
+            v.y = -v.y;
+        }
+    } else {
+        if (v.y < -v.x) {
+            theta = -SW_ANGLE_PI2;
+            auto tmp = -v.y;
+            v.y = v.x;
+            v.x = tmp;
+        } else {
+            theta = 0;
+        }
+    }
+
+    auto atan = ATAN_TBL;
+    uint32_t i;
+    SwFixed j;
+
+    //Pseudorotations. with right shifts
+    for (i = 1, j = 1; i < ATAN_MAX; j <<= 1, ++i) {
+        if (v.y > 0) {
+            auto tmp = v.x + ((v.y + j) >> i);
+            v.y = v.y - ((v.x + j) >> i);
+            v.x = tmp;
+            theta += *atan++;
+        } else {
+            auto tmp = v.x - ((v.y + j) >> i);
+            v.y = v.y + ((v.x + j) >> i);
+            v.x = tmp;
+            theta -= *atan++;
+        }
+    }
+
+    //round theta
+    if (theta >= 0) theta =  PAD_ROUND(theta, 32);
+    else theta = -PAD_ROUND(-theta, 32);
+
+    pt.x = v.x;
+    pt.y = theta;
+}
+
+
+static void _rotate(SwPoint& pt, SwFixed theta)
+{
+    SwFixed x = pt.x;
+    SwFixed y = pt.y;
+
+    //Rotate inside [-PI/4, PI/4] sector
+    while (theta < -SW_ANGLE_PI4) {
+        auto tmp = y;
+        y = -x;
+        x = tmp;
+        theta += SW_ANGLE_PI2;
+    }
+
+    while (theta > SW_ANGLE_PI4) {
+        auto tmp = -y;
+        y = x;
+        x = tmp;
+        theta -= SW_ANGLE_PI2;
+    }
+
+    auto atan = ATAN_TBL;
+    uint32_t i;
+    SwFixed j;
+
+    for (i = 1, j = 1; i < ATAN_MAX; j <<= 1, ++i) {
+        if (theta < 0) {
+            auto tmp = x + ((y + j) >> i);
+            y = y - ((x + j) >> i);
+            x = tmp;
+            theta += *atan++;
+        } else {
+            auto tmp = x - ((y + j) >> i);
+            y = y + ((x + j) >> i);
+            x = tmp;
+            theta -= *atan++;
+        }
+    }
+
+    pt.x = static_cast<SwCoord>(x);
+    pt.y = static_cast<SwCoord>(y);
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwFixed mathMean(SwFixed angle1, SwFixed angle2)
+{
+    return angle1 + mathDiff(angle1, angle2) / 2;
+}
+
+
+bool mathSmallCubic(const SwPoint* base, SwFixed& angleIn, SwFixed& angleMid, SwFixed& angleOut)
+{
+    auto d1 = base[2] - base[3];
+    auto d2 = base[1] - base[2];
+    auto d3 = base[0] - base[1];
+
+    if (d1.small()) {
+        if (d2.small()) {
+            if (d3.small()) {
+                //basically a point.
+                //do nothing to retain original direction
+            } else {
+                angleIn = angleMid = angleOut = mathAtan(d3);
+            }
+        } else {
+            if (d3.small()) {
+                angleIn = angleMid = angleOut = mathAtan(d2);
+            } else {
+                angleIn = angleMid = mathAtan(d2);
+                angleOut = mathAtan(d3);
+            }
+        }
+    } else {
+        if (d2.small()) {
+            if (d3.small()) {
+                angleIn = angleMid = angleOut = mathAtan(d1);
+            } else {
+                angleIn = mathAtan(d1);
+                angleOut = mathAtan(d3);
+                angleMid = mathMean(angleIn, angleOut);
+            }
+        } else {
+            if (d3.small()) {
+                angleIn = mathAtan(d1);
+                angleMid = angleOut = mathAtan(d2);
+            } else {
+                angleIn = mathAtan(d1);
+                angleMid = mathAtan(d2);
+                angleOut = mathAtan(d3);
+            }
+        }
+    }
+
+    auto theta1 = abs(mathDiff(angleIn, angleMid));
+    auto theta2 = abs(mathDiff(angleMid, angleOut));
+
+    if ((theta1 < (SW_ANGLE_PI / 8)) && (theta2 < (SW_ANGLE_PI / 8))) return true;
+    return false;
+}
+
+
+int64_t mathMultiply(int64_t a, int64_t b)
+{
+    int32_t s = 1;
+
+    //move sign
+    if (a < 0) {
+        a = -a;
+        s = -s;
+    }
+    if (b < 0) {
+        b = -b;
+        s = -s;
+    }
+    int64_t c = (a * b + 0x8000L) >> 16;
+    return (s > 0) ? c : -c;
+}
+
+
+int64_t mathDivide(int64_t a, int64_t b)
+{
+    int32_t s = 1;
+
+    //move sign
+    if (a < 0) {
+        a = -a;
+        s = -s;
+    }
+    if (b < 0) {
+        b = -b;
+        s = -s;
+    }
+    int64_t q = b > 0 ? ((a << 16) + (b >> 1)) / b : 0x7FFFFFFFL;
+    return (s < 0 ? -q : q);
+}
+
+
+int64_t mathMulDiv(int64_t a, int64_t b, int64_t c)
+{
+    int32_t s = 1;
+
+    //move sign
+    if (a < 0) {
+        a = -a;
+        s = -s;
+    }
+    if (b < 0) {
+        b = -b;
+        s = -s;
+    }
+    if (c < 0) {
+        c = -c;
+        s = -s;
+    }
+    int64_t d = c > 0 ? (a * b + (c >> 1)) / c : 0x7FFFFFFFL;
+
+    return (s > 0 ? d : -d);
+}
+
+
+void mathRotate(SwPoint& pt, SwFixed angle)
+{
+    if (angle == 0 || (pt.x == 0 && pt.y == 0)) return;
+
+    auto v  = pt;
+    auto shift = _normalize(v);
+
+    auto theta = angle;
+   _rotate(v, theta);
+
+    v.x = _downscale(v.x);
+    v.y = _downscale(v.y);
+
+    if (shift > 0) {
+        auto half = static_cast<int32_t>(1L << (shift - 1));
+        pt.x = (v.x + half + SATURATE(v.x)) >> shift;
+        pt.y = (v.y + half + SATURATE(v.y)) >> shift;
+    } else {
+        shift = -shift;
+        pt.x = static_cast<SwCoord>((unsigned long)v.x << shift);
+        pt.y = static_cast<SwCoord>((unsigned long)v.y << shift);
+    }
+}
+
+SwFixed mathTan(SwFixed angle)
+{
+    SwPoint v = {CORDIC_FACTOR >> 8, 0};
+    _rotate(v, angle);
+    return mathDivide(v.y, v.x);
+}
+
+
+SwFixed mathAtan(const SwPoint& pt)
+{
+    if (pt.x == 0 && pt.y == 0) return 0;
+
+    auto v = pt;
+    _normalize(v);
+    _polarize(v);
+
+    return v.y;
+}
+
+
+SwFixed mathSin(SwFixed angle)
+{
+    return mathCos(SW_ANGLE_PI2 - angle);
+}
+
+
+SwFixed mathCos(SwFixed angle)
+{
+    SwPoint v = {CORDIC_FACTOR >> 8, 0};
+    _rotate(v, angle);
+    return (v.x + 0x80L) >> 8;
+}
+
+
+SwFixed mathLength(const SwPoint& pt)
+{
+    auto v = pt;
+
+    //trivial case
+    if (v.x == 0) return abs(v.y);
+    if (v.y == 0) return abs(v.x);
+
+    //general case
+    auto shift = _normalize(v);
+    _polarize(v);
+    v.x = _downscale(v.x);
+
+    if (shift > 0) return (v.x + (static_cast<SwFixed>(1) << (shift -1))) >> shift;
+    return static_cast<SwFixed>((uint32_t)v.x << -shift);
+}
+
+
+void mathSplitCubic(SwPoint* base)
+{
+    SwCoord a, b, c, d;
+
+    base[6].x = base[3].x;
+    c = base[1].x;
+    d = base[2].x;
+    base[1].x = a = (base[0].x + c) / 2;
+    base[5].x = b = (base[3].x + d) / 2;
+    c = (c + d) / 2;
+    base[2].x = a = (a + c) / 2;
+    base[4].x = b = (b + c) / 2;
+    base[3].x = (a + b) / 2;
+
+    base[6].y = base[3].y;
+    c = base[1].y;
+    d = base[2].y;
+    base[1].y = a = (base[0].y + c) / 2;
+    base[5].y = b = (base[3].y + d) / 2;
+    c = (c + d) / 2;
+    base[2].y = a = (a + c) / 2;
+    base[4].y = b = (b + c) / 2;
+    base[3].y = (a + b) / 2;
+}
+
+
+SwFixed mathDiff(SwFixed angle1, SwFixed angle2)
+{
+    auto delta = angle2 - angle1;
+
+    delta %= SW_ANGLE_2PI;
+    if (delta < 0) delta += SW_ANGLE_2PI;
+    if (delta > SW_ANGLE_PI) delta -= SW_ANGLE_2PI;
+
+    return delta;
+}
+
+
+SwPoint mathTransform(const Point* to, const Matrix* transform)
+{
+    if (!transform) return {TO_SWCOORD(to->x), TO_SWCOORD(to->y)};
+
+    auto tx = to->x * transform->e11 + to->y * transform->e12 + transform->e13;
+    auto ty = to->x * transform->e21 + to->y * transform->e22 + transform->e23;
+
+    return {TO_SWCOORD(tx), TO_SWCOORD(ty)};
+}
+
+
+bool mathClipBBox(const SwBBox& clipper, SwBBox& clipee)
+{
+    clipee.max.x = (clipee.max.x < clipper.max.x) ? clipee.max.x : clipper.max.x;
+    clipee.max.y = (clipee.max.y < clipper.max.y) ? clipee.max.y : clipper.max.y;
+    clipee.min.x = (clipee.min.x > clipper.min.x) ? clipee.min.x : clipper.min.x;
+    clipee.min.y = (clipee.min.y > clipper.min.y) ? clipee.min.y : clipper.min.y;
+
+    //Check valid region
+    if (clipee.max.x - clipee.min.x < 1 && clipee.max.y - clipee.min.y < 1) return false;
+
+    //Check boundary
+    if (clipee.min.x >= clipper.max.x || clipee.min.y >= clipper.max.y ||
+        clipee.max.x <= clipper.min.x || clipee.max.y <= clipper.min.y) return false;
+
+    return true;
+}
+
+
+bool mathUpdateOutlineBBox(const SwOutline* outline, const SwBBox& clipRegion, SwBBox& renderRegion, bool fastTrack)
+{
+    if (!outline) return false;
+
+    auto pt = outline->pts.data;
+
+    if (outline->pts.empty() || outline->cntrs.empty()) {
+        renderRegion.reset();
+        return false;
+    }
+
+    auto xMin = pt->x;
+    auto xMax = pt->x;
+    auto yMin = pt->y;
+    auto yMax = pt->y;
+
+    for (++pt; pt < outline->pts.end(); ++pt) {
+        if (xMin > pt->x) xMin = pt->x;
+        if (xMax < pt->x) xMax = pt->x;
+        if (yMin > pt->y) yMin = pt->y;
+        if (yMax < pt->y) yMax = pt->y;
+    }
+    //Since no antialiasing is applied in the Fast Track case,
+    //the rasterization region has to be rearranged.
+    //https://github.com/Samsung/thorvg/issues/916
+    if (fastTrack) {
+        renderRegion.min.x = static_cast<SwCoord>(round(xMin / 64.0f));
+        renderRegion.max.x = static_cast<SwCoord>(round(xMax / 64.0f));
+        renderRegion.min.y = static_cast<SwCoord>(round(yMin / 64.0f));
+        renderRegion.max.y = static_cast<SwCoord>(round(yMax / 64.0f));
+    } else {
+        renderRegion.min.x = xMin >> 6;
+        renderRegion.max.x = (xMax + 63) >> 6;
+        renderRegion.min.y = yMin >> 6;
+        renderRegion.max.y = (yMax + 63) >> 6;
+    }
+    return mathClipBBox(clipRegion, renderRegion);
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMemPool.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMemPool.cpp
new file mode 100644
index 0000000000..54ae594bff
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwMemPool.cpp
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgSwCommon.h"
+
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwOutline* mpoolReqOutline(SwMpool* mpool, unsigned idx)
+{
+    return &mpool->outline[idx];
+}
+
+
+void mpoolRetOutline(SwMpool* mpool, unsigned idx)
+{
+    mpool->outline[idx].pts.clear();
+    mpool->outline[idx].cntrs.clear();
+    mpool->outline[idx].types.clear();
+    mpool->outline[idx].closed.clear();
+}
+
+
+SwOutline* mpoolReqStrokeOutline(SwMpool* mpool, unsigned idx)
+{
+    return &mpool->strokeOutline[idx];
+}
+
+
+void mpoolRetStrokeOutline(SwMpool* mpool, unsigned idx)
+{
+    mpool->strokeOutline[idx].pts.clear();
+    mpool->strokeOutline[idx].cntrs.clear();
+    mpool->strokeOutline[idx].types.clear();
+    mpool->strokeOutline[idx].closed.clear();
+}
+
+
+SwOutline* mpoolReqDashOutline(SwMpool* mpool, unsigned idx)
+{
+    return &mpool->dashOutline[idx];
+}
+
+
+void mpoolRetDashOutline(SwMpool* mpool, unsigned idx)
+{
+    mpool->dashOutline[idx].pts.clear();
+    mpool->dashOutline[idx].cntrs.clear();
+    mpool->dashOutline[idx].types.clear();
+    mpool->dashOutline[idx].closed.clear();
+}
+
+
+SwMpool* mpoolInit(unsigned threads)
+{
+    auto allocSize = threads + 1;
+
+    auto mpool = static_cast<SwMpool*>(calloc(sizeof(SwMpool), 1));
+    mpool->outline = static_cast<SwOutline*>(calloc(1, sizeof(SwOutline) * allocSize));
+    mpool->strokeOutline = static_cast<SwOutline*>(calloc(1, sizeof(SwOutline) * allocSize));
+    mpool->dashOutline = static_cast<SwOutline*>(calloc(1, sizeof(SwOutline) * allocSize));
+    mpool->allocSize = allocSize;
+
+    return mpool;
+}
+
+
+bool mpoolClear(SwMpool* mpool)
+{
+    for (unsigned i = 0; i < mpool->allocSize; ++i) {
+        mpool->outline[i].pts.reset();
+        mpool->outline[i].cntrs.reset();
+        mpool->outline[i].types.reset();
+        mpool->outline[i].closed.reset();
+
+        mpool->strokeOutline[i].pts.reset();
+        mpool->strokeOutline[i].cntrs.reset();
+        mpool->strokeOutline[i].types.reset();
+        mpool->strokeOutline[i].closed.reset();
+
+        mpool->dashOutline[i].pts.reset();
+        mpool->dashOutline[i].cntrs.reset();
+        mpool->dashOutline[i].types.reset();
+        mpool->dashOutline[i].closed.reset();
+    }
+
+    return true;
+}
+
+
+bool mpoolTerm(SwMpool* mpool)
+{
+    if (!mpool) return false;
+
+    mpoolClear(mpool);
+
+    free(mpool->outline);
+    free(mpool->strokeOutline);
+    free(mpool->dashOutline);
+    free(mpool);
+
+    return true;
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRaster.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRaster.cpp
new file mode 100644
index 0000000000..4b1ba59100
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRaster.cpp
@@ -0,0 +1,2037 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef _WIN32
+    #include <malloc.h>
+#elif defined(__linux__)
+    #include <alloca.h>
+#else
+    #include <stdlib.h>
+#endif
+
+#include "tvgMath.h"
+#include "tvgRender.h"
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+constexpr auto DOWN_SCALE_TOLERANCE = 0.5f;
+
+struct FillLinear
+{
+    void operator()(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask op, uint8_t a)
+    {
+        fillLinear(fill, dst, y, x, len, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask op, uint8_t a)
+    {
+        fillLinear(fill, dst, y, x, len, cmp, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a)
+    {
+        fillLinear(fill, dst, y, x, len, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+    {
+        fillLinear(fill, dst, y, x, len, cmp, alpha, csize, opacity);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a)
+    {
+        fillLinear(fill, dst, y, x, len, op, op2, a);
+    }
+
+};
+
+struct FillRadial
+{
+    void operator()(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, SwMask op, uint8_t a)
+    {
+        fillRadial(fill, dst, y, x, len, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint8_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwMask op, uint8_t a)
+    {
+        fillRadial(fill, dst, y, x, len, cmp, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, uint8_t a)
+    {
+        fillRadial(fill, dst, y, x, len, op, a);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, uint8_t* cmp, SwAlpha alpha, uint8_t csize, uint8_t opacity)
+    {
+        fillRadial(fill, dst, y, x, len, cmp, alpha, csize, opacity);
+    }
+
+    void operator()(const SwFill* fill, uint32_t* dst, uint32_t y, uint32_t x, uint32_t len, SwBlender op, SwBlender op2, uint8_t a)
+    {
+        fillRadial(fill, dst, y, x, len, op, op2, a);
+    }
+};
+
+
+static inline uint8_t _alpha(uint8_t* a)
+{
+    return *a;
+}
+
+
+static inline uint8_t _ialpha(uint8_t* a)
+{
+    return ~(*a);
+}
+
+
+static inline uint8_t _abgrLuma(uint8_t* c)
+{
+    auto v = *(uint32_t*)c;
+    return ((((v&0xff)*54) + (((v>>8)&0xff)*183) + (((v>>16)&0xff)*19))) >> 8; //0.2125*R + 0.7154*G + 0.0721*B
+}
+
+
+static inline uint8_t _argbLuma(uint8_t* c)
+{
+    auto v = *(uint32_t*)c;
+    return ((((v&0xff)*19) + (((v>>8)&0xff)*183) + (((v>>16)&0xff)*54))) >> 8; //0.0721*B + 0.7154*G + 0.2125*R
+}
+
+
+static inline uint8_t _abgrInvLuma(uint8_t* c)
+{
+    return ~_abgrLuma(c);
+}
+
+
+static inline uint8_t _argbInvLuma(uint8_t* c)
+{
+    return ~_argbLuma(c);
+}
+
+
+static inline uint32_t _abgrJoin(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    return (a << 24 | b << 16 | g << 8 | r);
+}
+
+
+static inline uint32_t _argbJoin(uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    return (a << 24 | r << 16 | g << 8 | b);
+}
+
+static inline bool _blending(const SwSurface* surface)
+{
+    return (surface->blender) ? true : false;
+}
+
+
+/* OPTIMIZE_ME: Probably, we can separate masking(8bits) / composition(32bits)
+   This would help to enhance the performance by avoiding the unnecessary matting from the composition */
+static inline bool _compositing(const SwSurface* surface)
+{
+    if (!surface->compositor || (int)surface->compositor->method <= (int)CompositeMethod::ClipPath) return false;
+    return true;
+}
+
+
+static inline bool _matting(const SwSurface* surface)
+{
+    if ((int)surface->compositor->method < (int)CompositeMethod::AddMask) return true;
+    else return false;
+}
+
+static inline uint8_t _opMaskNone(uint8_t s, TVG_UNUSED uint8_t d, TVG_UNUSED uint8_t a)
+{
+    return s;
+}
+
+static inline uint8_t _opMaskAdd(uint8_t s, uint8_t d, uint8_t a)
+{
+    return s + MULTIPLY(d, a);
+}
+
+
+static inline uint8_t _opMaskSubtract(uint8_t s, uint8_t d, TVG_UNUSED uint8_t a)
+{
+   return MULTIPLY(s, 255 - d);
+}
+
+
+static inline uint8_t _opMaskIntersect(uint8_t s, uint8_t d, TVG_UNUSED uint8_t a)
+{
+   return MULTIPLY(s, d);
+}
+
+
+static inline uint8_t _opMaskDifference(uint8_t s, uint8_t d, uint8_t a)
+{
+    return MULTIPLY(s, 255 - d) + MULTIPLY(d, a);
+}
+
+
+static inline bool _direct(CompositeMethod method)
+{
+    //subtract & Intersect allows the direct composition
+    if (method == CompositeMethod::SubtractMask || method == CompositeMethod::IntersectMask) return true;
+    return false;
+}
+
+
+static inline SwMask _getMaskOp(CompositeMethod method)
+{
+    switch (method) {
+        case CompositeMethod::AddMask: return _opMaskAdd;
+        case CompositeMethod::SubtractMask: return _opMaskSubtract;
+        case CompositeMethod::DifferenceMask: return _opMaskDifference;
+        case CompositeMethod::IntersectMask: return _opMaskIntersect;
+        default: return nullptr;
+    }
+}
+
+
+static bool _compositeMaskImage(SwSurface* surface, const SwImage* image, const SwBBox& region)
+{
+    auto dbuffer = &surface->buf8[region.min.y * surface->stride + region.min.x];
+    auto sbuffer = image->buf8 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto dst = dbuffer;
+        auto src = sbuffer;
+        for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) {
+            *dst = *src + MULTIPLY(*dst, ~*src);
+        }
+        dbuffer += surface->stride;
+        sbuffer += image->stride;
+    }
+    return true;
+}
+
+
+#include "tvgSwRasterTexmap.h"
+#include "tvgSwRasterC.h"
+#include "tvgSwRasterAvx.h"
+#include "tvgSwRasterNeon.h"
+
+
+static inline uint32_t _sampleSize(float scale)
+{
+    auto sampleSize = static_cast<uint32_t>(0.5f / scale);
+    if (sampleSize == 0) sampleSize = 1;
+    return sampleSize;
+}
+
+
+//Bilinear Interpolation
+//OPTIMIZE_ME: Skip the function pointer access
+static uint32_t _interpUpScaler(const uint32_t *img, TVG_UNUSED uint32_t stride, uint32_t w, uint32_t h, float sx, float sy, TVG_UNUSED uint32_t n, TVG_UNUSED uint32_t n2)
+{
+    auto rx = (uint32_t)(sx);
+    auto ry = (uint32_t)(sy);
+    auto rx2 = rx + 1;
+    if (rx2 >= w) rx2 = w - 1;
+    auto ry2 = ry + 1;
+    if (ry2 >= h) ry2 = h - 1;
+
+    auto dx = static_cast<uint32_t>((sx - rx) * 255.0f);
+    auto dy = static_cast<uint32_t>((sy - ry) * 255.0f);
+
+    auto c1 = img[rx + ry * w];
+    auto c2 = img[rx2 + ry * w];
+    auto c3 = img[rx2 + ry2 * w];
+    auto c4 = img[rx + ry2 * w];
+
+    return INTERPOLATE(INTERPOLATE(c3, c4, dx), INTERPOLATE(c2, c1, dx), dy);
+}
+
+
+//2n x 2n Mean Kernel
+//OPTIMIZE_ME: Skip the function pointer access
+static uint32_t _interpDownScaler(const uint32_t *img, uint32_t stride, uint32_t w, uint32_t h, float sx, float sy, uint32_t n, uint32_t n2)
+{
+    uint32_t rx = lroundf(sx);
+    uint32_t ry = lroundf(sy);
+    uint32_t c[4] = {0, 0, 0, 0};
+    auto src = img + rx - n + (ry - n) * stride;
+
+    for (auto y = ry - n; y < ry + n; ++y) {
+        if (y >= h) continue;
+        auto p = src;
+        for (auto x = rx - n; x < rx + n; ++x, ++p) {
+            if (x >= w) continue;
+            c[0] += *p >> 24;
+            c[1] += (*p >> 16) & 0xff;
+            c[2] += (*p >> 8) & 0xff;
+            c[3] += *p & 0xff;
+        }
+        src += stride;
+    }
+    for (auto i = 0; i < 4; ++i) {
+        c[i] = (c[i] >> 2) / n2;
+    }
+    return (c[0] << 24) | (c[1] << 16) | (c[2] << 8) | c[3];
+}
+
+
+/************************************************************************/
+/* Rect                                                                 */
+/************************************************************************/
+
+static bool _rasterCompositeMaskedRect(SwSurface* surface, const SwBBox& region, SwMask maskOp, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * cstride + region.min.x);   //compositor buffer
+    auto ialpha = 255 - a;
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto cmp = cbuffer;
+        for (uint32_t x = 0; x < w; ++x, ++cmp) {
+            *cmp = maskOp(a, *cmp, ialpha);
+        }
+        cbuffer += cstride;
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectMaskedRect(SwSurface* surface, const SwBBox& region, SwMask maskOp, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x);   //compositor buffer
+    auto dbuffer = surface->buf8 + (region.min.y * surface->stride + region.min.x);   //destination buffer
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto cmp = cbuffer;
+        auto dst = dbuffer;
+        for (uint32_t x = 0; x < w; ++x, ++cmp, ++dst) {
+            auto tmp = maskOp(a, *cmp, 0);   //not use alpha.
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+        }
+        cbuffer += surface->compositor->image.stride;
+        dbuffer += surface->stride;
+    }
+    return true;
+}
+
+
+static bool _rasterMaskedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    //8bit masking channels composition
+    if (surface->channelSize != sizeof(uint8_t)) return false;
+
+    TVGLOG("SW_ENGINE", "Masked(%d) Rect [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    if (_direct(surface->compositor->method)) return _rasterDirectMaskedRect(surface, region, maskOp, r, g, b, a);
+    else return _rasterCompositeMaskedRect(surface, region, maskOp, r, g, b, a);
+    return false;
+}
+
+
+static bool _rasterMattedRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto csize = surface->compositor->image.channelSize;
+    auto cbuffer = surface->compositor->image.buf8 + ((region.min.y * surface->compositor->image.stride + region.min.x) * csize);   //compositor buffer
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    TVGLOG("SW_ENGINE", "Matted(%d) Rect [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
+    
+    //32bits channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            auto cmp = &cbuffer[y * surface->compositor->image.stride * csize];
+            for (uint32_t x = 0; x < w; ++x, ++dst, cmp += csize) {
+                *dst = INTERPOLATE(color, *dst, alpha(cmp));
+            }
+        }
+    //8bits grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            auto cmp = &cbuffer[y * surface->compositor->image.stride * csize];
+            for (uint32_t x = 0; x < w; ++x, ++dst, cmp += csize) {
+                *dst = INTERPOLATE8(a, *dst, alpha(cmp));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterBlendingRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (surface->channelSize != sizeof(uint32_t)) return false;
+
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto color = surface->join(r, g, b, a);
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto ialpha = 255 - a;
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto dst = &buffer[y * surface->stride];
+        for (uint32_t x = 0; x < w; ++x, ++dst) {
+            *dst = surface->blender(color, *dst, ialpha);
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+#if defined(THORVG_AVX_VECTOR_SUPPORT)
+    return avxRasterTranslucentRect(surface, region, r, g, b, a);
+#elif defined(THORVG_NEON_VECTOR_SUPPORT)
+    return neonRasterTranslucentRect(surface, region, r, g, b, a);
+#else
+    return cRasterTranslucentRect(surface, region, r, g, b, a);
+#endif
+}
+
+
+static bool _rasterSolidRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b)
+{
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+
+    //32bits channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, 255);
+        auto buffer = surface->buf32 + (region.min.y * surface->stride);
+        for (uint32_t y = 0; y < h; ++y) {
+            rasterPixel32(buffer + y * surface->stride, color, region.min.x, w);
+        }
+        return true;
+    }
+    //8bits grayscale
+    if (surface->channelSize == sizeof(uint8_t)) {
+        for (uint32_t y = 0; y < h; ++y) {
+            rasterGrayscale8(surface->buf8, 255, (y + region.min.y) * surface->stride + region.min.x, w);
+        }
+        return true;
+    }
+    return false;
+}
+
+
+static bool _rasterRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterMattedRect(surface, region, r, g, b, a);
+        else return _rasterMaskedRect(surface, region, r, g, b, a);
+    } else if (_blending(surface)) {
+        return _rasterBlendingRect(surface, region, r, g, b, a);
+    } else {
+        if (a == 255) return _rasterSolidRect(surface, region, r, g, b);
+        else return _rasterTranslucentRect(surface, region, r, g, b, a);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/* Rle                                                                  */
+/************************************************************************/
+
+static bool _rasterCompositeMaskedRle(SwSurface* surface, SwRleData* rle, SwMask maskOp, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto span = rle->spans;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto cstride = surface->compositor->image.stride;
+    uint8_t src;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto cmp = &cbuffer[span->y * cstride + span->x];
+        if (span->coverage == 255) src = a;
+        else src = MULTIPLY(a, span->coverage);
+        auto ialpha = 255 - src;
+        for (auto x = 0; x < span->len; ++x, ++cmp) {
+            *cmp = maskOp(src, *cmp, ialpha);
+        }
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectMaskedRle(SwSurface* surface, SwRleData* rle, SwMask maskOp, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto span = rle->spans;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto cstride = surface->compositor->image.stride;
+    uint8_t src;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto cmp = &cbuffer[span->y * cstride + span->x];
+        auto dst = &surface->buf8[span->y * surface->stride + span->x];
+        if (span->coverage == 255) src = a;
+        else src = MULTIPLY(a, span->coverage);
+        for (auto x = 0; x < span->len; ++x, ++cmp, ++dst) {
+            auto tmp = maskOp(src, *cmp, 0);     //not use alpha
+            *dst = tmp + MULTIPLY(*dst, ~tmp);
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterMaskedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    TVGLOG("SW_ENGINE", "Masked(%d) Rle", (int)surface->compositor->method);
+
+    //8bit masking channels composition
+    if (surface->channelSize != sizeof(uint8_t)) return false;
+
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    if (_direct(surface->compositor->method)) return _rasterDirectMaskedRle(surface, rle, maskOp, r, g, b, a);
+    else return _rasterCompositeMaskedRle(surface, rle, maskOp, r, g, b, a);
+    return false;
+}
+
+
+static bool _rasterMattedRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    TVGLOG("SW_ENGINE", "Matted(%d) Rle", (int)surface->compositor->method);
+
+    auto span = rle->spans;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    //32bit channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        uint32_t src;
+        auto color = surface->join(r, g, b, a);
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+            auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
+            if (span->coverage == 255) src = color;
+            else src = ALPHA_BLEND(color, span->coverage);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, cmp += csize) {
+                auto tmp = ALPHA_BLEND(src, alpha(cmp));
+                *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+            }
+        }
+        return true;
+    }
+    //8bit grayscale
+    if (surface->channelSize == sizeof(uint8_t)) {
+        uint8_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
+            if (span->coverage == 255) src = a;
+            else src = MULTIPLY(a, span->coverage);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, cmp += csize) {
+                *dst = INTERPOLATE8(src, *dst, alpha(cmp));
+            }
+        }
+        return true;
+    }
+    return false;
+}
+
+
+static bool _rasterBlendingRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (surface->channelSize != sizeof(uint32_t)) return false;
+
+    auto span = rle->spans;
+    auto color = surface->join(r, g, b, a);
+    auto ialpha = 255 - a;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        if (span->coverage == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = surface->blender(color, *dst, ialpha);
+            }
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                auto tmp = surface->blender(color, *dst, ialpha);
+                *dst = INTERPOLATE(tmp, *dst, span->coverage);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterTranslucentRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+#if defined(THORVG_AVX_VECTOR_SUPPORT)
+    return avxRasterTranslucentRle(surface, rle, r, g, b, a);
+#elif defined(THORVG_NEON_VECTOR_SUPPORT)
+    return neonRasterTranslucentRle(surface, rle, r, g, b, a);
+#else
+    return cRasterTranslucentRle(surface, rle, r, g, b, a);
+#endif
+}
+
+
+static bool _rasterSolidRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b)
+{
+    auto span = rle->spans;
+
+    //32bit channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, 255);
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            if (span->coverage == 255) {
+                rasterPixel32(surface->buf32 + span->y * surface->stride, color, span->x, span->len);
+            } else {
+                auto dst = &surface->buf32[span->y * surface->stride + span->x];
+                auto src = ALPHA_BLEND(color, span->coverage);
+                auto ialpha = 255 - span->coverage;
+                for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                    *dst = src + ALPHA_BLEND(*dst, ialpha);
+                }
+            }
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            if (span->coverage == 255) {
+                rasterGrayscale8(surface->buf8, span->coverage, span->y * surface->stride + span->x, span->len);
+            } else {
+                auto dst = &surface->buf8[span->y * surface->stride + span->x];
+                auto ialpha = 255 - span->coverage;
+                for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                    *dst = span->coverage + MULTIPLY(*dst, ialpha);
+                }
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterRle(SwSurface* surface, SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (!rle) return false;
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterMattedRle(surface, rle, r, g, b, a);
+        else return _rasterMaskedRle(surface, rle, r, g, b, a);
+    } else if (_blending(surface)) {
+        return _rasterBlendingRle(surface, rle, r, g, b, a);
+    } else {
+        if (a == 255) return _rasterSolidRle(surface, rle, r, g, b);
+        else return _rasterTranslucentRle(surface, rle, r, g, b, a);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/* RLE Scaled Image                                                     */
+/************************************************************************/
+
+#if 0 //Enable it when GRAYSCALE image is supported
+static bool _rasterCompositeScaledMaskedRleImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, SwMask maskOp, uint8_t opacity)
+{
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+    auto span = image->rle->spans;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto cmp = &surface->compositor->image.buf8[span->y * surface->compositor->image.stride + span->x];
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++cmp) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                *cmp = maskOp(src, *cmp, ~src);
+            }
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++cmp) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = MULTIPLY(src, a);
+                *cmp = maskOp(tmp, *cmp, ~tmp);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterDirectScaledMaskedRleImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, SwMask maskOp, uint8_t opacity)
+{
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+    auto span = image->rle->spans;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto cmp = &surface->compositor->image.buf8[span->y * surface->compositor->image.stride + span->x];
+        auto dst = &surface->buf8[span->y * surface->stride + span->x];
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++cmp, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = maskOp(src, *cmp, 0);  //not use alpha
+                *dst = tmp + MULTIPLY(*dst, ~tmp);
+            }
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++cmp, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = maskOp(MULTIPLY(src, a), *cmp, 0);  //not use alpha
+                *dst = tmp + MULTIPLY(*dst, ~tmp);
+            }
+        }
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+#endif
+
+static bool _rasterScaledMaskedRleImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+#if 0 //Enable it when GRAYSCALE image is supported
+    TVGLOG("SW_ENGINE", "Scaled Masked(%d) Rle Image", (int)surface->compositor->method);
+
+    //8bit masking channels composition
+    if (surface->channelSize != sizeof(uint8_t)) return false;
+
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    if (_direct(surface->compositor->method)) return _rasterDirectScaledMaskedRleImage(surface, image, itransform, region, maskOp, opacity);
+    else return _rasterCompositeScaledMaskedRleImage(surface, image, itransform, region, maskOp, opacity);
+#endif
+    return false;
+}
+
+
+static bool _rasterScaledMattedRleImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+    TVGLOG("SW_ENGINE", "Scaled Matted(%d) Rle Image", (int)surface->compositor->method);
+
+    auto span = image->rle->spans;
+    auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto cmp = &surface->compositor->image.buf8[(span->y * surface->compositor->image.stride + span->x) * csize];
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto tmp = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2), alpha(cmp));
+                *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+            }
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst, cmp += csize) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = ALPHA_BLEND(src, MULTIPLY(alpha(cmp), a));
+                *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+            }
+        }
+    }
+
+    return true;
+}
+
+
+static bool _rasterScaledBlendingRleImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+    auto span = image->rle->spans;
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = surface->blender(src, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, A(src));
+            }
+        } else if (opacity == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = surface->blender(src, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, MULTIPLY(span->coverage, A(src)));
+            }
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2), opacity);
+                auto tmp = surface->blender(src, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, MULTIPLY(span->coverage, A(src)));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterScaledRleImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+    auto span = image->rle->spans;
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto sy = span->y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                *dst = src + ALPHA_BLEND(*dst, IA(src));
+            }
+        } else {
+            for (uint32_t x = static_cast<uint32_t>(span->x); x < static_cast<uint32_t>(span->x) + span->len; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2), alpha);
+                *dst = src + ALPHA_BLEND(*dst, IA(src));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _scaledRleImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported scaled rle image!");
+        return false;
+    }
+
+    Matrix itransform;
+
+    if (transform) {
+        if (!mathInverse(transform, &itransform)) return false;
+    } else mathIdentity(&itransform);
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterScaledMattedRleImage(surface, image, &itransform, region, opacity);
+        else return _rasterScaledMaskedRleImage(surface, image, &itransform, region, opacity);
+    } else if (_blending(surface)) {
+        return _rasterScaledBlendingRleImage(surface, image, &itransform, region, opacity);
+    } else {
+        return _rasterScaledRleImage(surface, image, &itransform, region, opacity);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/* RLE Direct Image                                                     */
+/************************************************************************/
+
+#if 0 //Enable it when GRAYSCALE image is supported
+static bool _rasterCompositeDirectMaskedRleImage(SwSurface* surface, const SwImage* image, SwMask maskOp, uint8_t opacity)
+{
+    auto span = image->rle->spans;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto ctride = surface->compositor->image.stride;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto src = image->buf8 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+        auto cmp = &cbuffer[span->y * ctride + span->x];
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp) {
+                *cmp = maskOp(*src, *cmp, ~*src);
+            }
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp) {
+                auto tmp = MULTIPLY(*src, alpha);
+                *cmp = maskOp(*src, *cmp, ~tmp);
+            }
+        }
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectDirectMaskedRleImage(SwSurface* surface, const SwImage* image, SwMask maskOp, uint8_t opacity)
+{
+    auto span = image->rle->spans;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto ctride = surface->compositor->image.stride;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto src = image->buf8 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+        auto cmp = &cbuffer[span->y * ctride + span->x];
+        auto dst = &surface->buf8[span->y * surface->stride + span->x];
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp, ++dst) {
+                auto tmp = maskOp(*src, *cmp, 0);  //not use alpha
+                *dst = INTERPOLATE8(tmp, *dst, (255 - tmp));
+            }
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++src, ++cmp, ++dst) {
+                auto tmp = maskOp(MULTIPLY(*src, alpha), *cmp, 0); //not use alpha
+                *dst = INTERPOLATE8(tmp, *dst, (255 - tmp));
+            }
+        }
+    }
+    return true;
+}
+#endif
+
+static bool _rasterDirectMaskedRleImage(SwSurface* surface, const SwImage* image, uint8_t opacity)
+{
+#if 0 //Enable it when GRAYSCALE image is supported
+    TVGLOG("SW_ENGINE", "Direct Masked(%d) Rle Image", (int)surface->compositor->method);
+
+    //8bit masking channels composition
+    if (surface->channelSize != sizeof(uint8_t)) return false;
+
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    if (_direct(surface->compositor->method)) _rasterDirectDirectMaskedRleImage(surface, image, maskOp, opacity);
+    else return _rasterCompositeDirectMaskedRleImage(surface, image, maskOp, opacity);
+#endif
+    return false;
+}
+
+
+static bool _rasterDirectMattedRleImage(SwSurface* surface, const SwImage* image, uint8_t opacity)
+{
+    TVGLOG("SW_ENGINE", "Direct Matted(%d) Rle Image", (int)surface->compositor->method);
+
+    auto span = image->rle->spans;
+    auto csize = surface->compositor->image.channelSize;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
+        auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+        auto a = MULTIPLY(span->coverage, opacity);
+        if (a == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) {
+                auto tmp = ALPHA_BLEND(*img, alpha(cmp));
+                *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+            }
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img, cmp += csize) {
+                auto tmp = ALPHA_BLEND(*img, MULTIPLY(a, alpha(cmp)));
+                *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterDirectBlendingRleImage(SwSurface* surface, const SwImage* image, uint8_t opacity)
+{
+    auto span = image->rle->spans;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
+                *dst = surface->blender(*img, *dst, IA(*img));
+            }
+        } else if (opacity == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
+                auto tmp = surface->blender(*img, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, MULTIPLY(span->coverage, A(*img)));
+            }
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
+                auto src = ALPHA_BLEND(*img, opacity);
+                auto tmp = surface->blender(src, *dst, IA(src));
+                *dst = INTERPOLATE(tmp, *dst, MULTIPLY(span->coverage, A(src)));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterDirectRleImage(SwSurface* surface, const SwImage* image, uint8_t opacity)
+{
+    auto span = image->rle->spans;
+
+    for (uint32_t i = 0; i < image->rle->size; ++i, ++span) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto img = image->buf32 + (span->y + image->oy) * image->stride + (span->x + image->ox);
+        auto alpha = MULTIPLY(span->coverage, opacity);
+        if (alpha == 255) {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
+                *dst = *img + ALPHA_BLEND(*dst, IA(*img));
+            }
+        } else {
+            for (uint32_t x = 0; x < span->len; ++x, ++dst, ++img) {
+                auto src = ALPHA_BLEND(*img, alpha);
+                *dst = src + ALPHA_BLEND(*dst, IA(src));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _directRleImage(SwSurface* surface, const SwImage* image, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale rle image!");
+        return false;
+    }
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterDirectMattedRleImage(surface, image, opacity);
+        else return _rasterDirectMaskedRleImage(surface, image, opacity);
+    } else if (_blending(surface)) {
+        return _rasterDirectBlendingRleImage(surface, image, opacity);
+    } else {
+        return _rasterDirectRleImage(surface, image, opacity);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/*Scaled Image                                                          */
+/************************************************************************/
+
+#if 0 //Enable it when GRAYSCALE image is supported
+static bool _rasterCompositeScaledMaskedImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, SwMask maskOp, uint8_t opacity)
+{
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * cstride + region.min.x);
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto cmp = cbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++cmp) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                *cmp = maskOp(src, *cmp, ~src);
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++cmp) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = MULTIPLY(src, opacity);
+                *cmp = maskOp(tmp, *cmp, ~tmp);
+            }
+        }
+        cbuffer += cstride;
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectScaledMaskedImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, SwMask maskOp, uint8_t opacity)
+{
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * cstride + region.min.x);
+    auto dbuffer = surface->buf8 + (region.min.y * surface->stride + region.min.x);
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto cmp = cbuffer;
+        auto dst = dbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++cmp, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = maskOp(src, *cmp, 0);  //not use alpha
+                *dst = tmp + MULTIPLY(*dst, ~tmp);
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++cmp, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf8, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = MULTIPLY(src, opacity);
+                auto tmp2 = maskOp(tmp, *cmp, 0);  //not use alpha
+                *dst = tmp2 + MULTIPLY(*dst, ~tmp2);
+            }
+        }
+        cbuffer += cstride;
+        dbuffer += surface->stride;
+    }
+    return true;
+}
+#endif
+
+static bool _rasterScaledMaskedImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+#if 0 //Enable it when GRAYSCALE image is supported
+    TVGLOG("SW_ENGINE", "Scaled Masked(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    if (_direct(surface->compositor->method)) return _rasterDirectScaledMaskedImage(surface, image, itransform, region, maskOp, opacity);
+    else return _rasterCompositeScaledMaskedImage(surface, image, itransform, region, maskOp, opacity);
+#endif
+    return false;
+}
+
+
+static bool _rasterScaledMattedImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+    auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
+    auto csize = surface->compositor->image.channelSize;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    TVGLOG("SW_ENGINE", "Scaled Matted(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = dbuffer;
+        auto cmp = cbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto temp = ALPHA_BLEND(src, alpha(cmp));
+                *dst = temp + ALPHA_BLEND(*dst, IA(temp));
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, cmp += csize) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto temp = ALPHA_BLEND(src, MULTIPLY(opacity, alpha(cmp)));
+                *dst = temp + ALPHA_BLEND(*dst, IA(temp));
+            }
+        }
+        dbuffer += surface->stride;
+        cbuffer += surface->compositor->image.stride * csize;
+    }
+    return true;
+}
+
+
+static bool _rasterScaledBlendingImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+    auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+
+    for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = dbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                auto tmp = surface->blender(src, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, A(src));
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2), opacity);
+                auto tmp = surface->blender(src, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, A(src));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterScaledImage(SwSurface* surface, const SwImage* image, const Matrix* itransform, const SwBBox& region, uint8_t opacity)
+{
+    auto dbuffer = surface->buf32 + (region.min.y * surface->stride + region.min.x);
+    auto scaleMethod = image->scale < DOWN_SCALE_TOLERANCE ? _interpDownScaler : _interpUpScaler;
+    auto sampleSize = _sampleSize(image->scale);
+    auto sampleSize2 = sampleSize * sampleSize;
+
+    for (auto y = region.min.y; y < region.max.y; ++y, dbuffer += surface->stride) {
+        auto sy = y * itransform->e22 + itransform->e23;
+        if ((uint32_t)sy >= image->h) continue;
+        auto dst = dbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2);
+                *dst = src + ALPHA_BLEND(*dst, IA(src));
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst) {
+                auto sx = x * itransform->e11 + itransform->e13;
+                if ((uint32_t)sx >= image->w) continue;
+                auto src = ALPHA_BLEND(scaleMethod(image->buf32, image->stride, image->w, image->h, sx, sy, sampleSize, sampleSize2), opacity);
+                *dst = src + ALPHA_BLEND(*dst, IA(src));
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool _scaledImage(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox& region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale Textmap polygon mesh!");
+        return false;
+    }
+
+    Matrix itransform;
+
+    if (transform) {
+        if (!mathInverse(transform, &itransform)) return false;
+    } else mathIdentity(&itransform);
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterScaledMattedImage(surface, image, &itransform, region, opacity);
+        else return _rasterScaledMaskedImage(surface, image, &itransform, region, opacity);
+    } else if (_blending(surface)) {
+        return _rasterScaledBlendingImage(surface, image, &itransform, region, opacity);
+    } else {
+        return _rasterScaledImage(surface, image, &itransform, region, opacity);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/* Direct Image                                                         */
+/************************************************************************/
+
+#if 0 //Enable it when GRAYSCALE image is supported
+static bool _rasterCompositeDirectMaskedImage(SwSurface* surface, const SwImage* image, const SwBBox& region, SwMask maskOp, uint8_t opacity)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * cstride + region.min.x); //compositor buffer
+    auto sbuffer = image->buf8 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto cmp = cbuffer;
+        auto src = sbuffer;
+        if (opacity == 255) {
+            for (uint32_t x = 0; x < w; ++x, ++src, ++cmp) {
+                *cmp = maskOp(*src, *cmp, ~*src);
+            }
+        } else {
+            for (uint32_t x = 0; x < w; ++x, ++src, ++cmp) {
+                auto tmp = MULTIPLY(*src, opacity);
+                *cmp = maskOp(tmp, *cmp, ~tmp);
+            }
+        }
+        cbuffer += cstride;
+        sbuffer += image->stride;
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+static bool _rasterDirectDirectMaskedImage(SwSurface* surface, const SwImage* image, const SwBBox& region, SwMask maskOp, uint8_t opacity)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+
+    auto cbuffer = surface->compositor->image.buf32 + (region.min.y * cstride + region.min.x); //compositor buffer
+    auto dbuffer = surface->buf8 + (region.min.y * surface->stride + region.min.x);            //destination buffer
+    auto sbuffer = image->buf8 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto cmp = cbuffer;
+        auto dst = dbuffer;
+        auto src = sbuffer;
+        if (opacity == 255) {
+            for (uint32_t x = 0; x < w; ++x, ++src, ++cmp, ++dst) {
+                auto tmp = maskOp(*src, *cmp, 0); //not use alpha
+                *dst = tmp + MULTIPLY(*dst, ~tmp);
+            }
+        } else {
+            for (uint32_t x = 0; x < w; ++x, ++src, ++cmp, ++dst) {
+                auto tmp = maskOp(MULTIPLY(*src, opacity), *cmp, 0); //not use alpha
+                *dst = tmp + MULTIPLY(*dst, ~tmp);
+            }
+        }
+        cbuffer += cstride;
+        dbuffer += surface->stride;
+        sbuffer += image->stride;
+    }
+    return true;
+}
+#endif
+
+static bool _rasterDirectMaskedImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint8_t opacity)
+{
+    TVGERR("SW_ENGINE", "Not Supported: Direct Masked(%d) Image [Region: %lu %lu %lu %lu]", (int)surface->compositor->method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+#if 0 //Enable it when GRAYSCALE image is supported
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    if (_direct(surface->compositor->method)) return _rasterDirectDirectMaskedImage(surface, image, region, maskOp, opacity);
+    else return _rasterCompositeDirectMaskedImage(surface, image, region, maskOp, opacity);
+#endif
+    return false;
+}
+
+
+static bool _rasterDirectMattedImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint8_t opacity)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto csize = surface->compositor->image.channelSize;
+    auto alpha = surface->alpha(surface->compositor->method);
+    auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize; //compositor buffer
+
+    TVGLOG("SW_ENGINE", "Direct Matted(%d) Image  [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
+
+    //32 bits
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = buffer;
+            auto cmp = cbuffer;
+            auto src = sbuffer;
+            if (opacity == 255) {
+                for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
+                    auto tmp = ALPHA_BLEND(*src, alpha(cmp));
+                    *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+                }
+            } else {
+                for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
+                    auto tmp = ALPHA_BLEND(*src, MULTIPLY(opacity, alpha(cmp)));
+                    *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+                }
+            }
+            buffer += surface->stride;
+            cbuffer += surface->compositor->image.stride * csize;
+            sbuffer += image->stride;
+        }
+    //8 bits
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = buffer;
+            auto cmp = cbuffer;
+            auto src = sbuffer;
+            if (opacity == 255) {
+                for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
+                    *dst = MULTIPLY(A(*src), alpha(cmp));
+                }
+            } else {
+                for (uint32_t x = 0; x < w; ++x, ++dst, ++src, cmp += csize) {
+                    *dst = MULTIPLY(A(*src), MULTIPLY(opacity, alpha(cmp)));
+                }
+            }
+            buffer += surface->stride;
+            cbuffer += surface->compositor->image.stride * csize;
+            sbuffer += image->stride;
+        }
+    }
+    return true;
+}
+
+
+static bool _rasterDirectBlendingImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale image!");
+        return false;
+    }
+
+    auto dbuffer = &surface->buf32[region.min.y * surface->stride + region.min.x];
+    auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto dst = dbuffer;
+        auto src = sbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) {
+                auto tmp = surface->blender(*src, *dst, 255);
+                *dst = INTERPOLATE(tmp, *dst, A(*src));
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, ++src) {
+                auto tmp = ALPHA_BLEND(*src, opacity);
+                auto tmp2 = surface->blender(tmp, *dst, 255);
+                *dst = INTERPOLATE(tmp2, *dst, A(tmp));
+            }
+        }
+        dbuffer += surface->stride;
+        sbuffer += image->stride;
+    }
+    return true;
+}
+
+
+static bool _rasterDirectImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale image!");
+        return false;
+    }
+
+    auto dbuffer = &surface->buf32[region.min.y * surface->stride + region.min.x];
+    auto sbuffer = image->buf32 + (region.min.y + image->oy) * image->stride + (region.min.x + image->ox);
+
+    for (auto y = region.min.y; y < region.max.y; ++y) {
+        auto dst = dbuffer;
+        auto src = sbuffer;
+        if (opacity == 255) {
+            for (auto x = region.min.x; x < region.max.x; x++, dst++, src++) {
+                *dst = *src + ALPHA_BLEND(*dst, IA(*src));
+            }
+        } else {
+            for (auto x = region.min.x; x < region.max.x; ++x, ++dst, ++src) {
+                auto tmp = ALPHA_BLEND(*src, opacity);
+                *dst = tmp + ALPHA_BLEND(*dst, IA(tmp));
+            }
+        }
+        dbuffer += surface->stride;
+        sbuffer += image->stride;
+    }
+    return true;
+}
+
+
+//Blenders for the following scenarios: [Composition / Non-Composition] * [Opaque / Translucent]
+static bool _directImage(SwSurface* surface, const SwImage* image, const SwBBox& region, uint8_t opacity)
+{
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterDirectMattedImage(surface, image, region, opacity);
+        else return _rasterDirectMaskedImage(surface, image, region, opacity);
+    } else if (_blending(surface)) {
+        return _rasterDirectBlendingImage(surface, image, region, opacity);
+    } else {
+        return _rasterDirectImage(surface, image, region, opacity);
+    }
+    return false;
+}
+
+
+//Blenders for the following scenarios: [RLE / Whole] * [Direct / Scaled / Transformed]
+static bool _rasterImage(SwSurface* surface, SwImage* image, const Matrix* transform, const SwBBox& region, uint8_t opacity)
+{
+    //RLE Image
+    if (image->rle) {
+        if (image->direct) return _directRleImage(surface, image, opacity);
+        else if (image->scaled) return _scaledRleImage(surface, image, transform, region, opacity);
+        else return _rasterTexmapPolygon(surface, image, transform, nullptr, opacity);
+    //Whole Image
+    } else {
+        if (image->direct) return _directImage(surface, image, region, opacity);
+        else if (image->scaled) return _scaledImage(surface, image, transform, region, opacity);
+        else return _rasterTexmapPolygon(surface, image, transform, &region, opacity);
+    }
+}
+
+
+/************************************************************************/
+/* Rect Gradient                                                        */
+/************************************************************************/
+
+template<typename fillMethod>
+static bool _rasterCompositeGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill, SwMask maskOp)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * cstride + region.min.x);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        fillMethod()(fill, cbuffer, region.min.y + y, region.min.x, w, maskOp, 255);
+        cbuffer += surface->stride;
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+template<typename fillMethod>
+static bool _rasterDirectGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill, SwMask maskOp)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * cstride + region.min.x);
+    auto dbuffer = surface->buf8 + (region.min.y * surface->stride + region.min.x);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        fillMethod()(fill, dbuffer, region.min.y + y, region.min.x, w, cbuffer, maskOp, 255);
+        cbuffer += cstride;
+        dbuffer += surface->stride;
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterGradientMaskedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    auto method = surface->compositor->method;
+
+    TVGLOG("SW_ENGINE", "Masked(%d) Gradient [Region: %lu %lu %lu %lu]", (int)method, region.min.x, region.min.y, region.max.x - region.min.x, region.max.y - region.min.y);
+
+    auto maskOp = _getMaskOp(method);
+
+    if (_direct(method)) return _rasterDirectGradientMaskedRect<fillMethod>(surface, region, fill, maskOp);
+    else return _rasterCompositeGradientMaskedRect<fillMethod>(surface, region, fill, maskOp);
+
+    return false;
+}
+
+
+template<typename fillMethod>
+static bool _rasterGradientMattedRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto csize = surface->compositor->image.channelSize;
+    auto cbuffer = surface->compositor->image.buf8 + (region.min.y * surface->compositor->image.stride + region.min.x) * csize;
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    TVGLOG("SW_ENGINE", "Matted(%d) Gradient [Region: %lu %lu %u %u]", (int)surface->compositor->method, region.min.x, region.min.y, w, h);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        fillMethod()(fill, buffer, region.min.y + y, region.min.x, w, cbuffer, alpha, csize, 255);
+        buffer += surface->stride;
+        cbuffer += surface->stride * csize;
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterBlendingGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+
+    if (fill->translucent) {
+        for (uint32_t y = 0; y < h; ++y) {
+            fillMethod()(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w, opBlendPreNormal, surface->blender, 255);
+        }
+    } else {
+        for (uint32_t y = 0; y < h; ++y) {
+            fillMethod()(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w, opBlendSrcOver, surface->blender, 255);
+        }
+    }
+    return true;
+}
+
+template<typename fillMethod>
+static bool _rasterTranslucentGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        fillMethod()(fill, buffer, region.min.y + y, region.min.x, w, opBlendPreNormal, 255);
+        buffer += surface->stride;
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterSolidGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        fillMethod()(fill, buffer + y * surface->stride, region.min.y + y, region.min.x, w, opBlendSrcOver, 255);
+    }
+    return true;
+}
+
+
+static bool _rasterLinearGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    if (fill->linear.len < FLT_EPSILON) return false;
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterGradientMattedRect<FillLinear>(surface, region, fill);
+        else return _rasterGradientMaskedRect<FillLinear>(surface, region, fill);
+    } else if (_blending(surface)) {
+        return _rasterBlendingGradientRect<FillLinear>(surface, region, fill);
+    } else {
+        if (fill->translucent) return _rasterTranslucentGradientRect<FillLinear>(surface, region, fill);
+        else _rasterSolidGradientRect<FillLinear>(surface, region, fill);
+    }
+    return false;
+}
+
+
+static bool _rasterRadialGradientRect(SwSurface* surface, const SwBBox& region, const SwFill* fill)
+{
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterGradientMattedRect<FillRadial>(surface, region, fill);
+        else return _rasterGradientMaskedRect<FillRadial>(surface, region, fill);
+    } else if (_blending(surface)) {
+        return _rasterBlendingGradientRect<FillRadial>(surface, region, fill);
+    } else {
+        if (fill->translucent) return _rasterTranslucentGradientRect<FillRadial>(surface, region, fill);
+        else _rasterSolidGradientRect<FillRadial>(surface, region, fill);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/* Rle Gradient                                                         */
+/************************************************************************/
+
+template<typename fillMethod>
+static bool _rasterCompositeGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill, SwMask maskOp)
+{
+    auto span = rle->spans;
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto cmp = &cbuffer[span->y * cstride + span->x];
+        fillMethod()(fill, cmp, span->y, span->x, span->len, maskOp, span->coverage);
+    }
+    return _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+}
+
+
+template<typename fillMethod>
+static bool _rasterDirectGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill, SwMask maskOp)
+{
+    auto span = rle->spans;
+    auto cstride = surface->compositor->image.stride;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto dbuffer = surface->buf8;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto cmp = &cbuffer[span->y * cstride + span->x];
+        auto dst = &dbuffer[span->y * surface->stride + span->x];
+        fillMethod()(fill, dst, span->y, span->x, span->len, cmp, maskOp, span->coverage);
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterGradientMaskedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    auto method = surface->compositor->method;
+
+    TVGLOG("SW_ENGINE", "Masked(%d) Rle Linear Gradient", (int)method);
+
+    auto maskOp = _getMaskOp(method);
+
+    if (_direct(method)) return _rasterDirectGradientMaskedRle<fillMethod>(surface, rle, fill, maskOp);
+    else return _rasterCompositeGradientMaskedRle<fillMethod>(surface, rle, fill, maskOp);
+    return false;
+}
+
+
+template<typename fillMethod>
+static bool _rasterGradientMattedRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    TVGLOG("SW_ENGINE", "Matted(%d) Rle Linear Gradient", (int)surface->compositor->method);
+
+    auto span = rle->spans;
+    auto csize = surface->compositor->image.channelSize;
+    auto cbuffer = surface->compositor->image.buf8;
+    auto alpha = surface->alpha(surface->compositor->method);
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto cmp = &cbuffer[(span->y * surface->compositor->image.stride + span->x) * csize];
+        fillMethod()(fill, dst, span->y, span->x, span->len, cmp, alpha, csize, span->coverage);
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterBlendingGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    auto span = rle->spans;
+
+    for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        fillMethod()(fill, dst, span->y, span->x, span->len, opBlendPreNormal, surface->blender, span->coverage);
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterTranslucentGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    auto span = rle->spans;
+
+    //32 bits
+    if (surface->channelSize == sizeof(uint32_t)) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+            if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len, opBlendPreNormal, 255);
+            else fillMethod()(fill, dst, span->y, span->x, span->len, opBlendNormal, span->coverage);
+        }
+    //8 bits
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            fillMethod()(fill, dst, span->y, span->x, span->len, _opMaskAdd, 255);
+        }
+    }
+    return true;
+}
+
+
+template<typename fillMethod>
+static bool _rasterSolidGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    auto span = rle->spans;
+
+    //32 bits
+    if (surface->channelSize == sizeof(uint32_t)) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+            if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len, opBlendSrcOver, 255);
+            else fillMethod()(fill, dst, span->y, span->x, span->len, opBlendInterp, span->coverage);
+        }
+    //8 bits
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            if (span->coverage == 255) fillMethod()(fill, dst, span->y, span->x, span->len, _opMaskNone, 255);
+            else fillMethod()(fill, dst, span->y, span->x, span->len, _opMaskAdd, span->coverage);
+        }
+    }
+
+    return true;
+}
+
+
+static bool _rasterLinearGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    if (!rle || fill->linear.len < FLT_EPSILON) return false;
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterGradientMattedRle<FillLinear>(surface, rle, fill);
+        else return _rasterGradientMaskedRle<FillLinear>(surface, rle, fill);
+    } else if (_blending(surface)) {
+        return _rasterBlendingGradientRle<FillLinear>(surface, rle, fill);
+    } else {
+        if (fill->translucent) return _rasterTranslucentGradientRle<FillLinear>(surface, rle, fill);
+        else return _rasterSolidGradientRle<FillLinear>(surface, rle, fill);
+    }
+    return false;
+}
+
+
+static bool _rasterRadialGradientRle(SwSurface* surface, const SwRleData* rle, const SwFill* fill)
+{
+    if (!rle) return false;
+
+    if (_compositing(surface)) {
+        if (_matting(surface)) return _rasterGradientMattedRle<FillRadial>(surface, rle, fill);
+        else return _rasterGradientMaskedRle<FillRadial>(surface, rle, fill);
+    } else if (_blending(surface)) {
+        _rasterBlendingGradientRle<FillRadial>(surface, rle, fill);
+    } else {
+        if (fill->translucent) _rasterTranslucentGradientRle<FillRadial>(surface, rle, fill);
+        else return _rasterSolidGradientRle<FillRadial>(surface, rle, fill);
+    }
+    return false;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+
+void rasterGrayscale8(uint8_t *dst, uint8_t val, uint32_t offset, int32_t len)
+{
+    //OPTIMIZE_ME: Support SIMD
+    cRasterPixels(dst, val, offset, len);
+}
+
+
+void rasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
+{
+#if defined(THORVG_AVX_VECTOR_SUPPORT)
+    avxRasterPixel32(dst, val, offset, len);
+#elif defined(THORVG_NEON_VECTOR_SUPPORT)
+    neonRasterPixel32(dst, val, offset, len);
+#else
+    cRasterPixels(dst, val, offset, len);
+#endif
+}
+
+
+bool rasterCompositor(SwSurface* surface)
+{
+    //See CompositeMethod, Alpha:3, InvAlpha:4, Luma:5, InvLuma:6
+    surface->alphas[0] = _alpha;
+    surface->alphas[1] = _ialpha;
+
+    if (surface->cs == ColorSpace::ABGR8888 || surface->cs == ColorSpace::ABGR8888S) {
+        surface->join = _abgrJoin;
+        surface->alphas[2] = _abgrLuma;
+        surface->alphas[3] = _abgrInvLuma;
+    } else if (surface->cs == ColorSpace::ARGB8888 || surface->cs == ColorSpace::ARGB8888S) {
+        surface->join = _argbJoin;
+        surface->alphas[2] = _argbLuma;
+        surface->alphas[3] = _argbInvLuma;
+    } else {
+        TVGERR("SW_ENGINE", "Unsupported Colorspace(%d) is expected!", surface->cs);
+        return false;
+    }
+    return true;
+}
+
+
+bool rasterClear(SwSurface* surface, uint32_t x, uint32_t y, uint32_t w, uint32_t h)
+{
+    if (!surface || !surface->buf32 || surface->stride == 0 || surface->w == 0 || surface->h == 0) return false;
+
+    //32 bits
+    if (surface->channelSize == sizeof(uint32_t)) {
+        //full clear
+        if (w == surface->stride) {
+            rasterPixel32(surface->buf32, 0x00000000, surface->stride * y, w * h);
+        //partial clear
+        } else {
+            for (uint32_t i = 0; i < h; i++) {
+                rasterPixel32(surface->buf32, 0x00000000, (surface->stride * y + x) + (surface->stride * i), w);
+            }
+        }
+    //8 bits
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        //full clear
+        if (w == surface->stride) {
+            rasterGrayscale8(surface->buf8, 0x00, surface->stride * y, w * h);
+        //partial clear
+        } else {
+            for (uint32_t i = 0; i < h; i++) {
+                rasterGrayscale8(surface->buf8, 0x00, (surface->stride * y + x) + (surface->stride * i), w);
+            }
+        }
+    }
+    return true;
+}
+
+
+void rasterUnpremultiply(Surface* surface)
+{
+    if (surface->channelSize != sizeof(uint32_t)) return;
+
+    TVGLOG("SW_ENGINE", "Unpremultiply [Size: %d x %d]", surface->w, surface->h);
+
+    //OPTIMIZE_ME: +SIMD
+    for (uint32_t y = 0; y < surface->h; y++) {
+        auto buffer = surface->buf32 + surface->stride * y;
+        for (uint32_t x = 0; x < surface->w; ++x) {
+            uint8_t a = buffer[x] >> 24;
+            if (a == 255) {
+                continue;
+            } else if (a == 0) {
+                buffer[x] = 0x00ffffff;
+            } else {
+                uint16_t r = ((buffer[x] >> 8) & 0xff00) / a;
+                uint16_t g = ((buffer[x]) & 0xff00) / a;
+                uint16_t b = ((buffer[x] << 8) & 0xff00) / a;
+                if (r > 0xff) r = 0xff;
+                if (g > 0xff) g = 0xff;
+                if (b > 0xff) b = 0xff;
+                buffer[x] = (a << 24) | (r << 16) | (g << 8) | (b);
+            }
+        }
+    }
+    surface->premultiplied = false;
+}
+
+
+void rasterPremultiply(Surface* surface)
+{
+    if (surface->channelSize != sizeof(uint32_t)) return;
+
+    TVGLOG("SW_ENGINE", "Premultiply [Size: %d x %d]", surface->w, surface->h);
+
+    //OPTIMIZE_ME: +SIMD
+    auto buffer = surface->buf32;
+    for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
+        auto dst = buffer;
+        for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
+            auto c = *dst;
+            auto a = (c >> 24);
+            *dst = (c & 0xff000000) + ((((c >> 8) & 0xff) * a) & 0xff00) + ((((c & 0x00ff00ff) * a) >> 8) & 0x00ff00ff);
+        }
+    }
+    surface->premultiplied = true;
+}
+
+
+bool rasterGradientShape(SwSurface* surface, SwShape* shape, unsigned id)
+{
+    if (!shape->fill) return false;
+
+    if (shape->fastTrack) {
+        if (id == TVG_CLASS_ID_LINEAR) return _rasterLinearGradientRect(surface, shape->bbox, shape->fill);
+        else if (id == TVG_CLASS_ID_RADIAL)return _rasterRadialGradientRect(surface, shape->bbox, shape->fill);
+    } else {
+        if (id == TVG_CLASS_ID_LINEAR) return _rasterLinearGradientRle(surface, shape->rle, shape->fill);
+        else if (id == TVG_CLASS_ID_RADIAL) return _rasterRadialGradientRle(surface, shape->rle, shape->fill);
+    }
+    return false;
+}
+
+
+bool rasterGradientStroke(SwSurface* surface, SwShape* shape, unsigned id)
+{
+    if (!shape->stroke || !shape->stroke->fill || !shape->strokeRle) return false;
+
+    if (id == TVG_CLASS_ID_LINEAR) return _rasterLinearGradientRle(surface, shape->strokeRle, shape->stroke->fill);
+    else if (id == TVG_CLASS_ID_RADIAL) return _rasterRadialGradientRle(surface, shape->strokeRle, shape->stroke->fill);
+
+    return false;
+}
+
+
+bool rasterShape(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (a < 255) {
+        r = MULTIPLY(r, a);
+        g = MULTIPLY(g, a);
+        b = MULTIPLY(b, a);
+    }
+    if (shape->fastTrack) return _rasterRect(surface, shape->bbox, r, g, b, a);
+    else return _rasterRle(surface, shape->rle, r, g, b, a);
+}
+
+
+bool rasterStroke(SwSurface* surface, SwShape* shape, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (a < 255) {
+        r = MULTIPLY(r, a);
+        g = MULTIPLY(g, a);
+        b = MULTIPLY(b, a);
+    }
+
+    return _rasterRle(surface, shape->strokeRle, r, g, b, a);
+}
+
+
+bool rasterImage(SwSurface* surface, SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox& bbox, uint8_t opacity)
+{
+    //Verify Boundary
+    if (bbox.max.x < 0 || bbox.max.y < 0 || bbox.min.x >= static_cast<SwCoord>(surface->w) || bbox.min.y >= static_cast<SwCoord>(surface->h)) return false;
+
+    if (mesh && mesh->triangleCnt > 0) return _rasterTexmapPolygonMesh(surface, image, mesh, transform, &bbox, opacity);
+    else return _rasterImage(surface, image, transform, bbox, opacity);
+}
+
+
+bool rasterConvertCS(Surface* surface, ColorSpace to)
+{
+    //TOOD: Support SIMD accelerations
+    auto from = surface->cs;
+
+    if (((from == ColorSpace::ABGR8888) || (from == ColorSpace::ABGR8888S)) && ((to == ColorSpace::ARGB8888) || (to == ColorSpace::ARGB8888S))) {
+        surface->cs = to;
+        return cRasterABGRtoARGB(surface);
+    }
+    if (((from == ColorSpace::ARGB8888) || (from == ColorSpace::ARGB8888S)) && ((to == ColorSpace::ABGR8888) || (to == ColorSpace::ABGR8888S))) {
+        surface->cs = to;
+        return cRasterARGBtoABGR(surface);
+    }
+
+    return false;
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterAvx.h b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterAvx.h
new file mode 100644
index 0000000000..090fa29a7a
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterAvx.h
@@ -0,0 +1,191 @@
+/*
+ * Copyright (c) 2021 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef THORVG_AVX_VECTOR_SUPPORT
+
+#include <immintrin.h>
+
+#define N_32BITS_IN_128REG 4
+#define N_32BITS_IN_256REG 8
+
+static inline __m128i ALPHA_BLEND(__m128i c, __m128i a)
+{
+    //1. set the masks for the A/G and R/B channels
+    auto AG = _mm_set1_epi32(0xff00ff00);
+    auto RB = _mm_set1_epi32(0x00ff00ff);
+
+    //2. mask the alpha vector - originally quartet [a, a, a, a]
+    auto aAG = _mm_and_si128(a, AG);
+    auto aRB = _mm_and_si128(a, RB);
+
+    //3. calculate the alpha blending of the 2nd and 4th channel
+    //- mask the color vector
+    //- multiply it by the masked alpha vector
+    //- add the correction to compensate bit shifting used instead of dividing by 255
+    //- shift bits - corresponding to division by 256
+    auto even = _mm_and_si128(c, RB);
+    even = _mm_mullo_epi16(even, aRB);
+    even =_mm_add_epi16(even, RB);
+    even = _mm_srli_epi16(even, 8);
+
+    //4. calculate the alpha blending of the 1st and 3rd channel:
+    //- mask the color vector
+    //- multiply it by the corresponding masked alpha vector and store the high bits of the result
+    //- add the correction to compensate division by 256 instead of by 255 (next step)
+    //- remove the low 8 bits to mimic the division by 256
+    auto odd = _mm_and_si128(c, AG);
+    odd = _mm_mulhi_epu16(odd, aAG);
+    odd = _mm_add_epi16(odd, RB);
+    odd = _mm_and_si128(odd, AG);
+
+    //5. the final result
+    return _mm_or_si128(odd, even);
+}
+
+
+static void avxRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
+{
+    //1. calculate how many iterations we need to cover the length
+    uint32_t iterations = len / N_32BITS_IN_256REG;
+    uint32_t avxFilled = iterations * N_32BITS_IN_256REG;
+
+    //2. set the beginning of the array
+    dst += offset;
+
+    //3. fill the octets
+    for (uint32_t i = 0; i < iterations; ++i, dst += N_32BITS_IN_256REG) {
+        _mm256_storeu_si256((__m256i*)dst, _mm256_set1_epi32(val));
+    }
+
+    //4. fill leftovers (in the first step we have to set the pointer to the place where the avx job is done)
+    int32_t leftovers = len - avxFilled;
+    while (leftovers--) *dst++ = val;
+}
+
+
+static bool avxRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (surface->channelSize != sizeof(uint32_t)) {
+        TVGERR("SW_ENGINE", "Unsupported Channel Size = %d", surface->channelSize);
+        return false;
+    }
+
+    auto color = surface->join(r, g, b, a);
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+
+    uint32_t ialpha = 255 - a;
+
+    auto avxColor = _mm_set1_epi32(color);
+    auto avxIalpha = _mm_set1_epi8(ialpha);
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto dst = &buffer[y * surface->stride];
+
+        //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required)
+        auto notAligned = ((uintptr_t)dst & 0xf) / 4;
+        if (notAligned) {
+            notAligned = (N_32BITS_IN_128REG - notAligned > w ? w : N_32BITS_IN_128REG - notAligned);
+            for (uint32_t x = 0; x < notAligned; ++x, ++dst) {
+                *dst = color + ALPHA_BLEND(*dst, ialpha);
+            }
+        }
+
+        //2. fill the aligned memory - N_32BITS_IN_128REG pixels processed at once
+        uint32_t iterations = (w - notAligned) / N_32BITS_IN_128REG;
+        uint32_t avxFilled = iterations * N_32BITS_IN_128REG;
+        auto avxDst = (__m128i*)dst;
+        for (uint32_t x = 0; x < iterations; ++x, ++avxDst) {
+            *avxDst = _mm_add_epi32(avxColor, ALPHA_BLEND(*avxDst, avxIalpha));
+        }
+
+        //3. fill the remaining pixels
+        int32_t leftovers = w - notAligned - avxFilled;
+        dst += avxFilled;
+        while (leftovers--) {
+            *dst = color + ALPHA_BLEND(*dst, ialpha);
+            dst++;
+        }
+    }
+    return true;
+}
+
+
+static bool avxRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (surface->channelSize != sizeof(uint32_t)) {
+        TVGERR("SW_ENGINE", "Unsupported Channel Size = %d", surface->channelSize);
+        return false;
+    }
+
+    auto color = surface->join(r, g, b, a);
+    auto span = rle->spans;
+    uint32_t src;
+
+    for (uint32_t i = 0; i < rle->size; ++i) {
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+
+        if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
+        else src = color;
+
+	auto ialpha = IA(src);
+
+        //1. fill the not aligned memory (for 128-bit registers a 16-bytes alignment is required)
+        auto notAligned = ((uintptr_t)dst & 0xf) / 4;
+        if (notAligned) {
+            notAligned = (N_32BITS_IN_128REG - notAligned > span->len ? span->len : N_32BITS_IN_128REG - notAligned);
+            for (uint32_t x = 0; x < notAligned; ++x, ++dst) {
+                *dst = src + ALPHA_BLEND(*dst, ialpha);
+            }
+        }
+
+        //2. fill the aligned memory using avx - N_32BITS_IN_128REG pixels processed at once
+        //In order to avoid unneccessary avx variables declarations a check is made whether there are any iterations at all
+        uint32_t iterations = (span->len - notAligned) / N_32BITS_IN_128REG;
+        uint32_t avxFilled = 0;
+        if (iterations > 0) {
+            auto avxSrc = _mm_set1_epi32(src);
+            auto avxIalpha = _mm_set1_epi8(ialpha);
+
+            avxFilled = iterations * N_32BITS_IN_128REG;
+            auto avxDst = (__m128i*)dst;
+            for (uint32_t x = 0; x < iterations; ++x, ++avxDst) {
+                *avxDst = _mm_add_epi32(avxSrc, ALPHA_BLEND(*avxDst, avxIalpha));
+            }
+        }
+
+        //3. fill the remaining pixels
+        int32_t leftovers = span->len - notAligned - avxFilled;
+        dst += avxFilled;
+        while (leftovers--) {
+            *dst = src + ALPHA_BLEND(*dst, ialpha);
+            dst++;
+        }
+
+        ++span;
+    }
+    return true;
+}
+
+
+#endif
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterC.h b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterC.h
new file mode 100644
index 0000000000..eb377e78e3
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterC.h
@@ -0,0 +1,163 @@
+/*
+ * Copyright (c) 2021 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+template<typename PIXEL_T>
+static void inline cRasterPixels(PIXEL_T* dst, PIXEL_T val, uint32_t offset, int32_t len)
+{
+    dst += offset;
+
+    //fix the misaligned memory
+    auto alignOffset = (long long) dst % 8;
+    if (alignOffset > 0) {
+        if (sizeof(PIXEL_T) == 4) alignOffset /= 4;
+        else if (sizeof(PIXEL_T) == 1) alignOffset = 8 - alignOffset;
+        while (alignOffset > 0 && len > 0) {
+            *dst++ = val;
+            --len;
+            --alignOffset;
+        }
+    }
+
+    //64bits faster clear
+    if ((sizeof(PIXEL_T) == 4)) {
+        auto val64 = (uint64_t(val) << 32) | uint64_t(val);
+        while (len > 1) {
+            *reinterpret_cast<uint64_t*>(dst) = val64;
+            len -= 2;
+            dst += 2;
+        }
+    } else if (sizeof(PIXEL_T) == 1) {
+        auto val32 = (uint32_t(val) << 24) | (uint32_t(val) << 16) | (uint32_t(val) << 8) | uint32_t(val);
+        auto val64 = (uint64_t(val32) << 32) | val32;
+        while (len > 7) {
+            *reinterpret_cast<uint64_t*>(dst) = val64;
+            len -= 8;
+            dst += 8;
+        }
+    }
+
+    //leftovers
+    while (len--) *dst++ = val;
+}
+
+
+static bool inline cRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto span = rle->spans;
+
+    //32bit channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, a);
+        uint32_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf32[span->y * surface->stride + span->x];
+            if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
+            else src = color;
+            auto ialpha = IA(src);
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = src + ALPHA_BLEND(*dst, ialpha);
+            }
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        uint8_t src;
+        for (uint32_t i = 0; i < rle->size; ++i, ++span) {
+            auto dst = &surface->buf8[span->y * surface->stride + span->x];
+            if (span->coverage < 255) src = MULTIPLY(span->coverage, a);
+            else src = a;
+            auto ialpha = ~a;
+            for (uint32_t x = 0; x < span->len; ++x, ++dst) {
+                *dst = src + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool inline cRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+
+    //32bits channels
+    if (surface->channelSize == sizeof(uint32_t)) {
+        auto color = surface->join(r, g, b, 255);
+        auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = 255 - a;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            for (uint32_t x = 0; x < w; ++x, ++dst) {
+                *dst = color + ALPHA_BLEND(*dst, ialpha);
+            }
+        }
+    //8bit grayscale
+    } else if (surface->channelSize == sizeof(uint8_t)) {
+        auto buffer = surface->buf8 + (region.min.y * surface->stride) + region.min.x;
+        auto ialpha = ~a;
+        for (uint32_t y = 0; y < h; ++y) {
+            auto dst = &buffer[y * surface->stride];
+            for (uint32_t x = 0; x < w; ++x, ++dst) {
+                *dst = a + MULTIPLY(*dst, ialpha);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool inline cRasterABGRtoARGB(Surface* surface)
+{
+    TVGLOG("SW_ENGINE", "Convert ColorSpace ABGR - ARGB [Size: %d x %d]", surface->w, surface->h);
+
+    //64bits faster converting
+    if (surface->w % 2 == 0) {
+        auto buffer = reinterpret_cast<uint64_t*>(surface->buf32);
+        for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride / 2) {
+            auto dst = buffer;
+            for (uint32_t x = 0; x < surface->w / 2; ++x, ++dst) {
+                auto c = *dst;
+                //flip Blue, Red channels
+                *dst = (c & 0xff000000ff000000) + ((c & 0x00ff000000ff0000) >> 16) + (c & 0x0000ff000000ff00) + ((c & 0x000000ff000000ff) << 16);
+            }
+        }
+    //default converting
+    } else {
+        auto buffer = surface->buf32;
+        for (uint32_t y = 0; y < surface->h; ++y, buffer += surface->stride) {
+            auto dst = buffer;
+            for (uint32_t x = 0; x < surface->w; ++x, ++dst) {
+                auto c = *dst;
+                //flip Blue, Red channels
+                *dst = (c & 0xff000000) + ((c & 0x00ff0000) >> 16) + (c & 0x0000ff00) + ((c & 0x000000ff) << 16);
+            }
+        }
+    }
+    return true;
+}
+
+
+static bool inline cRasterARGBtoABGR(Surface* surface)
+{
+    //exactly same with ABGRtoARGB
+    return cRasterABGRtoARGB(surface);
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterNeon.h b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterNeon.h
new file mode 100644
index 0000000000..ba77ed53cf
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterNeon.h
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2021 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifdef THORVG_NEON_VECTOR_SUPPORT
+
+#include <arm_neon.h>
+
+static inline uint8x8_t ALPHA_BLEND(uint8x8_t c, uint8x8_t a)
+{
+    uint16x8_t t = vmull_u8(c, a);
+    return vshrn_n_u16(t, 8);
+}
+
+
+static void neonRasterPixel32(uint32_t *dst, uint32_t val, uint32_t offset, int32_t len)
+{
+    uint32_t iterations = len / 4;
+    uint32_t neonFilled = iterations * 4;
+
+    dst += offset;
+    uint32x4_t vectorVal = {val, val, val, val};
+
+    for (uint32_t i = 0; i < iterations; ++i) {
+        vst1q_u32(dst, vectorVal);
+        dst += 4;
+    }
+
+    int32_t leftovers = len - neonFilled;
+    while (leftovers--) *dst++ = val;
+}
+
+
+static bool neonRasterTranslucentRle(SwSurface* surface, const SwRleData* rle, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (surface->channelSize != sizeof(uint32_t)) {
+        TVGERR("SW_ENGINE", "Unsupported Channel Size = %d", surface->channelSize);
+        return false;
+    }
+
+    auto color = surface->blender.join(r, g, b, a);
+    auto span = rle->spans;
+    uint32_t src;
+    uint8x8_t *vDst = nullptr;
+    uint16_t align;
+
+    for (uint32_t i = 0; i < rle->size; ++i) {
+        if (span->coverage < 255) src = ALPHA_BLEND(color, span->coverage);
+        else src = color;
+
+        auto dst = &surface->buf32[span->y * surface->stride + span->x];
+        auto ialpha = IALPHA(src);
+
+        if ((((uint32_t) dst) & 0x7) != 0) {
+            //fill not aligned byte
+            *dst = src + ALPHA_BLEND(*dst, ialpha);
+            vDst = (uint8x8_t*)(dst + 1);
+            align = 1;
+        } else {
+            vDst = (uint8x8_t*) dst;
+            align = 0;
+        }
+
+        uint8x8_t vSrc = (uint8x8_t) vdup_n_u32(src);
+        uint8x8_t vIalpha = vdup_n_u8((uint8_t) ialpha);
+
+        for (uint32_t x = 0; x < (span->len - align) / 2; ++x)
+            vDst[x] = vadd_u8(vSrc, ALPHA_BLEND(vDst[x], vIalpha));
+
+        auto leftovers = (span->len - align) % 2;
+        if (leftovers > 0) dst[span->len - 1] = src + ALPHA_BLEND(dst[span->len - 1], ialpha);
+
+        ++span;
+    }
+    return true;
+}
+
+
+static bool neonRasterTranslucentRect(SwSurface* surface, const SwBBox& region, uint8_t r, uint8_t g, uint8_t b, uint8_t a)
+{
+    if (surface->channelSize != sizeof(uint32_t)) {
+        TVGERR("SW_ENGINE", "Unsupported Channel Size = %d", surface->channelSize);
+        return false;
+    }
+
+    auto color = surface->blender.join(r, g, b, a);
+    auto buffer = surface->buf32 + (region.min.y * surface->stride) + region.min.x;
+    auto h = static_cast<uint32_t>(region.max.y - region.min.y);
+    auto w = static_cast<uint32_t>(region.max.x - region.min.x);
+    auto ialpha = 255 - a;
+
+    auto vColor = vdup_n_u32(color);
+    auto vIalpha = vdup_n_u8((uint8_t) ialpha);
+
+    uint8x8_t* vDst = nullptr;
+    uint32_t align;
+
+    for (uint32_t y = 0; y < h; ++y) {
+        auto dst = &buffer[y * surface->stride];
+
+        if ((((uint32_t) dst) & 0x7) != 0) {
+            //fill not aligned byte
+            *dst = color + ALPHA_BLEND(*dst, ialpha);
+            vDst = (uint8x8_t*) (dst + 1);
+            align = 1;
+        } else {
+            vDst = (uint8x8_t*) dst;
+            align = 0;
+        }
+
+        for (uint32_t x = 0; x <  (w - align) / 2; ++x)
+            vDst[x] = vadd_u8((uint8x8_t)vColor, ALPHA_BLEND(vDst[x], vIalpha));
+
+        auto leftovers = (w - align) % 2;
+        if (leftovers > 0) dst[w - 1] = color + ALPHA_BLEND(dst[w - 1], ialpha);
+    }
+    return true;
+}
+
+#endif
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterTexmap.h b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterTexmap.h
new file mode 100644
index 0000000000..698ab37da2
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRasterTexmap.h
@@ -0,0 +1,1207 @@
+/*
+ * Copyright (c) 2021 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+struct AALine
+{
+   int32_t x[2];
+   int32_t coverage[2];
+   int32_t length[2];
+};
+
+struct AASpans
+{
+   AALine *lines;
+   int32_t yStart;
+   int32_t yEnd;
+};
+
+static inline void _swap(float& a, float& b, float& tmp)
+{
+    tmp = a;
+    a = b;
+    b = tmp;
+}
+
+
+//Careful! Shared resource, No support threading
+static float dudx, dvdx;
+static float dxdya, dxdyb, dudya, dvdya;
+static float xa, xb, ua, va;
+
+
+//Y Range exception handling
+static bool _arrange(const SwImage* image, const SwBBox* region, int& yStart, int& yEnd)
+{
+    int32_t regionTop, regionBottom;
+
+    if (region) {
+        regionTop = region->min.y;
+        regionBottom = region->max.y;
+    } else {
+        regionTop = image->rle->spans->y;
+        regionBottom = image->rle->spans[image->rle->size - 1].y;
+    }
+
+    if (yStart >= regionBottom) return false;
+
+    if (yStart < regionTop) yStart = regionTop;
+    if (yEnd > regionBottom) yEnd = regionBottom;
+
+    return true;
+}
+
+
+static bool _rasterMaskedPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans, uint8_t opacity, uint8_t dirFlag = 0)
+{
+    return false;
+
+#if 0 //Enable it when GRAYSCALE image is supported
+    auto maskOp = _getMaskOp(surface->compositor->method);
+    auto direct = _direct(surface->compositor->method);
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf8;
+    int32_t sw = static_cast<int32_t>(image->stride);
+    int32_t sh = image->h;
+    int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = INT32_MIN;
+    float dx, u, v, iptr;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    if (!_arrange(image, region, yStart, yEnd)) return false;
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    y = yStart;
+
+    while (y < yEnd) {
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = INT32_MIN;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
+
+            x = x1;
+
+            auto cmp = &surface->compositor->image.buf8[y * surface->compositor->image.stride + x1];
+            auto dst = &surface->buf8[y * surface->stride + x1];
+
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    if (uu >= sw) continue;
+                    vv = (int) v;
+                    if (vv >= sh) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    if (direct) {
+                        auto tmp = maskOp(px, *cmp, 0);  //not use alpha
+                        *dst = tmp + MULTIPLY(*dst, ~tmp);
+                        ++dst;
+                    } else {
+                        *cmp = maskOp(px, *cmp, ~px);
+                    }
+                    ++cmp;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    if (uu >= sw) continue;
+                    vv = (int) v;
+                    if (vv >= sh) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+
+                    if (direct) {
+                        auto tmp = maskOp(MULTIPLY(px, opacity), *cmp, 0);
+                        *dst = tmp + MULTIPLY(*dst, ~tmp);
+                        ++dst;
+                    } else {
+                        auto tmp = MULTIPLY(px, opacity);
+                        *cmp = maskOp(tmp, *cmp, ~px);
+                    }
+                    ++cmp;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            }
+        }
+
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+
+        if (!region && spanIdx >= image->rle->size) break;
+
+        ++y;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+
+    return true;
+#endif
+}
+
+
+static void _rasterBlendingPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans, uint8_t opacity)
+{
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf32;
+    auto dbuf = surface->buf32;
+    int32_t sw = static_cast<int32_t>(image->stride);
+    int32_t sh = image->h;
+    int32_t dw = surface->stride;
+    int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = INT32_MIN;
+    float dx, u, v, iptr;
+    uint32_t* buf;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    if (!_arrange(image, region, yStart, yEnd)) return;
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    y = yStart;
+
+    while (y < yEnd) {
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = INT32_MIN;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
+
+            buf = dbuf + ((y * dw) + x1);
+
+            x = x1;
+
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    if (uu >= sw) continue;
+                    vv = (int) v;
+                    if (vv >= sh) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    *buf = surface->blender(px, *buf, IA(px));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    if (uu >= sw) continue;
+                    vv = (int) v;
+                    if (vv >= sh) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    auto src = ALPHA_BLEND(px, opacity);
+                    *buf = surface->blender(src, *buf, IA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            }
+        }
+
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+
+        if (!region && spanIdx >= image->rle->size) break;
+
+        ++y;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+}
+
+
+static void _rasterPolygonImageSegment(SwSurface* surface, const SwImage* image, const SwBBox* region, int yStart, int yEnd, AASpans* aaSpans, uint8_t opacity, bool matting)
+{
+    float _dudx = dudx, _dvdx = dvdx;
+    float _dxdya = dxdya, _dxdyb = dxdyb, _dudya = dudya, _dvdya = dvdya;
+    float _xa = xa, _xb = xb, _ua = ua, _va = va;
+    auto sbuf = image->buf32;
+    auto dbuf = surface->buf32;
+    int32_t sw = static_cast<int32_t>(image->stride);
+    int32_t sh = image->h;
+    int32_t dw = surface->stride;
+    int32_t x1, x2, x, y, ar, ab, iru, irv, px, ay;
+    int32_t vv = 0, uu = 0;
+    int32_t minx = INT32_MAX, maxx = INT32_MIN;
+    float dx, u, v, iptr;
+    uint32_t* buf;
+    SwSpan* span = nullptr;         //used only when rle based.
+
+    //for matting(composition)
+    auto csize = matting ? surface->compositor->image.channelSize: 0;
+    auto alpha = matting ? surface->alpha(surface->compositor->method) : nullptr;
+    uint8_t* cmp = nullptr;
+
+    if (!_arrange(image, region, yStart, yEnd)) return;
+
+    //Loop through all lines in the segment
+    uint32_t spanIdx = 0;
+
+    if (region) {
+        minx = region->min.x;
+        maxx = region->max.x;
+    } else {
+        span = image->rle->spans;
+        while (span->y < yStart) {
+            ++span;
+            ++spanIdx;
+        }
+    }
+
+    y = yStart;
+
+    while (y < yEnd) {
+        x1 = (int32_t)_xa;
+        x2 = (int32_t)_xb;
+
+        if (!region) {
+            minx = INT32_MAX;
+            maxx = INT32_MIN;
+            //one single row, could be consisted of multiple spans.
+            while (span->y == y && spanIdx < image->rle->size) {
+                if (minx > span->x) minx = span->x;
+                if (maxx < span->x + span->len) maxx = span->x + span->len;
+                ++span;
+                ++spanIdx;
+            }
+        }
+        if (x1 < minx) x1 = minx;
+        if (x2 > maxx) x2 = maxx;
+
+        //Anti-Aliasing frames
+        ay = y - aaSpans->yStart;
+        if (aaSpans->lines[ay].x[0] > x1) aaSpans->lines[ay].x[0] = x1;
+        if (aaSpans->lines[ay].x[1] < x2) aaSpans->lines[ay].x[1] = x2;
+
+        //Range allowed
+        if ((x2 - x1) >= 1 && (x1 < maxx) && (x2 > minx)) {
+
+            //Perform subtexel pre-stepping on UV
+            dx = 1 - (_xa - x1);
+            u = _ua + dx * _dudx;
+            v = _va + dx * _dvdx;
+
+            buf = dbuf + ((y * dw) + x1);
+
+            x = x1;
+
+            if (matting) cmp = &surface->compositor->image.buf8[(y * surface->compositor->image.stride + x1) * csize];
+
+            if (opacity == 255) {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    if (uu >= sw) continue;
+                    vv = (int) v;
+                    if (vv >= sh) continue;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    uint32_t src;
+                    if (matting) {
+                        src = ALPHA_BLEND(px, alpha(cmp));
+                        cmp += csize;
+                    } else {
+                        src = px;
+                    }
+                    *buf = src + ALPHA_BLEND(*buf, IA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            } else {
+                //Draw horizontal line
+                while (x++ < x2) {
+                    uu = (int) u;
+                    vv = (int) v;
+
+                    ar = (int)(255 * (1 - modff(u, &iptr)));
+                    ab = (int)(255 * (1 - modff(v, &iptr)));
+                    iru = uu + 1;
+                    irv = vv + 1;
+
+                    if (vv >= sh) continue;
+
+                    px = *(sbuf + (vv * sw) + uu);
+
+                    /* horizontal interpolate */
+                    if (iru < sw) {
+                        /* right pixel */
+                        int px2 = *(sbuf + (vv * sw) + iru);
+                        px = INTERPOLATE(px, px2, ar);
+                    }
+                    /* vertical interpolate */
+                    if (irv < sh) {
+                        /* bottom pixel */
+                        int px2 = *(sbuf + (irv * sw) + uu);
+
+                        /* horizontal interpolate */
+                        if (iru < sw) {
+                            /* bottom right pixel */
+                            int px3 = *(sbuf + (irv * sw) + iru);
+                            px2 = INTERPOLATE(px2, px3, ar);
+                        }
+                        px = INTERPOLATE(px, px2, ab);
+                    }
+                    uint32_t src;
+                    if (matting) {
+                        src = ALPHA_BLEND(px, MULTIPLY(opacity, alpha(cmp)));
+                        cmp += csize;
+                    } else {
+                        src = ALPHA_BLEND(px, opacity);
+                    }
+                    *buf = src + ALPHA_BLEND(*buf, IA(src));
+                    ++buf;
+
+                    //Step UV horizontally
+                    u += _dudx;
+                    v += _dvdx;
+                    //range over?
+                    if ((uint32_t)v >= image->h) break;
+                }
+            }
+        }
+
+        //Step along both edges
+        _xa += _dxdya;
+        _xb += _dxdyb;
+        _ua += _dudya;
+        _va += _dvdya;
+
+        if (!region && spanIdx >= image->rle->size) break;
+
+        ++y;
+    }
+    xa = _xa;
+    xb = _xb;
+    ua = _ua;
+    va = _va;
+}
+
+
+/* This mapping algorithm is based on Mikael Kalms's. */
+static void _rasterPolygonImage(SwSurface* surface, const SwImage* image, const SwBBox* region, Polygon& polygon, AASpans* aaSpans, uint8_t opacity)
+{
+    float x[3] = {polygon.vertex[0].pt.x, polygon.vertex[1].pt.x, polygon.vertex[2].pt.x};
+    float y[3] = {polygon.vertex[0].pt.y, polygon.vertex[1].pt.y, polygon.vertex[2].pt.y};
+    float u[3] = {polygon.vertex[0].uv.x, polygon.vertex[1].uv.x, polygon.vertex[2].uv.x};
+    float v[3] = {polygon.vertex[0].uv.y, polygon.vertex[1].uv.y, polygon.vertex[2].uv.y};
+
+    float off_y;
+    float dxdy[3] = {0.0f, 0.0f, 0.0f};
+    float tmp;
+
+    auto upper = false;
+
+    //Sort the vertices in ascending Y order
+    if (y[0] > y[1]) {
+        _swap(x[0], x[1], tmp);
+        _swap(y[0], y[1], tmp);
+        _swap(u[0], u[1], tmp);
+        _swap(v[0], v[1], tmp);
+    }
+    if (y[0] > y[2])  {
+        _swap(x[0], x[2], tmp);
+        _swap(y[0], y[2], tmp);
+        _swap(u[0], u[2], tmp);
+        _swap(v[0], v[2], tmp);
+    }
+    if (y[1] > y[2]) {
+        _swap(x[1], x[2], tmp);
+        _swap(y[1], y[2], tmp);
+        _swap(u[1], u[2], tmp);
+        _swap(v[1], v[2], tmp);
+    }
+
+    //Y indexes
+    int yi[3] = {(int)y[0], (int)y[1], (int)y[2]};
+
+    //Skip drawing if it's too thin to cover any pixels at all.
+    if ((yi[0] == yi[1] && yi[0] == yi[2]) || ((int) x[0] == (int) x[1] && (int) x[0] == (int) x[2])) return;
+
+    //Calculate horizontal and vertical increments for UV axes (these calcs are certainly not optimal, although they're stable (handles any dy being 0)
+    auto denom = ((x[2] - x[0]) * (y[1] - y[0]) - (x[1] - x[0]) * (y[2] - y[0]));
+
+    //Skip poly if it's an infinitely thin line
+    if (mathZero(denom)) return;
+
+    denom = 1 / denom;   //Reciprocal for speeding up
+    dudx = ((u[2] - u[0]) * (y[1] - y[0]) - (u[1] - u[0]) * (y[2] - y[0])) * denom;
+    dvdx = ((v[2] - v[0]) * (y[1] - y[0]) - (v[1] - v[0]) * (y[2] - y[0])) * denom;
+    auto dudy = ((u[1] - u[0]) * (x[2] - x[0]) - (u[2] - u[0]) * (x[1] - x[0])) * denom;
+    auto dvdy = ((v[1] - v[0]) * (x[2] - x[0]) - (v[2] - v[0]) * (x[1] - x[0])) * denom;
+
+    //Calculate X-slopes along the edges
+    if (y[1] > y[0]) dxdy[0] = (x[1] - x[0]) / (y[1] - y[0]);
+    if (y[2] > y[0]) dxdy[1] = (x[2] - x[0]) / (y[2] - y[0]);
+    if (y[2] > y[1]) dxdy[2] = (x[2] - x[1]) / (y[2] - y[1]);
+
+    //Determine which side of the polygon the longer edge is on
+    auto side = (dxdy[1] > dxdy[0]) ? true : false;
+
+    if (mathEqual(y[0], y[1])) side = x[0] > x[1];
+    if (mathEqual(y[1], y[2])) side = x[2] > x[1];
+
+    auto regionTop = region ? region->min.y : image->rle->spans->y;  //Normal Image or Rle Image?
+    auto compositing = _compositing(surface);   //Composition required
+    auto blending = _blending(surface);         //Blending required
+
+    //Longer edge is on the left side
+    if (!side) {
+        //Calculate slopes along left edge
+        dxdya = dxdy[1];
+        dudya = dxdya * dudx + dudy;
+        dvdya = dxdya * dvdx + dvdy;
+
+        //Perform subpixel pre-stepping along left edge
+        auto dy = 1.0f - (y[0] - yi[0]);
+        xa = x[0] + dy * dxdya;
+        ua = u[0] + dy * dudya;
+        va = v[0] + dy * dvdya;
+
+        //Draw upper segment if possibly visible
+        if (yi[0] < yi[1]) {
+            off_y = y[0] < regionTop ? (regionTop - y[0]) : 0;
+            xa += (off_y * dxdya);
+            ua += (off_y * dudya);
+            va += (off_y * dvdya);
+
+            // Set right edge X-slope and perform subpixel pre-stepping
+            dxdyb = dxdy[0];
+            xb = x[0] + dy * dxdyb + (off_y * dxdyb);
+
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, 1);
+            } else if (blending) {
+                _rasterBlendingPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, false);
+            }
+            upper = true;
+        }
+        //Draw lower segment if possibly visible
+        if (yi[1] < yi[2]) {
+            off_y = y[1] < regionTop ? (regionTop - y[1]) : 0;
+            if (!upper) {
+                xa += (off_y * dxdya);
+                ua += (off_y * dudya);
+                va += (off_y * dvdya);
+            }
+            // Set right edge X-slope and perform subpixel pre-stepping
+            dxdyb = dxdy[2];
+            xb = x[1] + (1 - (y[1] - yi[1])) * dxdyb + (off_y * dxdyb);
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, 2);
+            } else if (blending) {
+                 _rasterBlendingPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, false);
+            }
+        }
+    //Longer edge is on the right side
+    } else {
+        //Set right edge X-slope and perform subpixel pre-stepping
+        dxdyb = dxdy[1];
+        auto dy = 1.0f - (y[0] - yi[0]);
+        xb = x[0] + dy * dxdyb;
+
+        //Draw upper segment if possibly visible
+        if (yi[0] < yi[1]) {
+            off_y = y[0] < regionTop ? (regionTop - y[0]) : 0;
+            xb += (off_y *dxdyb);
+
+            // Set slopes along left edge and perform subpixel pre-stepping
+            dxdya = dxdy[0];
+            dudya = dxdya * dudx + dudy;
+            dvdya = dxdya * dvdx + dvdy;
+
+            xa = x[0] + dy * dxdya + (off_y * dxdya);
+            ua = u[0] + dy * dudya + (off_y * dudya);
+            va = v[0] + dy * dvdya + (off_y * dvdya);
+
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, 3);
+            } else if (blending) {
+                _rasterBlendingPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[0], yi[1], aaSpans, opacity, false);
+            }
+            upper = true;
+        }
+        //Draw lower segment if possibly visible
+        if (yi[1] < yi[2]) {
+            off_y = y[1] < regionTop ? (regionTop - y[1]) : 0;
+            if (!upper) xb += (off_y *dxdyb);
+
+            // Set slopes along left edge and perform subpixel pre-stepping
+            dxdya = dxdy[2];
+            dudya = dxdya * dudx + dudy;
+            dvdya = dxdya * dvdx + dvdy;
+            dy = 1 - (y[1] - yi[1]);
+            xa = x[1] + dy * dxdya + (off_y * dxdya);
+            ua = u[1] + dy * dudya + (off_y * dudya);
+            va = v[1] + dy * dvdya + (off_y * dvdya);
+
+            if (compositing) {
+                if (_matting(surface)) _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, true);
+                else _rasterMaskedPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, 4);
+            } else if (blending) {
+                _rasterBlendingPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity);
+            } else {
+                _rasterPolygonImageSegment(surface, image, region, yi[1], yi[2], aaSpans, opacity, false);
+            }
+        }
+    }
+}
+
+
+static AASpans* _AASpans(float ymin, float ymax, const SwImage* image, const SwBBox* region)
+{
+    auto yStart = static_cast<int32_t>(ymin);
+    auto yEnd = static_cast<int32_t>(ymax);
+
+    if (!_arrange(image, region, yStart, yEnd)) return nullptr;
+
+    auto aaSpans = static_cast<AASpans*>(malloc(sizeof(AASpans)));
+    aaSpans->yStart = yStart;
+    aaSpans->yEnd = yEnd;
+
+    //Initialize X range
+    auto height = yEnd - yStart;
+
+    aaSpans->lines = static_cast<AALine*>(calloc(height, sizeof(AALine)));
+
+    for (int32_t i = 0; i < height; i++) {
+        aaSpans->lines[i].x[0] = INT32_MAX;
+        aaSpans->lines[i].x[1] = INT32_MIN;
+    }
+    return aaSpans;
+}
+
+
+static void _calcIrregularCoverage(AALine* lines, int32_t eidx, int32_t y, int32_t diagonal, int32_t edgeDist, bool reverse)
+{
+    if (eidx == 1) reverse = !reverse;
+    int32_t coverage = (255 / (diagonal + 2));
+    int32_t tmp;
+    for (int32_t ry = 0; ry < (diagonal + 2); ry++) {
+        tmp = y - ry - edgeDist;
+        if (tmp < 0) return;
+        lines[tmp].length[eidx] = 1;
+        if (reverse) lines[tmp].coverage[eidx] = 255 - (coverage * ry);
+        else lines[tmp].coverage[eidx] = (coverage * ry);
+    }
+}
+
+
+static void _calcVertCoverage(AALine *lines, int32_t eidx, int32_t y, int32_t rewind, bool reverse)
+{
+    if (eidx == 1) reverse = !reverse;
+    int32_t coverage = (255 / (rewind + 1));
+    int32_t tmp;
+    for (int ry = 1; ry < (rewind + 1); ry++) {
+        tmp = y - ry;
+        if (tmp < 0) return;
+        lines[tmp].length[eidx] = 1;
+        if (reverse) lines[tmp].coverage[eidx] = (255 - (coverage * ry));
+        else lines[tmp].coverage[eidx] = (coverage * ry);
+    }
+}
+
+
+static void _calcHorizCoverage(AALine *lines, int32_t eidx, int32_t y, int32_t x, int32_t x2)
+{
+    if (lines[y].length[eidx] < abs(x - x2)) {
+        lines[y].length[eidx] = abs(x - x2);
+        lines[y].coverage[eidx] = (255 / (lines[y].length[eidx] + 1));
+    }
+}
+
+
+/*
+ * This Anti-Aliasing mechanism is originated from Hermet Park's idea.
+ * To understand this AA logic, you can refer this page:
+ * www.hermet.pe.kr/122 (hermetpark@gmail.com)
+*/
+static void _calcAAEdge(AASpans *aaSpans, int32_t eidx)
+{
+//Previous edge direction:
+#define DirOutHor 0x0011
+#define DirOutVer 0x0001
+#define DirInHor  0x0010
+#define DirInVer  0x0000
+#define DirNone   0x1000
+
+#define PUSH_VERTEX() \
+    do { \
+        pEdge.x = lines[y].x[eidx]; \
+        pEdge.y = y; \
+        ptx[0] = tx[0]; \
+        ptx[1] = tx[1]; \
+    } while (0)
+
+    int32_t y = 0;
+    SwPoint pEdge = {-1, -1};       //previous edge point
+    SwPoint edgeDiff = {0, 0};      //temporary used for point distance
+
+    /* store bigger to tx[0] between prev and current edge's x positions. */
+    int32_t tx[2] = {0, 0};
+    /* back up prev tx values */
+    int32_t ptx[2] = {0, 0};
+    int32_t diagonal = 0;           //straight diagonal pixels count
+
+    auto yStart = aaSpans->yStart;
+    auto yEnd = aaSpans->yEnd;
+    auto lines = aaSpans->lines;
+
+    int32_t prevDir = DirNone;
+    int32_t curDir = DirNone;
+
+    yEnd -= yStart;
+
+    //Start Edge
+    if (y < yEnd) {
+        pEdge.x = lines[y].x[eidx];
+        pEdge.y = y;
+    }
+
+    //Calculates AA Edges
+    for (y++; y < yEnd; y++) {
+        //Ready tx
+        if (eidx == 0) {
+            tx[0] = pEdge.x;
+            tx[1] = lines[y].x[0];
+        } else {
+            tx[0] = lines[y].x[1];
+            tx[1] = pEdge.x;
+        }
+        edgeDiff.x = (tx[0] - tx[1]);
+        edgeDiff.y = (y - pEdge.y);
+
+        //Confirm current edge direction
+        if (edgeDiff.x > 0) {
+            if (edgeDiff.y == 1) curDir = DirOutHor;
+            else curDir = DirOutVer;
+        } else if (edgeDiff.x < 0) {
+            if (edgeDiff.y == 1) curDir = DirInHor;
+            else curDir = DirInVer;
+        } else curDir = DirNone;
+
+        //straight diagonal increase
+        if ((curDir == prevDir) && (y < yEnd)) {
+            if ((abs(edgeDiff.x) == 1) && (edgeDiff.y == 1)) {
+                ++diagonal;
+                PUSH_VERTEX();
+                continue;
+            }
+        }
+
+        switch (curDir) {
+            case DirOutHor: {
+                _calcHorizCoverage(lines, eidx, y, tx[0], tx[1]);
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, 0, true);
+                    diagonal = 0;
+                }
+               /* Increment direction is changed: Outside Vertical -> Outside Horizontal */
+               if (prevDir == DirOutVer) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+
+               //Trick, but fine-tunning!
+               if (y == 1) _calcHorizCoverage(lines, eidx, pEdge.y, tx[0], tx[1]);
+               PUSH_VERTEX();
+            }
+            break;
+            case DirOutVer: {
+                _calcVertCoverage(lines, eidx, y, edgeDiff.y, true);
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, edgeDiff.y, false);
+                    diagonal = 0;
+                }
+               /* Increment direction is changed: Outside Horizontal -> Outside Vertical */
+               if (prevDir == DirOutHor) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+               PUSH_VERTEX();
+            }
+            break;
+            case DirInHor: {
+                _calcHorizCoverage(lines, eidx, (y - 1), tx[0], tx[1]);
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, 0, false);
+                    diagonal = 0;
+                }
+                /* Increment direction is changed: Outside Horizontal -> Inside Horizontal */
+               if (prevDir == DirOutHor) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+               PUSH_VERTEX();
+            }
+            break;
+            case DirInVer: {
+                _calcVertCoverage(lines, eidx, y, edgeDiff.y, false);
+                if (prevDir == DirOutHor) edgeDiff.y -= 1;      //Weird, fine tuning?????????????????????
+                if (diagonal > 0) {
+                    _calcIrregularCoverage(lines, eidx, y, diagonal, edgeDiff.y, true);
+                    diagonal = 0;
+                }
+                /* Increment direction is changed: Outside Horizontal -> Inside Vertical */
+                if (prevDir == DirOutHor) _calcHorizCoverage(lines, eidx, pEdge.y, ptx[0], ptx[1]);
+                PUSH_VERTEX();
+            }
+            break;
+        }
+        if (curDir != DirNone) prevDir = curDir;
+    }
+
+    //leftovers...?
+    if ((edgeDiff.y == 1) && (edgeDiff.x != 0)) {
+        if (y >= yEnd) y = (yEnd - 1);
+        _calcHorizCoverage(lines, eidx, y - 1, ptx[0], ptx[1]);
+        _calcHorizCoverage(lines, eidx, y, tx[0], tx[1]);
+    } else {
+        ++y;
+        if (y > yEnd) y = yEnd;
+        _calcVertCoverage(lines, eidx, y, (edgeDiff.y + 1), (prevDir & 0x00000001));
+    }
+}
+
+
+static bool _apply(SwSurface* surface, AASpans* aaSpans)
+{
+    auto y = aaSpans->yStart;
+    uint32_t pixel;
+    uint32_t* dst;
+    int32_t pos;
+
+   //left side
+   _calcAAEdge(aaSpans, 0);
+   //right side
+   _calcAAEdge(aaSpans, 1);
+
+    while (y < aaSpans->yEnd) {
+        auto line = &aaSpans->lines[y - aaSpans->yStart];
+        auto width = line->x[1] - line->x[0];
+        if (width > 0) {
+            auto offset = y * surface->stride;
+
+            //Left edge
+            dst = surface->buf32 + (offset + line->x[0]);
+            if (line->x[0] > 1) pixel = *(dst - 1);
+            else pixel = *dst;
+
+            pos = 1;
+            while (pos <= line->length[0]) {
+                *dst = INTERPOLATE(*dst, pixel, line->coverage[0] * pos);
+                ++dst;
+                ++pos;
+            }
+
+            //Right edge
+            dst = surface->buf32 + (offset + line->x[1] - 1);
+            if (line->x[1] < (int32_t)(surface->w - 1)) pixel = *(dst + 1);
+            else pixel = *dst;
+
+            pos = width;
+            while ((int32_t)(width - line->length[1]) < pos) {
+                *dst = INTERPOLATE(*dst, pixel, 255 - (line->coverage[1] * (line->length[1] - (width - pos))));
+                --dst;
+                --pos;
+            }
+          }
+        y++;
+    }
+
+    free(aaSpans->lines);
+    free(aaSpans);
+
+    return true;
+}
+
+
+/*
+    2 triangles constructs 1 mesh.
+    below figure illustrates vert[4] index info.
+    If you need better quality, please divide a mesh by more number of triangles.
+
+    0 -- 1
+    |  / |
+    | /  |
+    3 -- 2
+*/
+static bool _rasterTexmapPolygon(SwSurface* surface, const SwImage* image, const Matrix* transform, const SwBBox* region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale Textmap polygon!");
+        return false;
+    }
+
+    //Exceptions: No dedicated drawing area?
+    if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return false;
+
+   /* Prepare vertices.
+      shift XY coordinates to match the sub-pixeling technique. */
+    Vertex vertices[4];
+    vertices[0] = {{0.0f, 0.0f}, {0.0f, 0.0f}};
+    vertices[1] = {{float(image->w), 0.0f}, {float(image->w), 0.0f}};
+    vertices[2] = {{float(image->w), float(image->h)}, {float(image->w), float(image->h)}};
+    vertices[3] = {{0.0f, float(image->h)}, {0.0f, float(image->h)}};
+
+    float ys = FLT_MAX, ye = -1.0f;
+    for (int i = 0; i < 4; i++) {
+        mathMultiply(&vertices[i].pt, transform);
+
+        if (vertices[i].pt.y < ys) ys = vertices[i].pt.y;
+        if (vertices[i].pt.y > ye) ye = vertices[i].pt.y;
+    }
+
+    auto aaSpans = _AASpans(ys, ye, image, region);
+    if (!aaSpans) return true;
+
+    Polygon polygon;
+
+    //Draw the first polygon
+    polygon.vertex[0] = vertices[0];
+    polygon.vertex[1] = vertices[1];
+    polygon.vertex[2] = vertices[3];
+
+    _rasterPolygonImage(surface, image, region, polygon, aaSpans, opacity);
+
+    //Draw the second polygon
+    polygon.vertex[0] = vertices[1];
+    polygon.vertex[1] = vertices[2];
+    polygon.vertex[2] = vertices[3];
+
+    _rasterPolygonImage(surface, image, region, polygon, aaSpans, opacity);
+
+#if 0
+    if (_compositing(surface) && _masking(surface) && !_direct(surface->compositor->method)) {
+        _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+    }
+#endif
+    return _apply(surface, aaSpans);
+}
+
+
+/*
+    Provide any number of triangles to draw a mesh using the supplied image.
+    Indexes are not used, so each triangle (Polygon) vertex has to be defined, even if they copy the previous one.
+    Example:
+
+      0 -- 1       0 -- 1   0
+      |  / |  -->  |  /   / |
+      | /  |       | /   /  |
+      2 -- 3       2   1 -- 2
+
+      Should provide two Polygons, one for each triangle.
+      // TODO: region?
+*/
+static bool _rasterTexmapPolygonMesh(SwSurface* surface, const SwImage* image, const RenderMesh* mesh, const Matrix* transform, const SwBBox* region, uint8_t opacity)
+{
+    if (surface->channelSize == sizeof(uint8_t)) {
+        TVGERR("SW_ENGINE", "Not supported grayscale Textmap polygon mesh!");
+        return false;
+    }
+
+    //Exceptions: No dedicated drawing area?
+    if ((!image->rle && !region) || (image->rle && image->rle->size == 0)) return false;
+
+    // Step polygons once to transform
+    auto transformedTris = (Polygon*)malloc(sizeof(Polygon) * mesh->triangleCnt);
+    float ys = FLT_MAX, ye = -1.0f;
+    for (uint32_t i = 0; i < mesh->triangleCnt; i++) {
+        transformedTris[i] = mesh->triangles[i];
+        mathMultiply(&transformedTris[i].vertex[0].pt, transform);
+        mathMultiply(&transformedTris[i].vertex[1].pt, transform);
+        mathMultiply(&transformedTris[i].vertex[2].pt, transform);
+
+        if (transformedTris[i].vertex[0].pt.y < ys) ys = transformedTris[i].vertex[0].pt.y;
+        else if (transformedTris[i].vertex[0].pt.y > ye) ye = transformedTris[i].vertex[0].pt.y;
+        if (transformedTris[i].vertex[1].pt.y < ys) ys = transformedTris[i].vertex[1].pt.y;
+        else if (transformedTris[i].vertex[1].pt.y > ye) ye = transformedTris[i].vertex[1].pt.y;
+        if (transformedTris[i].vertex[2].pt.y < ys) ys = transformedTris[i].vertex[2].pt.y;
+        else if (transformedTris[i].vertex[2].pt.y > ye) ye = transformedTris[i].vertex[2].pt.y;
+
+        // Convert normalized UV coordinates to image coordinates
+        transformedTris[i].vertex[0].uv.x *= (float)image->w;
+        transformedTris[i].vertex[0].uv.y *= (float)image->h;
+        transformedTris[i].vertex[1].uv.x *= (float)image->w;
+        transformedTris[i].vertex[1].uv.y *= (float)image->h;
+        transformedTris[i].vertex[2].uv.x *= (float)image->w;
+        transformedTris[i].vertex[2].uv.y *= (float)image->h;
+    }
+
+    // Get AA spans and step polygons again to draw
+    if (auto aaSpans = _AASpans(ys, ye, image, region)) {
+        for (uint32_t i = 0; i < mesh->triangleCnt; i++) {
+            _rasterPolygonImage(surface, image, region, transformedTris[i], aaSpans, opacity);
+        }
+#if 0
+        if (_compositing(surface) && _masking(surface) && !_direct(surface->compositor->method)) {
+            _compositeMaskImage(surface, &surface->compositor->image, surface->compositor->bbox);
+        }
+#endif
+        _apply(surface, aaSpans);
+    }
+    free(transformedTris);
+    return true;
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.cpp
new file mode 100644
index 0000000000..049aa3d1d0
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.cpp
@@ -0,0 +1,856 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgMath.h"
+#include "tvgSwCommon.h"
+#include "tvgTaskScheduler.h"
+#include "tvgSwRenderer.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+static int32_t initEngineCnt = false;
+static int32_t rendererCnt = 0;
+static SwMpool* globalMpool = nullptr;
+static uint32_t threadsCnt = 0;
+
+struct SwTask : Task
+{
+    SwSurface* surface = nullptr;
+    SwMpool* mpool = nullptr;
+    SwBBox bbox = {{0, 0}, {0, 0}};       //Whole Rendering Region
+    Matrix* transform = nullptr;
+    Array<RenderData> clips;
+    RenderUpdateFlag flags = RenderUpdateFlag::None;
+    uint8_t opacity;
+    bool pushed = false;                  //Pushed into task list?
+    bool disposed = false;                //Disposed task?
+
+    RenderRegion bounds() const
+    {
+        RenderRegion region;
+
+        //Range over?
+        region.x = bbox.min.x > 0 ? bbox.min.x : 0;
+        region.y = bbox.min.y > 0 ? bbox.min.y : 0;
+        region.w = bbox.max.x - region.x;
+        region.h = bbox.max.y - region.y;
+        if (region.w < 0) region.w = 0;
+        if (region.h < 0) region.h = 0;
+
+        return region;
+    }
+
+    virtual bool dispose() = 0;
+    virtual bool clip(SwRleData* target) = 0;
+    virtual SwRleData* rle() = 0;
+
+    virtual ~SwTask()
+    {
+        free(transform);
+    }
+};
+
+
+struct SwShapeTask : SwTask
+{
+    SwShape shape;
+    const RenderShape* rshape = nullptr;
+    bool cmpStroking = false;
+    bool clipper = false;
+
+    /* We assume that if the stroke width is greater than 2,
+       the shape's outline beneath the stroke could be adequately covered by the stroke drawing.
+       Therefore, antialiasing is disabled under this condition.
+       Additionally, the stroke style should not be dashed. */
+    bool antialiasing(float strokeWidth)
+    {
+        return strokeWidth < 2.0f || rshape->stroke->dashCnt > 0 || rshape->stroke->strokeFirst;
+    }
+
+    float validStrokeWidth()
+    {
+        if (!rshape->stroke) return 0.0f;
+
+        auto width = rshape->stroke->width;
+        if (mathZero(width)) return 0.0f;
+
+        if (!rshape->stroke->fill && (MULTIPLY(rshape->stroke->color[3], opacity) == 0)) return 0.0f;
+        if (mathZero(rshape->stroke->trim.begin - rshape->stroke->trim.end)) return 0.0f;
+
+        if (transform) return (width * sqrt(transform->e11 * transform->e11 + transform->e12 * transform->e12));
+        else return width;
+    }
+
+
+    bool clip(SwRleData* target) override
+    {
+        if (shape.fastTrack) rleClipRect(target, &bbox);
+        else if (shape.rle) rleClipPath(target, shape.rle);
+        else return false;
+
+        return true;
+    }
+
+    SwRleData* rle() override
+    {
+        if (!shape.rle && shape.fastTrack) {
+            shape.rle = rleRender(&shape.bbox);
+        }
+        return shape.rle;
+    }
+
+    void run(unsigned tid) override
+    {
+        if (opacity == 0 && !clipper) return;  //Invisible
+
+        auto strokeWidth = validStrokeWidth();
+        bool visibleFill = false;
+        auto clipRegion = bbox;
+
+        //This checks also for the case, if the invisible shape turned to visible by alpha.
+        auto prepareShape = false;
+        if (!shapePrepared(&shape) && (flags & RenderUpdateFlag::Color)) prepareShape = true;
+
+        //Shape
+        if (flags & (RenderUpdateFlag::Path | RenderUpdateFlag::Transform) || prepareShape) {
+            uint8_t alpha = 0;
+            rshape->fillColor(nullptr, nullptr, nullptr, &alpha);
+            alpha = MULTIPLY(alpha, opacity);
+            visibleFill = (alpha > 0 || rshape->fill);
+            if (visibleFill || clipper) {
+                shapeReset(&shape);
+                if (!shapePrepare(&shape, rshape, transform, clipRegion, bbox, mpool, tid, clips.count > 0 ? true : false)) goto err;
+            }
+        }
+        //Fill
+        if (flags & (RenderUpdateFlag::Gradient | RenderUpdateFlag::Transform | RenderUpdateFlag::Color)) {
+            if (visibleFill || clipper) {
+                if (!shapeGenRle(&shape, rshape, antialiasing(strokeWidth))) goto err;
+            }
+            if (auto fill = rshape->fill) {
+                auto ctable = (flags & RenderUpdateFlag::Gradient) ? true : false;
+                if (ctable) shapeResetFill(&shape);
+                if (!shapeGenFillColors(&shape, fill, transform, surface, opacity, ctable)) goto err;
+            } else {
+                shapeDelFill(&shape);
+            }
+        }
+        //Stroke
+        if (flags & (RenderUpdateFlag::Stroke | RenderUpdateFlag::Transform)) {
+            if (strokeWidth > 0.0f) {
+                shapeResetStroke(&shape, rshape, transform);
+                if (!shapeGenStrokeRle(&shape, rshape, transform, clipRegion, bbox, mpool, tid)) goto err;
+
+                if (auto fill = rshape->strokeFill()) {
+                    auto ctable = (flags & RenderUpdateFlag::GradientStroke) ? true : false;
+                    if (ctable) shapeResetStrokeFill(&shape);
+                    if (!shapeGenStrokeFillColors(&shape, fill, transform, surface, opacity, ctable)) goto err;
+                } else {
+                    shapeDelStrokeFill(&shape);
+                }
+            } else {
+                shapeDelStroke(&shape);
+            }
+        }
+
+        //Clear current task memorypool here if the clippers would use the same memory pool
+        shapeDelOutline(&shape, mpool, tid);
+
+        //Clip Path
+        for (auto clip = clips.data; clip < clips.end(); ++clip) {
+            auto clipper = static_cast<SwTask*>(*clip);
+            //Clip shape rle
+            if (shape.rle && !clipper->clip(shape.rle)) goto err;
+            //Clip stroke rle
+            if (shape.strokeRle && !clipper->clip(shape.strokeRle)) goto err;
+        }
+        return;
+
+    err:
+        shapeReset(&shape);
+        shapeDelOutline(&shape, mpool, tid);
+    }
+
+    bool dispose() override
+    {
+       shapeFree(&shape);
+       return true;
+    }
+};
+
+
+struct SwSceneTask : SwTask
+{
+    Array<RenderData> scene;    //list of paints render data (SwTask)
+    SwRleData* sceneRle = nullptr;
+
+    bool clip(SwRleData* target) override
+    {
+        //Only one shape
+        if (scene.count == 1) {
+            return static_cast<SwTask*>(*scene.data)->clip(target);
+        }
+
+        //More than one shapes
+        if (sceneRle) rleClipPath(target, sceneRle);
+        else TVGLOG("SW_ENGINE", "No clippers in a scene?");
+
+        return true;
+    }
+
+    SwRleData* rle() override
+    {
+        return sceneRle;
+    }
+
+    void run(unsigned tid) override
+    {
+        //TODO: Skip the run if the scene hans't changed.
+        if (!sceneRle) sceneRle = static_cast<SwRleData*>(calloc(1, sizeof(SwRleData)));
+        else rleReset(sceneRle);
+
+        //Merge shapes if it has more than one shapes
+        if (scene.count > 1) {
+            //Merge first two clippers
+            auto clipper1 = static_cast<SwTask*>(*scene.data);
+            auto clipper2 = static_cast<SwTask*>(*(scene.data + 1));
+
+            rleMerge(sceneRle, clipper1->rle(), clipper2->rle());
+
+            //Unify the remained clippers
+            for (auto rd = scene.data + 2; rd < scene.end(); ++rd) {
+                auto clipper = static_cast<SwTask*>(*rd);
+                rleMerge(sceneRle, sceneRle, clipper->rle());
+            }
+        }
+    }
+
+    bool dispose() override
+    {
+        rleFree(sceneRle);
+        return true;
+    }
+};
+
+
+struct SwImageTask : SwTask
+{
+    SwImage image;
+    Surface* source;                            //Image source
+    const RenderMesh* mesh = nullptr;           //Should be valid ptr in action
+
+    bool clip(SwRleData* target) override
+    {
+        TVGERR("SW_ENGINE", "Image is used as ClipPath?");
+        return true;
+    }
+
+    SwRleData* rle() override
+    {
+        TVGERR("SW_ENGINE", "Image is used as Scene ClipPath?");
+        return nullptr;
+    }
+
+    void run(unsigned tid) override
+    {
+        auto clipRegion = bbox;
+
+        //Convert colorspace if it's not aligned.
+        if (source->owner) {
+            if (source->cs != surface->cs) rasterConvertCS(source, surface->cs);
+            if (!source->premultiplied) rasterPremultiply(source);
+        }
+
+        image.data = source->data;
+        image.w = source->w;
+        image.h = source->h;
+        image.stride = source->stride;
+        image.channelSize = source->channelSize;
+
+        //Invisible shape turned to visible by alpha.
+        if ((flags & (RenderUpdateFlag::Image | RenderUpdateFlag::Transform | RenderUpdateFlag::Color)) && (opacity > 0)) {
+            imageReset(&image);
+            if (!image.data || image.w == 0 || image.h == 0) goto end;
+
+            if (!imagePrepare(&image, mesh, transform, clipRegion, bbox, mpool, tid)) goto end;
+
+            // TODO: How do we clip the triangle mesh? Only clip non-meshed images for now
+            if (mesh->triangleCnt == 0 && clips.count > 0) {
+                if (!imageGenRle(&image, bbox, false)) goto end;
+                if (image.rle) {
+                    //Clear current task memorypool here if the clippers would use the same memory pool
+                    imageDelOutline(&image, mpool, tid);
+                    for (auto clip = clips.data; clip < clips.end(); ++clip) {
+                        auto clipper = static_cast<SwTask*>(*clip);
+                        if (!clipper->clip(image.rle)) goto err;
+                    }
+                    return;
+                }
+            }
+        }
+        goto end;
+    err:
+        rleReset(image.rle);
+    end:
+        imageDelOutline(&image, mpool, tid);
+    }
+
+    bool dispose() override
+    {
+       imageFree(&image);
+       return true;
+    }
+};
+
+
+static void _termEngine()
+{
+    if (rendererCnt > 0) return;
+
+    mpoolTerm(globalMpool);
+    globalMpool = nullptr;
+}
+
+
+static void _renderFill(SwShapeTask* task, SwSurface* surface, uint8_t opacity)
+{
+    uint8_t r, g, b, a;
+    if (auto fill = task->rshape->fill) {
+        rasterGradientShape(surface, &task->shape, fill->identifier());
+    } else {
+        task->rshape->fillColor(&r, &g, &b, &a);
+        a = MULTIPLY(opacity, a);
+        if (a > 0) rasterShape(surface, &task->shape, r, g, b, a);
+    }
+}
+
+static void _renderStroke(SwShapeTask* task, SwSurface* surface, uint8_t opacity)
+{
+    uint8_t r, g, b, a;
+    if (auto strokeFill = task->rshape->strokeFill()) {
+        rasterGradientStroke(surface, &task->shape, strokeFill->identifier());
+    } else {
+        if (task->rshape->strokeColor(&r, &g, &b, &a)) {
+            a = MULTIPLY(opacity, a);
+            if (a > 0) rasterStroke(surface, &task->shape, r, g, b, a);
+        }
+    }
+}
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwRenderer::~SwRenderer()
+{
+    clearCompositors();
+
+    delete(surface);
+
+    if (!sharedMpool) mpoolTerm(mpool);
+
+    --rendererCnt;
+
+    if (rendererCnt == 0 && initEngineCnt == 0) _termEngine();
+}
+
+
+bool SwRenderer::clear()
+{
+    for (auto task = tasks.data; task < tasks.end(); ++task) {
+        if ((*task)->disposed) {
+            delete(*task);
+        } else {
+            (*task)->done();
+            (*task)->pushed = false;
+        }
+    }
+    tasks.clear();
+
+    if (!sharedMpool) mpoolClear(mpool);
+
+    if (surface) {
+        vport.x = vport.y = 0;
+        vport.w = surface->w;
+        vport.h = surface->h;
+    }
+
+    return true;
+}
+
+
+bool SwRenderer::sync()
+{
+    return true;
+}
+
+
+RenderRegion SwRenderer::viewport()
+{
+    return vport;
+}
+
+
+bool SwRenderer::viewport(const RenderRegion& vp)
+{
+    vport = vp;
+    return true;
+}
+
+
+bool SwRenderer::target(pixel_t* data, uint32_t stride, uint32_t w, uint32_t h, ColorSpace cs)
+{
+    if (!data || stride == 0 || w == 0 || h == 0 || w > stride) return false;
+
+    if (!surface) surface = new SwSurface;
+
+    surface->data = data;
+    surface->stride = stride;
+    surface->w = w;
+    surface->h = h;
+    surface->cs = cs;
+    surface->channelSize = CHANNEL_SIZE(cs);
+    surface->premultiplied = true;
+    surface->owner = true;
+
+    vport.x = vport.y = 0;
+    vport.w = surface->w;
+    vport.h = surface->h;
+
+    return rasterCompositor(surface);
+}
+
+
+bool SwRenderer::preRender()
+{
+    return rasterClear(surface, 0, 0, surface->w, surface->h);
+}
+
+
+void SwRenderer::clearCompositors()
+{
+    //Free Composite Caches
+    for (auto comp = compositors.data; comp < compositors.end(); ++comp) {
+        free((*comp)->compositor->image.data);
+        delete((*comp)->compositor);
+        delete(*comp);
+    }
+    compositors.reset();
+}
+
+
+bool SwRenderer::postRender()
+{
+    //Unmultiply alpha if needed
+    if (surface->cs == ColorSpace::ABGR8888S || surface->cs == ColorSpace::ARGB8888S) {
+        rasterUnpremultiply(surface);
+    }
+
+    for (auto task = tasks.data; task < tasks.end(); ++task) {
+        if ((*task)->disposed) delete(*task);
+        else (*task)->pushed = false;
+    }
+    tasks.clear();
+
+    clearCompositors();
+    return true;
+}
+
+
+bool SwRenderer::renderImage(RenderData data)
+{
+    auto task = static_cast<SwImageTask*>(data);
+    task->done();
+
+    if (task->opacity == 0) return true;
+
+    return rasterImage(surface, &task->image, task->mesh, task->transform, task->bbox, task->opacity);
+}
+
+
+bool SwRenderer::renderShape(RenderData data)
+{
+    auto task = static_cast<SwShapeTask*>(data);
+    if (!task) return false;
+
+    task->done();
+
+    if (task->opacity == 0) return true;
+
+    //Main raster stage
+    if (task->rshape->stroke && task->rshape->stroke->strokeFirst) {
+        _renderStroke(task, surface, task->opacity);
+        _renderFill(task, surface, task->opacity);
+    } else {
+        _renderFill(task, surface, task->opacity);
+        _renderStroke(task, surface, task->opacity);
+    }
+
+    return true;
+}
+
+
+bool SwRenderer::blend(BlendMethod method)
+{
+    if (surface->blendMethod == method) return true;
+    surface->blendMethod = method;
+
+    switch (method) {
+        case BlendMethod::Add:
+            surface->blender = opBlendAdd;
+            break;
+        case BlendMethod::Screen:
+            surface->blender = opBlendScreen;
+            break;
+        case BlendMethod::Multiply:
+            surface->blender = opBlendMultiply;
+            break;
+        case BlendMethod::Overlay:
+            surface->blender = opBlendOverlay;
+            break;
+        case BlendMethod::Difference:
+            surface->blender = opBlendDifference;
+            break;
+        case BlendMethod::Exclusion:
+            surface->blender = opBlendExclusion;
+            break;
+        case BlendMethod::SrcOver:
+            surface->blender = opBlendSrcOver;
+            break;
+        case BlendMethod::Darken:
+            surface->blender = opBlendDarken;
+            break;
+        case BlendMethod::Lighten:
+            surface->blender = opBlendLighten;
+            break;
+        case BlendMethod::ColorDodge:
+            surface->blender = opBlendColorDodge;
+            break;
+        case BlendMethod::ColorBurn:
+            surface->blender = opBlendColorBurn;
+            break;
+        case BlendMethod::HardLight:
+            surface->blender = opBlendHardLight;
+            break;
+        case BlendMethod::SoftLight:
+            surface->blender = opBlendSoftLight;
+            break;
+        default:
+            surface->blender = nullptr;
+            break;
+    }
+    return false;
+}
+
+
+RenderRegion SwRenderer::region(RenderData data)
+{
+    return static_cast<SwTask*>(data)->bounds();
+}
+
+
+bool SwRenderer::beginComposite(Compositor* cmp, CompositeMethod method, uint8_t opacity)
+{
+    if (!cmp) return false;
+    auto p = static_cast<SwCompositor*>(cmp);
+
+    p->method = method;
+    p->opacity = opacity;
+
+    //Current Context?
+    if (p->method != CompositeMethod::None) {
+        surface = p->recoverSfc;
+        surface->compositor = p;
+    }
+
+    return true;
+}
+
+
+bool SwRenderer::mempool(bool shared)
+{
+    if (shared == sharedMpool) return true;
+
+    if (shared) {
+        if (!sharedMpool) {
+            if (!mpoolTerm(mpool)) return false;
+            mpool = globalMpool;
+        }
+    } else {
+        if (sharedMpool) mpool = mpoolInit(threadsCnt);
+    }
+
+    sharedMpool = shared;
+
+    if (mpool) return true;
+    return false;
+}
+
+
+Compositor* SwRenderer::target(const RenderRegion& region, ColorSpace cs)
+{
+    auto x = region.x;
+    auto y = region.y;
+    auto w = region.w;
+    auto h = region.h;
+    auto sw = static_cast<int32_t>(surface->w);
+    auto sh = static_cast<int32_t>(surface->h);
+
+    //Out of boundary
+    if (x >= sw || y >= sh || x + w < 0 || y + h < 0) return nullptr;
+
+    SwSurface* cmp = nullptr;
+
+    auto reqChannelSize = CHANNEL_SIZE(cs);
+
+    //Use cached data
+    for (auto p = compositors.data; p < compositors.end(); ++p) {
+        if ((*p)->compositor->valid && (*p)->compositor->image.channelSize == reqChannelSize) {
+            cmp = *p;
+            break;
+        }
+    }
+
+    //New Composition
+    if (!cmp) {
+        cmp = new SwSurface;
+
+        //Inherits attributes from main surface
+        *cmp = *surface;
+
+        cmp->compositor = new SwCompositor;
+
+        //TODO: We can optimize compositor surface size from (surface->stride x surface->h) to Parameter(w x h)
+        cmp->compositor->image.data = (pixel_t*)malloc(reqChannelSize * surface->stride * surface->h);
+        cmp->channelSize = cmp->compositor->image.channelSize = reqChannelSize;
+
+        compositors.push(cmp);
+    }
+
+    //Boundary Check
+    if (x + w > sw) w = (sw - x);
+    if (y + h > sh) h = (sh - y);
+
+    cmp->compositor->recoverSfc = surface;
+    cmp->compositor->recoverCmp = surface->compositor;
+    cmp->compositor->valid = false;
+    cmp->compositor->bbox.min.x = x;
+    cmp->compositor->bbox.min.y = y;
+    cmp->compositor->bbox.max.x = x + w;
+    cmp->compositor->bbox.max.y = y + h;
+    cmp->compositor->image.stride = surface->stride;
+    cmp->compositor->image.w = surface->w;
+    cmp->compositor->image.h = surface->h;
+    cmp->compositor->image.direct = true;
+
+    cmp->data = cmp->compositor->image.data;
+    cmp->w = cmp->compositor->image.w;
+    cmp->h = cmp->compositor->image.h;
+
+    rasterClear(cmp, x, y, w, h);
+
+    //Switch render target
+    surface = cmp;
+
+    return cmp->compositor;
+}
+
+
+bool SwRenderer::endComposite(Compositor* cmp)
+{
+    if (!cmp) return false;
+
+    auto p = static_cast<SwCompositor*>(cmp);
+    p->valid = true;
+
+    //Recover Context
+    surface = p->recoverSfc;
+    surface->compositor = p->recoverCmp;
+
+    //Default is alpha blending
+    if (p->method == CompositeMethod::None) {
+        return rasterImage(surface, &p->image, nullptr, nullptr, p->bbox, p->opacity);
+    }
+
+    return true;
+}
+
+
+ColorSpace SwRenderer::colorSpace()
+{
+    if (surface) return surface->cs;
+    else return ColorSpace::Unsupported;
+}
+
+
+bool SwRenderer::dispose(RenderData data)
+{
+    auto task = static_cast<SwTask*>(data);
+    if (!task) return true;
+    task->done();
+    task->dispose();
+
+    if (task->pushed) task->disposed = true;
+    else delete(task);
+
+    return true;
+}
+
+
+void* SwRenderer::prepareCommon(SwTask* task, const RenderTransform* transform, const Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags)
+{
+    if (!surface) return task;
+    if (flags == RenderUpdateFlag::None) return task;
+
+    //Finish previous task if it has duplicated request.
+    task->done();
+
+    //TODO: Failed threading them. It would be better if it's possible.
+    //See: https://github.com/thorvg/thorvg/issues/1409
+    //Guarantee composition targets get ready.
+    for (auto clip = clips.data; clip < clips.end(); ++clip) {
+        static_cast<SwTask*>(*clip)->done();
+    }
+
+    task->clips = clips;
+
+    if (transform) {
+        if (!task->transform) task->transform = static_cast<Matrix*>(malloc(sizeof(Matrix)));
+        *task->transform = transform->m;
+    } else {
+        if (task->transform) free(task->transform);
+        task->transform = nullptr;
+    }
+
+    //zero size?
+    if (task->transform) {
+        if (task->transform->e11 == 0.0f && task->transform->e12 == 0.0f) return task; //zero width
+        if (task->transform->e21 == 0.0f && task->transform->e22 == 0.0f) return task; //zero height
+    }
+
+    task->opacity = opacity;
+    task->surface = surface;
+    task->mpool = mpool;
+    task->flags = flags;
+    task->bbox.min.x = mathMax(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.x));
+    task->bbox.min.y = mathMax(static_cast<SwCoord>(0), static_cast<SwCoord>(vport.y));
+    task->bbox.max.x = mathMin(static_cast<SwCoord>(surface->w), static_cast<SwCoord>(vport.x + vport.w));
+    task->bbox.max.y = mathMin(static_cast<SwCoord>(surface->h), static_cast<SwCoord>(vport.y + vport.h));
+
+    if (!task->pushed) {
+        task->pushed = true;
+        tasks.push(task);
+    }
+
+    TaskScheduler::request(task);
+
+    return task;
+}
+
+
+RenderData SwRenderer::prepare(Surface* surface, const RenderMesh* mesh, RenderData data, const RenderTransform* transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags)
+{
+    //prepare task
+    auto task = static_cast<SwImageTask*>(data);
+    if (!task) task = new SwImageTask;
+    task->source = surface;
+    task->mesh = mesh;
+    return prepareCommon(task, transform, clips, opacity, flags);
+}
+
+
+RenderData SwRenderer::prepare(const Array<RenderData>& scene, RenderData data, const RenderTransform* transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags)
+{
+    //prepare task
+    auto task = static_cast<SwSceneTask*>(data);
+    if (!task) task = new SwSceneTask;
+    task->scene = scene;
+
+    //TODO: Failed threading them. It would be better if it's possible.
+    //See: https://github.com/thorvg/thorvg/issues/1409
+    //Guarantee composition targets get ready.
+    for (auto task = scene.data; task < scene.end(); ++task) {
+        static_cast<SwTask*>(*task)->done();
+    }
+    return prepareCommon(task, transform, clips, opacity, flags);
+}
+
+
+RenderData SwRenderer::prepare(const RenderShape& rshape, RenderData data, const RenderTransform* transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags, bool clipper)
+{
+    //prepare task
+    auto task = static_cast<SwShapeTask*>(data);
+    if (!task) {
+        task = new SwShapeTask;
+        task->rshape = &rshape;
+    }
+    task->clipper = clipper;
+
+    return prepareCommon(task, transform, clips, opacity, flags);
+}
+
+
+SwRenderer::SwRenderer():mpool(globalMpool)
+{
+}
+
+
+bool SwRenderer::init(uint32_t threads)
+{
+    if ((initEngineCnt++) > 0) return true;
+
+    threadsCnt = threads;
+
+    //Share the memory pool among the renderer
+    globalMpool = mpoolInit(threads);
+    if (!globalMpool) {
+        --initEngineCnt;
+        return false;
+    }
+
+    return true;
+}
+
+
+int32_t SwRenderer::init()
+{
+    return initEngineCnt;
+}
+
+
+bool SwRenderer::term()
+{
+    if ((--initEngineCnt) > 0) return true;
+
+    initEngineCnt = 0;
+
+   _termEngine();
+
+    return true;
+}
+
+SwRenderer* SwRenderer::gen()
+{
+    ++rendererCnt;
+    return new SwRenderer();
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.h b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.h
new file mode 100644
index 0000000000..4393740bd9
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRenderer.h
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef _TVG_SW_RENDERER_H_
+#define _TVG_SW_RENDERER_H_
+
+#include "tvgRender.h"
+
+struct SwSurface;
+struct SwTask;
+struct SwCompositor;
+struct SwMpool;
+
+namespace tvg
+{
+
+class SwRenderer : public RenderMethod
+{
+public:
+    RenderData prepare(const RenderShape& rshape, RenderData data, const RenderTransform* transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags, bool clipper) override;
+    RenderData prepare(const Array<RenderData>& scene, RenderData data, const RenderTransform* transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags) override;
+    RenderData prepare(Surface* surface, const RenderMesh* mesh, RenderData data, const RenderTransform* transform, Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags) override;
+    bool preRender() override;
+    bool renderShape(RenderData data) override;
+    bool renderImage(RenderData data) override;
+    bool postRender() override;
+    bool dispose(RenderData data) override;
+    RenderRegion region(RenderData data) override;
+    RenderRegion viewport() override;
+    bool viewport(const RenderRegion& vp) override;
+    bool blend(BlendMethod method) override;
+    ColorSpace colorSpace() override;
+
+    bool clear() override;
+    bool sync() override;
+    bool target(pixel_t* data, uint32_t stride, uint32_t w, uint32_t h, ColorSpace cs);
+    bool mempool(bool shared);
+
+    Compositor* target(const RenderRegion& region, ColorSpace cs) override;
+    bool beginComposite(Compositor* cmp, CompositeMethod method, uint8_t opacity) override;
+    bool endComposite(Compositor* cmp) override;
+    void clearCompositors();
+
+    static SwRenderer* gen();
+    static bool init(uint32_t threads);
+    static int32_t init();
+    static bool term();
+
+private:
+    SwSurface*           surface = nullptr;           //active surface
+    Array<SwTask*>       tasks;                       //async task list
+    Array<SwSurface*>    compositors;                 //render targets cache list
+    SwMpool*             mpool;                       //private memory pool
+    RenderRegion         vport;                       //viewport
+    bool                 sharedMpool = true;          //memory-pool behavior policy
+
+    SwRenderer();
+    ~SwRenderer();
+
+    RenderData prepareCommon(SwTask* task, const RenderTransform* transform, const Array<RenderData>& clips, uint8_t opacity, RenderUpdateFlag flags);
+};
+
+}
+
+#endif /* _TVG_SW_RENDERER_H_ */
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRle.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRle.cpp
new file mode 100644
index 0000000000..a4a7fabdee
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwRle.cpp
@@ -0,0 +1,1128 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/*
+ *                   The FreeType Project LICENSE
+ *                   ----------------------------
+
+ *                           2006-Jan-27
+
+ *                   Copyright 1996-2002, 2006 by
+ *         David Turner, Robert Wilhelm, and Werner Lemberg
+
+
+
+ * Introduction
+ * ============
+
+ * The FreeType  Project is distributed in  several archive packages;
+ * some of them may contain, in addition to the FreeType font engine,
+ * various tools and  contributions which rely on, or  relate to, the
+ * FreeType Project.
+
+ * This  license applies  to all  files found  in such  packages, and
+ * which do not  fall under their own explicit  license.  The license
+ * affects  thus  the  FreeType   font  engine,  the  test  programs,
+ * documentation and makefiles, at the very least.
+
+ * This  license   was  inspired  by  the  BSD,   Artistic,  and  IJG
+ * (Independent JPEG  Group) licenses, which  all encourage inclusion
+ * and  use of  free  software in  commercial  and freeware  products
+ * alike.  As a consequence, its main points are that:
+
+ *   o We don't promise that this software works. However, we will be
+ *     interested in any kind of bug reports. (`as is' distribution)
+
+ *   o You can  use this software for whatever you  want, in parts or
+ *      full form, without having to pay us. (`royalty-free' usage)
+
+ *    o You may not pretend that  you wrote this software.  If you use
+ *      it, or  only parts of it,  in a program,  you must acknowledge
+ *     somewhere  in  your  documentation  that  you  have  used  the
+ *     FreeType code. (`credits')
+
+ * We  specifically  permit  and  encourage  the  inclusion  of  this
+ * software, with  or without modifications,  in commercial products.
+ * We  disclaim  all warranties  covering  The  FreeType Project  and
+ * assume no liability related to The FreeType Project.
+
+
+ *  Finally,  many  people  asked  us  for  a  preferred  form  for  a
+ *  credit/disclaimer to use in compliance with this license.  We thus
+ * encourage you to use the following text:
+
+ *   """
+ *    Portions of this software are copyright � <year> The FreeType
+ *    Project (www.freetype.org).  All rights reserved.
+ *   """
+
+ *  Please replace <year> with the value from the FreeType version you
+ *  actually use.
+
+* Legal Terms
+* ===========
+
+* 0. Definitions
+* --------------
+
+*   Throughout this license,  the terms `package', `FreeType Project',
+*   and  `FreeType  archive' refer  to  the  set  of files  originally
+*   distributed  by the  authors  (David Turner,  Robert Wilhelm,  and
+*   Werner Lemberg) as the `FreeType Project', be they named as alpha,
+*   beta or final release.
+
+*   `You' refers to  the licensee, or person using  the project, where
+*   `using' is a generic term including compiling the project's source
+*   code as  well as linking it  to form a  `program' or `executable'.
+*   This  program is  referred to  as  `a program  using the  FreeType
+*   engine'.
+
+*   This  license applies  to all  files distributed  in  the original
+*   FreeType  Project,   including  all  source   code,  binaries  and
+*   documentation,  unless  otherwise  stated   in  the  file  in  its
+*   original, unmodified form as  distributed in the original archive.
+*   If you are  unsure whether or not a particular  file is covered by
+*   this license, you must contact us to verify this.
+
+*   The FreeType  Project is copyright (C) 1996-2000  by David Turner,
+*   Robert Wilhelm, and Werner Lemberg.  All rights reserved except as
+*   specified below.
+
+* 1. No Warranty
+* --------------
+
+*   THE FREETYPE PROJECT  IS PROVIDED `AS IS' WITHOUT  WARRANTY OF ANY
+*   KIND, EITHER  EXPRESS OR IMPLIED,  INCLUDING, BUT NOT  LIMITED TO,
+*   WARRANTIES  OF  MERCHANTABILITY   AND  FITNESS  FOR  A  PARTICULAR
+*   PURPOSE.  IN NO EVENT WILL ANY OF THE AUTHORS OR COPYRIGHT HOLDERS
+*   BE LIABLE  FOR ANY DAMAGES CAUSED  BY THE USE OR  THE INABILITY TO
+*   USE, OF THE FREETYPE PROJECT.
+
+* 2. Redistribution
+* -----------------
+
+*   This  license  grants  a  worldwide, royalty-free,  perpetual  and
+*   irrevocable right  and license to use,  execute, perform, compile,
+*   display,  copy,   create  derivative  works   of,  distribute  and
+*   sublicense the  FreeType Project (in  both source and  object code
+*   forms)  and  derivative works  thereof  for  any  purpose; and  to
+*   authorize others  to exercise  some or all  of the  rights granted
+*   herein, subject to the following conditions:
+
+*    o Redistribution of  source code  must retain this  license file
+*      (`FTL.TXT') unaltered; any  additions, deletions or changes to
+*      the original  files must be clearly  indicated in accompanying
+*      documentation.   The  copyright   notices  of  the  unaltered,
+*      original  files must  be  preserved in  all  copies of  source
+*      files.
+
+*    o Redistribution in binary form must provide a  disclaimer  that
+*      states  that  the software is based in part of the work of the
+*      FreeType Team,  in  the  distribution  documentation.  We also
+*      encourage you to put an URL to the FreeType web page  in  your
+*      documentation, though this isn't mandatory.
+
+*  These conditions  apply to any  software derived from or  based on
+*  the FreeType Project,  not just the unmodified files.   If you use
+*  our work, you  must acknowledge us.  However, no  fee need be paid
+*  to us.
+
+* 3. Advertising
+* --------------
+
+*  Neither the  FreeType authors and  contributors nor you  shall use
+*  the name of the  other for commercial, advertising, or promotional
+*  purposes without specific prior written permission.
+
+*  We suggest,  but do not require, that  you use one or  more of the
+*  following phrases to refer  to this software in your documentation
+*  or advertising  materials: `FreeType Project',  `FreeType Engine',
+*  `FreeType library', or `FreeType Distribution'.
+
+*  As  you have  not signed  this license,  you are  not  required to
+*  accept  it.   However,  as  the FreeType  Project  is  copyrighted
+*  material, only  this license, or  another one contracted  with the
+*  authors, grants you  the right to use, distribute,  and modify it.
+*  Therefore,  by  using,  distributing,  or modifying  the  FreeType
+*  Project, you indicate that you understand and accept all the terms
+*  of this license.
+
+* 4. Contacts
+* -----------
+
+*  There are two mailing lists related to FreeType:
+
+*    o freetype@nongnu.org
+
+*      Discusses general use and applications of FreeType, as well as
+*      future and  wanted additions to the  library and distribution.
+*      If  you are looking  for support,  start in  this list  if you
+*      haven't found anything to help you in the documentation.
+
+*    o freetype-devel@nongnu.org
+
+*      Discusses bugs,  as well  as engine internals,  design issues,
+*      specific licenses, porting, etc.
+
+*  Our home page can be found at
+
+*    http://www.freetype.org
+*/
+
+#include <setjmp.h>
+#include <limits.h>
+#include <memory.h>
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+constexpr auto MAX_SPANS = 256;
+constexpr auto PIXEL_BITS = 8;   //must be at least 6 bits!
+constexpr auto ONE_PIXEL = (1L << PIXEL_BITS);
+
+using Area = long;
+
+struct Band
+{
+    SwCoord min, max;
+};
+
+struct Cell
+{
+    SwCoord x;
+    SwCoord cover;
+    Area area;
+    Cell *next;
+};
+
+struct RleWorker
+{
+    SwRleData* rle;
+
+    SwPoint cellPos;
+    SwPoint cellMin;
+    SwPoint cellMax;
+    SwCoord cellXCnt;
+    SwCoord cellYCnt;
+
+    Area area;
+    SwCoord cover;
+
+    Cell* cells;
+    ptrdiff_t maxCells;
+    ptrdiff_t cellsCnt;
+
+    SwPoint pos;
+
+    SwPoint bezStack[32 * 3 + 1];
+    int levStack[32];
+
+    SwOutline* outline;
+
+    SwSpan spans[MAX_SPANS];
+    int spansCnt;
+    int ySpan;
+
+    int bandSize;
+    int bandShoot;
+
+    jmp_buf jmpBuf;
+
+    void* buffer;
+    long bufferSize;
+
+    Cell** yCells;
+    SwCoord yCnt;
+
+    bool invalid;
+    bool antiAlias;
+};
+
+
+static inline SwPoint UPSCALE(const SwPoint& pt)
+{
+    return {SwCoord(((unsigned long) pt.x) << (PIXEL_BITS - 6)), SwCoord(((unsigned long) pt.y) << (PIXEL_BITS - 6))};
+}
+
+
+static inline SwPoint TRUNC(const SwPoint& pt)
+{
+    return  {pt.x >> PIXEL_BITS, pt.y >> PIXEL_BITS};
+}
+
+
+static inline SwCoord TRUNC(const SwCoord x)
+{
+    return  x >> PIXEL_BITS;
+}
+
+
+static inline SwPoint SUBPIXELS(const SwPoint& pt)
+{
+    return {SwCoord(((unsigned long) pt.x) << PIXEL_BITS), SwCoord(((unsigned long) pt.y) << PIXEL_BITS)};
+}
+
+
+static inline SwCoord SUBPIXELS(const SwCoord x)
+{
+    return SwCoord(((unsigned long) x) << PIXEL_BITS);
+}
+
+/*
+ *  Approximate sqrt(x*x+y*y) using the `alpha max plus beta min'
+ *  algorithm.  We use alpha = 1, beta = 3/8, giving us results with a
+ *  largest error less than 7% compared to the exact value.
+ */
+static inline SwCoord HYPOT(SwPoint pt)
+{
+    if (pt.x < 0) pt.x = -pt.x;
+    if (pt.y < 0) pt.y = -pt.y;
+    return ((pt.x > pt.y) ? (pt.x + (3 * pt.y >> 3)) : (pt.y + (3 * pt.x >> 3)));
+}
+
+static void _genSpan(SwRleData* rle, const SwSpan* spans, uint32_t count)
+{
+    auto newSize = rle->size + count;
+
+    /* allocate enough memory for new spans */
+    /* alloc is required to prevent free and reallocation */
+    /* when the rle needs to be regenerated because of attribute change. */
+    if (rle->alloc < newSize) {
+        rle->alloc = (newSize * 2);
+        //OPTIMIZE: use mempool!
+        rle->spans = static_cast<SwSpan*>(realloc(rle->spans, rle->alloc * sizeof(SwSpan)));
+    }
+
+    //copy the new spans to the allocated memory
+    SwSpan* lastSpan = rle->spans + rle->size;
+    memcpy(lastSpan, spans, count * sizeof(SwSpan));
+
+    rle->size = newSize;
+}
+
+
+static void _horizLine(RleWorker& rw, SwCoord x, SwCoord y, SwCoord area, SwCoord acount)
+{
+    x += rw.cellMin.x;
+    y += rw.cellMin.y;
+
+    //Clip Y range
+    if (y < rw.cellMin.y || y >= rw.cellMax.y) return;
+
+    /* compute the coverage line's coverage, depending on the outline fill rule */
+    /* the coverage percentage is area/(PIXEL_BITS*PIXEL_BITS*2) */
+    auto coverage = static_cast<int>(area >> (PIXEL_BITS * 2 + 1 - 8));    //range 0 - 255
+
+    if (coverage < 0) coverage = -coverage;
+
+    if (rw.outline->fillRule == FillRule::EvenOdd) {
+        coverage &= 511;
+        if (coverage > 255) coverage = 511 - coverage;
+    } else {
+        //normal non-zero winding rule
+        if (coverage > 255) coverage = 255;
+    }
+
+    //span has ushort coordinates. check limit overflow
+    if (x >= SHRT_MAX) {
+        TVGERR("SW_ENGINE", "X-coordiante overflow!");
+        x = SHRT_MAX;
+    }
+    if (y >= SHRT_MAX) {
+        TVGERR("SW_ENGINE", "Y Coordiante overflow!");
+        y = SHRT_MAX;
+    }
+
+    if (coverage > 0) {
+        if (!rw.antiAlias) coverage = 255;
+        auto count = rw.spansCnt;
+        auto span = rw.spans + count - 1;
+
+        //see whether we can add this span to the current list
+        if ((count > 0) && (rw.ySpan == y) &&
+            (span->x + span->len == x) && (span->coverage == coverage)) {
+
+            //Clip x range
+            SwCoord xOver = 0;
+            if (x + acount >= rw.cellMax.x) xOver -= (x + acount - rw.cellMax.x);
+            if (x < rw.cellMin.x) xOver -= (rw.cellMin.x - x);
+
+            //span->len += (acount + xOver) - 1;
+            span->len += (acount + xOver);
+            return;
+        }
+
+        if (count >= MAX_SPANS) {
+            _genSpan(rw.rle, rw.spans, count);
+            rw.spansCnt = 0;
+            rw.ySpan = 0;
+            span = rw.spans;
+        } else {
+            ++span;
+        }
+
+        //Clip x range
+        SwCoord xOver = 0;
+        if (x + acount >= rw.cellMax.x) xOver -= (x + acount - rw.cellMax.x);
+        if (x < rw.cellMin.x) {
+            xOver -= (rw.cellMin.x - x);
+            x = rw.cellMin.x;
+        }
+
+        //Nothing to draw
+        if (acount + xOver <= 0) return;
+
+        //add a span to the current list
+        span->x = x;
+        span->y = y;
+        span->len = (acount + xOver);
+        span->coverage = coverage;
+        ++rw.spansCnt;
+        rw.ySpan = y;
+    }
+}
+
+
+static void _sweep(RleWorker& rw)
+{
+    if (rw.cellsCnt == 0) return;
+
+    rw.spansCnt = 0;
+    rw.ySpan = 0;
+
+    for (int y = 0; y < rw.yCnt; ++y) {
+        auto cover = 0;
+        auto x = 0;
+        auto cell = rw.yCells[y];
+
+        while (cell) {
+            if (cell->x > x && cover != 0) _horizLine(rw, x, y, cover * (ONE_PIXEL * 2), cell->x - x);
+            cover += cell->cover;
+            auto area = cover * (ONE_PIXEL * 2) - cell->area;
+            if (area != 0 && cell->x >= 0) _horizLine(rw, cell->x, y, area, 1);
+            x = cell->x + 1;
+            cell = cell->next;
+        }
+
+        if (cover != 0) _horizLine(rw, x, y, cover * (ONE_PIXEL * 2), rw.cellXCnt - x);
+    }
+
+    if (rw.spansCnt > 0) _genSpan(rw.rle, rw.spans, rw.spansCnt);
+}
+
+
+static Cell* _findCell(RleWorker& rw)
+{
+    auto x = rw.cellPos.x;
+    if (x > rw.cellXCnt) x = rw.cellXCnt;
+
+    auto pcell = &rw.yCells[rw.cellPos.y];
+
+    while(true) {
+        Cell* cell = *pcell;
+        if (!cell || cell->x > x) break;
+        if (cell->x == x) return cell;
+        pcell = &cell->next;
+    }
+
+    if (rw.cellsCnt >= rw.maxCells) longjmp(rw.jmpBuf, 1);
+
+    auto cell = rw.cells + rw.cellsCnt++;
+    cell->x = x;
+    cell->area = 0;
+    cell->cover = 0;
+    cell->next = *pcell;
+    *pcell = cell;
+
+    return cell;
+}
+
+
+static void _recordCell(RleWorker& rw)
+{
+    if (rw.area | rw.cover) {
+        auto cell = _findCell(rw);
+        cell->area += rw.area;
+        cell->cover += rw.cover;
+    }
+}
+
+
+static void _setCell(RleWorker& rw, SwPoint pos)
+{
+    /* Move the cell pointer to a new position.  We set the `invalid'      */
+    /* flag to indicate that the cell isn't part of those we're interested */
+    /* in during the render phase.  This means that:                       */
+    /*                                                                     */
+    /* . the new vertical position must be within min_ey..max_ey-1.        */
+    /* . the new horizontal position must be strictly less than max_ex     */
+    /*                                                                     */
+    /* Note that if a cell is to the left of the clipping region, it is    */
+    /* actually set to the (min_ex-1) horizontal position.                 */
+
+    /* All cells that are on the left of the clipping region go to the
+       min_ex - 1 horizontal position. */
+    pos.x -= rw.cellMin.x;
+    pos.y -= rw.cellMin.y;
+
+    if (pos.x > rw.cellMax.x) pos.x = rw.cellMax.x;
+
+    //Are we moving to a different cell?
+    if (pos != rw.cellPos) {
+        //Record the current one if it is valid
+        if (!rw.invalid) _recordCell(rw);
+    }
+
+    rw.area = 0;
+    rw.cover = 0;
+    rw.cellPos = pos;
+    rw.invalid = ((unsigned)pos.y >= (unsigned)rw.cellYCnt || pos.x >= rw.cellXCnt);
+}
+
+
+static void _startCell(RleWorker& rw, SwPoint pos)
+{
+    if (pos.x > rw.cellMax.x) pos.x = rw.cellMax.x;
+    if (pos.x < rw.cellMin.x) pos.x = rw.cellMin.x;
+
+    rw.area = 0;
+    rw.cover = 0;
+    rw.cellPos = pos - rw.cellMin;
+    rw.invalid = false;
+
+    _setCell(rw, pos);
+}
+
+
+static void _moveTo(RleWorker& rw, const SwPoint& to)
+{
+    //record current cell, if any */
+    if (!rw.invalid) _recordCell(rw);
+
+    //start to a new position
+    _startCell(rw, TRUNC(to));
+
+    rw.pos = to;
+}
+
+
+static void _lineTo(RleWorker& rw, const SwPoint& to)
+{
+#define SW_UDIV(a, b) \
+    static_cast<SwCoord>(((unsigned long)(a) * (unsigned long)(b)) >> \
+    (sizeof(long) * CHAR_BIT - PIXEL_BITS))
+
+    auto e1 = TRUNC(rw.pos);
+    auto e2 = TRUNC(to);
+
+    //vertical clipping
+    if ((e1.y >= rw.cellMax.y && e2.y >= rw.cellMax.y) || (e1.y < rw.cellMin.y && e2.y < rw.cellMin.y)) {
+        rw.pos = to;
+        return;
+    }
+
+    auto diff = to - rw.pos;
+    auto f1 = rw.pos - SUBPIXELS(e1);
+    SwPoint f2;
+
+    //inside one cell
+    if (e1 == e2) {
+        ;
+    //any horizontal line
+    } else if (diff.y == 0) {
+        e1.x = e2.x;
+        _setCell(rw, e1);
+    } else if (diff.x == 0) {
+        //vertical line up
+        if (diff.y > 0) {
+            do {
+                f2.y = ONE_PIXEL;
+                rw.cover += (f2.y - f1.y);
+                rw.area += (f2.y - f1.y) * f1.x * 2;
+                f1.y = 0;
+                ++e1.y;
+                _setCell(rw, e1);
+            } while(e1.y != e2.y);
+        //vertical line down
+        } else {
+            do {
+                f2.y = 0;
+                rw.cover += (f2.y - f1.y);
+                rw.area += (f2.y - f1.y) * f1.x * 2;
+                f1.y = ONE_PIXEL;
+                --e1.y;
+                _setCell(rw, e1);
+            } while(e1.y != e2.y);
+        }
+    //any other line
+    } else {
+        Area prod = diff.x * f1.y - diff.y * f1.x;
+
+        /* These macros speed up repetitive divisions by replacing them
+           with multiplications and right shifts. */
+        auto dx_r = static_cast<long>(ULONG_MAX >> PIXEL_BITS) / (diff.x);
+        auto dy_r = static_cast<long>(ULONG_MAX >> PIXEL_BITS) / (diff.y);
+
+        /* The fundamental value `prod' determines which side and the  */
+        /* exact coordinate where the line exits current cell.  It is  */
+        /* also easily updated when moving from one cell to the next.  */
+        do {
+            auto px = diff.x * ONE_PIXEL;
+            auto py = diff.y * ONE_PIXEL;
+
+            //left
+            if (prod <= 0 && prod - px > 0) {
+                f2 = {0, SW_UDIV(-prod, -dx_r)};
+                prod -= py;
+                rw.cover += (f2.y - f1.y);
+                rw.area += (f2.y - f1.y) * (f1.x + f2.x);
+                f1 = {ONE_PIXEL, f2.y};
+                --e1.x;
+            //up
+            } else if (prod - px <= 0 && prod - px + py > 0) {
+                prod -= px;
+                f2 = {SW_UDIV(-prod, dy_r), ONE_PIXEL};
+                rw.cover += (f2.y - f1.y);
+                rw.area += (f2.y - f1.y) * (f1.x + f2.x);
+                f1 = {f2.x, 0};
+                ++e1.y;
+            //right
+            } else if (prod - px + py <= 0 && prod + py >= 0) {
+                prod += py;
+                f2 = {ONE_PIXEL, SW_UDIV(prod, dx_r)};
+                rw.cover += (f2.y - f1.y);
+                rw.area += (f2.y - f1.y) * (f1.x + f2.x);
+                f1 = {0, f2.y};
+                ++e1.x;
+            //down
+            } else {
+                f2 = {SW_UDIV(prod, -dy_r), 0};
+                prod += px;
+                rw.cover += (f2.y - f1.y);
+                rw.area += (f2.y - f1.y) * (f1.x + f2.x);
+                f1 = {f2.x, ONE_PIXEL};
+                --e1.y;
+            }
+
+            _setCell(rw, e1);
+
+        } while(e1 != e2);
+    }
+
+    f2 = {to.x - SUBPIXELS(e2.x), to.y - SUBPIXELS(e2.y)};
+    rw.cover += (f2.y - f1.y);
+    rw.area += (f2.y - f1.y) * (f1.x + f2.x);
+    rw.pos = to;
+}
+
+
+static void _cubicTo(RleWorker& rw, const SwPoint& ctrl1, const SwPoint& ctrl2, const SwPoint& to)
+{
+    auto arc = rw.bezStack;
+    arc[0] = to;
+    arc[1] = ctrl2;
+    arc[2] = ctrl1;
+    arc[3] = rw.pos;
+
+    //Short-cut the arc that crosses the current band
+    auto min = arc[0].y;
+    auto max = arc[0].y;
+
+    SwCoord y;
+    for (auto i = 1; i < 4; ++i) {
+        y = arc[i].y;
+        if (y < min) min = y;
+        if (y > max) max = y;
+    }
+
+    if (TRUNC(min) >= rw.cellMax.y || TRUNC(max) < rw.cellMin.y) goto draw;
+
+    /* Decide whether to split or draw. See `Rapid Termination          */
+    /* Evaluation for Recursive Subdivision of Bezier Curves' by Thomas */
+    /* F. Hain, at                                                      */
+    /* http://www.cis.southalabama.edu/~hain/general/Publications/Bezier/Camera-ready%20CISST02%202.pdf */
+    while (true) {
+        {
+            //diff is the P0 - P3 chord vector
+            auto diff = arc[3] - arc[0];
+            auto L = HYPOT(diff);
+
+            //avoid possible arithmetic overflow below by splitting
+            if (L > SHRT_MAX) goto split;
+
+            //max deviation may be as much as (s/L) * 3/4 (if Hain's v = 1)
+            auto sLimit = L * (ONE_PIXEL / 6);
+
+            auto diff1 = arc[1] - arc[0];
+            auto s = diff.y * diff1.x - diff.x * diff1.y;
+            if (s < 0) s = -s;
+            if (s > sLimit) goto split;
+
+            //s is L * the perpendicular distance from P2 to the line P0 - P3
+            auto diff2 = arc[2] - arc[0];
+            s = diff.y * diff2.x - diff.x * diff2.y;
+            if (s < 0) s = -s;
+            if (s > sLimit) goto split;
+
+            /* Split super curvy segments where the off points are so far
+            from the chord that the angles P0-P1-P3 or P0-P2-P3 become
+            acute as detected by appropriate dot products */
+            if (diff1.x * (diff1.x - diff.x) + diff1.y * (diff1.y - diff.y) > 0 ||
+                diff2.x * (diff2.x - diff.x) + diff2.y * (diff2.y - diff.y) > 0)
+                goto split;
+
+            //no reason to split
+            goto draw;
+        }
+    split:
+        mathSplitCubic(arc);
+        arc += 3;
+        continue;
+
+    draw:
+        _lineTo(rw, arc[0]);
+        if (arc == rw.bezStack) return;
+        arc -= 3;
+    }
+}
+
+
+static void _decomposeOutline(RleWorker& rw)
+{
+    auto outline = rw.outline;
+    auto first = 0;  //index of first point in contour
+
+    for (auto cntr = outline->cntrs.data; cntr < outline->cntrs.end(); ++cntr) {
+        auto last = *cntr;
+        auto limit = outline->pts.data + last;
+        auto start = UPSCALE(outline->pts[first]);
+        auto pt = outline->pts.data + first;
+        auto types = outline->types.data + first;
+
+        _moveTo(rw, UPSCALE(outline->pts[first]));
+
+        while (pt < limit) {
+            ++pt;
+            ++types;
+
+            //emit a single line_to
+            if (types[0] == SW_CURVE_TYPE_POINT) {
+                _lineTo(rw, UPSCALE(*pt));
+            //types cubic
+            } else {
+                pt += 2;
+                types += 2;
+
+                if (pt <= limit) {
+                    _cubicTo(rw, UPSCALE(pt[-2]), UPSCALE(pt[-1]), UPSCALE(pt[0]));
+                    continue;
+                }
+                _cubicTo(rw, UPSCALE(pt[-2]), UPSCALE(pt[-1]), start);
+                goto close;
+            }
+        }
+        _lineTo(rw, start);
+    close:
+       first = last + 1;
+    }
+}
+
+
+static int _genRle(RleWorker& rw)
+{
+    if (setjmp(rw.jmpBuf) == 0) {
+        _decomposeOutline(rw);
+        if (!rw.invalid) _recordCell(rw);
+        return 0;
+    }
+    return -1;              //lack of cell memory
+}
+
+
+static SwSpan* _intersectSpansRegion(const SwRleData *clip, const SwRleData *target, SwSpan *outSpans, uint32_t outSpansCnt)
+{
+    auto out = outSpans;
+    auto spans = target->spans;
+    auto end = target->spans + target->size;
+    auto clipSpans = clip->spans;
+    auto clipEnd = clip->spans + clip->size;
+
+    while (spans < end && clipSpans < clipEnd) {
+        //align y cooridnates.
+        if (clipSpans->y > spans->y) {
+            ++spans;
+            continue;
+        }
+        if (spans->y > clipSpans->y) {
+            ++clipSpans;
+            continue;
+        }
+
+        //Try clipping with all clip spans which have a same y coordinate.
+        auto temp = clipSpans;
+        while(temp < clipEnd && outSpansCnt > 0 && temp->y == clipSpans->y) {
+            auto sx1 = spans->x;
+            auto sx2 = sx1 + spans->len;
+            auto cx1 = temp->x;
+            auto cx2 = cx1 + temp->len;
+
+            //The span must be left(x1) to right(x2) direction. Not intersected.
+            if (cx2 < sx1 || sx2 < cx1) {
+                ++temp;
+                continue;
+            }
+
+            //clip span region.
+            auto x = sx1 > cx1 ? sx1 : cx1;
+            auto len = (sx2 < cx2 ? sx2 : cx2) - x;
+            if (len > 0) {
+                out->x = x;
+                out->y = temp->y;
+                out->len = len;
+                out->coverage = (uint8_t)(((spans->coverage * temp->coverage) + 0xff) >> 8);
+                ++out;
+                --outSpansCnt;
+            }
+            ++temp;
+        }
+        ++spans;
+    }
+    return out;
+}
+
+
+static SwSpan* _intersectSpansRect(const SwBBox *bbox, const SwRleData *targetRle, SwSpan *outSpans, uint32_t outSpansCnt)
+{
+    auto out = outSpans;
+    auto spans = targetRle->spans;
+    auto end = targetRle->spans + targetRle->size;
+    auto minx = static_cast<int16_t>(bbox->min.x);
+    auto miny = static_cast<int16_t>(bbox->min.y);
+    auto maxx = minx + static_cast<int16_t>(bbox->max.x - bbox->min.x) - 1;
+    auto maxy = miny + static_cast<int16_t>(bbox->max.y - bbox->min.y) - 1;
+
+    while (outSpansCnt > 0 && spans < end) {
+        if (spans->y > maxy) {
+            spans = end;
+            break;
+        }
+        if (spans->y < miny || spans->x > maxx || spans->x + spans->len <= minx) {
+            ++spans;
+            continue;
+        }
+        if (spans->x < minx) {
+            out->len = (spans->len - (minx - spans->x)) < (maxx - minx + 1) ? (spans->len - (minx - spans->x)) : (maxx - minx + 1);
+            out->x = minx;
+        }
+        else {
+            out->x = spans->x;
+            out->len = spans->len < (maxx - spans->x + 1) ? spans->len : (maxx - spans->x + 1);
+        }
+        if (out->len > 0) {
+            out->y = spans->y;
+            out->coverage = spans->coverage;
+            ++out;
+            --outSpansCnt;
+        }
+        ++spans;
+    }
+    return out;
+}
+
+
+static SwSpan* _mergeSpansRegion(const SwRleData *clip1, const SwRleData *clip2, SwSpan *outSpans)
+{
+    auto out = outSpans;
+    auto spans1 = clip1->spans;
+    auto end1 = clip1->spans + clip1->size;
+    auto spans2 = clip2->spans;
+    auto end2 = clip2->spans + clip2->size;
+
+    //list two spans up in y order
+    //TODO: Remove duplicated regions?
+    while (spans1 < end1 && spans2 < end2) {
+        while (spans1 < end1 && spans1->y <= spans2->y) {
+            *out = *spans1;
+            ++spans1;
+            ++out;
+        }
+        if (spans1 >= end1) break;
+        while (spans2 < end2 && spans2->y <= spans1->y) {
+            *out = *spans2;
+            ++spans2;
+            ++out;
+        }
+    }
+
+    //Leftovers
+    while (spans1 < end1) {
+        *out = *spans1;
+        ++spans1;
+        ++out;
+    }
+    while (spans2 < end2) {
+        *out = *spans2;
+        ++spans2;
+        ++out;
+    }
+
+    return out;
+}
+
+
+void _replaceClipSpan(SwRleData *rle, SwSpan* clippedSpans, uint32_t size)
+{
+    free(rle->spans);
+    rle->spans = clippedSpans;
+    rle->size = rle->alloc = size;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+SwRleData* rleRender(SwRleData* rle, const SwOutline* outline, const SwBBox& renderRegion, bool antiAlias)
+{
+    constexpr auto RENDER_POOL_SIZE = 16384L;
+    constexpr auto BAND_SIZE = 40;
+
+    //TODO: We can preserve several static workers in advance
+    RleWorker rw;
+    Cell buffer[RENDER_POOL_SIZE / sizeof(Cell)];
+
+    //Init Cells
+    rw.buffer = buffer;
+    rw.bufferSize = sizeof(buffer);
+    rw.yCells = reinterpret_cast<Cell**>(buffer);
+    rw.cells = nullptr;
+    rw.maxCells = 0;
+    rw.cellsCnt = 0;
+    rw.area = 0;
+    rw.cover = 0;
+    rw.invalid = true;
+    rw.cellMin = renderRegion.min;
+    rw.cellMax = renderRegion.max;
+    rw.cellXCnt = rw.cellMax.x - rw.cellMin.x;
+    rw.cellYCnt = rw.cellMax.y - rw.cellMin.y;
+    rw.ySpan = 0;
+    rw.outline = const_cast<SwOutline*>(outline);
+    rw.bandSize = rw.bufferSize / (sizeof(Cell) * 8);  //bandSize: 64
+    rw.bandShoot = 0;
+    rw.antiAlias = antiAlias;
+
+    if (!rle) rw.rle = reinterpret_cast<SwRleData*>(calloc(1, sizeof(SwRleData)));
+    else rw.rle = rle;
+
+    //Generate RLE
+    Band bands[BAND_SIZE];
+    Band* band;
+
+    /* set up vertical bands */
+    auto bandCnt = static_cast<int>((rw.cellMax.y - rw.cellMin.y) / rw.bandSize);
+    if (bandCnt == 0) bandCnt = 1;
+    else if (bandCnt >= BAND_SIZE) bandCnt = (BAND_SIZE - 1);
+
+    auto min = rw.cellMin.y;
+    auto yMax = rw.cellMax.y;
+    SwCoord max;
+    int ret;
+
+    for (int n = 0; n < bandCnt; ++n, min = max) {
+        max = min + rw.bandSize;
+        if (n == bandCnt -1 || max > yMax) max = yMax;
+
+        bands[0].min = min;
+        bands[0].max = max;
+        band = bands;
+
+        while (band >= bands) {
+            rw.yCells = static_cast<Cell**>(rw.buffer);
+            rw.yCnt = band->max - band->min;
+
+            int cellStart = sizeof(Cell*) * (int)rw.yCnt;
+            int cellMod = cellStart % sizeof(Cell);
+
+            if (cellMod > 0) cellStart += sizeof(Cell) - cellMod;
+
+            auto cellEnd = rw.bufferSize;
+            cellEnd -= cellEnd % sizeof(Cell);
+
+            auto cellsMax = reinterpret_cast<Cell*>((char*)rw.buffer + cellEnd);
+            rw.cells = reinterpret_cast<Cell*>((char*)rw.buffer + cellStart);
+
+            if (rw.cells >= cellsMax) goto reduce_bands;
+
+            rw.maxCells = cellsMax - rw.cells;
+            if (rw.maxCells < 2) goto reduce_bands;
+
+            for (int y = 0; y < rw.yCnt; ++y)
+                rw.yCells[y] = nullptr;
+
+            rw.cellsCnt = 0;
+            rw.invalid = true;
+            rw.cellMin.y = band->min;
+            rw.cellMax.y = band->max;
+            rw.cellYCnt = band->max - band->min;
+
+            ret = _genRle(rw);
+            if (ret == 0) {
+                _sweep(rw);
+                --band;
+                continue;
+            } else if (ret == 1) {
+                goto error;
+            }
+
+        reduce_bands:
+            /* render pool overflow: we will reduce the render band by half */
+            auto bottom = band->min;
+            auto top = band->max;
+            auto middle = bottom + ((top - bottom) >> 1);
+
+            /* This is too complex for a single scanline; there must
+               be some problems */
+            if (middle == bottom) goto error;
+
+            if (bottom - top >= rw.bandSize) ++rw.bandShoot;
+
+            band[1].min = bottom;
+            band[1].max = middle;
+            band[0].min = middle;
+            band[0].max = top;
+            ++band;
+        }
+    }
+
+    if (rw.bandShoot > 8 && rw.bandSize > 16)
+        rw.bandSize = (rw.bandSize >> 1);
+
+    return rw.rle;
+
+error:
+    free(rw.rle);
+    rw.rle = nullptr;
+    return nullptr;
+}
+
+
+SwRleData* rleRender(const SwBBox* bbox)
+{
+    auto width = static_cast<uint16_t>(bbox->max.x - bbox->min.x);
+    auto height = static_cast<uint16_t>(bbox->max.y - bbox->min.y);
+
+    auto rle = static_cast<SwRleData*>(malloc(sizeof(SwRleData)));
+    rle->spans = static_cast<SwSpan*>(malloc(sizeof(SwSpan) * height));
+    rle->size = height;
+    rle->alloc = height;
+
+    auto span = rle->spans;
+    for (uint16_t i = 0; i < height; ++i, ++span) {
+        span->x = bbox->min.x;
+        span->y = bbox->min.y + i;
+        span->len = width;
+        span->coverage = 255;
+    }
+
+    return rle;
+}
+
+
+void rleReset(SwRleData* rle)
+{
+    if (!rle) return;
+    rle->size = 0;
+}
+
+
+void rleFree(SwRleData* rle)
+{
+    if (!rle) return;
+    if (rle->spans) free(rle->spans);
+    free(rle);
+}
+
+
+void rleMerge(SwRleData* rle, SwRleData* clip1, SwRleData* clip2)
+{
+    if (!rle || (!clip1 && !clip2)) return;
+    if (clip1 && clip1->size == 0 && clip2 && clip2->size == 0) return;
+
+    TVGLOG("SW_ENGINE", "Unifying Rle!");
+
+    //clip1 is empty, just copy clip2
+    if (!clip1 || clip1->size == 0) {
+        if (clip2) {
+            auto spans = static_cast<SwSpan*>(malloc(sizeof(SwSpan) * (clip2->size)));
+            memcpy(spans, clip2->spans, clip2->size);
+            _replaceClipSpan(rle, spans, clip2->size);
+        } else {
+            _replaceClipSpan(rle, nullptr, 0);
+        }
+        return;
+    }
+
+    //clip2 is empty, just copy clip1
+    if (!clip2 || clip2->size == 0) {
+        if (clip1) {
+            auto spans = static_cast<SwSpan*>(malloc(sizeof(SwSpan) * (clip1->size)));
+            memcpy(spans, clip1->spans, clip1->size);
+            _replaceClipSpan(rle, spans, clip1->size);
+        } else {
+            _replaceClipSpan(rle, nullptr, 0);
+        }
+        return;
+    }
+
+    auto spanCnt = clip1->size + clip2->size;
+    auto spans = static_cast<SwSpan*>(malloc(sizeof(SwSpan) * spanCnt));
+    auto spansEnd = _mergeSpansRegion(clip1, clip2, spans);
+
+    _replaceClipSpan(rle, spans, spansEnd - spans);
+}
+
+
+void rleClipPath(SwRleData *rle, const SwRleData *clip)
+{
+    if (rle->size == 0 || clip->size == 0) return;
+    auto spanCnt = rle->size > clip->size ? rle->size : clip->size;
+    auto spans = static_cast<SwSpan*>(malloc(sizeof(SwSpan) * (spanCnt)));
+    auto spansEnd = _intersectSpansRegion(clip, rle, spans, spanCnt);
+
+    _replaceClipSpan(rle, spans, spansEnd - spans);
+
+    TVGLOG("SW_ENGINE", "Using ClipPath!");
+}
+
+
+void rleClipRect(SwRleData *rle, const SwBBox* clip)
+{
+    if (rle->size == 0) return;
+    auto spans = static_cast<SwSpan*>(malloc(sizeof(SwSpan) * (rle->size)));
+    auto spansEnd = _intersectSpansRect(clip, rle, spans, rle->size);
+
+    _replaceClipSpan(rle, spans, spansEnd - spans);
+
+    TVGLOG("SW_ENGINE", "Using ClipRect!");
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp
new file mode 100644
index 0000000000..159898c750
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwShape.cpp
@@ -0,0 +1,654 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "tvgSwCommon.h"
+#include "tvgMath.h"
+#include "tvgBezier.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+struct Line
+{
+    Point pt1;
+    Point pt2;
+};
+
+
+static float _lineLength(const Point& pt1, const Point& pt2)
+{
+    /* approximate sqrt(x*x + y*y) using alpha max plus beta min algorithm.
+       With alpha = 1, beta = 3/8, giving results with the largest error less
+       than 7% compared to the exact value. */
+    Point diff = {pt2.x - pt1.x, pt2.y - pt1.y};
+    if (diff.x < 0) diff.x = -diff.x;
+    if (diff.y < 0) diff.y = -diff.y;
+    return (diff.x > diff.y) ? (diff.x + diff.y * 0.375f) : (diff.y + diff.x * 0.375f);
+}
+
+
+static void _lineSplitAt(const Line& cur, float at, Line& left, Line& right)
+{
+    auto len = _lineLength(cur.pt1, cur.pt2);
+    auto dx = ((cur.pt2.x - cur.pt1.x) / len) * at;
+    auto dy = ((cur.pt2.y - cur.pt1.y) / len) * at;
+    left.pt1 = cur.pt1;
+    left.pt2.x = left.pt1.x + dx;
+    left.pt2.y = left.pt1.y + dy;
+    right.pt1 = left.pt2;
+    right.pt2 = cur.pt2;
+}
+
+
+static void _outlineEnd(SwOutline& outline)
+{
+    if (outline.pts.empty()) return;
+    outline.cntrs.push(outline.pts.count - 1);
+}
+
+
+static void _outlineMoveTo(SwOutline& outline, const Point* to, const Matrix* transform)
+{
+    if (outline.pts.count > 0) outline.cntrs.push(outline.pts.count - 1);
+
+    outline.pts.push(mathTransform(to, transform));
+    outline.types.push(SW_CURVE_TYPE_POINT);
+}
+
+
+static void _outlineLineTo(SwOutline& outline, const Point* to, const Matrix* transform)
+{
+    outline.pts.push(mathTransform(to, transform));
+    outline.types.push(SW_CURVE_TYPE_POINT);
+}
+
+
+static void _outlineCubicTo(SwOutline& outline, const Point* ctrl1, const Point* ctrl2, const Point* to, const Matrix* transform)
+{
+    outline.pts.push(mathTransform(ctrl1, transform));
+    outline.types.push(SW_CURVE_TYPE_CUBIC);
+
+    outline.pts.push(mathTransform(ctrl2, transform));
+    outline.types.push(SW_CURVE_TYPE_CUBIC);    
+
+    outline.pts.push(mathTransform(to, transform));
+    outline.types.push(SW_CURVE_TYPE_POINT);
+}
+
+
+static void _outlineClose(SwOutline& outline)
+{
+    uint32_t i = 0;
+
+    if (outline.cntrs.count > 0) i = outline.cntrs.last() + 1;
+    else i = 0;   //First Path
+
+    //Make sure there is at least one point in the current path
+    if (outline.pts.count == i) return;
+
+    //Close the path
+    outline.pts.push(outline.pts[i]);
+    outline.types.push(SW_CURVE_TYPE_POINT);
+    outline.closed.push(true);
+}
+
+
+static void _dashLineTo(SwDashStroke& dash, const Point* to, const Matrix* transform)
+{
+    Line cur = {dash.ptCur, *to};
+    auto len = _lineLength(cur.pt1, cur.pt2);
+
+    if (len < dash.curLen) {
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            _outlineMoveTo(*dash.outline, &dash.ptCur, transform);
+            _outlineLineTo(*dash.outline, to, transform);
+        }
+    } else {
+        while (len > dash.curLen) {
+            Line left, right;
+            if (dash.curLen > 0) {
+                len -= dash.curLen;
+                _lineSplitAt(cur, dash.curLen, left, right);
+                if (!dash.curOpGap) {
+                    _outlineMoveTo(*dash.outline, &left.pt1, transform);
+                    _outlineLineTo(*dash.outline, &left.pt2, transform);
+                }
+            } else {
+                right = cur;
+            }
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+            cur = right;
+            dash.ptCur = cur.pt1;
+        }
+        //leftovers
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            _outlineMoveTo(*dash.outline, &cur.pt1, transform);
+            _outlineLineTo(*dash.outline, &cur.pt2, transform);
+        }
+        if (dash.curLen < 1 && TO_SWCOORD(len) > 1) {
+            //move to next dash
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+        }
+    }
+    dash.ptCur = *to;
+}
+
+
+static void _dashCubicTo(SwDashStroke& dash, const Point* ctrl1, const Point* ctrl2, const Point* to, const Matrix* transform)
+{
+    Bezier cur = {dash.ptCur, *ctrl1, *ctrl2, *to};
+    auto len = bezLength(cur);
+
+    if (len < dash.curLen) {
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            _outlineMoveTo(*dash.outline, &dash.ptCur, transform);
+            _outlineCubicTo(*dash.outline, ctrl1, ctrl2, to, transform);
+        }
+    } else {
+        bool begin = true;          //starting with move_to
+        while (len > dash.curLen) {
+            Bezier left, right;
+            if (dash.curLen > 0) {
+                len -= dash.curLen;
+                bezSplitAt(cur, dash.curLen, left, right);
+                if (!dash.curOpGap) {
+                    // leftovers from a previous command don't require moveTo
+                    if (begin || dash.pattern[dash.curIdx] - dash.curLen < FLT_EPSILON) {
+                        _outlineMoveTo(*dash.outline, &left.start, transform);
+                        begin = false;
+                    }
+                    _outlineCubicTo(*dash.outline, &left.ctrl1, &left.ctrl2, &left.end, transform);
+                }
+            } else {
+                right = cur;
+            }
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+            cur = right;
+            dash.ptCur = right.start;
+        }
+        //leftovers
+        dash.curLen -= len;
+        if (!dash.curOpGap) {
+            _outlineMoveTo(*dash.outline, &cur.start, transform);
+            _outlineCubicTo(*dash.outline, &cur.ctrl1, &cur.ctrl2, &cur.end, transform);
+        }
+        if (dash.curLen < 1 && TO_SWCOORD(len) > 1) {
+            //move to next dash
+            dash.curIdx = (dash.curIdx + 1) % dash.cnt;
+            dash.curLen = dash.pattern[dash.curIdx];
+            dash.curOpGap = !dash.curOpGap;
+        }
+    }
+    dash.ptCur = *to;
+}
+
+
+static SwOutline* _genDashOutline(const RenderShape* rshape, const Matrix* transform, float length, SwMpool* mpool, unsigned tid)
+{
+    const PathCommand* cmds = rshape->path.cmds.data;
+    auto cmdCnt = rshape->path.cmds.count;
+    const Point* pts = rshape->path.pts.data;
+    auto ptsCnt = rshape->path.pts.count;
+
+    //No actual shape data
+    if (cmdCnt == 0 || ptsCnt == 0) return nullptr;
+
+    SwDashStroke dash;
+    auto offset = 0.0f;
+    auto trimmed = false;
+
+    dash.cnt = rshape->strokeDash((const float**)&dash.pattern, &offset);
+
+    //dash by trimming.
+    if (length > 0.0f && dash.cnt == 0) {
+        auto begin = length * rshape->stroke->trim.begin;
+        auto end = length * rshape->stroke->trim.end;
+
+        //TODO: mix trimming + dash style
+
+        //default
+        if (end > begin) {
+            if (begin > 0) dash.cnt += 4;
+            else dash.cnt += 2;
+        //looping
+        } else dash.cnt += 3;
+
+        dash.pattern = (float*)malloc(sizeof(float) * dash.cnt);
+
+        if (dash.cnt == 2) {
+            dash.pattern[0] = end - begin;
+            dash.pattern[1] = length - (end - begin);
+        } else if (dash.cnt == 3) {
+            dash.pattern[0] = end;
+            dash.pattern[1] = (begin - end);
+            dash.pattern[2] = length - begin;
+        } else {
+            dash.pattern[0] = 0;     //zero dash to start with a space.
+            dash.pattern[1] = begin;
+            dash.pattern[2] = end - begin;
+            dash.pattern[3] = length - (end - begin);
+        }
+
+        trimmed = true;
+    //just a dasy style.
+    } else {
+
+        if (dash.cnt == 0) return nullptr;
+    }
+
+    //offset?
+    auto patternLength = 0.0f;
+    uint32_t offIdx = 0;
+    if (!mathZero(offset)) {
+        for (size_t i = 0; i < dash.cnt; ++i) patternLength += dash.pattern[i];
+        bool isOdd = dash.cnt % 2;
+        if (isOdd) patternLength *= 2;
+
+        offset = fmod(offset, patternLength);
+        if (offset < 0) offset += patternLength;
+
+        for (size_t i = 0; i < dash.cnt * (1 + (size_t)isOdd); ++i, ++offIdx) {
+            auto curPattern = dash.pattern[i % dash.cnt];
+            if (offset < curPattern) break;
+            offset -= curPattern;
+        }
+    }
+
+    dash.outline = mpoolReqDashOutline(mpool, tid);
+
+    //smart reservation
+    auto closeCnt = 0;
+    auto moveCnt = 0;
+
+    for (auto cmd = rshape->path.cmds.data; cmd < rshape->path.cmds.end(); ++cmd) {
+        if (*cmd == PathCommand::Close) ++closeCnt;
+        else if (*cmd == PathCommand::MoveTo) ++moveCnt;
+    }
+
+    //No idea exact count.... Reserve Approximitely 20x...
+    //OPTIMIZE: we can directly copy the path points when the close is occupied with a point.
+    dash.outline->pts.grow(20 * (closeCnt + ptsCnt + 1));
+    dash.outline->types.grow(20 * (closeCnt + ptsCnt + 1));
+    dash.outline->cntrs.grow(20 * (moveCnt + 1));
+
+    while (cmdCnt-- > 0) {
+        switch (*cmds) {
+            case PathCommand::Close: {
+                _dashLineTo(dash, &dash.ptStart, transform);
+                break;
+            }
+            case PathCommand::MoveTo: {
+                //reset the dash
+                dash.curIdx = offIdx % dash.cnt;
+                dash.curLen = dash.pattern[dash.curIdx] - offset;
+                dash.curOpGap = offIdx % 2;
+                dash.ptStart = dash.ptCur = *pts;
+                ++pts;
+                break;
+            }
+            case PathCommand::LineTo: {
+                _dashLineTo(dash, pts, transform);
+                ++pts;
+                break;
+            }
+            case PathCommand::CubicTo: {
+                _dashCubicTo(dash, pts, pts + 1, pts + 2, transform);
+                pts += 3;
+                break;
+            }
+        }
+        ++cmds;
+    }
+
+    _outlineEnd(*dash.outline);
+
+    if (trimmed) free(dash.pattern);
+
+    return dash.outline;
+}
+
+
+static float _outlineLength(const RenderShape* rshape)
+{
+    const PathCommand* cmds = rshape->path.cmds.data;
+    auto cmdCnt = rshape->path.cmds.count;
+    const Point* pts = rshape->path.pts.data;
+    auto ptsCnt = rshape->path.pts.count;
+
+    //No actual shape data
+    if (cmdCnt == 0 || ptsCnt == 0) return 0.0f;
+
+    const Point* close = nullptr;
+    auto length = 0.0f;
+
+    //Compute the whole length
+    while (cmdCnt-- > 0) {
+        switch (*cmds) {
+            case PathCommand::Close: {
+                length += mathLength(pts - 1, close);
+                ++pts;
+                break;
+            }
+            case PathCommand::MoveTo: {
+                close = pts;
+                ++pts;
+                break;
+            }
+            case PathCommand::LineTo: {
+                length += mathLength(pts - 1, pts);
+                ++pts;
+                break;
+            }
+            case PathCommand::CubicTo: {
+                length += bezLength({*(pts - 1), *pts, *(pts + 1), *(pts + 2)});
+                pts += 3;
+                break;
+            }
+        }
+        ++cmds;
+    }
+    return length;
+}
+
+
+static bool _axisAlignedRect(const SwOutline* outline)
+{
+    //Fast Track: axis-aligned rectangle?
+    if (outline->pts.count != 5) return false;
+
+    auto pt1 = outline->pts.data + 0;
+    auto pt2 = outline->pts.data + 1;
+    auto pt3 = outline->pts.data + 2;
+    auto pt4 = outline->pts.data + 3;
+
+    auto a = SwPoint{pt1->x, pt3->y};
+    auto b = SwPoint{pt3->x, pt1->y};
+
+    if ((*pt2 == a && *pt4 == b) || (*pt2 == b && *pt4 == a)) return true;
+
+    return false;
+}
+
+
+static bool _genOutline(SwShape* shape, const RenderShape* rshape, const Matrix* transform, SwMpool* mpool, unsigned tid, bool hasComposite)
+{
+    const PathCommand* cmds = rshape->path.cmds.data;
+    auto cmdCnt = rshape->path.cmds.count;
+    const Point* pts = rshape->path.pts.data;
+    auto ptsCnt = rshape->path.pts.count;
+
+    //No actual shape data
+    if (cmdCnt == 0 || ptsCnt == 0) return false;
+
+    //smart reservation
+    auto moveCnt = 0;
+    auto closeCnt = 0;
+
+    for (auto cmd = rshape->path.cmds.data; cmd < rshape->path.cmds.end(); ++cmd) {
+        if (*cmd == PathCommand::Close) ++closeCnt;
+        else if (*cmd == PathCommand::MoveTo) ++moveCnt;
+    }
+
+    shape->outline = mpoolReqOutline(mpool, tid);
+    auto outline = shape->outline;
+
+    //OPTIMIZE: we can directly copy the path points when the close is occupied with a point.
+    outline->pts.grow(ptsCnt + closeCnt + 1);
+    outline->types.grow(ptsCnt + closeCnt + 1);
+    outline->cntrs.grow(moveCnt + 1);
+
+    //Dash outlines are always opened.
+    //Only normal outlines use this information, it sholud be same to their contour counts.
+    outline->closed.reserve(outline->cntrs.reserved);
+
+    memset(outline->closed.data, 0x0, sizeof(bool) * outline->closed.reserved);
+
+    //Generate Outlines
+    while (cmdCnt-- > 0) {
+        switch (*cmds) {
+            case PathCommand::Close: {
+                _outlineClose(*outline);
+                break;
+            }
+            case PathCommand::MoveTo: {
+                _outlineMoveTo(*outline, pts, transform);
+                ++pts;
+                break;
+            }
+            case PathCommand::LineTo: {
+                _outlineLineTo(*outline, pts, transform);
+                ++pts;
+                break;
+            }
+            case PathCommand::CubicTo: {
+                _outlineCubicTo(*outline, pts, pts + 1, pts + 2, transform);
+                pts += 3;
+                break;
+            }
+        }
+        ++cmds;
+    }
+
+    _outlineEnd(*outline);
+
+    outline->fillRule = rshape->rule;
+    shape->outline = outline;
+
+    shape->fastTrack = (!hasComposite && _axisAlignedRect(shape->outline));
+    return true;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+bool shapePrepare(SwShape* shape, const RenderShape* rshape, const Matrix* transform,  const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid, bool hasComposite)
+{
+    if (!_genOutline(shape, rshape, transform, mpool, tid, hasComposite)) return false;
+    if (!mathUpdateOutlineBBox(shape->outline, clipRegion, renderRegion, shape->fastTrack)) return false;
+
+    //Keep it for Rasterization Region
+    shape->bbox = renderRegion;
+
+    //Check valid region
+    if (renderRegion.max.x - renderRegion.min.x < 1 && renderRegion.max.y - renderRegion.min.y < 1) return false;
+
+    //Check boundary
+    if (renderRegion.min.x >= clipRegion.max.x || renderRegion.min.y >= clipRegion.max.y ||
+        renderRegion.max.x <= clipRegion.min.x || renderRegion.max.y <= clipRegion.min.y) return false;
+
+    return true;
+}
+
+
+bool shapePrepared(const SwShape* shape)
+{
+    return shape->rle ? true : false;
+}
+
+
+bool shapeGenRle(SwShape* shape, TVG_UNUSED const RenderShape* rshape, bool antiAlias)
+{
+    //FIXME: Should we draw it?
+    //Case: Stroke Line
+    //if (shape.outline->opened) return true;
+
+    //Case A: Fast Track Rectangle Drawing
+    if (shape->fastTrack) return true;
+
+    //Case B: Normal Shape RLE Drawing
+    if ((shape->rle = rleRender(shape->rle, shape->outline, shape->bbox, antiAlias))) return true;
+
+    return false;
+}
+
+
+void shapeDelOutline(SwShape* shape, SwMpool* mpool, uint32_t tid)
+{
+    mpoolRetOutline(mpool, tid);
+    shape->outline = nullptr;
+}
+
+
+void shapeReset(SwShape* shape)
+{
+    rleReset(shape->rle);
+    rleReset(shape->strokeRle);
+    shape->fastTrack = false;
+    shape->bbox.reset();
+}
+
+
+void shapeFree(SwShape* shape)
+{
+    rleFree(shape->rle);
+    shapeDelFill(shape);
+
+    if (shape->stroke) {
+        rleFree(shape->strokeRle);
+        strokeFree(shape->stroke);
+    }
+}
+
+
+void shapeDelStroke(SwShape* shape)
+{
+    if (!shape->stroke) return;
+    rleFree(shape->strokeRle);
+    shape->strokeRle = nullptr;
+    strokeFree(shape->stroke);
+    shape->stroke = nullptr;
+}
+
+
+void shapeResetStroke(SwShape* shape, const RenderShape* rshape, const Matrix* transform)
+{
+    if (!shape->stroke) shape->stroke = static_cast<SwStroke*>(calloc(1, sizeof(SwStroke)));
+    auto stroke = shape->stroke;
+    if (!stroke) return;
+
+    strokeReset(stroke, rshape, transform);
+    rleReset(shape->strokeRle);
+}
+
+
+bool shapeGenStrokeRle(SwShape* shape, const RenderShape* rshape, const Matrix* transform, const SwBBox& clipRegion, SwBBox& renderRegion, SwMpool* mpool, unsigned tid)
+{
+    SwOutline* shapeOutline = nullptr;
+    SwOutline* strokeOutline = nullptr;
+    auto dashStroking = false;
+    auto ret = true;
+
+    auto length = rshape->strokeTrim() ? _outlineLength(rshape) : 0.0f;
+
+    //Dash style (+trimming)
+    if (rshape->stroke->dashCnt > 0 || length > 0) {
+        shapeOutline = _genDashOutline(rshape, transform, length, mpool, tid);
+        if (!shapeOutline) return false;
+        dashStroking = true;
+    //Normal style
+    } else {
+        if (!shape->outline) {
+            if (!_genOutline(shape, rshape, transform, mpool, tid, false)) return false;
+        }
+        shapeOutline = shape->outline;
+    }
+
+    if (!strokeParseOutline(shape->stroke, *shapeOutline)) {
+        ret = false;
+        goto clear;
+    }
+
+    strokeOutline = strokeExportOutline(shape->stroke, mpool, tid);
+
+    if (!mathUpdateOutlineBBox(strokeOutline, clipRegion, renderRegion, false)) {
+        ret = false;
+        goto clear;
+    }
+
+    shape->strokeRle = rleRender(shape->strokeRle, strokeOutline, renderRegion, true);
+
+clear:
+    if (dashStroking) mpoolRetDashOutline(mpool, tid);
+    mpoolRetStrokeOutline(mpool, tid);
+
+    return ret;
+}
+
+
+bool shapeGenFillColors(SwShape* shape, const Fill* fill, const Matrix* transform, SwSurface* surface, uint8_t opacity, bool ctable)
+{
+    return fillGenColorTable(shape->fill, fill, transform, surface, opacity, ctable);
+}
+
+
+bool shapeGenStrokeFillColors(SwShape* shape, const Fill* fill, const Matrix* transform, SwSurface* surface, uint8_t opacity, bool ctable)
+{
+    return fillGenColorTable(shape->stroke->fill, fill, transform, surface, opacity, ctable);
+}
+
+
+void shapeResetFill(SwShape* shape)
+{
+    if (!shape->fill) {
+        shape->fill = static_cast<SwFill*>(calloc(1, sizeof(SwFill)));
+        if (!shape->fill) return;
+    }
+    fillReset(shape->fill);
+}
+
+
+void shapeResetStrokeFill(SwShape* shape)
+{
+    if (!shape->stroke->fill) {
+        shape->stroke->fill = static_cast<SwFill*>(calloc(1, sizeof(SwFill)));
+        if (!shape->stroke->fill) return;
+    }
+    fillReset(shape->stroke->fill);
+}
+
+
+void shapeDelFill(SwShape* shape)
+{
+    if (!shape->fill) return;
+    fillFree(shape->fill);
+    shape->fill = nullptr;
+}
+
+
+void shapeDelStrokeFill(SwShape* shape)
+{
+    if (!shape->stroke->fill) return;
+    fillFree(shape->stroke->fill);
+    shape->stroke->fill = nullptr;
+}
diff --git a/thirdparty/thorvg/src/renderer/sw_engine/tvgSwStroke.cpp b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwStroke.cpp
new file mode 100644
index 0000000000..b1bdccbbba
--- /dev/null
+++ b/thirdparty/thorvg/src/renderer/sw_engine/tvgSwStroke.cpp
@@ -0,0 +1,910 @@
+/*
+ * Copyright (c) 2020 - 2023 the ThorVG project. All rights reserved.
+
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include <string.h>
+#include <math.h>
+#include "tvgSwCommon.h"
+
+/************************************************************************/
+/* Internal Class Implementation                                        */
+/************************************************************************/
+
+static constexpr auto SW_STROKE_TAG_POINT = 1;
+static constexpr auto SW_STROKE_TAG_CUBIC = 2;
+static constexpr auto SW_STROKE_TAG_BEGIN = 4;
+static constexpr auto SW_STROKE_TAG_END = 8;
+
+static inline SwFixed SIDE_TO_ROTATE(const int32_t s)
+{
+    return (SW_ANGLE_PI2 - static_cast<SwFixed>(s) * SW_ANGLE_PI);
+}
+
+
+static inline void SCALE(const SwStroke& stroke, SwPoint& pt)
+{
+    pt.x = static_cast<SwCoord>(pt.x * stroke.sx);
+    pt.y = static_cast<SwCoord>(pt.y * stroke.sy);
+}
+
+
+static void _growBorder(SwStrokeBorder* border, uint32_t newPts)
+{
+    auto maxOld = border->maxPts;
+    auto maxNew = border->ptsCnt + newPts;
+
+    if (maxNew <= maxOld) return;
+
+    auto maxCur = maxOld;
+
+    while (maxCur < maxNew)
+        maxCur += (maxCur >> 1) + 16;
+    //OPTIMIZE: use mempool!
+    border->pts = static_cast<SwPoint*>(realloc(border->pts, maxCur * sizeof(SwPoint)));
+    border->tags = static_cast<uint8_t*>(realloc(border->tags, maxCur * sizeof(uint8_t)));
+    border->maxPts = maxCur;
+}
+
+
+static void _borderClose(SwStrokeBorder* border, bool reverse)
+{
+    auto start = border->start;
+    auto count = border->ptsCnt;
+
+    //Don't record empty paths!
+    if (count <= start + 1U) {
+        border->ptsCnt = start;
+    } else {
+        /* Copy the last point to the start of this sub-path,
+           since it contains the adjusted starting coordinates */
+        border->ptsCnt = --count;
+        border->pts[start] = border->pts[count];
+
+        if (reverse) {
+            //reverse the points
+            auto pt1 = border->pts + start + 1;
+            auto pt2 = border->pts + count - 1;
+
+            while (pt1 < pt2) {
+                auto tmp = *pt1;
+                *pt1 = *pt2;
+                *pt2 = tmp;
+                ++pt1;
+                --pt2;
+            }
+
+            //reverse the tags
+            auto tag1 = border->tags + start + 1;
+            auto tag2 = border->tags + count - 1;
+
+            while (tag1 < tag2) {
+                auto tmp = *tag1;
+                *tag1 = *tag2;
+                *tag2 = tmp;
+                ++tag1;
+                --tag2;
+            }
+        }
+
+        border->tags[start] |= SW_STROKE_TAG_BEGIN;
+        border->tags[count - 1] |=  SW_STROKE_TAG_END;
+    }
+
+    border->start = -1;
+    border->movable = false;
+}
+
+
+static void _borderCubicTo(SwStrokeBorder* border, const SwPoint& ctrl1, const SwPoint& ctrl2, const SwPoint& to)
+{
+    _growBorder(border, 3);
+
+    auto pt = border->pts + border->ptsCnt;
+    auto tag = border->tags + border->ptsCnt;
+
+    pt[0] = ctrl1;
+    pt[1] = ctrl2;
+    pt[2] = to;
+
+    tag[0] = SW_STROKE_TAG_CUBIC;
+    tag[1] = SW_STROKE_TAG_CUBIC;
+    tag[2] = SW_STROKE_TAG_POINT;
+
+    border->ptsCnt += 3;
+    border->movable = false;
+}
+
+
+static void _borderArcTo(SwStrokeBorder* border, const SwPoint& center, SwFixed radius, SwFixed angleStart, SwFixed angleDiff, SwStroke& stroke)
+{
+    constexpr SwFixed ARC_CUBIC_ANGLE = SW_ANGLE_PI / 2;
+    SwPoint a = {static_cast<SwCoord>(radius), 0};
+    mathRotate(a, angleStart);
+    SCALE(stroke, a);
+    a += center;
+
+    auto total = angleDiff;
+    auto angle = angleStart;
+    auto rotate = (angleDiff >= 0) ? SW_ANGLE_PI2 : -SW_ANGLE_PI2;
+
+    while (total != 0) {
+        auto step = total;
+        if (step > ARC_CUBIC_ANGLE) step = ARC_CUBIC_ANGLE;
+        else if (step < -ARC_CUBIC_ANGLE) step = -ARC_CUBIC_ANGLE;
+
+        auto next = angle + step;
+        auto theta = step;
+        if (theta < 0) theta = -theta;
+
+        theta >>= 1;
+
+        //compute end point
+        SwPoint b = {static_cast<SwCoord>(radius), 0};
+        mathRotate(b, next);
+        SCALE(stroke, b);
+        b += center;
+
+        //compute first and second control points
+        auto length = mathMulDiv(radius, mathSin(theta) * 4, (0x10000L + mathCos(theta)) * 3);
+
+        SwPoint a2 = {static_cast<SwCoord>(length), 0};
+        mathRotate(a2, angle + rotate);
+        SCALE(stroke, a2);
+        a2 += a;
+
+        SwPoint b2 = {static_cast<SwCoord>(length), 0};
+        mathRotate(b2, next - rotate);
+        SCALE(stroke, b2);
+        b2 += b;
+
+        //add cubic arc
+        _borderCubicTo(border, a2, b2, b);
+
+        //process the rest of the arc?
+        a = b;
+        total -= step;
+        angle = next;
+    }
+}
+
+
+static void _borderLineTo(SwStrokeBorder* border, const SwPoint& to, bool movable)
+{
+    if (border->movable) {
+        //move last point
+        border->pts[border->ptsCnt - 1] = to;
+    } else {
+        //don't add zero-length line_to
+        if (border->ptsCnt > 0 && (border->pts[border->ptsCnt - 1] - to).small()) return;
+
+        _growBorder(border, 1);
+        border->pts[border->ptsCnt] = to;
+        border->tags[border->ptsCnt] = SW_STROKE_TAG_POINT;
+        border->ptsCnt += 1;
+    }
+
+    border->movable = movable;
+}
+
+
+static void _borderMoveTo(SwStrokeBorder* border, SwPoint& to)
+{
+    //close current open path if any?
+    if (border->start >= 0) _borderClose(border, false);
+
+    border->start = border->ptsCnt;
+    border->movable = false;
+
+    _borderLineTo(border, to, false);
+}
+
+
+static void _arcTo(SwStroke& stroke, int32_t side)
+{
+    auto border = stroke.borders + side;
+    auto rotate = SIDE_TO_ROTATE(side);
+    auto total = mathDiff(stroke.angleIn, stroke.angleOut);
+    if (total == SW_ANGLE_PI) total = -rotate * 2;
+
+    _borderArcTo(border, stroke.center, stroke.width, stroke.angleIn + rotate, total, stroke);
+    border->movable = false;
+}
+
+
+static void _outside(SwStroke& stroke, int32_t side, SwFixed lineLength)
+{
+    auto border = stroke.borders + side;
+
+    if (stroke.join == StrokeJoin::Round) {
+        _arcTo(stroke, side);
+    } else {
+        //this is a mitered (pointed) or beveled (truncated) corner
+        auto rotate = SIDE_TO_ROTATE(side);
+        auto bevel = (stroke.join == StrokeJoin::Bevel) ? true : false;
+        SwFixed phi = 0;
+        SwFixed thcos = 0;
+
+        if (!bevel) {
+            auto theta = mathDiff(stroke.angleIn, stroke.angleOut);
+            if (theta == SW_ANGLE_PI) {
+                theta = rotate;
+                phi = stroke.angleIn;
+            } else {
+                theta /= 2;
+                phi = stroke.angleIn + theta + rotate;
+            }
+
+            thcos = mathCos(theta);
+            auto sigma = mathMultiply(stroke.miterlimit, thcos);
+
+            //is miter limit exceeded?
+            if (sigma < 0x10000L) bevel = true;
+        }
+
+        //this is a bevel (broken angle)
+        if (bevel) {
+            SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+            mathRotate(delta, stroke.angleOut + rotate);
+            SCALE(stroke, delta);
+            delta += stroke.center;
+            border->movable = false;
+            _borderLineTo(border, delta, false);
+        //this is a miter (intersection)
+        } else {
+            auto length = mathDivide(stroke.width, thcos);
+            SwPoint delta = {static_cast<SwCoord>(length), 0};
+            mathRotate(delta, phi);
+            SCALE(stroke, delta);
+            delta += stroke.center;
+            _borderLineTo(border, delta, false);
+
+            /* Now add and end point
+               Only needed if not lineto (lineLength is zero for curves) */
+            if (lineLength == 0) {
+                delta = {static_cast<SwCoord>(stroke.width), 0};
+                mathRotate(delta, stroke.angleOut + rotate);
+                SCALE(stroke, delta);
+                delta += stroke.center;
+                _borderLineTo(border, delta, false);
+            }
+        }
+    }
+}
+
+
+static void _inside(SwStroke& stroke, int32_t side, SwFixed lineLength)
+{
+    auto border = stroke.borders + side;
+    auto theta = mathDiff(stroke.angleIn, stroke.angleOut) / 2;
+    SwPoint delta;
+    bool intersect = false;
+
+    /* Only intersect borders if between two line_to's and both
+       lines are long enough (line length is zero for curves). */
+    if (border->movable && lineLength > 0) {
+        //compute minimum required length of lines
+        SwFixed minLength = abs(mathMultiply(stroke.width, mathTan(theta)));
+        if (stroke.lineLength >= minLength && lineLength >= minLength) intersect = true;
+    }
+
+    auto rotate = SIDE_TO_ROTATE(side);
+
+    if (!intersect) {
+        delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, stroke.angleOut + rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+        border->movable = false;
+    } else {
+        //compute median angle
+        auto phi = stroke.angleIn + theta;
+        auto thcos = mathCos(theta);
+        delta = {static_cast<SwCoord>(mathDivide(stroke.width, thcos)), 0};
+        mathRotate(delta, phi + rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+    }
+
+    _borderLineTo(border, delta, false);
+}
+
+
+void _processCorner(SwStroke& stroke, SwFixed lineLength)
+{
+    auto turn = mathDiff(stroke.angleIn, stroke.angleOut);
+
+    //no specific corner processing is required if the turn is 0
+    if (turn == 0) return;
+
+    //when we turn to the right, the inside side is 0
+    int32_t inside = 0;
+
+    //otherwise, the inside is 1
+    if (turn < 0) inside = 1;
+
+    //process the inside
+    _inside(stroke, inside, lineLength);
+
+    //process the outside
+    _outside(stroke, 1 - inside, lineLength);
+}
+
+
+void _firstSubPath(SwStroke& stroke, SwFixed startAngle, SwFixed lineLength)
+{
+    SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+    mathRotate(delta, startAngle + SW_ANGLE_PI2);
+    SCALE(stroke, delta);
+
+    auto pt = stroke.center + delta;
+    auto border = stroke.borders;
+    _borderMoveTo(border, pt);
+
+    pt = stroke.center - delta;
+    ++border;
+    _borderMoveTo(border, pt);
+
+    /* Save angle, position and line length for last join
+       lineLength is zero for curves */
+    stroke.subPathAngle = startAngle;
+    stroke.firstPt = false;
+    stroke.subPathLineLength = lineLength;
+}
+
+
+static void _lineTo(SwStroke& stroke, const SwPoint& to)
+{
+    auto delta = to - stroke.center;
+
+    //a zero-length lineto is a no-op; avoid creating a spurious corner
+    if (delta.zero()) return;
+
+    //compute length of line
+    auto angle = mathAtan(delta);
+
+    /* The lineLength is used to determine the intersection of strokes outlines.
+       The scale needs to be reverted since the stroke width has not been scaled.
+       An alternative option is to scale the width of the stroke properly by
+       calculating the mixture of the sx/sy rating on the stroke direction. */
+    delta.x = static_cast<SwCoord>(delta.x / stroke.sx);
+    delta.y = static_cast<SwCoord>(delta.y / stroke.sy);
+    auto lineLength = mathLength(delta);
+
+    delta = {static_cast<SwCoord>(stroke.width), 0};
+    mathRotate(delta, angle + SW_ANGLE_PI2);
+    SCALE(stroke, delta);
+
+    //process corner if necessary
+    if (stroke.firstPt) {
+        /* This is the first segment of a subpath. We need to add a point to each border
+        at their respective starting point locations. */
+        _firstSubPath(stroke, angle, lineLength);
+    } else {
+        //process the current corner
+        stroke.angleOut = angle;
+        _processCorner(stroke, lineLength);
+    }
+
+    //now add a line segment to both the inside and outside paths
+    auto border = stroke.borders;
+    auto side = 1;
+
+    while (side >= 0) {
+        auto pt = to + delta;
+
+        //the ends of lineto borders are movable
+        _borderLineTo(border, pt, true);
+
+        delta.x = -delta.x;
+        delta.y = -delta.y;
+
+        --side;
+        ++border;
+    }
+
+    stroke.angleIn = angle;
+    stroke.center = to;
+    stroke.lineLength = lineLength;
+}
+
+
+static void _cubicTo(SwStroke& stroke, const SwPoint& ctrl1, const SwPoint& ctrl2, const SwPoint& to)
+{
+    SwPoint bezStack[37];   //TODO: static?
+    auto limit = bezStack + 32;
+    auto arc = bezStack;
+    auto firstArc = true;
+    arc[0] = to;
+    arc[1] = ctrl2;
+    arc[2] = ctrl1;
+    arc[3] = stroke.center;
+
+    while (arc >= bezStack) {
+        SwFixed angleIn, angleOut, angleMid;
+
+        //initialize with current direction
+        angleIn = angleOut = angleMid = stroke.angleIn;
+
+        if (arc < limit && !mathSmallCubic(arc, angleIn, angleMid, angleOut)) {
+            if (stroke.firstPt) stroke.angleIn = angleIn;
+            mathSplitCubic(arc);
+            arc += 3;
+            continue;
+        }
+
+        if (firstArc) {
+            firstArc = false;
+            //process corner if necessary
+            if (stroke.firstPt) {
+                _firstSubPath(stroke, angleIn, 0);
+            } else {
+                stroke.angleOut = angleIn;
+                _processCorner(stroke, 0);
+            }
+        } else if (abs(mathDiff(stroke.angleIn, angleIn)) > (SW_ANGLE_PI / 8) / 4) {
+            //if the deviation from one arc to the next is too great add a round corner
+            stroke.center = arc[3];
+            stroke.angleOut = angleIn;
+            stroke.join = StrokeJoin::Round;
+
+            _processCorner(stroke, 0);
+
+            //reinstate line join style
+            stroke.join = stroke.joinSaved;
+        }
+
+        //the arc's angle is small enough; we can add it directly to each border
+        auto theta1 = mathDiff(angleIn, angleMid) / 2;
+        auto theta2 = mathDiff(angleMid, angleOut) / 2;
+        auto phi1 = mathMean(angleIn, angleMid);
+        auto phi2 = mathMean(angleMid, angleOut);
+        auto length1 = mathDivide(stroke.width, mathCos(theta1));
+        auto length2 = mathDivide(stroke.width, mathCos(theta2));
+        SwFixed alpha0 = 0;
+
+        //compute direction of original arc
+        if (stroke.handleWideStrokes) {
+            alpha0 = mathAtan(arc[0] - arc[3]);
+        }
+
+        auto border = stroke.borders;
+        int32_t side = 0;
+
+        while (side < 2) {
+            auto rotate = SIDE_TO_ROTATE(side);
+
+            //compute control points
+            SwPoint _ctrl1 = {static_cast<SwCoord>(length1), 0};
+            mathRotate(_ctrl1, phi1 + rotate);
+            SCALE(stroke, _ctrl1);
+            _ctrl1 += arc[2];
+
+            SwPoint _ctrl2 = {static_cast<SwCoord>(length2), 0};
+            mathRotate(_ctrl2, phi2 + rotate);
+            SCALE(stroke, _ctrl2);
+            _ctrl2 += arc[1];
+
+            //compute end point
+            SwPoint _end = {static_cast<SwCoord>(stroke.width), 0};
+            mathRotate(_end, angleOut + rotate);
+            SCALE(stroke, _end);
+            _end += arc[0];
+
+            if (stroke.handleWideStrokes) {
+                /* determine whether the border radius is greater than the radius of
+                   curvature of the original arc */
+                auto _start = border->pts[border->ptsCnt - 1];
+                auto alpha1 = mathAtan(_end - _start);
+
+                //is the direction of the border arc opposite to that of the original arc?
+                if (abs(mathDiff(alpha0, alpha1)) > SW_ANGLE_PI / 2) {
+
+                    //use the sine rule to find the intersection point
+                    auto beta = mathAtan(arc[3] - _start);
+                    auto gamma = mathAtan(arc[0] - _end);
+                    auto bvec = _end - _start;
+                    auto blen = mathLength(bvec);
+                    auto sinA = abs(mathSin(alpha1 - gamma));
+                    auto sinB = abs(mathSin(beta - gamma));
+                    auto alen = mathMulDiv(blen, sinA, sinB);
+
+                    SwPoint delta = {static_cast<SwCoord>(alen), 0};
+                    mathRotate(delta, beta);
+                    delta += _start;
+
+                    //circumnavigate the negative sector backwards
+                    border->movable = false;
+                    _borderLineTo(border, delta, false);
+                    _borderLineTo(border, _end, false);
+                    _borderCubicTo(border, _ctrl2, _ctrl1, _start);
+
+                    //and then move to the endpoint
+                    _borderLineTo(border, _end, false);
+
+                    ++side;
+                    ++border;
+                    continue;
+                }
+            }
+            _borderCubicTo(border, _ctrl1, _ctrl2, _end);
+            ++side;
+            ++border;
+        }
+        arc -= 3;
+        stroke.angleIn = angleOut;
+    }
+    stroke.center = to;
+}
+
+
+static void _addCap(SwStroke& stroke, SwFixed angle, int32_t side)
+{
+    if (stroke.cap == StrokeCap::Square) {
+        auto rotate = SIDE_TO_ROTATE(side);
+        auto border = stroke.borders + side;
+
+        SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle);
+        SCALE(stroke, delta);
+
+        SwPoint delta2 = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta2, angle + rotate);
+        SCALE(stroke, delta2);
+        delta += stroke.center + delta2;
+
+        _borderLineTo(border, delta, false);
+
+        delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle);
+        SCALE(stroke, delta);
+
+        delta2 = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta2, angle - rotate);
+        SCALE(stroke, delta2);
+        delta += delta2 + stroke.center;
+
+        _borderLineTo(border, delta, false);
+
+    } else if (stroke.cap == StrokeCap::Round) {
+
+        stroke.angleIn = angle;
+        stroke.angleOut = angle + SW_ANGLE_PI;
+        _arcTo(stroke, side);
+        return;
+
+    } else {  //Butt
+        auto rotate = SIDE_TO_ROTATE(side);
+        auto border = stroke.borders + side;
+
+        SwPoint delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle + rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+
+        _borderLineTo(border, delta, false);
+
+        delta = {static_cast<SwCoord>(stroke.width), 0};
+        mathRotate(delta, angle - rotate);
+        SCALE(stroke, delta);
+        delta += stroke.center;
+
+        _borderLineTo(border, delta, false);
+    }
+}
+
+
+static void _addReverseLeft(SwStroke& stroke, bool opened)
+{
+    auto right = stroke.borders + 0;
+    auto left = stroke.borders + 1;
+    auto newPts = left->ptsCnt - left->start;
+
+    if (newPts <= 0) return;
+
+    _growBorder(right, newPts);
+
+    auto dstPt = right->pts + right->ptsCnt;
+    auto dstTag = right->tags + right->ptsCnt;
+    auto srcPt = left->pts + left->ptsCnt - 1;
+    auto srcTag = left->tags + left->ptsCnt - 1;
+
+    while (srcPt >= left->pts + left->start) {
+        *dstPt = *srcPt;
+        *dstTag = *srcTag;
+
+        if (opened) {
+             dstTag[0] &= ~(SW_STROKE_TAG_BEGIN | SW_STROKE_TAG_END);
+        } else {
+            //switch begin/end tags if necessary
+            auto ttag = dstTag[0] & (SW_STROKE_TAG_BEGIN | SW_STROKE_TAG_END);
+            if (ttag == SW_STROKE_TAG_BEGIN || ttag == SW_STROKE_TAG_END)
+              dstTag[0] ^= (SW_STROKE_TAG_BEGIN | SW_STROKE_TAG_END);
+        }
+        --srcPt;
+        --srcTag;
+        ++dstPt;
+        ++dstTag;
+    }
+
+    left->ptsCnt = left->start;
+    right->ptsCnt += newPts;
+    right->movable = false;
+    left->movable = false;
+}
+
+
+static void _beginSubPath(SwStroke& stroke, const SwPoint& to, bool closed)
+{
+    /* We cannot process the first point because there is not enough
+       information regarding its corner/cap. Later, it will be processed
+       in the _endSubPath() */
+
+    stroke.firstPt = true;
+    stroke.center = to;
+    stroke.closedSubPath = closed;
+
+    /* Determine if we need to check whether the border radius is greater
+       than the radius of curvature of a curve, to handle this case specially.
+       This is only required if bevel joins or butt caps may be created because
+       round & miter joins and round & square caps cover the nagative sector
+       created with wide strokes. */
+    if ((stroke.join != StrokeJoin::Round) || (!stroke.closedSubPath && stroke.cap == StrokeCap::Butt))
+        stroke.handleWideStrokes = true;
+    else
+        stroke.handleWideStrokes = false;
+
+    stroke.ptStartSubPath = to;
+    stroke.angleIn = 0;
+}
+
+
+static void _endSubPath(SwStroke& stroke)
+{
+    if (stroke.closedSubPath) {
+        //close the path if needed
+        if (stroke.center != stroke.ptStartSubPath)
+            _lineTo(stroke, stroke.ptStartSubPath);
+
+        //process the corner
+        stroke.angleOut = stroke.subPathAngle;
+        auto turn = mathDiff(stroke.angleIn, stroke.angleOut);
+
+        //No specific corner processing is required if the turn is 0
+        if (turn != 0) {
+
+            //when we turn to the right, the inside is 0
+            int32_t inside = 0;
+
+            //otherwise, the inside is 1
+            if (turn < 0) inside = 1;
+
+            _inside(stroke, inside, stroke.subPathLineLength);        //inside
+            _outside(stroke, 1 - inside, stroke.subPathLineLength);   //outside
+        }
+
+        _borderClose(stroke.borders + 0, false);
+        _borderClose(stroke.borders + 1, true);
+    } else {
+        auto right = stroke.borders;
+
+        /* all right, this is an opened path, we need to add a cap between
+           right & left, add the reverse of left, then add a final cap
+           between left & right */
+        _addCap(stroke, stroke.angleIn, 0);
+
+        //add reversed points from 'left' to 'right'
+        _addReverseLeft(stroke, true);
+
+        //now add the final cap
+        stroke.center = stroke.ptStartSubPath;
+        _addCap(stroke, stroke.subPathAngle + SW_ANGLE_PI, 0);
+
+        /* now end the right subpath accordingly. The left one is rewind
+           and deosn't need further processing */
+        _borderClose(right, false);
+    }
+}
+
+
+static void _getCounts(SwStrokeBorder* border, uint32_t& ptsCnt, uint32_t& cntrsCnt)
+{
+    auto count = border->ptsCnt;
+    auto tags = border->tags;
+    uint32_t _ptsCnt = 0;
+    uint32_t _cntrsCnt = 0;
+    bool inCntr = false;
+
+    while (count > 0) {
+        if (tags[0] & SW_STROKE_TAG_BEGIN) {
+            if (inCntr) goto fail;
+            inCntr = true;
+        } else if (!inCntr) goto fail;
+
+        if (tags[0] & SW_STROKE_TAG_END) {
+            inCntr = false;
+            ++_cntrsCnt;
+        }
+        --count;
+        ++_ptsCnt;
+        ++tags;
+    }
+
+    if (inCntr) goto fail;
+
+    ptsCnt = _ptsCnt;
+    cntrsCnt = _cntrsCnt;
+
+    return;
+
+fail:
+    ptsCnt = 0;
+    cntrsCnt = 0;
+}
+
+
+static void _exportBorderOutline(const SwStroke& stroke, SwOutline* outline, uint32_t side)
+{
+    auto border = stroke.borders + side;
+    if (border->ptsCnt == 0) return;
+
+    memcpy(outline->pts.data + outline->pts.count, border->pts, border->ptsCnt * sizeof(SwPoint));
+
+    auto cnt = border->ptsCnt;
+    auto src = border->tags;
+    auto tags = outline->types.data + outline->types.count;
+    auto idx = outline->pts.count;
+
+    while (cnt > 0) {
+        if (*src & SW_STROKE_TAG_POINT) *tags = SW_CURVE_TYPE_POINT;
+        else if (*src & SW_STROKE_TAG_CUBIC) *tags = SW_CURVE_TYPE_CUBIC;
+        else TVGERR("SW_ENGINE", "Invalid stroke tag was given! = %d", *src);
+        if (*src & SW_STROKE_TAG_END) outline->cntrs.push(idx);
+        ++src;
+        ++tags;
+        ++idx;
+        --cnt;
+    }
+    outline->pts.count += border->ptsCnt;
+    outline->types.count += border->ptsCnt;
+}
+
+
+/************************************************************************/
+/* External Class Implementation                                        */
+/************************************************************************/
+
+void strokeFree(SwStroke* stroke)
+{
+    if (!stroke) return;
+
+    //free borders
+    if (stroke->borders[0].pts) free(stroke->borders[0].pts);
+    if (stroke->borders[0].tags) free(stroke->borders[0].tags);
+    if (stroke->borders[1].pts) free(stroke->borders[1].pts);
+    if (stroke->borders[1].tags) free(stroke->borders[1].tags);
+
+    fillFree(stroke->fill);
+    stroke->fill = nullptr;
+
+    free(stroke);
+}
+
+
+void strokeReset(SwStroke* stroke, const RenderShape* rshape, const Matrix* transform)
+{
+    if (transform) {
+        stroke->sx = sqrtf(powf(transform->e11, 2.0f) + powf(transform->e21, 2.0f));
+        stroke->sy = sqrtf(powf(transform->e12, 2.0f) + powf(transform->e22, 2.0f));
+    } else {
+        stroke->sx = stroke->sy = 1.0f;
+    }
+
+    stroke->width = HALF_STROKE(rshape->strokeWidth());
+    stroke->cap = rshape->strokeCap();
+    stroke->miterlimit = static_cast<SwFixed>(rshape->strokeMiterlimit()) << 16;
+
+    //Save line join: it can be temporarily changed when stroking curves...
+    stroke->joinSaved = stroke->join = rshape->strokeJoin();
+
+    stroke->borders[0].ptsCnt = 0;
+    stroke->borders[0].start = -1;
+    stroke->borders[1].ptsCnt = 0;
+    stroke->borders[1].start = -1;
+}
+
+
+bool strokeParseOutline(SwStroke* stroke, const SwOutline& outline)
+{
+    uint32_t first = 0;
+    uint32_t i = 0;
+
+    for (auto cntr = outline.cntrs.data; cntr < outline.cntrs.end(); ++cntr, ++i) {
+        auto last = *cntr;           //index of last point in contour
+        auto limit = outline.pts.data + last;
+
+        //Skip empty points
+        if (last <= first) {
+            first = last + 1;
+            continue;
+        }
+
+        auto start = outline.pts[first];
+        auto pt = outline.pts.data + first;
+        auto types = outline.types.data + first;
+        auto type = types[0];
+
+        //A contour cannot start with a cubic control point
+        if (type == SW_CURVE_TYPE_CUBIC) return false;
+
+        auto closed =  outline.closed.data ? outline.closed.data[i]: false;
+
+        _beginSubPath(*stroke, start, closed);
+
+        while (pt < limit) {
+            ++pt;
+            ++types;
+
+            //emit a signel line_to
+            if (types[0] == SW_CURVE_TYPE_POINT) {
+                _lineTo(*stroke, *pt);
+            //types cubic
+            } else {
+                if (pt + 1 > limit || types[1] != SW_CURVE_TYPE_CUBIC) return false;
+
+                pt += 2;
+                types += 2;
+
+                if (pt <= limit) {
+                    _cubicTo(*stroke, pt[-2], pt[-1], pt[0]);
+                    continue;
+                }
+                _cubicTo(*stroke, pt[-2], pt[-1], start);
+                goto close;
+            }
+        }
+    close:
+        if (!stroke->firstPt) _endSubPath(*stroke);
+        first = last + 1;
+    }
+    return true;
+}
+
+
+SwOutline* strokeExportOutline(SwStroke* stroke, SwMpool* mpool, unsigned tid)
+{
+    uint32_t count1, count2, count3, count4;
+
+    _getCounts(stroke->borders + 0, count1, count2);
+    _getCounts(stroke->borders + 1, count3, count4);
+
+    auto ptsCnt = count1 + count3;
+    auto cntrsCnt = count2 + count4;
+
+    auto outline = mpoolReqStrokeOutline(mpool, tid);
+    outline->pts.reserve(ptsCnt);
+    outline->types.reserve(ptsCnt);
+    outline->cntrs.reserve(cntrsCnt);
+
+    _exportBorderOutline(*stroke, outline, 0);  //left
+    _exportBorderOutline(*stroke, outline, 1);  //right
+
+    return outline;
+}