diff options
Diffstat (limited to 'drivers/gles_common/rasterizer_canvas_batcher.h')
-rw-r--r-- | drivers/gles_common/rasterizer_canvas_batcher.h | 3129 |
1 files changed, 3129 insertions, 0 deletions
diff --git a/drivers/gles_common/rasterizer_canvas_batcher.h b/drivers/gles_common/rasterizer_canvas_batcher.h new file mode 100644 index 0000000000..1836fd68d2 --- /dev/null +++ b/drivers/gles_common/rasterizer_canvas_batcher.h @@ -0,0 +1,3129 @@ +/*************************************************************************/ +/* rasterizer_canvas_batcher.h */ +/*************************************************************************/ +/* This file is part of: */ +/* GODOT ENGINE */ +/* https://godotengine.org */ +/*************************************************************************/ +/* Copyright (c) 2007-2021 Juan Linietsky, Ariel Manzur. */ +/* Copyright (c) 2014-2021 Godot Engine contributors (cf. AUTHORS.md). */ +/* */ +/* Permission is hereby granted, free of charge, to any person obtaining */ +/* a copy of this software and associated documentation files (the */ +/* "Software"), to deal in the Software without restriction, including */ +/* without limitation the rights to use, copy, modify, merge, publish, */ +/* distribute, sublicense, and/or sell copies of the Software, and to */ +/* permit persons to whom the Software is furnished to do so, subject to */ +/* the following conditions: */ +/* */ +/* The above copyright notice and this permission notice shall be */ +/* included in all copies or substantial portions of the Software. */ +/* */ +/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ +/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ +/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.*/ +/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ +/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ +/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ +/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ +/*************************************************************************/ + +#ifndef RASTERIZER_CANVAS_BATCHER_H +#define RASTERIZER_CANVAS_BATCHER_H + +#include "core/os/os.h" +#include "core/templates/local_vector.h" +#include "rasterizer_array.h" +#include "rasterizer_asserts.h" +#include "rasterizer_storage_common.h" +#include "rasterizer_version.h" + +#ifdef GODOT_4 +#include "core/config/project_settings.h" +#include "servers/rendering/renderer_compositor.h" +#else +#include "core/project_settings.h" +#include "servers/visual/rasterizer.h" +#endif + +// We are using the curiously recurring template pattern +// https://en.wikipedia.org/wiki/Curiously_recurring_template_pattern +// For static polymorphism. + +// This makes it super easy to access +// data / call funcs in the derived rasterizers from the base without writing and +// maintaining a boatload of virtual functions. +// In addition it assures that vtable will not be used and the function calls can be optimized, +// because it gives compile time static polymorphism. + +// These macros makes it simpler and less verbose to define (and redefine) the inline functions +// template preamble +#define T_PREAMBLE template <class T, typename T_STORAGE> +// class preamble +#define C_PREAMBLE RasterizerCanvasBatcher<T, T_STORAGE> +// generic preamble +#define PREAMBLE(RET_T) \ + T_PREAMBLE \ + RET_T C_PREAMBLE + +template <class T, typename T_STORAGE> +class RasterizerCanvasBatcher { +public: + // used to determine whether we use hardware transform (none) + // software transform all verts, or software transform just a translate + // (no rotate or scale) + enum TransformMode { + TM_NONE, + TM_ALL, + TM_TRANSLATE, + }; + + // pod versions of vector and color and RID, need to be 32 bit for vertex format + struct BatchVector2 { + float x, y; + void set(float xx, float yy) { + x = xx; + y = yy; + } + void set(const Vector2 &p_o) { + x = p_o.x; + y = p_o.y; + } + void to(Vector2 &r_o) const { + r_o.x = x; + r_o.y = y; + } + }; + + struct BatchColor { + float r, g, b, a; + void set_white() { + r = 1.0f; + g = 1.0f; + b = 1.0f; + a = 1.0f; + } + void set(const Color &p_c) { + r = p_c.r; + g = p_c.g; + b = p_c.b; + a = p_c.a; + } + void set(float rr, float gg, float bb, float aa) { + r = rr; + g = gg; + b = bb; + a = aa; + } + bool operator==(const BatchColor &p_c) const { + return (r == p_c.r) && (g == p_c.g) && (b == p_c.b) && (a == p_c.a); + } + bool operator!=(const BatchColor &p_c) const { return (*this == p_c) == false; } + bool equals(const Color &p_c) const { + return (r == p_c.r) && (g == p_c.g) && (b == p_c.b) && (a == p_c.a); + } + const float *get_data() const { return &r; } + String to_string() const { + String sz = "{"; + const float *data = get_data(); + for (int c = 0; c < 4; c++) { + float f = data[c]; + int val = ((f * 255.0f) + 0.5f); + sz += String(Variant(val)) + " "; + } + sz += "}"; + return sz; + } + }; + + // simplest FVF - local or baked position + struct BatchVertex { + // must be 32 bit pod + BatchVector2 pos; + BatchVector2 uv; + }; + + // simple FVF but also incorporating baked color + struct BatchVertexColored : public BatchVertex { + // must be 32 bit pod + BatchColor col; + }; + + // if we are using normal mapping, we need light angles to be sent + struct BatchVertexLightAngled : public BatchVertexColored { + // must be pod + float light_angle; + }; + + // CUSTOM SHADER vertex formats. These are larger but will probably + // be needed with custom shaders in order to have the data accessible in the shader. + + // if we are using COLOR in vertex shader but not position (VERTEX) + struct BatchVertexModulated : public BatchVertexLightAngled { + BatchColor modulate; + }; + + struct BatchTransform { + BatchVector2 translate; + BatchVector2 basis[2]; + }; + + // last resort, specially for custom shader, we put everything possible into a huge FVF + // not very efficient, but better than no batching at all. + struct BatchVertexLarge : public BatchVertexModulated { + // must be pod + BatchTransform transform; + }; + + // Batch should be as small as possible, and ideally nicely aligned (is 32 bytes at the moment) + struct Batch { + RasterizerStorageCommon::BatchType type; // should be 16 bit + uint16_t batch_texture_id; + + // also item reference number + uint32_t first_command; + + // in the case of DEFAULT, this is num commands. + // with rects, is number of command and rects. + // with lines, is number of lines + uint32_t num_commands; + + // first vertex of this batch in the vertex lists + uint32_t first_vert; + + BatchColor color; + }; + + struct BatchTex { + enum TileMode : uint32_t { + TILE_OFF, + TILE_NORMAL, + TILE_FORCE_REPEAT, + }; + RID RID_texture; + RID RID_normal; + TileMode tile_mode; + BatchVector2 tex_pixel_size; + uint32_t flags; + }; + + // items in a list to be sorted prior to joining + struct BSortItem { + // have a function to keep as pod, rather than operator + void assign(const BSortItem &o) { + item = o.item; + z_index = o.z_index; + } + RendererCanvasRender::Item *item; + int z_index; + }; + + // batch item may represent 1 or more items + struct BItemJoined { + uint32_t first_item_ref; + uint32_t num_item_refs; + + Rect2 bounding_rect; + + // note the z_index may only be correct for the first of the joined item references + // this has implications for light culling with z ranged lights. + int16_t z_index; + + // these are defined in RasterizerStorageCommon::BatchFlags + uint16_t flags; + + // we are always splitting items with lots of commands, + // and items with unhandled primitives (default) + bool use_hardware_transform() const { return num_item_refs == 1; } + }; + + struct BItemRef { + RendererCanvasRender::Item *item; + Color final_modulate; + }; + + struct BLightRegion { + void reset() { + light_bitfield = 0; + shadow_bitfield = 0; + too_many_lights = false; + } + uint64_t light_bitfield; + uint64_t shadow_bitfield; + bool too_many_lights; // we can only do light region optimization if there are 64 or less lights + }; + + struct BatchData { + BatchData() { + reset_flush(); + reset_joined_item(); + + gl_vertex_buffer = 0; + gl_index_buffer = 0; + max_quads = 0; + vertex_buffer_size_units = 0; + vertex_buffer_size_bytes = 0; + index_buffer_size_units = 0; + index_buffer_size_bytes = 0; + + use_colored_vertices = false; + + settings_use_batching = false; + settings_max_join_item_commands = 0; + settings_colored_vertex_format_threshold = 0.0f; + settings_batch_buffer_num_verts = 0; + scissor_threshold_area = 0.0f; + joined_item_batch_flags = 0; + diagnose_frame = false; + next_diagnose_tick = 10000; + diagnose_frame_number = 9999999999; // some high number + join_across_z_indices = true; + settings_item_reordering_lookahead = 0; + + settings_use_batching_original_choice = false; + settings_flash_batching = false; + settings_diagnose_frame = false; + settings_scissor_lights = false; + settings_scissor_threshold = -1.0f; + settings_use_single_rect_fallback = false; + settings_use_software_skinning = true; + settings_ninepatch_mode = 0; // default + settings_light_max_join_items = 16; + + settings_uv_contract = false; + settings_uv_contract_amount = 0.0f; + + buffer_mode_batch_upload_send_null = true; + buffer_mode_batch_upload_flag_stream = false; + + stats_items_sorted = 0; + stats_light_items_joined = 0; + } + + // called for each joined item + void reset_joined_item() { + // noop but left in as a stub + } + + // called after each flush + void reset_flush() { + batches.reset(); + batch_textures.reset(); + + vertices.reset(); + light_angles.reset(); + vertex_colors.reset(); + vertex_modulates.reset(); + vertex_transforms.reset(); + + total_quads = 0; + total_verts = 0; + total_color_changes = 0; + + use_light_angles = false; + use_modulate = false; + use_large_verts = false; + fvf = RasterizerStorageCommon::FVF_REGULAR; + } + + unsigned int gl_vertex_buffer; + unsigned int gl_index_buffer; + + uint32_t max_quads; + uint32_t vertex_buffer_size_units; + uint32_t vertex_buffer_size_bytes; + uint32_t index_buffer_size_units; + uint32_t index_buffer_size_bytes; + + // small vertex FVF type - pos and UV. + // This will always be written to initially, but can be translated + // to larger FVFs if necessary. + RasterizerArray<BatchVertex> vertices; + + // extra data which can be stored during prefilling, for later translation to larger FVFs + RasterizerArray<float> light_angles; + RasterizerArray<BatchColor> vertex_colors; // these aren't usually used, but are for polys + RasterizerArray<BatchColor> vertex_modulates; + RasterizerArray<BatchTransform> vertex_transforms; + + // instead of having a different buffer for each vertex FVF type + // we have a special array big enough for the biggest FVF + // which can have a changeable unit size, and reuse it. + RasterizerUnitArray unit_vertices; + + RasterizerArray<Batch> batches; + RasterizerArray<Batch> batches_temp; // used for translating to colored vertex batches + RasterizerArray_non_pod<BatchTex> batch_textures; // the only reason this is non-POD is because of RIDs + + // SHOULD THESE BE IN FILLSTATE? + // flexible vertex format. + // all verts have pos and UV. + // some have color, some light angles etc. + RasterizerStorageCommon::FVF fvf; + bool use_colored_vertices; + bool use_light_angles; + bool use_modulate; + bool use_large_verts; + + // if the shader is using MODULATE, we prevent baking color so the final_modulate can + // be read in the shader. + // if the shader is reading VERTEX, we prevent baking vertex positions with extra matrices etc + // to prevent the read position being incorrect. + // These flags are defined in RasterizerStorageCommon::BatchFlags + uint32_t joined_item_batch_flags; + + RasterizerArray<BItemJoined> items_joined; + RasterizerArray<BItemRef> item_refs; + + // items are sorted prior to joining + RasterizerArray<BSortItem> sort_items; + + // new for Godot 4 .. the client outputs a linked list so we need to convert this + // to a linear array + LocalVector<RendererCanvasRender::Item::Command *> command_shortlist; + + // counts + int total_quads; + int total_verts; + + // we keep a record of how many color changes caused new batches + // if the colors are causing an excessive number of batches, we switch + // to alternate batching method and add color to the vertex format. + int total_color_changes; + + // measured in pixels, recalculated each frame + float scissor_threshold_area; + + // diagnose this frame, every nTh frame when settings_diagnose_frame is on + bool diagnose_frame; + String frame_string; + uint32_t next_diagnose_tick; + uint64_t diagnose_frame_number; + + // whether to join items across z_indices - this can interfere with z ranged lights, + // so has to be disabled in some circumstances + bool join_across_z_indices; + + // global settings + bool settings_use_batching; // the current use_batching (affected by flash) + bool settings_use_batching_original_choice; // the choice entered in project settings + bool settings_flash_batching; // for regression testing, flash between non-batched and batched renderer + bool settings_diagnose_frame; // print out batches to help optimize / regression test + int settings_max_join_item_commands; + float settings_colored_vertex_format_threshold; + int settings_batch_buffer_num_verts; + bool settings_scissor_lights; + float settings_scissor_threshold; // 0.0 to 1.0 + int settings_item_reordering_lookahead; + bool settings_use_single_rect_fallback; + bool settings_use_software_skinning; + int settings_light_max_join_items; + int settings_ninepatch_mode; + + // buffer orphaning modes + bool buffer_mode_batch_upload_send_null; + bool buffer_mode_batch_upload_flag_stream; + + // uv contraction + bool settings_uv_contract; + float settings_uv_contract_amount; + + // only done on diagnose frame + void reset_stats() { + stats_items_sorted = 0; + stats_light_items_joined = 0; + } + + // frame stats (just for monitoring and debugging) + int stats_items_sorted; + int stats_light_items_joined; + } bdata; + + struct FillState { + void reset_flush() { + // don't reset members that need to be preserved after flushing + // half way through a list of commands + curr_batch = 0; + batch_tex_id = -1; + texpixel_size = Vector2(1, 1); + contract_uvs = false; + + sequence_batch_type_flags = 0; + } + + void reset_joined_item(bool p_use_hardware_transform) { + reset_flush(); + use_hardware_transform = p_use_hardware_transform; + extra_matrix_sent = false; + } + + // for batching multiple types, we don't allow mixing RECTs / LINEs etc. + // using flags allows quicker rejection of sequences with different batch types + uint32_t sequence_batch_type_flags; + + Batch *curr_batch; + int batch_tex_id; + bool use_hardware_transform; + bool contract_uvs; + Vector2 texpixel_size; + Color final_modulate; + TransformMode transform_mode; + TransformMode orig_transform_mode; + + // support for extra matrices + bool extra_matrix_sent; // whether sent on this item (in which case sofware transform can't be used untl end of item) + int transform_extra_command_number_p1; // plus one to allow fast checking against zero + Transform2D transform_combined; // final * extra + }; + + // used during try_join + struct RenderItemState { + RenderItemState() { reset(); } + void reset() { + current_clip = nullptr; + shader_cache = nullptr; + rebind_shader = true; + prev_use_skeleton = false; + last_blend_mode = -1; + canvas_last_material = RID(); + item_group_z = 0; + item_group_light = nullptr; + final_modulate = Color(-1.0, -1.0, -1.0, -1.0); // just something unlikely + + joined_item_batch_type_flags_curr = 0; + joined_item_batch_type_flags_prev = 0; + + joined_item = nullptr; + } + + RendererCanvasRender::Item *current_clip; + typename T_STORAGE::Shader *shader_cache; + bool rebind_shader; + bool prev_use_skeleton; + bool prev_distance_field; + int last_blend_mode; + RID canvas_last_material; + Color final_modulate; + + // used for joining items only + BItemJoined *joined_item; + bool join_batch_break; + BLightRegion light_region; + + // we need some logic to prevent joining items that have vastly different batch types + // these are defined in RasterizerStorageCommon::BatchTypeFlags + uint32_t joined_item_batch_type_flags_curr; + uint32_t joined_item_batch_type_flags_prev; + + // 'item group' is data over a single call to canvas_render_items + int item_group_z; + Color item_group_modulate; + RendererCanvasRender::Light *item_group_light; + Transform2D item_group_base_transform; + } _render_item_state; + + bool use_nvidia_rect_workaround; + + ////////////////////////////////////////////////////////////////////////////// + // End of structs used by the batcher. Beginning of funcs. +private: + // curiously recurring template pattern - allows access to functions in the DERIVED class + // this is kind of like using virtual functions but more efficient as they are resolved at compile time + T_STORAGE *get_storage() { return static_cast<const T *>(this)->storage; } + const T_STORAGE *get_storage() const { return static_cast<const T *>(this)->storage; } + T *get_this() { return static_cast<T *>(this); } + const T *get_this() const { return static_cast<const T *>(this); } + +protected: + // main functions called from the rasterizer canvas + void batch_constructor(); + void batch_initialize(); + + void batch_canvas_begin(); + void batch_canvas_end(); + void batch_canvas_render_items_begin(const Color &p_modulate, RendererCanvasRender::Light *p_light, const Transform2D &p_base_transform); + void batch_canvas_render_items_end(); + void batch_canvas_render_items(RendererCanvasRender::Item *p_item_list, int p_z, const Color &p_modulate, RendererCanvasRender::Light *p_light, const Transform2D &p_base_transform); + + // recording and sorting items from the initial pass + void record_items(RendererCanvasRender::Item *p_item_list, int p_z); + void join_sorted_items(); + void sort_items(); + bool _sort_items_match(const BSortItem &p_a, const BSortItem &p_b) const; + bool sort_items_from(int p_start); + + // joining logic + bool _disallow_item_join_if_batch_types_too_different(RenderItemState &r_ris, uint32_t btf_allowed); + bool _detect_item_batch_break(RenderItemState &r_ris, RendererCanvasRender::Item *p_ci, bool &r_batch_break); + + // drives the loop filling batches and flushing + void render_joined_item_commands(const BItemJoined &p_bij, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit); + +private: + // flush once full or end of joined item + void flush_render_batches(RendererCanvasRender::Item *p_first_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, uint32_t p_sequence_batch_type_flags); + + // a single joined item can contain multiple itemrefs, and thus create lots of batches +#ifdef GODOT_3 + bool prefill_joined_item(FillState &r_fill_state, int &r_command_start, RendererCanvasRender::Item *p_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material); +#else + // command start given a separate name to make easier to tell apart godot 3 and 4 + bool prefill_joined_item(FillState &r_fill_state, RendererCanvasRender::Item::Command **r_first_command, RendererCanvasRender::Item *p_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material); +#endif + + // prefilling different types of batch + + // default batch is an 'unhandled' legacy type batch that will be drawn with the legacy path, + // all other batches are accelerated. + void _prefill_default_batch(FillState &r_fill_state, int p_command_num, const RendererCanvasRender::Item &p_item); + + // accelerated batches +#ifdef GODOT_3 + bool _prefill_line(RendererCanvasRender::Item::GD_COMMAND_LINE *p_line, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item *p_item, bool multiply_final_modulate); + template <bool SEND_LIGHT_ANGLES> + bool _prefill_ninepatch(RendererCanvasRender::Item::CommandNinePatch *p_np, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item *p_item, bool multiply_final_modulate); + template <bool SEND_LIGHT_ANGLES> + bool _prefill_polygon(RendererCanvasRender::Item::CommandPolygon *p_poly, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item *p_item, bool multiply_final_modulate); + template <bool SEND_LIGHT_ANGLES> +#endif + bool _prefill_rect(RendererCanvasRender::Item::CommandRect *rect, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item::Command *const *commands, RendererCanvasRender::Item *p_item, bool multiply_final_modulate); + + // dealing with textures + int _batch_find_or_create_tex(const RID &p_texture, const RID &p_normal, bool p_tile, int p_previous_match); + +protected: + // legacy support for non batched mode + void _legacy_canvas_item_render_commands(RendererCanvasRender::Item *p_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material); + + // light scissoring + bool _light_scissor_begin(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect) const; + bool _light_find_intersection(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect, Rect2 &r_cliprect) const; + void _calculate_scissor_threshold_area(); + +private: + // translating vertex formats prior to rendering + void _translate_batches_to_vertex_colored_FVF(); + template <class BATCH_VERTEX_TYPE, bool INCLUDE_LIGHT_ANGLES, bool INCLUDE_MODULATE, bool INCLUDE_LARGE> + void _translate_batches_to_larger_FVF(uint32_t p_sequence_batch_type_flags); + +protected: + // accessory funcs + void _software_transform_vertex(BatchVector2 &r_v, const Transform2D &p_tr) const; + void _software_transform_vertex(Vector2 &r_v, const Transform2D &p_tr) const; + TransformMode _find_transform_mode(const Transform2D &p_tr) const { + // decided whether to do translate only for software transform + if ((p_tr.elements[0].x == 1.0f) && + (p_tr.elements[0].y == 0.0f) && + (p_tr.elements[1].x == 0.0f) && + (p_tr.elements[1].y == 1.0f)) { + return TM_TRANSLATE; + } + + return TM_ALL; + } + +#ifdef GODOT_3 + bool _software_skin_poly(RendererCanvasRender::Item::CommandPolygon *p_poly, RendererCanvasRender::Item *p_item, BatchVertex *bvs, BatchColor *vertex_colors, const FillState &p_fill_state, const BatchColor *p_precalced_colors); +#endif + + typename T_STORAGE::Texture *_get_canvas_texture(const RID &p_texture) const { + if (p_texture.is_valid()) { + typename T_STORAGE::Texture *texture = get_storage()->texture_owner.getornull(p_texture); + + if (texture) { + return texture->get_ptr(); + } + } + + return 0; + } + +public: + Batch *_batch_request_new(bool p_blank = true) { + Batch *batch = bdata.batches.request(); + if (!batch) { + // grow the batches + bdata.batches.grow(); + + // and the temporary batches (used for color verts) + bdata.batches_temp.reset(); + bdata.batches_temp.grow(); + + // this should always succeed after growing + batch = bdata.batches.request(); + RAST_DEBUG_ASSERT(batch); + } + + if (p_blank) + memset(batch, 0, sizeof(Batch)); + + return batch; + } + + BatchVertex *_batch_vertex_request_new() { + return bdata.vertices.request(); + } + +protected: +#ifdef GODOT_4 + int godot4_commands_count(RendererCanvasRender::Item::Command *p_comm) const { + int count = 0; + while (p_comm) { + count++; + p_comm = p_comm->next; + } + return count; + } + + unsigned int godot4_commands_to_vector(RendererCanvasRender::Item::Command *p_comm, LocalVector<RendererCanvasRender::Item::Command *> &p_list) { + p_list.clear(); + while (p_comm) { + p_list.push_back(p_comm); + p_comm = p_comm->next; + } + return p_list.size(); + } +#endif + + // no need to compile these in in release, they are unneeded outside the editor and only add to executable size +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) +#include "batch_diagnose.inc" +#endif +}; + +PREAMBLE(void)::batch_canvas_begin() { + // diagnose_frame? + bdata.frame_string = ""; // just in case, always set this as we don't want a string leak in release... +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + if (bdata.settings_diagnose_frame) { + bdata.diagnose_frame = false; + + uint32_t tick = OS::get_singleton()->get_ticks_msec(); + uint64_t frame = Engine::get_singleton()->get_frames_drawn(); + + if (tick >= bdata.next_diagnose_tick) { + bdata.next_diagnose_tick = tick + 10000; + + // the plus one is prevent starting diagnosis half way through frame + bdata.diagnose_frame_number = frame + 1; + } + + if (frame == bdata.diagnose_frame_number) { + bdata.diagnose_frame = true; + bdata.reset_stats(); + } + + if (bdata.diagnose_frame) { + bdata.frame_string = "canvas_begin FRAME " + itos(frame) + "\n"; + } + } +#endif +} + +PREAMBLE(void)::batch_canvas_end() { +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + if (bdata.diagnose_frame) { + bdata.frame_string += "canvas_end\n"; + if (bdata.stats_items_sorted) { + bdata.frame_string += "\titems reordered: " + itos(bdata.stats_items_sorted) + "\n"; + } + if (bdata.stats_light_items_joined) { + bdata.frame_string += "\tlight items joined: " + itos(bdata.stats_light_items_joined) + "\n"; + } + + print_line(bdata.frame_string); + } +#endif +} + +PREAMBLE(void)::batch_canvas_render_items_begin(const Color &p_modulate, RendererCanvasRender::Light *p_light, const Transform2D &p_base_transform) { + // if we are debugging, flash each frame between batching renderer and old version to compare for regressions + if (bdata.settings_flash_batching) { + if ((Engine::get_singleton()->get_frames_drawn() % 2) == 0) + bdata.settings_use_batching = true; + else + bdata.settings_use_batching = false; + } + + if (!bdata.settings_use_batching) { + return; + } + + // this only needs to be done when screen size changes, but this should be + // infrequent enough + _calculate_scissor_threshold_area(); + + // set up render item state for all the z_indexes (this is common to all z_indexes) + _render_item_state.reset(); + _render_item_state.item_group_modulate = p_modulate; + _render_item_state.item_group_light = p_light; + _render_item_state.item_group_base_transform = p_base_transform; + _render_item_state.light_region.reset(); + + // batch break must be preserved over the different z indices, + // to prevent joining to an item on a previous index if not allowed + _render_item_state.join_batch_break = false; + + // whether to join across z indices depends on whether there are z ranged lights. + // joined z_index items can be wrongly classified with z ranged lights. + bdata.join_across_z_indices = true; + + int light_count = 0; + while (p_light) { + light_count++; + + if ((p_light->z_min != GD_VS::CANVAS_ITEM_Z_MIN) || (p_light->z_max != GD_VS::CANVAS_ITEM_Z_MAX)) { + // prevent joining across z indices. This would have caused visual regressions + bdata.join_across_z_indices = false; + } + + p_light = p_light->next_ptr; + } + + // can't use the light region bitfield if there are too many lights + // hopefully most games won't blow this limit.. + // if they do they will work but it won't batch join items just in case + if (light_count > 64) { + _render_item_state.light_region.too_many_lights = true; + } +} + +PREAMBLE(void)::batch_canvas_render_items_end() { + if (!bdata.settings_use_batching) { + return; + } + + join_sorted_items(); + +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + if (bdata.diagnose_frame) { + bdata.frame_string += "items\n"; + } +#endif + + // batching render is deferred until after going through all the z_indices, joining all the items + get_this()->canvas_render_items_implementation(0, 0, _render_item_state.item_group_modulate, + _render_item_state.item_group_light, + _render_item_state.item_group_base_transform); + + bdata.items_joined.reset(); + bdata.item_refs.reset(); + bdata.sort_items.reset(); +} + +PREAMBLE(void)::batch_canvas_render_items(RendererCanvasRender::Item *p_item_list, int p_z, const Color &p_modulate, RendererCanvasRender::Light *p_light, const Transform2D &p_base_transform) { + // stage 1 : join similar items, so that their state changes are not repeated, + // and commands from joined items can be batched together + if (bdata.settings_use_batching) { + record_items(p_item_list, p_z); + return; + } + + // only legacy renders at this stage, batched renderer doesn't render until canvas_render_items_end() + get_this()->canvas_render_items_implementation(p_item_list, p_z, p_modulate, p_light, p_base_transform); +} + +// Default batches will not occur in software transform only items +// EXCEPT IN THE CASE OF SINGLE RECTS (and this may well not occur, check the logic in prefill_join_item TYPE_RECT) +// but can occur where transform commands have been sent during hardware batch +PREAMBLE(void)::_prefill_default_batch(FillState &r_fill_state, int p_command_num, const RendererCanvasRender::Item &p_item) { + if (r_fill_state.curr_batch->type == RasterizerStorageCommon::BT_DEFAULT) { + // don't need to flush an extra transform command? + if (!r_fill_state.transform_extra_command_number_p1) { + // another default command, just add to the existing batch + r_fill_state.curr_batch->num_commands++; + } else { +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + if (r_fill_state.transform_extra_command_number_p1 != p_command_num) { + WARN_PRINT_ONCE("_prefill_default_batch : transform_extra_command_number_p1 != p_command_num"); + } +#endif + // if the first member of the batch is a transform we have to be careful + if (!r_fill_state.curr_batch->num_commands) { + // there can be leading useless extra transforms (sometimes happens with debug collision polys) + // we need to rejig the first_command for the first useful transform + r_fill_state.curr_batch->first_command += r_fill_state.transform_extra_command_number_p1 - 1; + } + + // we do have a pending extra transform command to flush + // either the extra transform is in the prior command, or not, in which case we need 2 batches + r_fill_state.curr_batch->num_commands += 2; + + r_fill_state.transform_extra_command_number_p1 = 0; // mark as sent + r_fill_state.extra_matrix_sent = true; + + // the original mode should always be hardware transform .. + // test this assumption + //CRASH_COND(r_fill_state.orig_transform_mode != TM_NONE); + r_fill_state.transform_mode = r_fill_state.orig_transform_mode; + + // do we need to restore anything else? + } + } else { + // end of previous different type batch, so start new default batch + + // first consider whether there is a dirty extra matrix to send + if (r_fill_state.transform_extra_command_number_p1) { + // get which command the extra is in, and blank all the records as it no longer is stored CPU side + int extra_command = r_fill_state.transform_extra_command_number_p1 - 1; // plus 1 based + r_fill_state.transform_extra_command_number_p1 = 0; + r_fill_state.extra_matrix_sent = true; + + // send the extra to the GPU in a batch + r_fill_state.curr_batch = _batch_request_new(); + r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DEFAULT; + r_fill_state.curr_batch->first_command = extra_command; + r_fill_state.curr_batch->num_commands = 1; + + // revert to the original transform mode + // e.g. go back to NONE if we were in hardware transform mode + r_fill_state.transform_mode = r_fill_state.orig_transform_mode; + + // reset the original transform if we are going back to software mode, + // because the extra is now done on the GPU... + // (any subsequent extras are sent directly to the GPU, no deferring) + if (r_fill_state.orig_transform_mode != TM_NONE) { + r_fill_state.transform_combined = p_item.final_transform; + } + + // can possibly combine batch with the next one in some cases + // this is more efficient than having an extra batch especially for the extra + if ((extra_command + 1) == p_command_num) { + r_fill_state.curr_batch->num_commands = 2; + return; + } + } + + // start default batch + r_fill_state.curr_batch = _batch_request_new(); + r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DEFAULT; + r_fill_state.curr_batch->first_command = p_command_num; + r_fill_state.curr_batch->num_commands = 1; + } +} + +PREAMBLE(int)::_batch_find_or_create_tex(const RID &p_texture, const RID &p_normal, bool p_tile, int p_previous_match) { + // optimization .. in 99% cases the last matched value will be the same, so no need to traverse the list + if (p_previous_match > 0) // if it is zero, it will get hit first in the linear search anyway + { + const BatchTex &batch_texture = bdata.batch_textures[p_previous_match]; + + // note for future reference, if RID implementation changes, this could become more expensive + if ((batch_texture.RID_texture == p_texture) && (batch_texture.RID_normal == p_normal)) { + // tiling mode must also match + bool tiles = batch_texture.tile_mode != BatchTex::TILE_OFF; + + if (tiles == p_tile) + // match! + return p_previous_match; + } + } + + // not the previous match .. we will do a linear search ... slower, but should happen + // not very often except with non-batchable runs, which are going to be slow anyway + // n.b. could possibly be replaced later by a fast hash table + for (int n = 0; n < bdata.batch_textures.size(); n++) { + const BatchTex &batch_texture = bdata.batch_textures[n]; + if ((batch_texture.RID_texture == p_texture) && (batch_texture.RID_normal == p_normal)) { + // tiling mode must also match + bool tiles = batch_texture.tile_mode != BatchTex::TILE_OFF; + + if (tiles == p_tile) + // match! + return n; + } + } + + // pushing back from local variable .. not ideal but has to use a Vector because non pod + // due to RIDs + BatchTex new_batch_tex; + new_batch_tex.RID_texture = p_texture; + new_batch_tex.RID_normal = p_normal; + + // get the texture + typename T_STORAGE::Texture *texture = _get_canvas_texture(p_texture); + + if (texture) { + // special case, there can be textures with no width or height + int w = texture->width; + int h = texture->height; + + if (!w || !h) { + w = 1; + h = 1; + } + + new_batch_tex.tex_pixel_size.x = 1.0 / w; + new_batch_tex.tex_pixel_size.y = 1.0 / h; + new_batch_tex.flags = texture->flags; + } else { + // maybe doesn't need doing... + new_batch_tex.tex_pixel_size.x = 1.0f; + new_batch_tex.tex_pixel_size.y = 1.0f; + new_batch_tex.flags = 0; + } + + if (p_tile) { + if (texture) { + // default + new_batch_tex.tile_mode = BatchTex::TILE_NORMAL; + + // no hardware support for non power of 2 tiling + if (!get_storage()->config.support_npot_repeat_mipmap) { + if (next_power_of_2(texture->alloc_width) != (unsigned int)texture->alloc_width && next_power_of_2(texture->alloc_height) != (unsigned int)texture->alloc_height) { + new_batch_tex.tile_mode = BatchTex::TILE_FORCE_REPEAT; + } + } + } else { + // this should not happen? + new_batch_tex.tile_mode = BatchTex::TILE_OFF; + } + } else { + new_batch_tex.tile_mode = BatchTex::TILE_OFF; + } + + // push back + bdata.batch_textures.push_back(new_batch_tex); + + return bdata.batch_textures.size() - 1; +} + +PREAMBLE(void)::batch_constructor() { + bdata.settings_use_batching = false; + +#ifdef GLES_OVER_GL + use_nvidia_rect_workaround = GLOBAL_GET("rendering/quality/2d/use_nvidia_rect_flicker_workaround"); +#else + // Not needed (a priori) on GLES devices + use_nvidia_rect_workaround = false; +#endif +} + +PREAMBLE(void)::batch_initialize() { +#define BATCHING_LOAD_PROJECT_SETTINGS + +#ifdef BATCHING_LOAD_PROJECT_SETTINGS + bdata.settings_use_batching = GLOBAL_GET("rendering/batching/options/use_batching"); + bdata.settings_max_join_item_commands = GLOBAL_GET("rendering/batching/parameters/max_join_item_commands"); + bdata.settings_colored_vertex_format_threshold = GLOBAL_GET("rendering/batching/parameters/colored_vertex_format_threshold"); + bdata.settings_item_reordering_lookahead = GLOBAL_GET("rendering/batching/parameters/item_reordering_lookahead"); + bdata.settings_light_max_join_items = GLOBAL_GET("rendering/batching/lights/max_join_items"); + bdata.settings_use_single_rect_fallback = GLOBAL_GET("rendering/batching/options/single_rect_fallback"); + bdata.settings_use_software_skinning = GLOBAL_GET("rendering/quality/2d/use_software_skinning"); + bdata.settings_ninepatch_mode = GLOBAL_GET("rendering/quality/2d/ninepatch_mode"); + + // alternatively only enable uv contract if pixel snap in use, + // but with this enable bool, it should not be necessary + bdata.settings_uv_contract = GLOBAL_GET("rendering/batching/precision/uv_contract"); + bdata.settings_uv_contract_amount = (float)GLOBAL_GET("rendering/batching/precision/uv_contract_amount") / 1000000.0f; + + // we can use the threshold to determine whether to turn scissoring off or on + bdata.settings_scissor_threshold = GLOBAL_GET("rendering/batching/lights/scissor_area_threshold"); +#endif + + if (bdata.settings_scissor_threshold > 0.999f) { + bdata.settings_scissor_lights = false; + } else { + bdata.settings_scissor_lights = true; + + // apply power of 4 relationship for the area, as most of the important changes + // will be happening at low values of scissor threshold + bdata.settings_scissor_threshold *= bdata.settings_scissor_threshold; + bdata.settings_scissor_threshold *= bdata.settings_scissor_threshold; + } + + // The sweet spot on my desktop for cache is actually smaller than the max, and this + // is the default. This saves memory too so we will use it for now, needs testing to see whether this varies according + // to device / platform. +#ifdef BATCHING_LOAD_PROJECT_SETTINGS + bdata.settings_batch_buffer_num_verts = GLOBAL_GET("rendering/batching/parameters/batch_buffer_size"); + + // override the use_batching setting in the editor + // (note that if the editor can't start, you can't change the use_batching project setting!) + if (Engine::get_singleton()->is_editor_hint()) { + bool use_in_editor = GLOBAL_GET("rendering/batching/options/use_batching_in_editor"); + bdata.settings_use_batching = use_in_editor; + + // fix some settings in the editor, as the performance not worth the risk + bdata.settings_use_single_rect_fallback = false; + } +#endif + + // if we are using batching, we will purposefully disable the nvidia workaround. + // This is because the only reason to use the single rect fallback is the approx 2x speed + // of the uniform drawing technique. If we used nvidia workaround, speed would be + // approx equal to the batcher drawing technique (indexed primitive + VB). + if (bdata.settings_use_batching) { + use_nvidia_rect_workaround = false; + } + + // For debugging, if flash is set in project settings, it will flash on alternate frames + // between the non-batched renderer and the batched renderer, + // in order to find regressions. + // This should not be used except during development. + // make a note of the original choice in case we are flashing on and off the batching + bdata.settings_use_batching_original_choice = bdata.settings_use_batching; + +#ifdef BATCHING_LOAD_PROJECT_SETTINGS + bdata.settings_flash_batching = GLOBAL_GET("rendering/batching/debug/flash_batching"); +#endif + if (!bdata.settings_use_batching) { + // no flash when batching turned off + bdata.settings_flash_batching = false; + } + + // frame diagnosis. print out the batches every nth frame + bdata.settings_diagnose_frame = false; + if (!Engine::get_singleton()->is_editor_hint() && bdata.settings_use_batching) { +#ifdef BATCHING_LOAD_PROJECT_SETTINGS + bdata.settings_diagnose_frame = GLOBAL_GET("rendering/batching/debug/diagnose_frame"); +#endif + } + + // the maximum num quads in a batch is limited by GLES2. We can have only 16 bit indices, + // which means we can address a vertex buffer of max size 65535. 4 vertices are needed per quad. + + // Note this determines the memory use by the vertex buffer vector. max quads (65536/4)-1 + // but can be reduced to save memory if really required (will result in more batches though) + const int max_possible_quads = (65536 / 4) - 1; + const int min_possible_quads = 8; // some reasonable small value + + // value from project settings + int max_quads = bdata.settings_batch_buffer_num_verts / 4; + + // sanity checks + max_quads = CLAMP(max_quads, min_possible_quads, max_possible_quads); + bdata.settings_max_join_item_commands = CLAMP(bdata.settings_max_join_item_commands, 0, 65535); + bdata.settings_colored_vertex_format_threshold = CLAMP(bdata.settings_colored_vertex_format_threshold, 0.0f, 1.0f); + bdata.settings_scissor_threshold = CLAMP(bdata.settings_scissor_threshold, 0.0f, 1.0f); + bdata.settings_light_max_join_items = CLAMP(bdata.settings_light_max_join_items, 0, 65535); + bdata.settings_item_reordering_lookahead = CLAMP(bdata.settings_item_reordering_lookahead, 0, 65535); + + // allow user to override the api usage techniques using project settings + // bdata.buffer_mode_batch_upload_send_null = GLOBAL_GET("rendering/options/api_usage_batching/send_null"); + // bdata.buffer_mode_batch_upload_flag_stream = GLOBAL_GET("rendering/options/api_usage_batching/flag_stream"); + + // for debug purposes, output a string with the batching options + String batching_options_string = "OpenGL ES Batching: "; + if (bdata.settings_use_batching) { + batching_options_string += "ON"; + + if (OS::get_singleton()->is_stdout_verbose()) { + batching_options_string += "\n\tOPTIONS\n"; + batching_options_string += "\tmax_join_item_commands " + itos(bdata.settings_max_join_item_commands) + "\n"; + batching_options_string += "\tcolored_vertex_format_threshold " + String(Variant(bdata.settings_colored_vertex_format_threshold)) + "\n"; + batching_options_string += "\tbatch_buffer_size " + itos(bdata.settings_batch_buffer_num_verts) + "\n"; + batching_options_string += "\tlight_scissor_area_threshold " + String(Variant(bdata.settings_scissor_threshold)) + "\n"; + + batching_options_string += "\titem_reordering_lookahead " + itos(bdata.settings_item_reordering_lookahead) + "\n"; + batching_options_string += "\tlight_max_join_items " + itos(bdata.settings_light_max_join_items) + "\n"; + batching_options_string += "\tsingle_rect_fallback " + String(Variant(bdata.settings_use_single_rect_fallback)) + "\n"; + + batching_options_string += "\tdebug_flash " + String(Variant(bdata.settings_flash_batching)) + "\n"; + batching_options_string += "\tdiagnose_frame " + String(Variant(bdata.settings_diagnose_frame)); + } + + print_line(batching_options_string); + } + + // special case, for colored vertex format threshold. + // as the comparison is >=, we want to be able to totally turn on or off + // conversion to colored vertex format at the extremes, so we will force + // 1.0 to be just above 1.0 + if (bdata.settings_colored_vertex_format_threshold > 0.995f) { + bdata.settings_colored_vertex_format_threshold = 1.01f; + } + + // save memory when batching off + if (!bdata.settings_use_batching) { + max_quads = 0; + } + + uint32_t sizeof_batch_vert = sizeof(BatchVertex); + + bdata.max_quads = max_quads; + + // 4 verts per quad + bdata.vertex_buffer_size_units = max_quads * 4; + + // the index buffer can be longer than 65535, only the indices need to be within this range + bdata.index_buffer_size_units = max_quads * 6; + + const int max_verts = bdata.vertex_buffer_size_units; + + // this comes out at approx 64K for non-colored vertex buffer, and 128K for colored vertex buffer + bdata.vertex_buffer_size_bytes = max_verts * sizeof_batch_vert; + bdata.index_buffer_size_bytes = bdata.index_buffer_size_units * 2; // 16 bit inds + + // create equal number of normal and (max) unit sized verts (as the normal may need to be translated to a larger FVF) + bdata.vertices.create(max_verts); // 512k + bdata.unit_vertices.create(max_verts, sizeof(BatchVertexLarge)); + + // extra data per vert needed for larger FVFs + bdata.light_angles.create(max_verts); + bdata.vertex_colors.create(max_verts); + bdata.vertex_modulates.create(max_verts); + bdata.vertex_transforms.create(max_verts); + + // num batches will be auto increased dynamically if required + bdata.batches.create(1024); + bdata.batches_temp.create(bdata.batches.max_size()); + + // batch textures can also be increased dynamically + bdata.batch_textures.create(32); +} + +PREAMBLE(bool)::_light_scissor_begin(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect) const { + float area_item = p_item_rect.size.x * p_item_rect.size.y; // double check these are always positive + + // quick reject .. the area of pixels saved can never be more than the area of the item + if (area_item < bdata.scissor_threshold_area) { + return false; + } + + Rect2 cliprect; + if (!_light_find_intersection(p_item_rect, p_light_xform, p_light_rect, cliprect)) { + // should not really occur .. but just in case + cliprect = Rect2(0, 0, 0, 0); + } else { + // some conditions not to scissor + // determine the area (fill rate) that will be saved + float area_cliprect = cliprect.size.x * cliprect.size.y; + float area_saved = area_item - area_cliprect; + + // if area saved is too small, don't scissor + if (area_saved < bdata.scissor_threshold_area) { + return false; + } + } + + int rh = get_storage()->frame.current_rt->height; + + int y = rh - (cliprect.position.y + cliprect.size.y); +#ifdef GODOT_3 + if (get_storage()->frame.current_rt->flags[RendererStorage::RENDER_TARGET_VFLIP]) + y = cliprect.position.y; +#endif + get_this()->gl_enable_scissor(cliprect.position.x, y, cliprect.size.width, cliprect.size.height); + + return true; +} + +PREAMBLE(bool)::_light_find_intersection(const Rect2 &p_item_rect, const Transform2D &p_light_xform, const Rect2 &p_light_rect, Rect2 &r_cliprect) const { + // transform light to world space (note this is done in the earlier intersection test, so could + // be made more efficient) + Vector2 pts[4] = { + p_light_xform.xform(p_light_rect.position), + p_light_xform.xform(Vector2(p_light_rect.position.x + p_light_rect.size.x, p_light_rect.position.y)), + p_light_xform.xform(Vector2(p_light_rect.position.x, p_light_rect.position.y + p_light_rect.size.y)), + p_light_xform.xform(Vector2(p_light_rect.position.x + p_light_rect.size.x, p_light_rect.position.y + p_light_rect.size.y)), + }; + + // calculate the light bound rect in world space + Rect2 lrect(pts[0].x, pts[0].y, 0, 0); + for (int n = 1; n < 4; n++) { + lrect.expand_to(pts[n]); + } + + // intersection between the 2 rects + // they should probably always intersect, because of earlier check, but just in case... + if (!p_item_rect.intersects(lrect)) + return false; + + // note this does almost the same as Rect2.clip but slightly more efficient for our use case + r_cliprect.position.x = MAX(p_item_rect.position.x, lrect.position.x); + r_cliprect.position.y = MAX(p_item_rect.position.y, lrect.position.y); + + Point2 item_rect_end = p_item_rect.position + p_item_rect.size; + Point2 lrect_end = lrect.position + lrect.size; + + r_cliprect.size.x = MIN(item_rect_end.x, lrect_end.x) - r_cliprect.position.x; + r_cliprect.size.y = MIN(item_rect_end.y, lrect_end.y) - r_cliprect.position.y; + + return true; +} + +PREAMBLE(void)::_calculate_scissor_threshold_area() { + if (!bdata.settings_scissor_lights) { + return; + } + + // scissor area threshold is 0.0 to 1.0 in the settings for ease of use. + // we need to translate to an absolute area to determine quickly whether + // to scissor. + if (bdata.settings_scissor_threshold < 0.0001f) { + bdata.scissor_threshold_area = -1.0f; // will always pass + } else { + // in pixels + int w = get_storage()->frame.current_rt->width; + int h = get_storage()->frame.current_rt->height; + + int screen_area = w * h; + + bdata.scissor_threshold_area = bdata.settings_scissor_threshold * screen_area; + } +} + +#ifdef GODOT_3 +PREAMBLE(bool)::_prefill_line(RendererCanvasRender::Item::GD_COMMAND_LINE *p_line, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item *p_item, bool multiply_final_modulate) { + bool change_batch = false; + + // we have separate batch types for non and anti aliased lines. + // You can't batch the different types together. + RasterizerStorageCommon::BatchType line_batch_type = RasterizerStorageCommon::BT_LINE; + uint32_t line_batch_flags = RasterizerStorageCommon::BTF_LINE; +#ifdef GLES_OVER_GL + if (p_line->antialiased) { + line_batch_type = RasterizerStorageCommon::BT_LINE_AA; + line_batch_flags = RasterizerStorageCommon::BTF_LINE_AA; + } +#endif + + // conditions for creating a new batch + if (r_fill_state.curr_batch->type != line_batch_type) { + if (r_fill_state.sequence_batch_type_flags & (~line_batch_flags)) { + // don't allow joining to a different sequence type + r_command_start = command_num; + return true; + } + r_fill_state.sequence_batch_type_flags |= line_batch_flags; + + change_batch = true; + } + + // get the baked line color + Color col = p_line->color; + + if (multiply_final_modulate) + col *= r_fill_state.final_modulate; + + BatchColor bcol; + bcol.set(col); + + // if the color has changed we need a new batch + // (only single color line batches supported so far) + if (r_fill_state.curr_batch->color != bcol) + change_batch = true; + + // not sure if needed + r_fill_state.batch_tex_id = -1; + + // try to create vertices BEFORE creating a batch, + // because if the vertex buffer is full, we need to finish this + // function, draw what we have so far, and then start a new set of batches + + // request multiple vertices at a time, this is more efficient + BatchVertex *bvs = bdata.vertices.request(2); + if (!bvs) { + // run out of space in the vertex buffer .. finish this function and draw what we have so far + // return where we got to + r_command_start = command_num; + return true; + } + + if (change_batch) { + // open new batch (this should never fail, it dynamically grows) + r_fill_state.curr_batch = _batch_request_new(false); + + r_fill_state.curr_batch->type = line_batch_type; + r_fill_state.curr_batch->color = bcol; + r_fill_state.curr_batch->batch_texture_id = -1; + r_fill_state.curr_batch->first_command = command_num; + r_fill_state.curr_batch->num_commands = 1; + //r_fill_state.curr_batch->first_quad = bdata.total_quads; + r_fill_state.curr_batch->first_vert = bdata.total_verts; + } else { + // we could alternatively do the count when closing a batch .. perhaps more efficient + r_fill_state.curr_batch->num_commands++; + } + + // fill the geometry + Vector2 from = p_line->from; + Vector2 to = p_line->to; + + if (r_fill_state.transform_mode != TM_NONE) { + _software_transform_vertex(from, r_fill_state.transform_combined); + _software_transform_vertex(to, r_fill_state.transform_combined); + } + + bvs[0].pos.set(from); + bvs[0].uv.set(0, 0); // may not be necessary + bvs[1].pos.set(to); + bvs[1].uv.set(0, 0); + + bdata.total_verts += 2; + + return false; +} +#endif // godot 3 + +//unsigned int _ninepatch_apply_tiling_modes(RendererCanvasRender::Item::CommandNinePatch *p_np, Rect2 &r_source) { +// unsigned int rect_flags = 0; + +// switch (p_np->axis_x) { +// default: +// break; +// case VisualServer::NINE_PATCH_TILE: { +// r_source.size.x = p_np->rect.size.x; +// rect_flags = RendererCanvasRender::CANVAS_RECT_TILE; +// } break; +// case VisualServer::NINE_PATCH_TILE_FIT: { +// // prevent divide by zero (may never happen) +// if (r_source.size.x) { +// int units = p_np->rect.size.x / r_source.size.x; +// if (!units) +// units++; +// r_source.size.x = r_source.size.x * units; +// rect_flags = RendererCanvasRender::CANVAS_RECT_TILE; +// } +// } break; +// } + +// switch (p_np->axis_y) { +// default: +// break; +// case VisualServer::NINE_PATCH_TILE: { +// r_source.size.y = p_np->rect.size.y; +// rect_flags = RendererCanvasRender::CANVAS_RECT_TILE; +// } break; +// case VisualServer::NINE_PATCH_TILE_FIT: { +// // prevent divide by zero (may never happen) +// if (r_source.size.y) { +// int units = p_np->rect.size.y / r_source.size.y; +// if (!units) +// units++; +// r_source.size.y = r_source.size.y * units; +// rect_flags = RendererCanvasRender::CANVAS_RECT_TILE; +// } +// } break; +// } + +// return rect_flags; +//} + +#ifdef GODOT_3 + +T_PREAMBLE +template <bool SEND_LIGHT_ANGLES> +bool C_PREAMBLE::_prefill_ninepatch(RendererCanvasRender::Item::CommandNinePatch *p_np, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item *p_item, bool multiply_final_modulate) { + typename T_STORAGE::Texture *tex = get_storage()->texture_owner.getornull(p_np->texture); + + if (!tex) { + // FIXME: Handle textureless ninepatch gracefully + WARN_PRINT("NinePatch without texture not supported yet in GLES2 backend, skipping."); + return false; + } + if (tex->width == 0 || tex->height == 0) { + WARN_PRINT("Cannot set empty texture to NinePatch."); + return false; + } + + // first check there are enough verts for this to complete successfully + if (bdata.vertices.size() + (4 * 9) > bdata.vertices.max_size()) { + // return where we got to + r_command_start = command_num; + return true; + } + + // create a temporary rect so we can reuse the rect routine + RendererCanvasRender::Item::CommandRect trect; + + trect.texture = p_np->texture; +#ifdef GODOT_3 + trect.normal_map = p_np->normal_map; +#endif + trect.modulate = p_np->color; + trect.flags = RendererCanvasRender::CANVAS_RECT_REGION; + + //Size2 texpixel_size(1.0f / tex->width, 1.0f / tex->height); + + Rect2 source = p_np->source; + if (source.size.x == 0 && source.size.y == 0) { + source.size.x = tex->width; + source.size.y = tex->height; + } + + float screen_scale = 1.0f; + + // optional crazy ninepatch scaling mode + if ((bdata.settings_ninepatch_mode == 1) && (source.size.x != 0) && (source.size.y != 0)) { + screen_scale = MIN(p_np->rect.size.x / source.size.x, p_np->rect.size.y / source.size.y); + screen_scale = MIN(1.0, screen_scale); + } + + // deal with nine patch texture wrapping modes + // this is switched off because it may not be possible with batching + // trect.flags |= _ninepatch_apply_tiling_modes(p_np, source); + + // translate to rects + Rect2 &rt = trect.rect; + Rect2 &src = trect.source; + + float tex_margin_left = p_np->margin[MARGIN_LEFT]; + float tex_margin_right = p_np->margin[MARGIN_RIGHT]; + float tex_margin_top = p_np->margin[MARGIN_TOP]; + float tex_margin_bottom = p_np->margin[MARGIN_BOTTOM]; + + float x[4]; + x[0] = p_np->rect.position.x; + x[1] = x[0] + (p_np->margin[MARGIN_LEFT] * screen_scale); + x[3] = x[0] + (p_np->rect.size.x); + x[2] = x[3] - (p_np->margin[MARGIN_RIGHT] * screen_scale); + + float y[4]; + y[0] = p_np->rect.position.y; + y[1] = y[0] + (p_np->margin[MARGIN_TOP] * screen_scale); + y[3] = y[0] + (p_np->rect.size.y); + y[2] = y[3] - (p_np->margin[MARGIN_BOTTOM] * screen_scale); + + float u[4]; + u[0] = source.position.x; + u[1] = u[0] + tex_margin_left; + u[3] = u[0] + source.size.x; + u[2] = u[3] - tex_margin_right; + + float v[4]; + v[0] = source.position.y; + v[1] = v[0] + tex_margin_top; + v[3] = v[0] + source.size.y; + v[2] = v[3] - tex_margin_bottom; + + // temporarily override to prevent single rect fallback + bool single_rect_fallback = bdata.settings_use_single_rect_fallback; + bdata.settings_use_single_rect_fallback = false; + + // each line of the ninepatch + for (int line = 0; line < 3; line++) { + rt.position = Vector2(x[0], y[line]); + rt.size = Vector2(x[1] - x[0], y[line + 1] - y[line]); + src.position = Vector2(u[0], v[line]); + src.size = Vector2(u[1] - u[0], v[line + 1] - v[line]); + _prefill_rect<SEND_LIGHT_ANGLES>(&trect, r_fill_state, r_command_start, command_num, command_count, nullptr, p_item, multiply_final_modulate); + + if ((line == 1) && (!p_np->draw_center)) + ; + else { + rt.position.x = x[1]; + rt.size.x = x[2] - x[1]; + src.position.x = u[1]; + src.size.x = u[2] - u[1]; + _prefill_rect<SEND_LIGHT_ANGLES>(&trect, r_fill_state, r_command_start, command_num, command_count, nullptr, p_item, multiply_final_modulate); + } + + rt.position.x = x[2]; + rt.size.x = x[3] - x[2]; + src.position.x = u[2]; + src.size.x = u[3] - u[2]; + _prefill_rect<SEND_LIGHT_ANGLES>(&trect, r_fill_state, r_command_start, command_num, command_count, nullptr, p_item, multiply_final_modulate); + } + + // restore single rect fallback + bdata.settings_use_single_rect_fallback = single_rect_fallback; + return false; +} + +T_PREAMBLE +template <bool SEND_LIGHT_ANGLES> +bool C_PREAMBLE::_prefill_polygon(RendererCanvasRender::Item::CommandPolygon *p_poly, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item *p_item, bool multiply_final_modulate) { + bool change_batch = false; + + // conditions for creating a new batch + if (r_fill_state.curr_batch->type != RasterizerStorageCommon::BT_POLY) { + // don't allow joining to a different sequence type + if (r_fill_state.sequence_batch_type_flags & (~RasterizerStorageCommon::BTF_POLY)) { + // don't allow joining to a different sequence type + r_command_start = command_num; + return true; + } + r_fill_state.sequence_batch_type_flags |= RasterizerStorageCommon::BTF_POLY; + + change_batch = true; + } + + int num_inds = p_poly->indices.size(); + + // nothing to draw? + if (!num_inds) + return false; + + // we aren't using indices, so will transform verts more than once .. less efficient. + // could be done with a temporary vertex buffer + BatchVertex *bvs = bdata.vertices.request(num_inds); + if (!bvs) { + // run out of space in the vertex buffer .. finish this function and draw what we have so far + // return where we got to + r_command_start = command_num; + return true; + } + + BatchColor *vertex_colors = bdata.vertex_colors.request(num_inds); + RAST_DEBUG_ASSERT(vertex_colors); + + // are we using large FVF? + //////////////////////////////////// + const bool use_large_verts = bdata.use_large_verts; + const bool use_modulate = bdata.use_modulate; + + BatchColor *vertex_modulates = nullptr; + if (use_modulate) { + vertex_modulates = bdata.vertex_modulates.request(num_inds); + RAST_DEBUG_ASSERT(vertex_modulates); + // precalc the vertex modulate (will be shared by all verts) + // we store the modulate as an attribute in the fvf rather than a uniform + vertex_modulates[0].set(r_fill_state.final_modulate); + } + + BatchTransform *pBT = nullptr; + if (use_large_verts) { + pBT = bdata.vertex_transforms.request(num_inds); + RAST_DEBUG_ASSERT(pBT); + // precalc the batch transform (will be shared by all verts) + // we store the transform as an attribute in the fvf rather than a uniform + const Transform2D &tr = r_fill_state.transform_combined; + + pBT[0].translate.set(tr.elements[2]); + // could do swizzling in shader? + pBT[0].basis[0].set(tr.elements[0][0], tr.elements[1][0]); + pBT[0].basis[1].set(tr.elements[0][1], tr.elements[1][1]); + } + //////////////////////////////////// + + // the modulate is always baked + Color modulate; + if (!use_large_verts && !use_modulate && multiply_final_modulate) + modulate = r_fill_state.final_modulate; + else + modulate = Color(1, 1, 1, 1); + + int old_batch_tex_id = r_fill_state.batch_tex_id; + r_fill_state.batch_tex_id = _batch_find_or_create_tex(p_poly->texture, p_poly->normal_map, false, old_batch_tex_id); + + // conditions for creating a new batch + if (old_batch_tex_id != r_fill_state.batch_tex_id) { + change_batch = true; + } + + // N.B. polygons don't have color thus don't need a batch change with color + // This code is left as reference in case of problems. + // if (!r_fill_state.curr_batch->color.equals(modulate)) { + // change_batch = true; + // bdata.total_color_changes++; + // } + + if (change_batch) { + // put the tex pixel size in a local (less verbose and can be a register) + const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id]; + batchtex.tex_pixel_size.to(r_fill_state.texpixel_size); + + if (bdata.settings_uv_contract) { + r_fill_state.contract_uvs = (batchtex.flags & GD_VS::TEXTURE_FLAG_FILTER) == 0; + } + + // open new batch (this should never fail, it dynamically grows) + r_fill_state.curr_batch = _batch_request_new(false); + + r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_POLY; + + // modulate unused except for debugging? + r_fill_state.curr_batch->color.set(modulate); + r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id; + r_fill_state.curr_batch->first_command = command_num; + r_fill_state.curr_batch->num_commands = num_inds; + // r_fill_state.curr_batch->num_elements = num_inds; + r_fill_state.curr_batch->first_vert = bdata.total_verts; + } else { + // we could alternatively do the count when closing a batch .. perhaps more efficient + r_fill_state.curr_batch->num_commands += num_inds; + } + + // PRECALCULATE THE COLORS (as there may be less colors than there are indices + // in either hardware or software paths) + BatchColor vcol; + int num_verts = p_poly->points.size(); + + // in special cases, only 1 color is specified by convention, so we want to preset this + // to use in all verts. + if (p_poly->colors.size()) + vcol.set(p_poly->colors[0] * modulate); + else + // color is undefined, use modulate color straight + vcol.set(modulate); + + BatchColor *precalced_colors = (BatchColor *)alloca(num_verts * sizeof(BatchColor)); + + // two stage, super efficient setup of precalculated colors + int num_colors_specified = p_poly->colors.size(); + + for (int n = 0; n < num_colors_specified; n++) { + vcol.set(p_poly->colors[n] * modulate); + precalced_colors[n] = vcol; + } + for (int n = num_colors_specified; n < num_verts; n++) { + precalced_colors[n] = vcol; + } + + if (!_software_skin_poly(p_poly, p_item, bvs, vertex_colors, r_fill_state, precalced_colors)) { + for (int n = 0; n < num_inds; n++) { + int ind = p_poly->indices[n]; + + RAST_DEV_DEBUG_ASSERT(ind < p_poly->points.size()); + + // this could be moved outside the loop + if (r_fill_state.transform_mode != TM_NONE) { + Vector2 pos = p_poly->points[ind]; + _software_transform_vertex(pos, r_fill_state.transform_combined); + bvs[n].pos.set(pos.x, pos.y); + } else { + const Point2 &pos = p_poly->points[ind]; + bvs[n].pos.set(pos.x, pos.y); + } + + if (ind < p_poly->uvs.size()) { + const Point2 &uv = p_poly->uvs[ind]; + bvs[n].uv.set(uv.x, uv.y); + } else { + bvs[n].uv.set(0.0f, 0.0f); + } + + vertex_colors[n] = precalced_colors[ind]; + + if (use_modulate) { + vertex_modulates[n] = vertex_modulates[0]; + } + + if (use_large_verts) { + // reuse precalced transform (same for each vertex within polygon) + pBT[n] = pBT[0]; + } + } + } // if not software skinning + else { + // software skinning extra passes + if (use_modulate) { + for (int n = 0; n < num_inds; n++) { + vertex_modulates[n] = vertex_modulates[0]; + } + } + // not sure if this will produce garbage if software skinning is changing vertex pos + // in the shader, but is included for completeness + if (use_large_verts) { + for (int n = 0; n < num_inds; n++) { + pBT[n] = pBT[0]; + } + } + } + + // increment total vert count + bdata.total_verts += num_inds; + + return false; +} + +PREAMBLE(bool)::_software_skin_poly(RendererCanvasRender::Item::CommandPolygon *p_poly, RendererCanvasRender::Item *p_item, BatchVertex *bvs, BatchColor *vertex_colors, const FillState &p_fill_state, const BatchColor *p_precalced_colors) { + // alternatively could check get_this()->state.using_skeleton + if (p_item->skeleton == RID()) + return false; + + int num_inds = p_poly->indices.size(); + int num_verts = p_poly->points.size(); + + RID skeleton = p_item->skeleton; + int bone_count = RendererStorage::base_singleton->skeleton_get_bone_count(skeleton); + + // we want a temporary buffer of positions to transform + Vector2 *pTemps = (Vector2 *)alloca(num_verts * sizeof(Vector2)); + memset((void *)pTemps, 0, num_verts * sizeof(Vector2)); + + // these are used in the shader but don't appear to be needed for software transform + // const Transform2D &skel_trans = get_this()->state.skeleton_transform; + // const Transform2D &skel_trans_inv = get_this()->state.skeleton_transform_inverse; + + // get the bone transforms. + // this is not ideal because we don't know in advance which bones are needed + // for any particular poly, but depends how cheap the skeleton_bone_get_transform_2d call is + Transform2D *bone_transforms = (Transform2D *)alloca(bone_count * sizeof(Transform2D)); + for (int b = 0; b < bone_count; b++) { + bone_transforms[b] = RendererStorage::base_singleton->skeleton_bone_get_transform_2d(skeleton, b); + } + + if (num_verts && (p_poly->bones.size() == num_verts * 4) && (p_poly->weights.size() == p_poly->bones.size())) { + const Transform2D &item_transform = p_item->xform; + Transform2D item_transform_inv = item_transform.affine_inverse(); + + for (int n = 0; n < num_verts; n++) { + const Vector2 &src_pos = p_poly->points[n]; + Vector2 &dst_pos = pTemps[n]; + + // there can be an offset on the polygon at rigging time, this has to be accounted for + // note it may be possible that this could be concatenated with the bone transforms to save extra transforms - not sure yet + Vector2 src_pos_back_transformed = item_transform.xform(src_pos); + + float total_weight = 0.0f; + + for (int k = 0; k < 4; k++) { + int bone_id = p_poly->bones[n * 4 + k]; + float weight = p_poly->weights[n * 4 + k]; + if (weight == 0.0f) + continue; + + total_weight += weight; + + RAST_DEBUG_ASSERT(bone_id < bone_count); + const Transform2D &bone_tr = bone_transforms[bone_id]; + + Vector2 pos = bone_tr.xform(src_pos_back_transformed); + + dst_pos += pos * weight; + } + + // this is some unexplained weirdness with verts with no weights, + // but it seemed to work for the example project ... watch for regressions + if (total_weight < 0.01f) + dst_pos = src_pos; + else { + dst_pos /= total_weight; + + // retransform back from the poly offset space + dst_pos = item_transform_inv.xform(dst_pos); + } + } + + // software transform with combined matrix? + if (p_fill_state.transform_mode != TM_NONE) { + for (int n = 0; n < num_verts; n++) { + Vector2 &dst_pos = pTemps[n]; + _software_transform_vertex(dst_pos, p_fill_state.transform_combined); + } + } + + } // if bone format matches + else { + // not supported + } + + // output to the batch verts + for (int n = 0; n < num_inds; n++) { + int ind = p_poly->indices[n]; + + RAST_DEV_DEBUG_ASSERT(ind < num_verts); + const Point2 &pos = pTemps[ind]; + bvs[n].pos.set(pos.x, pos.y); + + if (ind < p_poly->uvs.size()) { + const Point2 &uv = p_poly->uvs[ind]; + bvs[n].uv.set(uv.x, uv.y); + } else { + bvs[n].uv.set(0.0f, 0.0f); + } + + vertex_colors[n] = p_precalced_colors[ind]; + } + + return true; +} + +T_PREAMBLE +template <bool SEND_LIGHT_ANGLES> +bool C_PREAMBLE::_prefill_rect(RendererCanvasRender::Item::CommandRect *rect, FillState &r_fill_state, int &r_command_start, int command_num, int command_count, RendererCanvasRender::Item::Command *const *commands, RendererCanvasRender::Item *p_item, bool multiply_final_modulate) { + bool change_batch = false; + + // conditions for creating a new batch + if (r_fill_state.curr_batch->type != RasterizerStorageCommon::BT_RECT) { + // don't allow joining to a different sequence type + if (r_fill_state.sequence_batch_type_flags & (~RasterizerStorageCommon::BTF_RECT)) { + // don't allow joining to a different sequence type + r_command_start = command_num; + return true; + } + r_fill_state.sequence_batch_type_flags |= RasterizerStorageCommon::BTF_RECT; + + change_batch = true; + + // check for special case if there is only a single or small number of rects, + // in which case we will use the legacy default rect renderer + // because it is faster for single rects + + // we only want to do this if not a joined item with more than 1 item, + // because joined items with more than 1, the command * will be incorrect + // NOTE - this is assuming that use_hardware_transform means that it is a non-joined item!! + // If that assumption is incorrect this will go horribly wrong. + if (bdata.settings_use_single_rect_fallback && r_fill_state.use_hardware_transform) { + bool is_single_rect = false; + int command_num_next = command_num + 1; + if (command_num_next < command_count) { + RendererCanvasRender::Item::Command *command_next = commands[command_num_next]; + if ((command_next->type != RendererCanvasRender::Item::Command::TYPE_RECT) && (command_next->type != RendererCanvasRender::Item::Command::TYPE_TRANSFORM)) { + is_single_rect = true; + } + } else { + is_single_rect = true; + } + // if it is a rect on its own, do exactly the same as the default routine + if (is_single_rect) { + _prefill_default_batch(r_fill_state, command_num, *p_item); + return false; + } + } // if use hardware transform + } + + // try to create vertices BEFORE creating a batch, + // because if the vertex buffer is full, we need to finish this + // function, draw what we have so far, and then start a new set of batches + + // request FOUR vertices at a time, this is more efficient + BatchVertex *bvs = bdata.vertices.request(4); + if (!bvs) { + // run out of space in the vertex buffer .. finish this function and draw what we have so far + // return where we got to + r_command_start = command_num; + return true; + } + + // are we using large FVF? + const bool use_large_verts = bdata.use_large_verts; + const bool use_modulate = bdata.use_modulate; + + Color col = rect->modulate; + + if (!use_large_verts) { + if (multiply_final_modulate) { + col *= r_fill_state.final_modulate; + } + } + + // instead of doing all the texture preparation for EVERY rect, + // we build a list of texture combinations and do this once off. + // This means we have a potentially rather slow step to identify which texture combo + // using the RIDs. + int old_batch_tex_id = r_fill_state.batch_tex_id; + +#ifdef GODOT_4 + r_fill_state.batch_tex_id = _batch_find_or_create_tex(rect->texture, RID(), rect->flags & RendererCanvasRender::CANVAS_RECT_TILE, old_batch_tex_id); +#else + r_fill_state.batch_tex_id = _batch_find_or_create_tex(rect->texture, rect->normal_map, rect->flags & RendererCanvasRender::CANVAS_RECT_TILE, old_batch_tex_id); +#endif + + //r_fill_state.use_light_angles = send_light_angles; + if (SEND_LIGHT_ANGLES) { + bdata.use_light_angles = true; + } + + // conditions for creating a new batch + if (old_batch_tex_id != r_fill_state.batch_tex_id) { + change_batch = true; + } + + // we need to treat color change separately because we need to count these + // to decide whether to switch on the fly to colored vertices. + if (!r_fill_state.curr_batch->color.equals(col)) { + change_batch = true; + bdata.total_color_changes++; + } + + if (change_batch) { + // put the tex pixel size in a local (less verbose and can be a register) + const BatchTex &batchtex = bdata.batch_textures[r_fill_state.batch_tex_id]; + batchtex.tex_pixel_size.to(r_fill_state.texpixel_size); + + if (bdata.settings_uv_contract) { + r_fill_state.contract_uvs = (batchtex.flags & GD_VS::TEXTURE_FLAG_FILTER) == 0; + } + + // need to preserve texpixel_size between items + //r_fill_state.texpixel_size = r_fill_state.texpixel_size; + + // open new batch (this should never fail, it dynamically grows) + r_fill_state.curr_batch = _batch_request_new(false); + + r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_RECT; + r_fill_state.curr_batch->color.set(col); + r_fill_state.curr_batch->batch_texture_id = r_fill_state.batch_tex_id; + r_fill_state.curr_batch->first_command = command_num; + r_fill_state.curr_batch->num_commands = 1; + //r_fill_state.curr_batch->first_quad = bdata.total_quads; + r_fill_state.curr_batch->first_vert = bdata.total_verts; + } else { + // we could alternatively do the count when closing a batch .. perhaps more efficient + r_fill_state.curr_batch->num_commands++; + } + + // fill the quad geometry + Vector2 mins = rect->rect.position; + + if (r_fill_state.transform_mode == TM_TRANSLATE) { + if (!use_large_verts) { + _software_transform_vertex(mins, r_fill_state.transform_combined); + } + } + + Vector2 maxs = mins + rect->rect.size; + + // just aliases + BatchVertex *bA = &bvs[0]; + BatchVertex *bB = &bvs[1]; + BatchVertex *bC = &bvs[2]; + BatchVertex *bD = &bvs[3]; + + bA->pos.x = mins.x; + bA->pos.y = mins.y; + + bB->pos.x = maxs.x; + bB->pos.y = mins.y; + + bC->pos.x = maxs.x; + bC->pos.y = maxs.y; + + bD->pos.x = mins.x; + bD->pos.y = maxs.y; + + // possibility of applying flips here for normal mapping .. but they don't seem to be used + if (rect->rect.size.x < 0) { + SWAP(bA->pos, bB->pos); + SWAP(bC->pos, bD->pos); + } + if (rect->rect.size.y < 0) { + SWAP(bA->pos, bD->pos); + SWAP(bB->pos, bC->pos); + } + + if (r_fill_state.transform_mode == TM_ALL) { + if (!use_large_verts) { + _software_transform_vertex(bA->pos, r_fill_state.transform_combined); + _software_transform_vertex(bB->pos, r_fill_state.transform_combined); + _software_transform_vertex(bC->pos, r_fill_state.transform_combined); + _software_transform_vertex(bD->pos, r_fill_state.transform_combined); + } + } + + // uvs + Vector2 src_min; + Vector2 src_max; + if (rect->flags & RendererCanvasRender::CANVAS_RECT_REGION) { + src_min = rect->source.position; + src_max = src_min + rect->source.size; + + src_min *= r_fill_state.texpixel_size; + src_max *= r_fill_state.texpixel_size; + + const float uv_epsilon = bdata.settings_uv_contract_amount; + + // nudge offset for the maximum to prevent precision error on GPU reading into line outside the source rect + // this is very difficult to get right. + if (r_fill_state.contract_uvs) { + src_min.x += uv_epsilon; + src_min.y += uv_epsilon; + src_max.x -= uv_epsilon; + src_max.y -= uv_epsilon; + } + } else { + src_min = Vector2(0, 0); + src_max = Vector2(1, 1); + } + + // 10% faster calculating the max first + Vector2 uvs[4] = { + src_min, + Vector2(src_max.x, src_min.y), + src_max, + Vector2(src_min.x, src_max.y), + }; + + // for encoding in light angle + // flips should be optimized out when not being used for light angle. + bool flip_h = false; + bool flip_v = false; + + if (rect->flags & RendererCanvasRender::CANVAS_RECT_TRANSPOSE) { + SWAP(uvs[1], uvs[3]); + } + + if (rect->flags & RendererCanvasRender::CANVAS_RECT_FLIP_H) { + SWAP(uvs[0], uvs[1]); + SWAP(uvs[2], uvs[3]); + flip_h = !flip_h; + flip_v = !flip_v; + } + if (rect->flags & RendererCanvasRender::CANVAS_RECT_FLIP_V) { + SWAP(uvs[0], uvs[3]); + SWAP(uvs[1], uvs[2]); + flip_v = !flip_v; + } + + bA->uv.set(uvs[0]); + bB->uv.set(uvs[1]); + bC->uv.set(uvs[2]); + bD->uv.set(uvs[3]); + + // modulate + if (use_modulate) { + // store the final modulate separately from the rect modulate + BatchColor *pBC = bdata.vertex_modulates.request(4); + RAST_DEBUG_ASSERT(pBC); + pBC[0].set(r_fill_state.final_modulate); + pBC[1] = pBC[0]; + pBC[2] = pBC[0]; + pBC[3] = pBC[0]; + } + + if (use_large_verts) { + // store the transform separately + BatchTransform *pBT = bdata.vertex_transforms.request(4); + RAST_DEBUG_ASSERT(pBT); + + const Transform2D &tr = r_fill_state.transform_combined; + + pBT[0].translate.set(tr.elements[2]); + // could do swizzling in shader? + pBT[0].basis[0].set(tr.elements[0][0], tr.elements[1][0]); + pBT[0].basis[1].set(tr.elements[0][1], tr.elements[1][1]); + + pBT[1] = pBT[0]; + pBT[2] = pBT[0]; + pBT[3] = pBT[0]; + } + + if (SEND_LIGHT_ANGLES) { + // we can either keep the light angles in sync with the verts when writing, + // or sync them up during translation. We are syncing in translation. + // N.B. There may be batches that don't require light_angles between batches that do. + float *angles = bdata.light_angles.request(4); + RAST_DEBUG_ASSERT(angles); + + float angle = 0.0f; + const float TWO_PI = Math_PI * 2; + + if (r_fill_state.transform_mode != TM_NONE) { + const Transform2D &tr = r_fill_state.transform_combined; + + // apply to an x axis + // the x axis and y axis can be taken directly from the transform (no need to xform identity vectors) + Vector2 x_axis(tr.elements[0][0], tr.elements[1][0]); + + // have to do a y axis to check for scaling flips + // this is hassle and extra slowness. We could only allow flips via the flags. + Vector2 y_axis(tr.elements[0][1], tr.elements[1][1]); + + // has the x / y axis flipped due to scaling? + float cross = x_axis.cross(y_axis); + if (cross < 0.0f) { + flip_v = !flip_v; + } + + // passing an angle is smaller than a vector, it can be reconstructed in the shader + angle = x_axis.angle(); + + // we don't want negative angles, as negative is used to encode flips. + // This moves range from -PI to PI to 0 to TWO_PI + if (angle < 0.0f) + angle += TWO_PI; + + } // if transform needed + + // if horizontal flip, angle is shifted by 180 degrees + if (flip_h) { + angle += Math_PI; + + // mod to get back to 0 to TWO_PI range + angle = fmodf(angle, TWO_PI); + } + + // add 1 (to take care of zero floating point error with sign) + angle += 1.0f; + + // flip if necessary to indicate a vertical flip in the shader + if (flip_v) + angle *= -1.0f; + + // light angle must be sent for each vert, instead as a single uniform in the uniform draw method + // this has the benefit of enabling batching with light angles. + for (int n = 0; n < 4; n++) { + angles[n] = angle; + } + } + + // increment quad count + bdata.total_quads++; + bdata.total_verts += 4; + + return false; +} + +// This function may be called MULTIPLE TIMES for each item, so needs to record how far it has got +PREAMBLE(bool)::prefill_joined_item(FillState &r_fill_state, int &r_command_start, RendererCanvasRender::Item *p_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material) { + // we will prefill batches and vertices ready for sending in one go to the vertex buffer + int command_count = p_item->commands.size(); + RendererCanvasRender::Item::Command *const *commands = p_item->commands.ptr(); + + // checking the color for not being white makes it 92/90 times faster in the case where it is white + bool multiply_final_modulate = false; + if (!r_fill_state.use_hardware_transform && (r_fill_state.final_modulate != Color(1, 1, 1, 1))) { + multiply_final_modulate = true; + } + + // start batch is a dummy batch (tex id -1) .. could be made more efficient + if (!r_fill_state.curr_batch) { + // OLD METHOD, but left dangling zero length default batches + // r_fill_state.curr_batch = _batch_request_new(); + // r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DEFAULT; + // r_fill_state.curr_batch->first_command = r_command_start; + // should tex_id be set to -1? check this + + // allocate dummy batch on the stack, it should always get replaced + // note that the rest of the structure is uninitialized, this should not matter + // if the type is checked before anything else. + r_fill_state.curr_batch = (Batch *)alloca(sizeof(Batch)); + r_fill_state.curr_batch->type = RasterizerStorageCommon::BT_DUMMY; + + // this is assumed to be the case + //CRASH_COND (r_fill_state.transform_extra_command_number_p1); + } + + // we need to return which command we got up to, so + // store this outside the loop + int command_num; + + // do as many commands as possible until the vertex buffer will be full up + for (command_num = r_command_start; command_num < command_count; command_num++) { + RendererCanvasRender::Item::Command *command = commands[command_num]; + + switch (command->type) { + default: { + _prefill_default_batch(r_fill_state, command_num, *p_item); + } break; + case RendererCanvasRender::Item::Command::TYPE_TRANSFORM: { + // if the extra matrix has been sent already, + // break this extra matrix software path (as we don't want to unset it on the GPU etc) + if (r_fill_state.extra_matrix_sent) { + _prefill_default_batch(r_fill_state, command_num, *p_item); + + // keep track of the combined matrix on the CPU in parallel, in case we use large vertex format + RendererCanvasRender::Item::CommandTransform *transform = static_cast<RendererCanvasRender::Item::CommandTransform *>(command); + const Transform2D &extra_matrix = transform->xform; + r_fill_state.transform_combined = p_item->final_transform * extra_matrix; + } else { + // Extra matrix fast path. + // Instead of sending the command immediately, we store the modified transform (in combined) + // for software transform, and only flush this transform command if we NEED to (i.e. we want to + // render some default commands) + RendererCanvasRender::Item::CommandTransform *transform = static_cast<RendererCanvasRender::Item::CommandTransform *>(command); + const Transform2D &extra_matrix = transform->xform; + + if (r_fill_state.use_hardware_transform) { + // if we are using hardware transform mode, we have already sent the final transform, + // so we only want to software transform the extra matrix + r_fill_state.transform_combined = extra_matrix; + } else { + r_fill_state.transform_combined = p_item->final_transform * extra_matrix; + } + // after a transform command, always use some form of software transform (either the combined final + extra, or just the extra) + // until we flush this dirty extra matrix because we need to render default commands. + r_fill_state.transform_mode = _find_transform_mode(r_fill_state.transform_combined); + + // make a note of which command the dirty extra matrix is store in, so we can send it later + // if necessary + r_fill_state.transform_extra_command_number_p1 = command_num + 1; // plus 1 so we can test against zero + } + } break; + case RendererCanvasRender::Item::Command::TYPE_RECT: { + RendererCanvasRender::Item::CommandRect *rect = static_cast<RendererCanvasRender::Item::CommandRect *>(command); + + // unoptimized - could this be done once per batch / batch texture? + bool send_light_angles = rect->normal_map != RID(); + + bool buffer_full = false; + + // the template params must be explicit for compilation, + // this forces building the multiple versions of the function. + if (send_light_angles) { + buffer_full = _prefill_rect<true>(rect, r_fill_state, r_command_start, command_num, command_count, commands, p_item, multiply_final_modulate); + } else { + buffer_full = _prefill_rect<false>(rect, r_fill_state, r_command_start, command_num, command_count, commands, p_item, multiply_final_modulate); + } + + if (buffer_full) + return true; + + } break; + case RendererCanvasRender::Item::Command::TYPE_NINEPATCH: { + RendererCanvasRender::Item::CommandNinePatch *np = static_cast<RendererCanvasRender::Item::CommandNinePatch *>(command); + + if ((np->axis_x != VisualServer::NINE_PATCH_STRETCH) || (np->axis_y != VisualServer::NINE_PATCH_STRETCH)) { + // not accelerated + _prefill_default_batch(r_fill_state, command_num, *p_item); + continue; + } + + // unoptimized - could this be done once per batch / batch texture? + bool send_light_angles = np->normal_map != RID(); + + bool buffer_full = false; + + if (send_light_angles) + buffer_full = _prefill_ninepatch<true>(np, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); + else + buffer_full = _prefill_ninepatch<false>(np, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); + + if (buffer_full) + return true; + + } break; + + case RendererCanvasRender::Item::Command::TYPE_LINE: { + RendererCanvasRender::Item::CommandLine *line = static_cast<RendererCanvasRender::Item::CommandLine *>(command); + + if (line->width <= 1) { + bool buffer_full = _prefill_line(line, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); + + if (buffer_full) + return true; + } else { + // not accelerated + _prefill_default_batch(r_fill_state, command_num, *p_item); + } + } break; + + case RendererCanvasRender::Item::Command::TYPE_POLYGON: { + RendererCanvasRender::Item::CommandPolygon *polygon = static_cast<RendererCanvasRender::Item::CommandPolygon *>(command); +#ifdef GLES_OVER_GL + // anti aliasing not accelerated .. it is problematic because it requires a 2nd line drawn around the outside of each + // poly, which would require either a second list of indices or a second list of vertices for this step + if (polygon->antialiased) { + // not accelerated + _prefill_default_batch(r_fill_state, command_num, *p_item); + } else { +#endif + // not using software skinning? + if (!bdata.settings_use_software_skinning && get_this()->state.using_skeleton) { + // not accelerated + _prefill_default_batch(r_fill_state, command_num, *p_item); + } else { + // unoptimized - could this be done once per batch / batch texture? + bool send_light_angles = polygon->normal_map != RID(); + + bool buffer_full = false; + + if (send_light_angles) { + // NYI + _prefill_default_batch(r_fill_state, command_num, *p_item); + //buffer_full = prefill_polygon<true>(polygon, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); + } else + buffer_full = _prefill_polygon<false>(polygon, r_fill_state, r_command_start, command_num, command_count, p_item, multiply_final_modulate); + + if (buffer_full) + return true; + } // if not using hardware skinning path +#ifdef GLES_OVER_GL + } // if not anti-aliased poly +#endif + + } break; + } + } + + // VERY IMPORTANT to return where we got to, because this func may be called multiple + // times per item. + // Don't miss out on this step by calling return earlier in the function without setting r_command_start. + r_command_start = command_num; + + return false; +} + +PREAMBLE(void)::flush_render_batches(RendererCanvasRender::Item *p_first_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, uint32_t p_sequence_batch_type_flags) { + // some heuristic to decide whether to use colored verts. + // feel free to tweak this. + // this could use hysteresis, to prevent jumping between methods + // .. however probably not necessary + bdata.use_colored_vertices = false; + + RasterizerStorageCommon::FVF backup_fvf = bdata.fvf; + + // the batch type in this flush can override the fvf from the joined item. + // The joined item uses the material to determine fvf, assuming a rect... + // however with custom drawing, lines or polys may be drawn. + // lines contain no color (this is stored in the batch), and polys contain vertex and color only. + if (p_sequence_batch_type_flags & (RasterizerStorageCommon::BTF_LINE | RasterizerStorageCommon::BTF_LINE_AA)) { + // do nothing, use the default regular FVF + bdata.fvf = RasterizerStorageCommon::FVF_REGULAR; + } else { + // switch from regular to colored? + if (bdata.fvf == RasterizerStorageCommon::FVF_REGULAR) { + // only check whether to convert if there are quads (prevent divide by zero) + // and we haven't decided to prevent color baking (due to e.g. MODULATE + // being used in a shader) + if (bdata.total_quads && !(bdata.joined_item_batch_flags & RasterizerStorageCommon::PREVENT_COLOR_BAKING)) { + // minus 1 to prevent single primitives (ratio 1.0) always being converted to colored.. + // in that case it is slightly cheaper to just have the color as part of the batch + float ratio = (float)(bdata.total_color_changes - 1) / (float)bdata.total_quads; + + // use bigger than or equal so that 0.0 threshold can force always using colored verts + if (ratio >= bdata.settings_colored_vertex_format_threshold) { + bdata.use_colored_vertices = true; + bdata.fvf = RasterizerStorageCommon::FVF_COLOR; + } + } + + // if we used vertex colors + if (bdata.vertex_colors.size()) { + bdata.use_colored_vertices = true; + bdata.fvf = RasterizerStorageCommon::FVF_COLOR; + } + + // needs light angles? + if (bdata.use_light_angles) { + bdata.fvf = RasterizerStorageCommon::FVF_LIGHT_ANGLE; + } + } + + backup_fvf = bdata.fvf; + } // if everything else except lines + + // translate if required to larger FVFs + switch (bdata.fvf) { + case RasterizerStorageCommon::FVF_UNBATCHED: // should not happen + break; + case RasterizerStorageCommon::FVF_REGULAR: // no change + break; + case RasterizerStorageCommon::FVF_COLOR: { + // special case, where vertex colors are used (polys) + if (!bdata.vertex_colors.size()) + _translate_batches_to_larger_FVF<BatchVertexColored, false, false, false>(p_sequence_batch_type_flags); + else + // normal, reduce number of batches by baking batch colors + _translate_batches_to_vertex_colored_FVF(); + } break; + case RasterizerStorageCommon::FVF_LIGHT_ANGLE: + _translate_batches_to_larger_FVF<BatchVertexLightAngled, true, false, false>(p_sequence_batch_type_flags); + break; + case RasterizerStorageCommon::FVF_MODULATED: + _translate_batches_to_larger_FVF<BatchVertexModulated, true, true, false>(p_sequence_batch_type_flags); + break; + case RasterizerStorageCommon::FVF_LARGE: + _translate_batches_to_larger_FVF<BatchVertexLarge, true, true, true>(p_sequence_batch_type_flags); + break; + } + + // send buffers to opengl + get_this()->_batch_upload_buffers(); + + RendererCanvasRender::Item::Command *const *commands = p_first_item->commands.ptr(); + +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + if (bdata.diagnose_frame) { + diagnose_batches(commands); + } +#endif + + get_this()->render_batches(commands, p_current_clip, r_reclip, p_material); + + // if we overrode the fvf for lines, set it back to the joined item fvf + bdata.fvf = backup_fvf; + + // overwrite source buffers with garbage if error checking +#ifdef RASTERIZER_EXTRA_CHECKS + _debug_write_garbage(); +#endif +} + +#endif // godot 3 + +PREAMBLE(void)::render_joined_item_commands(const BItemJoined &p_bij, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material, bool p_lit) { + RendererCanvasRender::Item *item = 0; + RendererCanvasRender::Item *first_item = bdata.item_refs[p_bij.first_item_ref].item; + + // fill_state and bdata have once off setup per joined item, and a smaller reset on flush + FillState fill_state; + fill_state.reset_joined_item(p_bij.use_hardware_transform()); + + bdata.reset_joined_item(); + + // should this joined item be using large FVF? + if (p_bij.flags & RasterizerStorageCommon::USE_MODULATE_FVF) { + bdata.use_modulate = true; + bdata.fvf = RasterizerStorageCommon::FVF_MODULATED; + } + if (p_bij.flags & RasterizerStorageCommon::USE_LARGE_FVF) { + bdata.use_modulate = true; + bdata.use_large_verts = true; + bdata.fvf = RasterizerStorageCommon::FVF_LARGE; + } + + // in the special case of custom shaders that read from VERTEX (i.e. vertex position) + // we want to disable software transform of extra matrix + if (bdata.joined_item_batch_flags & RasterizerStorageCommon::PREVENT_VERTEX_BAKING) { + fill_state.extra_matrix_sent = true; + } + + for (unsigned int i = 0; i < p_bij.num_item_refs; i++) { + const BItemRef &ref = bdata.item_refs[p_bij.first_item_ref + i]; + item = ref.item; + + if (!p_lit) { + // if not lit we use the complex calculated final modulate + fill_state.final_modulate = ref.final_modulate; + } else { + // if lit we ignore canvas modulate and just use the item modulate + fill_state.final_modulate = item->final_modulate; + } + +#ifdef GODOT_3 + int command_count = item->commands.size(); + int command_start = 0; +#endif + + // ONCE OFF fill state setup, that will be retained over multiple calls to + // prefill_joined_item() + fill_state.transform_combined = item->final_transform; + + // decide the initial transform mode, and make a backup + // in orig_transform_mode in case we need to switch back + if (!fill_state.use_hardware_transform) { + fill_state.transform_mode = _find_transform_mode(fill_state.transform_combined); + } else { + fill_state.transform_mode = TM_NONE; + } + fill_state.orig_transform_mode = fill_state.transform_mode; + + // keep track of when we added an extra matrix + // so we can defer sending until we see a default command + fill_state.transform_extra_command_number_p1 = 0; + +#ifdef GODOT_4 + RendererCanvasRender::Item::Command *current_command = item->commands; + while (current_command) { +#else + while (command_start < command_count) { +#endif + // fill as many batches as possible (until all done, or the vertex buffer is full) +#ifdef GODOT_4 + bool bFull = get_this()->prefill_joined_item(fill_state, current_command, item, p_current_clip, r_reclip, p_material); +#else + bool bFull = get_this()->prefill_joined_item(fill_state, command_start, item, p_current_clip, r_reclip, p_material); +#endif + + if (bFull) { + // always pass first item (commands for default are always first item) + flush_render_batches(first_item, p_current_clip, r_reclip, p_material, fill_state.sequence_batch_type_flags); + + // zero all the batch data ready for a new run + bdata.reset_flush(); + + // don't zero all the fill state, some may need to be preserved + fill_state.reset_flush(); + } + } + } + + // flush if any left + flush_render_batches(first_item, p_current_clip, r_reclip, p_material, fill_state.sequence_batch_type_flags); + + // zero all the batch data ready for a new run + bdata.reset_flush(); +} + +PREAMBLE(void)::_legacy_canvas_item_render_commands(RendererCanvasRender::Item *p_item, RendererCanvasRender::Item *p_current_clip, bool &r_reclip, typename T_STORAGE::Material *p_material) { +#ifdef GODOT_3 + int command_count = p_item->commands.size(); + RendererCanvasRender::Item::Command *const *commands = p_item->commands.ptr(); +#else + // reuse the same list each time to prevent needless dynamic allocations + unsigned int command_count = godot4_commands_to_vector(p_item->commands, bdata.command_shortlist); + RendererCanvasRender::Item::Command *const *commands = nullptr; + if (command_count) + commands = &bdata.command_shortlist[0]; +#endif + + // legacy .. just create one massive batch and render everything as before + bdata.batches.reset(); + Batch *batch = _batch_request_new(); + batch->type = RasterizerStorageCommon::BT_DEFAULT; + batch->num_commands = command_count; + + get_this()->render_batches(commands, p_current_clip, r_reclip, p_material); + bdata.reset_flush(); +} + +PREAMBLE(void)::record_items(RendererCanvasRender::Item *p_item_list, int p_z) { + while (p_item_list) { + BSortItem *s = bdata.sort_items.request_with_grow(); + + s->item = p_item_list; + s->z_index = p_z; + + p_item_list = p_item_list->next; + } +} + +PREAMBLE(void)::join_sorted_items() { + /* + sort_items(); + + int z = GD_VS::CANVAS_ITEM_Z_MIN; + _render_item_state.item_group_z = z; + + for (int s = 0; s < bdata.sort_items.size(); s++) { + const BSortItem &si = bdata.sort_items[s]; + RendererCanvasRender::Item *ci = si.item; + + // change z? + if (si.z_index != z) { + z = si.z_index; + + // may not be required + _render_item_state.item_group_z = z; + + // if z ranged lights are present, sometimes we have to disable joining over z_indices. + // we do this here. + // Note this restriction may be able to be relaxed with light bitfields, investigate! + if (!bdata.join_across_z_indices) { + _render_item_state.join_batch_break = true; + } + } + + bool join; + + if (_render_item_state.join_batch_break) { + // always start a new batch for this item + join = false; + + // could be another batch break (i.e. prevent NEXT item from joining this) + // so we still need to run try_join_item + // even though we know join is false. + // also we need to run try_join_item for every item because it keeps the state up to date, + // if we didn't run it the state would be out of date. + get_this()->try_join_item(ci, _render_item_state, _render_item_state.join_batch_break); + } else { + join = get_this()->try_join_item(ci, _render_item_state, _render_item_state.join_batch_break); + } + + // assume the first item will always return no join + if (!join) { + _render_item_state.joined_item = bdata.items_joined.request_with_grow(); + _render_item_state.joined_item->first_item_ref = bdata.item_refs.size(); + _render_item_state.joined_item->num_item_refs = 1; + _render_item_state.joined_item->bounding_rect = ci->global_rect_cache; + _render_item_state.joined_item->z_index = z; + _render_item_state.joined_item->flags = bdata.joined_item_batch_flags; + + // we need some logic to prevent joining items that have vastly different batch types + _render_item_state.joined_item_batch_type_flags_prev = _render_item_state.joined_item_batch_type_flags_curr; + + // add the reference + BItemRef *r = bdata.item_refs.request_with_grow(); + r->item = ci; + // we are storing final_modulate in advance per item reference + // for baking into vertex colors. + // this may not be ideal... as we are increasing the size of item reference, + // but it is stupidly complex to calculate later, which would probably be slower. + r->final_modulate = _render_item_state.final_modulate; + } else { + RAST_DEBUG_ASSERT(_render_item_state.joined_item != 0); + _render_item_state.joined_item->num_item_refs += 1; + _render_item_state.joined_item->bounding_rect = _render_item_state.joined_item->bounding_rect.merge(ci->global_rect_cache); + + BItemRef *r = bdata.item_refs.request_with_grow(); + r->item = ci; + r->final_modulate = _render_item_state.final_modulate; + } + + } // for s through sort items +*/ +} + +#ifdef GODOT_3 +PREAMBLE(void)::sort_items() { + // turned off? + if (!bdata.settings_item_reordering_lookahead) { + return; + } + + for (int s = 0; s < bdata.sort_items.size() - 2; s++) { + if (sort_items_from(s)) { +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + bdata.stats_items_sorted++; +#endif + } + } +} + +PREAMBLE(bool)::_sort_items_match(const BSortItem &p_a, const BSortItem &p_b) const { + const RendererCanvasRender::Item *a = p_a.item; + const RendererCanvasRender::Item *b = p_b.item; + + if (b->commands.size() != 1) + return false; + + // tested outside function + // if (a->commands.size() != 1) + // return false; + + const RendererCanvasRender::Item::Command &cb = *b->commands[0]; + if (cb.type != RendererCanvasRender::Item::Command::TYPE_RECT) + return false; + + const RendererCanvasRender::Item::Command &ca = *a->commands[0]; + // tested outside function + // if (ca.type != Item::Command::TYPE_RECT) + // return false; + + const RendererCanvasRender::Item::CommandRect *rect_a = static_cast<const RendererCanvasRender::Item::CommandRect *>(&ca); + const RendererCanvasRender::Item::CommandRect *rect_b = static_cast<const RendererCanvasRender::Item::CommandRect *>(&cb); + + if (rect_a->texture != rect_b->texture) + return false; + + /* ALTERNATIVE APPROACH NOT LIMITED TO RECTS +const RendererCanvasRender::Item::Command &ca = *a->commands[0]; +const RendererCanvasRender::Item::Command &cb = *b->commands[0]; + +if (ca.type != cb.type) + return false; + +// do textures match? +switch (ca.type) +{ +default: + break; +case RendererCanvasRender::Item::Command::TYPE_RECT: + { + const RendererCanvasRender::Item::CommandRect *comm_a = static_cast<const RendererCanvasRender::Item::CommandRect *>(&ca); + const RendererCanvasRender::Item::CommandRect *comm_b = static_cast<const RendererCanvasRender::Item::CommandRect *>(&cb); + if (comm_a->texture != comm_b->texture) + return false; + } + break; +case RendererCanvasRender::Item::Command::TYPE_POLYGON: + { + const RendererCanvasRender::Item::CommandPolygon *comm_a = static_cast<const RendererCanvasRender::Item::CommandPolygon *>(&ca); + const RendererCanvasRender::Item::CommandPolygon *comm_b = static_cast<const RendererCanvasRender::Item::CommandPolygon *>(&cb); + if (comm_a->texture != comm_b->texture) + return false; + } + break; +} +*/ + + return true; +} + +PREAMBLE(bool)::sort_items_from(int p_start) { +#if defined(TOOLS_ENABLED) && defined(DEBUG_ENABLED) + ERR_FAIL_COND_V((p_start + 1) >= bdata.sort_items.size(), false) +#endif + + const BSortItem &start = bdata.sort_items[p_start]; + int start_z = start.z_index; + + // check start is the right type for sorting + if (start.item->commands.size() != 1) { + return false; + } + const RendererCanvasRender::Item::Command &command_start = *start.item->commands[0]; + if (command_start.type != RendererCanvasRender::Item::Command::TYPE_RECT) { + return false; + } + + BSortItem &second = bdata.sort_items[p_start + 1]; + if (second.z_index != start_z) { + // no sorting across z indices (for now) + return false; + } + + // if the neighbours are already a good match + if (_sort_items_match(start, second)) // order is crucial, start first + { + return false; + } + + // local cached aabb + Rect2 second_AABB = second.item->global_rect_cache; + + // if the start and 2nd items overlap, can do no more + if (start.item->global_rect_cache.intersects(second_AABB)) { + return false; + } + + // which neighbour to test + int test_last = 2 + bdata.settings_item_reordering_lookahead; + for (int test = 2; test < test_last; test++) { + int test_sort_item_id = p_start + test; + + // if we've got to the end of the list, can't sort any more, give up + if (test_sort_item_id >= bdata.sort_items.size()) { + return false; + } + + BSortItem *test_sort_item = &bdata.sort_items[test_sort_item_id]; + + // across z indices? + if (test_sort_item->z_index != start_z) { + return false; + } + + RendererCanvasRender::Item *test_item = test_sort_item->item; + + // if the test item overlaps the second item, we can't swap, AT ALL + // because swapping an item OVER this one would cause artefacts + if (second_AABB.intersects(test_item->global_rect_cache)) { + return false; + } + + // do they match? + if (!_sort_items_match(start, *test_sort_item)) // order is crucial, start first + { + continue; + } + + // we can only swap if there are no AABB overlaps with sandwiched neighbours + bool ok = true; + + // start from 2, no need to check 1 as the second has already been checked against this item + // in the intersection test above + for (int sn = 2; sn < test; sn++) { + BSortItem *sandwich_neighbour = &bdata.sort_items[p_start + sn]; + if (test_item->global_rect_cache.intersects(sandwich_neighbour->item->global_rect_cache)) { + ok = false; + break; + } + } + if (!ok) { + continue; + } + + // it is ok to exchange them! + BSortItem temp; + temp.assign(second); + second.assign(*test_sort_item); + test_sort_item->assign(temp); + + return true; + } // for test + + return false; +} +#endif // godot 3 + +PREAMBLE(void)::_software_transform_vertex(BatchVector2 &r_v, const Transform2D &p_tr) const { + Vector2 vc(r_v.x, r_v.y); + vc = p_tr.xform(vc); + r_v.set(vc); +} + +PREAMBLE(void)::_software_transform_vertex(Vector2 &r_v, const Transform2D &p_tr) const { + r_v = p_tr.xform(r_v); +} + +PREAMBLE(void)::_translate_batches_to_vertex_colored_FVF() { + // zeros the size and sets up how big each unit is + bdata.unit_vertices.prepare(sizeof(BatchVertexColored)); + + const BatchColor *source_vertex_colors = &bdata.vertex_colors[0]; + RAST_DEBUG_ASSERT(bdata.vertex_colors.size() == bdata.vertices.size()); + + int num_verts = bdata.vertices.size(); + + for (int n = 0; n < num_verts; n++) { + const BatchVertex &bv = bdata.vertices[n]; + + BatchVertexColored *cv = (BatchVertexColored *)bdata.unit_vertices.request(); + + cv->pos = bv.pos; + cv->uv = bv.uv; + cv->col = *source_vertex_colors++; + } +} + +// Translation always involved adding color to the FVF, which enables +// joining of batches that have different colors. +// There is a trade off. Non colored verts are smaller so work faster, but +// there comes a point where it is better to just use colored verts to avoid lots of +// batches. +// In addition this can optionally add light angles to the FVF, necessary for normal mapping. +T_PREAMBLE +template <class BATCH_VERTEX_TYPE, bool INCLUDE_LIGHT_ANGLES, bool INCLUDE_MODULATE, bool INCLUDE_LARGE> +void C_PREAMBLE::_translate_batches_to_larger_FVF(uint32_t p_sequence_batch_type_flags) { + bool include_poly_color = false; + + // we ONLY want to include the color verts in translation when using polys, + // as rects do not write vertex colors, only colors per batch. + if (p_sequence_batch_type_flags & RasterizerStorageCommon::BTF_POLY) { + include_poly_color = INCLUDE_LIGHT_ANGLES | INCLUDE_MODULATE | INCLUDE_LARGE; + } + + // zeros the size and sets up how big each unit is + bdata.unit_vertices.prepare(sizeof(BATCH_VERTEX_TYPE)); + bdata.batches_temp.reset(); + + // As the vertices_colored and batches_temp are 'mirrors' of the non-colored version, + // the sizes should be equal, and allocations should never fail. Hence the use of debug + // asserts to check program flow, these should not occur at runtime unless the allocation + // code has been altered. + RAST_DEBUG_ASSERT(bdata.unit_vertices.max_size() == bdata.vertices.max_size()); + RAST_DEBUG_ASSERT(bdata.batches_temp.max_size() == bdata.batches.max_size()); + + Color curr_col(-1.0f, -1.0f, -1.0f, -1.0f); + + Batch *dest_batch = nullptr; + + const BatchColor *source_vertex_colors = &bdata.vertex_colors[0]; + const float *source_light_angles = &bdata.light_angles[0]; + const BatchColor *source_vertex_modulates = &bdata.vertex_modulates[0]; + const BatchTransform *source_vertex_transforms = &bdata.vertex_transforms[0]; + + // translate the batches into vertex colored batches + for (int n = 0; n < bdata.batches.size(); n++) { + const Batch &source_batch = bdata.batches[n]; + + // does source batch use light angles? + const BatchTex &btex = bdata.batch_textures[source_batch.batch_texture_id]; + bool source_batch_uses_light_angles = btex.RID_normal != RID(); + + bool needs_new_batch = true; + + if (dest_batch) { + if (dest_batch->type == source_batch.type) { + if (source_batch.type == RasterizerStorageCommon::BT_RECT) { + if (dest_batch->batch_texture_id == source_batch.batch_texture_id) { + // add to previous batch + dest_batch->num_commands += source_batch.num_commands; + needs_new_batch = false; + + // create the colored verts (only if not default) + //int first_vert = source_batch.first_quad * 4; + //int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands); + int first_vert = source_batch.first_vert; + int end_vert = first_vert + (4 * source_batch.num_commands); + + for (int v = first_vert; v < end_vert; v++) { + RAST_DEV_DEBUG_ASSERT(bdata.vertices.size()); + const BatchVertex &bv = bdata.vertices[v]; + BATCH_VERTEX_TYPE *cv = (BATCH_VERTEX_TYPE *)bdata.unit_vertices.request(); + RAST_DEBUG_ASSERT(cv); + cv->pos = bv.pos; + cv->uv = bv.uv; + cv->col = source_batch.color; + + if (INCLUDE_LIGHT_ANGLES) { + RAST_DEV_DEBUG_ASSERT(bdata.light_angles.size()); + // this is required to allow compilation with non light angle vertex. + // it should be compiled out. + BatchVertexLightAngled *lv = (BatchVertexLightAngled *)cv; + if (source_batch_uses_light_angles) + lv->light_angle = *source_light_angles++; + else + lv->light_angle = 0.0f; // dummy, unused in vertex shader (could possibly be left uninitialized, but probably bad idea) + } // if including light angles + + if (INCLUDE_MODULATE) { + RAST_DEV_DEBUG_ASSERT(bdata.vertex_modulates.size()); + BatchVertexModulated *mv = (BatchVertexModulated *)cv; + mv->modulate = *source_vertex_modulates++; + } // including modulate + + if (INCLUDE_LARGE) { + RAST_DEV_DEBUG_ASSERT(bdata.vertex_transforms.size()); + BatchVertexLarge *lv = (BatchVertexLarge *)cv; + lv->transform = *source_vertex_transforms++; + } // if including large + } + } // textures match + } else { + // default + // we can still join, but only under special circumstances + // does this ever happen? not sure at this stage, but left for future expansion + uint32_t source_last_command = source_batch.first_command + source_batch.num_commands; + if (source_last_command == dest_batch->first_command) { + dest_batch->num_commands += source_batch.num_commands; + needs_new_batch = false; + } // if the commands line up exactly + } + } // if both batches are the same type + + } // if dest batch is valid + + if (needs_new_batch) { + dest_batch = bdata.batches_temp.request(); + RAST_DEBUG_ASSERT(dest_batch); + + *dest_batch = source_batch; + + // create the colored verts (only if not default) + if (source_batch.type != RasterizerStorageCommon::BT_DEFAULT) { + // int first_vert = source_batch.first_quad * 4; + // int end_vert = 4 * (source_batch.first_quad + source_batch.num_commands); + int first_vert = source_batch.first_vert; + int end_vert = first_vert + (4 * source_batch.num_commands); + + for (int v = first_vert; v < end_vert; v++) { + RAST_DEV_DEBUG_ASSERT(bdata.vertices.size()); + const BatchVertex &bv = bdata.vertices[v]; + BATCH_VERTEX_TYPE *cv = (BATCH_VERTEX_TYPE *)bdata.unit_vertices.request(); + RAST_DEBUG_ASSERT(cv); + cv->pos = bv.pos; + cv->uv = bv.uv; + + // polys are special, they can have per vertex colors + if (!include_poly_color) { + cv->col = source_batch.color; + } else { + RAST_DEV_DEBUG_ASSERT(bdata.vertex_colors.size()); + cv->col = *source_vertex_colors++; + } + + if (INCLUDE_LIGHT_ANGLES) { + RAST_DEV_DEBUG_ASSERT(bdata.light_angles.size()); + // this is required to allow compilation with non light angle vertex. + // it should be compiled out. + BatchVertexLightAngled *lv = (BatchVertexLightAngled *)cv; + if (source_batch_uses_light_angles) + lv->light_angle = *source_light_angles++; + else + lv->light_angle = 0.0f; // dummy, unused in vertex shader (could possibly be left uninitialized, but probably bad idea) + } // if using light angles + + if (INCLUDE_MODULATE) { + RAST_DEV_DEBUG_ASSERT(bdata.vertex_modulates.size()); + BatchVertexModulated *mv = (BatchVertexModulated *)cv; + mv->modulate = *source_vertex_modulates++; + } // including modulate + + if (INCLUDE_LARGE) { + RAST_DEV_DEBUG_ASSERT(bdata.vertex_transforms.size()); + BatchVertexLarge *lv = (BatchVertexLarge *)cv; + lv->transform = *source_vertex_transforms++; + } // if including large + } + } + } + } + + // copy the temporary batches to the master batch list (this could be avoided but it makes the code cleaner) + bdata.batches.copy_from(bdata.batches_temp); +} + +PREAMBLE(bool)::_disallow_item_join_if_batch_types_too_different(RenderItemState &r_ris, uint32_t btf_allowed) { + r_ris.joined_item_batch_type_flags_curr |= btf_allowed; + + bool disallow = false; + + if (r_ris.joined_item_batch_type_flags_prev & (~btf_allowed)) + disallow = true; + + return disallow; +} + +#ifdef GODOT_3 +PREAMBLE(bool)::_detect_item_batch_break(RenderItemState &r_ris, RendererCanvasRender::Item *p_ci, bool &r_batch_break) { + int command_count = p_ci->commands.size(); + + // Any item that contains commands that are default + // (i.e. not handled by software transform and the batching renderer) should not be joined. + + // ALSO batched types that differ in what the vertex format is needed to be should not be + // joined. + + // In order to work this out, it does a lookahead through the commands, + // which could potentially be very expensive. As such it makes sense to put a limit on this + // to some small number, which will catch nearly all cases which need joining, + // but not be overly expensive in the case of items with large numbers of commands. + + // It is hard to know what this number should be, empirically, + // and this has not been fully investigated. It works to join single sprite items when set to 1 or above. + // Note that there is a cost to increasing this because it has to look in advance through + // the commands. + // On the other hand joining items where possible will usually be better up to a certain + // number where the cost of software transform is higher than separate drawcalls with hardware + // transform. + + // if there are more than this number of commands in the item, we + // don't allow joining (separate state changes, and hardware transform) + // This is set to quite a conservative (low) number until investigated properly. + // const int MAX_JOIN_ITEM_COMMANDS = 16; + + r_ris.joined_item_batch_type_flags_curr = 0; + + if (command_count > bdata.settings_max_join_item_commands) { + return true; + } else { + RendererCanvasRender::Item::Command *const *commands = p_ci->commands.ptr(); + + // run through the commands looking for one that could prevent joining + for (int command_num = 0; command_num < command_count; command_num++) { + RendererCanvasRender::Item::Command *command = commands[command_num]; + RAST_DEBUG_ASSERT(command); + + switch (command->type) { + default: { + //r_batch_break = true; + return true; + } break; + case RendererCanvasRender::Item::Command::TYPE_LINE: { + // special case, only batches certain lines + RendererCanvasRender::Item::CommandLine *line = static_cast<RendererCanvasRender::Item::CommandLine *>(command); + + if (line->width > 1) { + //r_batch_break = true; + return true; + } + + if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_LINE | RasterizerStorageCommon::BTF_LINE_AA)) { + return true; + } + } break; + case RendererCanvasRender::Item::Command::TYPE_POLYGON: { + // only allow polygons to join if they aren't skeleton + RendererCanvasRender::Item::CommandPolygon *poly = static_cast<RendererCanvasRender::Item::CommandPolygon *>(command); + +#ifdef GLES_OVER_GL + // anti aliasing not accelerated + if (poly->antialiased) + return true; +#endif + + // light angles not yet implemented, treat as default + if (poly->normal_map != RID()) + return true; + + if (!get_this()->bdata.settings_use_software_skinning && poly->bones.size()) + return true; + + if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_POLY)) { + //r_batch_break = true; + return true; + } + } break; + case RendererCanvasRender::Item::Command::TYPE_RECT: { + if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_RECT)) + return true; + } break; + case RendererCanvasRender::Item::Command::TYPE_NINEPATCH: { + // do not handle tiled ninepatches, these can't be batched and need to use legacy method + RendererCanvasRender::Item::CommandNinePatch *np = static_cast<RendererCanvasRender::Item::CommandNinePatch *>(command); + if ((np->axis_x != VisualServer::NINE_PATCH_STRETCH) || (np->axis_y != VisualServer::NINE_PATCH_STRETCH)) + return true; + + if (_disallow_item_join_if_batch_types_too_different(r_ris, RasterizerStorageCommon::BTF_RECT)) + return true; + } break; + case RendererCanvasRender::Item::Command::TYPE_TRANSFORM: { + // compatible with all types + } break; + } // switch + + } // for through commands + + } // else + + // special case, back buffer copy, so don't join + if (p_ci->copy_back_buffer) { + return true; + } + + return false; +} +#endif + +#undef PREAMBLE +#undef T_PREAMBLE +#undef C_PREAMBLE + +#endif // RASTERIZER_CANVAS_BATCHER_H |