Merge commit godotengine/godot@0f5f3bc9546b46b2029fc8896dc859697f1eab97

author: Spartan322 <Megacake1234@gmail.com> 2024-11-11 09:07:04 -0500
committer: Spartan322 <Megacake1234@gmail.com> 2024-11-11 09:08:01 -0500
commit: 62fbec9f6f0722a1f9825c17f073742932082228 (patch)
tree: a10abf56ba93705731da1aaf338f2cf21403c6ad /core/string
parent: e7894c2c4efdd51049a21af4892005381fe57cd6 (diff)
parent: 0f5f3bc9546b46b2029fc8896dc859697f1eab97 (diff)
download: redot-engine-62fbec9f6f0722a1f9825c17f073742932082228.tar.gz
4 files changed, 471 insertions, 2 deletions
diff --git a/core/string/fuzzy_search.cpp b/core/string/fuzzy_search.cpp
new file mode 100644
index 0000000000..57b82ac829
--- /dev/null
+++ b/core/string/fuzzy_search.cpp
@@ -0,0 +1,351 @@
+/**************************************************************************/
+/*  fuzzy_search.cpp                                                      */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             REDOT ENGINE                               */
+/*                        https://redotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2024-present Redot Engine contributors                   */
+/*                                          (see REDOT_AUTHORS.md)        */
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#include "fuzzy_search.h"
+
+constexpr float cull_factor = 0.1f;
+constexpr float cull_cutoff = 30.0f;
+const String boundary_chars = "/\\-_.";
+
+static bool _is_valid_interval(const Vector2i &p_interval) {
+	// Empty intervals are represented as (-1, -1).
+	return p_interval.x >= 0 && p_interval.y >= p_interval.x;
+}
+
+static Vector2i _extend_interval(const Vector2i &p_a, const Vector2i &p_b) {
+	if (!_is_valid_interval(p_a)) {
+		return p_b;
+	}
+	if (!_is_valid_interval(p_b)) {
+		return p_a;
+	}
+	return Vector2i(MIN(p_a.x, p_b.x), MAX(p_a.y, p_b.y));
+}
+
+static bool _is_word_boundary(const String &p_str, int p_index) {
+	if (p_index == -1 || p_index == p_str.size()) {
+		return true;
+	}
+	return boundary_chars.find_char(p_str[p_index]) != -1;
+}
+
+bool FuzzySearchToken::try_exact_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset) const {
+	p_match.token_idx = idx;
+	p_match.token_length = string.length();
+	int match_idx = p_target.find(string, p_offset);
+	if (match_idx == -1) {
+		return false;
+	}
+	p_match.add_substring(match_idx, string.length());
+	return true;
+}
+
+bool FuzzySearchToken::try_fuzzy_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset, int p_miss_budget) const {
+	p_match.token_idx = idx;
+	p_match.token_length = string.length();
+	int run_start = -1;
+	int run_len = 0;
+
+	// Search for the subsequence p_token in p_target starting from p_offset, recording each substring for
+	// later scoring and display.
+	for (int i = 0; i < string.length(); i++) {
+		int new_offset = p_target.find_char(string[i], p_offset);
+		if (new_offset < 0) {
+			p_miss_budget--;
+			if (p_miss_budget < 0) {
+				return false;
+			}
+		} else {
+			if (run_start == -1 || p_offset != new_offset) {
+				if (run_start != -1) {
+					p_match.add_substring(run_start, run_len);
+				}
+				run_start = new_offset;
+				run_len = 1;
+			} else {
+				run_len += 1;
+			}
+			p_offset = new_offset + 1;
+		}
+	}
+
+	if (run_start != -1) {
+		p_match.add_substring(run_start, run_len);
+	}
+
+	return true;
+}
+
+void FuzzyTokenMatch::add_substring(int p_substring_start, int p_substring_length) {
+	substrings.append(Vector2i(p_substring_start, p_substring_length));
+	matched_length += p_substring_length;
+	Vector2i substring_interval = { p_substring_start, p_substring_start + p_substring_length - 1 };
+	interval = _extend_interval(interval, substring_interval);
+}
+
+bool FuzzyTokenMatch::intersects(const Vector2i &p_other_interval) const {
+	if (!_is_valid_interval(interval) || !_is_valid_interval(p_other_interval)) {
+		return false;
+	}
+	return interval.y >= p_other_interval.x && interval.x <= p_other_interval.y;
+}
+
+bool FuzzySearchResult::can_add_token_match(const FuzzyTokenMatch &p_match) const {
+	if (p_match.get_miss_count() > miss_budget) {
+		return false;
+	}
+
+	if (p_match.intersects(match_interval)) {
+		if (token_matches.size() == 1) {
+			return false;
+		}
+		for (const FuzzyTokenMatch &existing_match : token_matches) {
+			if (existing_match.intersects(p_match.interval)) {
+				return false;
+			}
+		}
+	}
+
+	return true;
+}
+
+bool FuzzyTokenMatch::is_case_insensitive(const String &p_original, const String &p_adjusted) const {
+	for (const Vector2i &substr : substrings) {
+		const int end = substr.x + substr.y;
+		for (int i = substr.x; i < end; i++) {
+			if (p_original[i] != p_adjusted[i]) {
+				return true;
+			}
+		}
+	}
+	return false;
+}
+
+void FuzzySearchResult::score_token_match(FuzzyTokenMatch &p_match, bool p_case_insensitive) const {
+	// This can always be tweaked more. The intuition is that exact matches should almost always
+	// be prioritized over broken up matches, and other criteria more or less act as tie breakers.
+
+	p_match.score = -20 * p_match.get_miss_count() - (p_case_insensitive ? 3 : 0);
+
+	for (const Vector2i &substring : p_match.substrings) {
+		// Score longer substrings higher than short substrings.
+		int substring_score = substring.y * substring.y;
+		// Score matches deeper in path higher than shallower matches
+		if (substring.x > dir_index) {
+			substring_score *= 2;
+		}
+		// Score matches on a word boundary higher than matches within a word
+		if (_is_word_boundary(target, substring.x - 1) || _is_word_boundary(target, substring.x + substring.y)) {
+			substring_score += 4;
+		}
+		// Score exact query matches higher than non-compact subsequence matches
+		if (substring.y == p_match.token_length) {
+			substring_score += 100;
+		}
+		p_match.score += substring_score;
+	}
+}
+
+void FuzzySearchResult::maybe_apply_score_bonus() {
+	// This adds a small bonus to results which match tokens in the same order they appear in the query.
+	int *token_range_starts = (int *)alloca(sizeof(int) * token_matches.size());
+
+	for (const FuzzyTokenMatch &match : token_matches) {
+		token_range_starts[match.token_idx] = match.interval.x;
+	}
+
+	int last = token_range_starts[0];
+	for (int i = 1; i < token_matches.size(); i++) {
+		if (last > token_range_starts[i]) {
+			return;
+		}
+		last = token_range_starts[i];
+	}
+
+	score += 1;
+}
+
+void FuzzySearchResult::add_token_match(const FuzzyTokenMatch &p_match) {
+	score += p_match.score;
+	match_interval = _extend_interval(match_interval, p_match.interval);
+	miss_budget -= p_match.get_miss_count();
+	token_matches.append(p_match);
+}
+
+void remove_low_scores(Vector<FuzzySearchResult> &p_results, float p_cull_score) {
+	// Removes all results with score < p_cull_score in-place.
+	int i = 0;
+	int j = p_results.size() - 1;
+	FuzzySearchResult *results = p_results.ptrw();
+
+	while (true) {
+		// Advances i to an element to remove and j to an element to keep.
+		while (j >= i && results[j].score < p_cull_score) {
+			j--;
+		}
+		while (i < j && results[i].score >= p_cull_score) {
+			i++;
+		}
+		if (i >= j) {
+			break;
+		}
+		results[i++] = results[j--];
+	}
+
+	p_results.resize(j + 1);
+}
+
+void FuzzySearch::sort_and_filter(Vector<FuzzySearchResult> &p_results) const {
+	if (p_results.is_empty()) {
+		return;
+	}
+
+	float avg_score = 0;
+	float max_score = 0;
+
+	for (const FuzzySearchResult &result : p_results) {
+		avg_score += result.score;
+		max_score = MAX(max_score, result.score);
+	}
+
+	// TODO: Tune scoring and culling here to display fewer subsequence soup matches when good matches
+	//  are available.
+	avg_score /= p_results.size();
+	float cull_score = MIN(cull_cutoff, Math::lerp(avg_score, max_score, cull_factor));
+	remove_low_scores(p_results, cull_score);
+
+	struct FuzzySearchResultComparator {
+		bool operator()(const FuzzySearchResult &p_lhs, const FuzzySearchResult &p_rhs) const {
+			// Sort on (score, length, alphanumeric) to ensure consistent ordering.
+			if (p_lhs.score == p_rhs.score) {
+				if (p_lhs.target.length() == p_rhs.target.length()) {
+					return p_lhs.target < p_rhs.target;
+				}
+				return p_lhs.target.length() < p_rhs.target.length();
+			}
+			return p_lhs.score > p_rhs.score;
+		}
+	};
+
+	SortArray<FuzzySearchResult, FuzzySearchResultComparator> sorter;
+
+	if (p_results.size() > max_results) {
+		sorter.partial_sort(0, p_results.size(), max_results, p_results.ptrw());
+		p_results.resize(max_results);
+	} else {
+		sorter.sort(p_results.ptrw(), p_results.size());
+	}
+}
+
+void FuzzySearch::set_query(const String &p_query) {
+	tokens.clear();
+	for (const String &string : p_query.split(" ", false)) {
+		tokens.append({ static_cast<int>(tokens.size()), string });
+	}
+
+	case_sensitive = !p_query.is_lowercase();
+
+	struct TokenComparator {
+		bool operator()(const FuzzySearchToken &A, const FuzzySearchToken &B) const {
+			if (A.string.length() == B.string.length()) {
+				return A.idx < B.idx;
+			}
+			return A.string.length() > B.string.length();
+		}
+	};
+
+	// Prioritize matching longer tokens before shorter ones since match overlaps are not accepted.
+	tokens.sort_custom<TokenComparator>();
+}
+
+bool FuzzySearch::search(const String &p_target, FuzzySearchResult &p_result) const {
+	p_result.target = p_target;
+	p_result.dir_index = p_target.rfind_char('/');
+	p_result.miss_budget = max_misses;
+
+	String adjusted_target = case_sensitive ? p_target : p_target.to_lower();
+
+	// For each token, eagerly generate subsequences starting from index 0 and keep the best scoring one
+	// which does not conflict with prior token matches. This is not ensured to find the highest scoring
+	// combination of matches, or necessarily the highest scoring single subsequence, as it only considers
+	// eager subsequences for a given index, and likewise eagerly finds matches for each token in sequence.
+	for (const FuzzySearchToken &token : tokens) {
+		FuzzyTokenMatch best_match;
+		int offset = start_offset;
+
+		while (true) {
+			FuzzyTokenMatch match;
+			if (allow_subsequences) {
+				if (!token.try_fuzzy_match(match, adjusted_target, offset, p_result.miss_budget)) {
+					break;
+				}
+			} else {
+				if (!token.try_exact_match(match, adjusted_target, offset)) {
+					break;
+				}
+			}
+			if (p_result.can_add_token_match(match)) {
+				p_result.score_token_match(match, match.is_case_insensitive(p_target, adjusted_target));
+				if (best_match.token_idx == -1 || best_match.score < match.score) {
+					best_match = match;
+				}
+			}
+			if (_is_valid_interval(match.interval)) {
+				offset = match.interval.x + 1;
+			} else {
+				break;
+			}
+		}
+
+		if (best_match.token_idx == -1) {
+			return false;
+		}
+
+		p_result.add_token_match(best_match);
+	}
+
+	p_result.maybe_apply_score_bonus();
+	return true;
+}
+
+void FuzzySearch::search_all(const PackedStringArray &p_targets, Vector<FuzzySearchResult> &p_results) const {
+	p_results.clear();
+
+	for (const String &target : p_targets) {
+		FuzzySearchResult result;
+		if (search(target, result)) {
+			p_results.append(result);
+		}
+	}
+
+	sort_and_filter(p_results);
+}
diff --git a/core/string/fuzzy_search.h b/core/string/fuzzy_search.h
new file mode 100644
index 0000000000..4e2564421c
--- /dev/null
+++ b/core/string/fuzzy_search.h
@@ -0,0 +1,103 @@
+/**************************************************************************/
+/*  fuzzy_search.h                                                        */
+/**************************************************************************/
+/*                         This file is part of:                          */
+/*                             REDOT ENGINE                               */
+/*                        https://redotengine.org                         */
+/**************************************************************************/
+/* Copyright (c) 2024-present Redot Engine contributors                   */
+/*                                          (see REDOT_AUTHORS.md)        */
+/* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */
+/* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur.                  */
+/*                                                                        */
+/* Permission is hereby granted, free of charge, to any person obtaining  */
+/* a copy of this software and associated documentation files (the        */
+/* "Software"), to deal in the Software without restriction, including    */
+/* without limitation the rights to use, copy, modify, merge, publish,    */
+/* distribute, sublicense, and/or sell copies of the Software, and to     */
+/* permit persons to whom the Software is furnished to do so, subject to  */
+/* the following conditions:                                              */
+/*                                                                        */
+/* The above copyright notice and this permission notice shall be         */
+/* included in all copies or substantial portions of the Software.        */
+/*                                                                        */
+/* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,        */
+/* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF     */
+/* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */
+/* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY   */
+/* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,   */
+/* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE      */
+/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.                 */
+/**************************************************************************/
+
+#ifndef FUZZY_SEARCH_H
+#define FUZZY_SEARCH_H
+
+#include "core/variant/variant.h"
+
+class FuzzyTokenMatch;
+
+struct FuzzySearchToken {
+	int idx = -1;
+	String string;
+
+	bool try_exact_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset) const;
+	bool try_fuzzy_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset, int p_miss_budget) const;
+};
+
+class FuzzyTokenMatch {
+	friend struct FuzzySearchToken;
+	friend class FuzzySearchResult;
+	friend class FuzzySearch;
+
+	int matched_length = 0;
+	int token_length = 0;
+	int token_idx = -1;
+	Vector2i interval = Vector2i(-1, -1); // x and y are both inclusive indices.
+
+	void add_substring(int p_substring_start, int p_substring_length);
+	bool intersects(const Vector2i &p_other_interval) const;
+	bool is_case_insensitive(const String &p_original, const String &p_adjusted) const;
+	int get_miss_count() const { return token_length - matched_length; }
+
+public:
+	int score = 0;
+	Vector<Vector2i> substrings; // x is start index, y is length.
+};
+
+class FuzzySearchResult {
+	friend class FuzzySearch;
+
+	int miss_budget = 0;
+	Vector2i match_interval = Vector2i(-1, -1);
+
+	bool can_add_token_match(const FuzzyTokenMatch &p_match) const;
+	void score_token_match(FuzzyTokenMatch &p_match, bool p_case_insensitive) const;
+	void add_token_match(const FuzzyTokenMatch &p_match);
+	void maybe_apply_score_bonus();
+
+public:
+	String target;
+	int score = 0;
+	int dir_index = -1;
+	Vector<FuzzyTokenMatch> token_matches;
+};
+
+class FuzzySearch {
+	Vector<FuzzySearchToken> tokens;
+
+	void sort_and_filter(Vector<FuzzySearchResult> &p_results) const;
+
+public:
+	int start_offset = 0;
+	bool case_sensitive = false;
+	int max_results = 100;
+	int max_misses = 2;
+	bool allow_subsequences = true;
+
+	void set_query(const String &p_query);
+	bool search(const String &p_target, FuzzySearchResult &p_result) const;
+	void search_all(const PackedStringArray &p_targets, Vector<FuzzySearchResult> &p_results) const;
+};
+
+#endif // FUZZY_SEARCH_H
diff --git a/core/string/ustring.cpp b/core/string/ustring.cpp
index 9bfc0a67cb..033b5f2864 100644
--- a/core/string/ustring.cpp
+++ b/core/string/ustring.cpp
@@ -3389,7 +3389,7 @@ int String::find(const char *p_str, int p_from) const {
 	return -1;
 }
 
-int String::find_char(const char32_t &p_char, int p_from) const {
+int String::find_char(char32_t p_char, int p_from) const {
 	return _cowdata.find(p_char, p_from);
 }
 
@@ -3626,6 +3626,10 @@ int String::rfind(const char *p_str, int p_from) const {
 	return -1;
 }
 
+int String::rfind_char(char32_t p_char, int p_from) const {
+	return _cowdata.rfind(p_char, p_from);
+}
+
 int String::rfindn(const String &p_str, int p_from) const {
 	// establish a limit
 	int limit = length() - p_str.length();
@@ -3839,6 +3843,15 @@ bool String::is_quoted() const {
 	return is_enclosed_in("\"") || is_enclosed_in("'");
 }
 
+bool String::is_lowercase() const {
+	for (const char32_t *str = &operator[](0); *str; str++) {
+		if (is_unicode_upper_case(*str)) {
+			return false;
+		}
+	}
+	return true;
+}
+
 int String::_count(const String &p_string, int p_from, int p_to, bool p_case_insensitive) const {
 	if (p_string.is_empty()) {
 		return 0;
diff --git a/core/string/ustring.h b/core/string/ustring.h
index 11be148084..d363228ecb 100644
--- a/core/string/ustring.h
+++ b/core/string/ustring.h
@@ -289,11 +289,12 @@ public:
 	String substr(int p_from, int p_chars = -1) const;
 	int find(const String &p_str, int p_from = 0) const; ///< return <0 if failed
 	int find(const char *p_str, int p_from = 0) const; ///< return <0 if failed
-	int find_char(const char32_t &p_char, int p_from = 0) const; ///< return <0 if failed
+	int find_char(char32_t p_char, int p_from = 0) const; ///< return <0 if failed
 	int findn(const String &p_str, int p_from = 0) const; ///< return <0 if failed, case insensitive
 	int findn(const char *p_str, int p_from = 0) const; ///< return <0 if failed
 	int rfind(const String &p_str, int p_from = -1) const; ///< return <0 if failed
 	int rfind(const char *p_str, int p_from = -1) const; ///< return <0 if failed
+	int rfind_char(char32_t p_char, int p_from = -1) const; ///< return <0 if failed
 	int rfindn(const String &p_str, int p_from = -1) const; ///< return <0 if failed, case insensitive
 	int rfindn(const char *p_str, int p_from = -1) const; ///< return <0 if failed
 	int findmk(const Vector<String> &p_keys, int p_from = 0, int *r_key = nullptr) const; ///< return <0 if failed
@@ -307,6 +308,7 @@ public:
 	bool is_subsequence_of(const String &p_string) const;
 	bool is_subsequence_ofn(const String &p_string) const;
 	bool is_quoted() const;
+	bool is_lowercase() const;
 	Vector<String> bigrams() const;
 	float similarity(const String &p_string) const;
 	String format(const Variant &values, const String &placeholder = "{_}") const;
author	Spartan322 <Megacake1234@gmail.com>	2024-11-11 09:07:04 -0500
committer	Spartan322 <Megacake1234@gmail.com>	2024-11-11 09:08:01 -0500
commit	62fbec9f6f0722a1f9825c17f073742932082228 (patch)
tree	a10abf56ba93705731da1aaf338f2cf21403c6ad /core/string
parent	e7894c2c4efdd51049a21af4892005381fe57cd6 (diff)
parent	0f5f3bc9546b46b2029fc8896dc859697f1eab97 (diff)
download	redot-engine-62fbec9f6f0722a1f9825c17f073742932082228.tar.gz