/**************************************************************************/ /* fuzzy_search.cpp */ /**************************************************************************/ /* This file is part of: */ /* REDOT ENGINE */ /* https://redotengine.org */ /**************************************************************************/ /* Copyright (c) 2024-present Redot Engine contributors */ /* (see REDOT_AUTHORS.md) */ /* Copyright (c) 2014-present Godot Engine contributors (see AUTHORS.md). */ /* Copyright (c) 2007-2014 Juan Linietsky, Ariel Manzur. */ /* */ /* Permission is hereby granted, free of charge, to any person obtaining */ /* a copy of this software and associated documentation files (the */ /* "Software"), to deal in the Software without restriction, including */ /* without limitation the rights to use, copy, modify, merge, publish, */ /* distribute, sublicense, and/or sell copies of the Software, and to */ /* permit persons to whom the Software is furnished to do so, subject to */ /* the following conditions: */ /* */ /* The above copyright notice and this permission notice shall be */ /* included in all copies or substantial portions of the Software. */ /* */ /* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, */ /* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF */ /* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. */ /* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY */ /* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, */ /* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE */ /* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ /**************************************************************************/ #include "fuzzy_search.h" constexpr float cull_factor = 0.1f; constexpr float cull_cutoff = 30.0f; const String boundary_chars = "/\\-_."; static bool _is_valid_interval(const Vector2i &p_interval) { // Empty intervals are represented as (-1, -1). return p_interval.x >= 0 && p_interval.y >= p_interval.x; } static Vector2i _extend_interval(const Vector2i &p_a, const Vector2i &p_b) { if (!_is_valid_interval(p_a)) { return p_b; } if (!_is_valid_interval(p_b)) { return p_a; } return Vector2i(MIN(p_a.x, p_b.x), MAX(p_a.y, p_b.y)); } static bool _is_word_boundary(const String &p_str, int p_index) { if (p_index == -1 || p_index == p_str.size()) { return true; } return boundary_chars.find_char(p_str[p_index]) != -1; } bool FuzzySearchToken::try_exact_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset) const { p_match.token_idx = idx; p_match.token_length = string.length(); int match_idx = p_target.find(string, p_offset); if (match_idx == -1) { return false; } p_match.add_substring(match_idx, string.length()); return true; } bool FuzzySearchToken::try_fuzzy_match(FuzzyTokenMatch &p_match, const String &p_target, int p_offset, int p_miss_budget) const { p_match.token_idx = idx; p_match.token_length = string.length(); int run_start = -1; int run_len = 0; // Search for the subsequence p_token in p_target starting from p_offset, recording each substring for // later scoring and display. for (int i = 0; i < string.length(); i++) { int new_offset = p_target.find_char(string[i], p_offset); if (new_offset < 0) { p_miss_budget--; if (p_miss_budget < 0) { return false; } } else { if (run_start == -1 || p_offset != new_offset) { if (run_start != -1) { p_match.add_substring(run_start, run_len); } run_start = new_offset; run_len = 1; } else { run_len += 1; } p_offset = new_offset + 1; } } if (run_start != -1) { p_match.add_substring(run_start, run_len); } return true; } void FuzzyTokenMatch::add_substring(int p_substring_start, int p_substring_length) { substrings.append(Vector2i(p_substring_start, p_substring_length)); matched_length += p_substring_length; Vector2i substring_interval = { p_substring_start, p_substring_start + p_substring_length - 1 }; interval = _extend_interval(interval, substring_interval); } bool FuzzyTokenMatch::intersects(const Vector2i &p_other_interval) const { if (!_is_valid_interval(interval) || !_is_valid_interval(p_other_interval)) { return false; } return interval.y >= p_other_interval.x && interval.x <= p_other_interval.y; } bool FuzzySearchResult::can_add_token_match(const FuzzyTokenMatch &p_match) const { if (p_match.get_miss_count() > miss_budget) { return false; } if (p_match.intersects(match_interval)) { if (token_matches.size() == 1) { return false; } for (const FuzzyTokenMatch &existing_match : token_matches) { if (existing_match.intersects(p_match.interval)) { return false; } } } return true; } bool FuzzyTokenMatch::is_case_insensitive(const String &p_original, const String &p_adjusted) const { for (const Vector2i &substr : substrings) { const int end = substr.x + substr.y; for (int i = substr.x; i < end; i++) { if (p_original[i] != p_adjusted[i]) { return true; } } } return false; } void FuzzySearchResult::score_token_match(FuzzyTokenMatch &p_match, bool p_case_insensitive) const { // This can always be tweaked more. The intuition is that exact matches should almost always // be prioritized over broken up matches, and other criteria more or less act as tie breakers. p_match.score = -20 * p_match.get_miss_count() - (p_case_insensitive ? 3 : 0); for (const Vector2i &substring : p_match.substrings) { // Score longer substrings higher than short substrings. int substring_score = substring.y * substring.y; // Score matches deeper in path higher than shallower matches if (substring.x > dir_index) { substring_score *= 2; } // Score matches on a word boundary higher than matches within a word if (_is_word_boundary(target, substring.x - 1) || _is_word_boundary(target, substring.x + substring.y)) { substring_score += 4; } // Score exact query matches higher than non-compact subsequence matches if (substring.y == p_match.token_length) { substring_score += 100; } p_match.score += substring_score; } } void FuzzySearchResult::maybe_apply_score_bonus() { // This adds a small bonus to results which match tokens in the same order they appear in the query. int *token_range_starts = (int *)alloca(sizeof(int) * token_matches.size()); for (const FuzzyTokenMatch &match : token_matches) { token_range_starts[match.token_idx] = match.interval.x; } int last = token_range_starts[0]; for (int i = 1; i < token_matches.size(); i++) { if (last > token_range_starts[i]) { return; } last = token_range_starts[i]; } score += 1; } void FuzzySearchResult::add_token_match(const FuzzyTokenMatch &p_match) { score += p_match.score; match_interval = _extend_interval(match_interval, p_match.interval); miss_budget -= p_match.get_miss_count(); token_matches.append(p_match); } void remove_low_scores(Vector &p_results, float p_cull_score) { // Removes all results with score < p_cull_score in-place. int i = 0; int j = p_results.size() - 1; FuzzySearchResult *results = p_results.ptrw(); while (true) { // Advances i to an element to remove and j to an element to keep. while (j >= i && results[j].score < p_cull_score) { j--; } while (i < j && results[i].score >= p_cull_score) { i++; } if (i >= j) { break; } results[i++] = results[j--]; } p_results.resize(j + 1); } void FuzzySearch::sort_and_filter(Vector &p_results) const { if (p_results.is_empty()) { return; } float avg_score = 0; float max_score = 0; for (const FuzzySearchResult &result : p_results) { avg_score += result.score; max_score = MAX(max_score, result.score); } // TODO: Tune scoring and culling here to display fewer subsequence soup matches when good matches // are available. avg_score /= p_results.size(); float cull_score = MIN(cull_cutoff, Math::lerp(avg_score, max_score, cull_factor)); remove_low_scores(p_results, cull_score); struct FuzzySearchResultComparator { bool operator()(const FuzzySearchResult &p_lhs, const FuzzySearchResult &p_rhs) const { // Sort on (score, length, alphanumeric) to ensure consistent ordering. if (p_lhs.score == p_rhs.score) { if (p_lhs.target.length() == p_rhs.target.length()) { return p_lhs.target < p_rhs.target; } return p_lhs.target.length() < p_rhs.target.length(); } return p_lhs.score > p_rhs.score; } }; SortArray sorter; if (p_results.size() > max_results) { sorter.partial_sort(0, p_results.size(), max_results, p_results.ptrw()); p_results.resize(max_results); } else { sorter.sort(p_results.ptrw(), p_results.size()); } } void FuzzySearch::set_query(const String &p_query) { tokens.clear(); for (const String &string : p_query.split(" ", false)) { tokens.append({ static_cast(tokens.size()), string }); } case_sensitive = !p_query.is_lowercase(); struct TokenComparator { bool operator()(const FuzzySearchToken &A, const FuzzySearchToken &B) const { if (A.string.length() == B.string.length()) { return A.idx < B.idx; } return A.string.length() > B.string.length(); } }; // Prioritize matching longer tokens before shorter ones since match overlaps are not accepted. tokens.sort_custom(); } bool FuzzySearch::search(const String &p_target, FuzzySearchResult &p_result) const { p_result.target = p_target; p_result.dir_index = p_target.rfind_char('/'); p_result.miss_budget = max_misses; String adjusted_target = case_sensitive ? p_target : p_target.to_lower(); // For each token, eagerly generate subsequences starting from index 0 and keep the best scoring one // which does not conflict with prior token matches. This is not ensured to find the highest scoring // combination of matches, or necessarily the highest scoring single subsequence, as it only considers // eager subsequences for a given index, and likewise eagerly finds matches for each token in sequence. for (const FuzzySearchToken &token : tokens) { FuzzyTokenMatch best_match; int offset = start_offset; while (true) { FuzzyTokenMatch match; if (allow_subsequences) { if (!token.try_fuzzy_match(match, adjusted_target, offset, p_result.miss_budget)) { break; } } else { if (!token.try_exact_match(match, adjusted_target, offset)) { break; } } if (p_result.can_add_token_match(match)) { p_result.score_token_match(match, match.is_case_insensitive(p_target, adjusted_target)); if (best_match.token_idx == -1 || best_match.score < match.score) { best_match = match; } } if (_is_valid_interval(match.interval)) { offset = match.interval.x + 1; } else { break; } } if (best_match.token_idx == -1) { return false; } p_result.add_token_match(best_match); } p_result.maybe_apply_score_bonus(); return true; } void FuzzySearch::search_all(const PackedStringArray &p_targets, Vector &p_results) const { p_results.clear(); for (const String &target : p_targets) { FuzzySearchResult result; if (search(target, result)) { p_results.append(result); } } sort_and_filter(p_results); }