Refactor the natural string compare and sort algorithm

Add a new string fragment type for whitespace and punctuation which needs
to be assessed separately from letters and symbols.

Use wxUint64 instead of long for storing the value for numeric fragment.

Use collate instead of compare for non-numeric fragments.

Change names for the public comparison functions: wxWidgets provided function
is now named wxCmpGenericNatural() and for common public use is wxCmpNatural()
which calls a native function in wxMSW and wxCmpGenericNatural() elsewhere.

Try harder in wxCmpNaturalGeneric() if wxRegEx is unavailable: do not
just make a simple string comparison, but perform a case-insensitive
collation.

Make some other changes to simplify and possibly speed up the code.
This commit is contained in:
PB
2020-07-02 18:15:25 +02:00
committed by Vadim Zeitlin
parent 371c4b1366
commit 83a2a1e505
4 changed files with 282 additions and 237 deletions

View File

@@ -53,7 +53,7 @@ WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2); int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
WXDLLIMPEXP_BASE WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2); int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
inline int wxCMPFUNC_CONV wxNaturalStringSortAscending(const wxString& s1, const wxString& s2) inline int wxCMPFUNC_CONV wxNaturalStringSortAscending(const wxString& s1, const wxString& s2)
{ {

View File

@@ -397,7 +397,7 @@ int wxStringSortDescending(const wxString& s1, const wxString& s2);
@see wxDictionaryStringSortDescending(), @see wxDictionaryStringSortDescending(),
wxStringSortAscending(), wxStringSortAscending(),
wxNaturalStringSortAscending() wxNaturalStringSortAscending()
@since 3.1.0 @since 3.1.0
*/ */
int wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2); int wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2);
@@ -416,88 +416,84 @@ int wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2);
int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2); int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2);
/** /**
Comparison function used for Natural Sort. Comparison function comparing strings in natural order.
Functions in the same way as wxDictionaryStringSortAscending(), with
the exception that numbers within the string are recognised, and
compared numerically, rather than alphabetically. When used for
sorting, the result is that e.g. file names containing numbers are
sorted in a natural way.
This function will use an OS native function if one is available, This function can be used with wxSortedArrayString::Sort()
to ensure that the sort order is the same as the OS uses. or passed as an argument to wxSortedArrayString constructor.
Comparison is case insensitive. See wxCmpNatural() for more information about how natural
sort order is implemented.
e.g. Sorting using wxDictionaryStringSortAscending() results in:
- file1.txt @see wxNaturalStringSortDescending(),
- file10.txt wxStringSortAscending(), wxDictionaryStringSortAscending()
- file100.txt
- file2.txt @since 3.1.4
- file20.txt */
- file3.txt
e.g. Sorting using wxNaturalStringSortAscending() results in:
- file1.txt
- file2.txt
- file3.txt
- file11.txt
- file20.txt
- file100.txt
@see wxNaturalStringSortDescending(),
wxStringSortAscending(),
wxDictionaryStringSortAscending()
@since 3.1.2
*/
int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2); int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2);
/**
Comparison function comparing strings in reverse natural order.
/** This function can be used with wxSortedArrayString::Sort()
Comparison function comparing strings in reverse natural order. or passed as an argument to wxSortedArrayString constructor.
See wxNaturalStringSortAscending() for the natural sort description. See wxCmpNatural() for more information about how natural
sort order is implemented.
@see wxNaturalStringSortAscending(),
wxStringSortDescending(), @see wxNaturalStringSortAscending(),
wxDictionaryStringSortDescending() wxStringSortDescending(), wxDictionaryStringSortDescending()
@since 3.1.2 @since 3.1.4
*/ */
int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2); int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2);
/**
This function compares strings using case-insensitive collation and
additionally, numbers within strings are recognised and compared
numerically, rather than alphabetically. When used for sorting,
the result is that e.g. file names containing numbers are sorted
in a natural way.
/** For example, sorting with a simple string comparison results in:
This is wxWidgets' own implementation of the natural sort comparison - file1.txt
function. This will be used whenever an OS native function is not available. - file10.txt
- file100.txt
Since OS native implementations might differ from each other, the user might - file2.txt
wish to use this function which behaves in the same way across all platforms. - file20.txt
- file3.txt
@since 3.1.2
*/
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
But sorting the same strings in natural sort order results in:
- file1.txt
- file2.txt
- file3.txt
- file10.txt
- file20.txt
- file100.txt
/** wxCmpNatural() uses an OS native natural sort function when available
Comparison function, identical to wxNaturalStringSortAscending(). (currently only under Microsoft Windows), wxCmpNaturalGeneric() otherwise.
In fact, wxNaturalStringSortAscending() and wxNaturalStringSortDescending()
are both implemented using this function.
When an OS native natural sort function is available, that will be used,
otherwise wxCmpNatural() will be used.
Be aware that OS native implementations might differ from each other, and Be aware that OS native implementations might differ from each other,
might change behaviour from release to release. and might change behaviour from release to release.
@see wxNaturalStringSortAscending(), @see wxNaturalStringSortAscending(), wxNaturalStringSortDescending()
wxNaturalStringSortDescending()
@since 3.1.4
@since 3.1.2 */
*/ int wxCmpNatural(const wxString& s1, const wxString& s2);
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2);
/**
This is wxWidgets' own implementation of the natural sort comparison function.
Requires wxRegEx, if it is unavailable numbers within strings are not
recognised and only case-insensitive collation is performed.
@see wxCmpNatural()
@since 3.1.4
*/
int wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
// ============================================================================ // ============================================================================

View File

@@ -20,13 +20,14 @@
#endif #endif
#include "wx/arrstr.h" #include "wx/arrstr.h"
#include "wx/regex.h"
#include "wx/scopedarray.h" #include "wx/scopedarray.h"
#include "wx/wxcrt.h"
#include "wx/beforestd.h" #include "wx/beforestd.h"
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include "wx/afterstd.h" #include "wx/afterstd.h"
#include "wx/regex.h"
#if defined( __WINDOWS__ ) #if defined( __WINDOWS__ )
#include <shlwapi.h> #include <shlwapi.h>
@@ -728,143 +729,149 @@ wxArrayString wxSplit(const wxString& str, const wxChar sep, const wxChar escape
return ret; return ret;
} }
#if wxUSE_REGEX
namespace // enum, class and functions needed by wxCmpNatural(). namespace // helpers needed by wxCmpNaturalGeneric()
{ {
enum wxStringFragmentType // Used for comparison of string parts
struct wxStringFragment
{
// Fragment types are generally sorted like this:
// Empty < SpaceOrPunct < Digit < LetterOrSymbol
// Fragments of the same type are compared as follows:
// SpaceOrPunct - collated, Digit - as numbers using value
// LetterOrSymbol - lower-cased and then collated
enum Type
{ {
wxFRAGMENT_TYPE_EMPTY = 0, Empty,
wxFRAGMENT_TYPE_ALPHA = 1, SpaceOrPunct, // whitespace or punctuation
wxFRAGMENT_TYPE_DIGIT = 2 Digit, // a sequence of decimal digits
LetterOrSymbol // letters and symbols, i.e., anything not covered by the above types
}; };
wxStringFragment() : type(Empty), value(0) {}
// ---------------------------------------------------------------------------- Type type;
// wxStringFragment wxString text;
// ---------------------------------------------------------------------------- wxUint64 value; // used only for Digit type
// };
// Lightweight object returned by GetNaturalFragment().
// Represents either a number, or a string which contains no numerical digits.
class wxStringFragment wxStringFragment GetFragment(wxString& text)
{
static const wxRegEx reSpaceOrPunct(wxS("^([[:space:]]|[[:punct:]])+"));
// Limit the length to make sure the value will fit into a wxUint64
static const wxRegEx reDigit(wxS("^[[:digit:]]{1,19}"));
static const wxRegEx reLetterOrSymbol("^[^[:space:]|[:punct:]|[:digit:]]+");
if ( text.empty() )
return wxStringFragment();
wxStringFragment fragment;
size_t length = 0;
// In attempt to minimize the number of wxRegEx.Matches() calls,
// try to do them from the most expected to the least expected
// string fragment type.
if ( reLetterOrSymbol.Matches(text) )
{ {
public: if ( reLetterOrSymbol.GetMatch(NULL, &length) )
wxStringFragment() {
: type(wxFRAGMENT_TYPE_EMPTY) fragment.type = wxStringFragment::LetterOrSymbol;
{} fragment.text = text.Left(length);
}
wxString text; }
long value; else if ( reDigit.Matches(text) )
wxStringFragmentType type;
};
wxStringFragment GetFragment(wxString& text)
{ {
static const wxRegEx naturalNumeric(wxS("[0-9]+")); if ( reDigit.GetMatch(NULL, &length) )
static const wxRegEx naturalAlpha(wxS("[^0-9]+"));
size_t digitStart = 0;
size_t digitLength = 0;
size_t alphaStart = 0;
size_t alphaLength = 0;
wxStringFragment fragment;
if ( text.empty() )
return fragment;
if ( naturalNumeric.Matches(text) )
{ {
naturalNumeric.GetMatch(&digitStart, &digitLength, 0); fragment.type = wxStringFragment::Digit;
fragment.text = text.Left(length);
fragment.text.ToULongLong(&fragment.value);
} }
}
if ( naturalAlpha.Matches(text) ) else if ( reSpaceOrPunct.Matches(text) )
{
if ( reSpaceOrPunct.GetMatch(NULL, &length) )
{ {
naturalAlpha.GetMatch(&alphaStart, &alphaLength, 0); fragment.type = wxStringFragment::SpaceOrPunct;
fragment.text = text.Left(length);
} }
if ( alphaStart == 0 )
{
fragment.text = text.Mid(0, alphaLength);
fragment.value = 0;
fragment.type = wxFRAGMENT_TYPE_ALPHA;
text.erase(0, alphaLength);
}
if ( digitStart == 0 )
{
fragment.text = text.Mid(0, digitLength);
fragment.text.ToLong(&fragment.value);
fragment.type = wxFRAGMENT_TYPE_DIGIT;
text.erase(0, digitLength);
}
return fragment;
} }
int CompareFragmentNatural(const wxStringFragment& lhs, const wxStringFragment& rhs) text.erase(0, length);
return fragment;
}
int CompareFragmentNatural(const wxStringFragment& lhs, const wxStringFragment& rhs)
{
switch ( lhs.type )
{ {
if ( (lhs.type == wxFRAGMENT_TYPE_ALPHA) && case wxStringFragment::Empty:
(rhs.type == wxFRAGMENT_TYPE_ALPHA) ) switch ( rhs.type )
{
return lhs.text.CmpNoCase(rhs.text);
}
if ( (lhs.type == wxFRAGMENT_TYPE_DIGIT) &&
(rhs.type == wxFRAGMENT_TYPE_DIGIT) )
{
if ( lhs.value == rhs.value )
{ {
return 0; case wxStringFragment::Empty:
return 0;
case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
} }
if ( lhs.value < rhs.value ) case wxStringFragment::SpaceOrPunct:
switch ( rhs.type )
{ {
return -1; case wxStringFragment::Empty:
return 1;
case wxStringFragment::SpaceOrPunct:
return wxStrcoll_String(lhs.text, rhs.text);
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
} }
if ( lhs.value > rhs.value ) case wxStringFragment::Digit:
switch ( rhs.type )
{ {
return 1; case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
return 1;
case wxStringFragment::Digit:
if ( lhs.value > rhs.value )
return 1;
else if ( lhs.value < rhs.value )
return -1;
else
return 0;
case wxStringFragment::LetterOrSymbol:
return -1;
} }
}
if ( (lhs.type == wxFRAGMENT_TYPE_DIGIT) && case wxStringFragment::LetterOrSymbol:
(rhs.type == wxFRAGMENT_TYPE_ALPHA) ) switch ( rhs.type )
{ {
return -1; case wxStringFragment::Empty:
} case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
if ( (lhs.type == wxFRAGMENT_TYPE_ALPHA) && return 1;
(rhs.type == wxFRAGMENT_TYPE_DIGIT) ) case wxStringFragment::LetterOrSymbol:
{ return wxStrcoll_String(lhs.text.Lower(), rhs.text.Lower());
return 1; }
}
if ( lhs.type == wxFRAGMENT_TYPE_EMPTY )
{
return -1;
}
if ( rhs.type == wxFRAGMENT_TYPE_EMPTY )
{
return 1;
}
return 0;
} }
// all possible cases should be covered by the switch above
// but return also from here to prevent the compiler warning
return 1;
}
} // unnamed namespace } // unnamed namespace
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// wxCmpNaturalNative // wxCmpNaturalGeneric
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// //
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2) int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{ {
wxString lhs(s1); wxString lhs(s1);
wxString rhs(s2); wxString rhs(s2);
@@ -873,19 +880,28 @@ int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
while ( (comparison == 0) && (!lhs.empty() || !rhs.empty()) ) while ( (comparison == 0) && (!lhs.empty() || !rhs.empty()) )
{ {
wxStringFragment fragmentL = GetFragment(lhs); const wxStringFragment fragmentLHS = GetFragment(lhs);
wxStringFragment fragmentR = GetFragment(rhs); const wxStringFragment fragmentRHS = GetFragment(rhs);
comparison = CompareFragmentNatural(fragmentL, fragmentR);
comparison = CompareFragmentNatural(fragmentLHS, fragmentRHS);
} }
return comparison; return comparison;
} }
#else
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{
return wxStrcoll_String(s1.Lower(), s2.Lower());
}
#endif // #if wxUSE_REGEX
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Declaration of StrCmpLogicalW() // Declaration of StrCmpLogicalW()
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// //
// In some distributions of MinGW32, this function is exported in the library, // In some distributions of MinGW32, this function is exported in the library,
// but not declared in shlwapi.h. Therefore we declare it here. // but not declared in shlwapi.h. Therefore we declare it here.
#if defined( __MINGW32_TOOLCHAIN__ ) #if defined( __MINGW32_TOOLCHAIN__ )
@@ -894,19 +910,17 @@ int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// wxCmpNaturalNative // wxCmpNatural
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// //
// If a native version of Natural sort is available, then use that, otherwise // If a native version of Natural sort is available, then use that, otherwise
// use the wxWidgets version, wxCmpNatural(). // use the generic version.
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2) inline int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
{ {
#if defined( __WINDOWS__ ) #if defined( __WINDOWS__ )
return StrCmpLogicalW( s1.wc_str(), s2.wc_str() ); return StrCmpLogicalW(s1.wc_str(), s2.wc_str());
#else
#else return wxCmpNaturalGeneric(s1, s2);
return wxCmpNatural( s1, s2 ); #endif // #if defined( __WINDOWS__ )
#endif
} }

View File

@@ -782,47 +782,82 @@ void ArraysTestCase::IndexFromEnd()
} }
TEST_CASE("wxNaturalStringSortAscending()", "[array][sort][string]") TEST_CASE("wxNaturalStringComparisonGeneric()", "[wxString][compare]")
{ {
wxString s01("3String"); #if !wxUSE_REGEX
wxString s02("21String"); WARN("Skipping wxCmpNaturalGeneric() tests: wxRegEx not available");
#else
wxString s03("100string"); // simple string comparison
wxString s04("100String"); CHECK(wxCmpNaturalGeneric("a", "a") == 0);
CHECK(wxCmpNaturalGeneric("a", "z") < 0);
wxString s05("10String"); CHECK(wxCmpNaturalGeneric("z", "a") > 0);
wxString s06("Str3ing");
wxString s07("Str20ing"); // case insensitivity
wxString s08("Str200ing"); CHECK(wxCmpNaturalGeneric("a", "A") == 0);
wxString s09("String8"); CHECK(wxCmpNaturalGeneric("A", "a") == 0);
wxString s10("String90"); CHECK(wxCmpNaturalGeneric("AB", "a") > 0);
CHECK(wxCmpNaturalGeneric("a", "AB") < 0);
wxString s11("7String3");
wxString s12("07String20"); // empty strings sort before whitespace and punctiation
wxString s13("007String100"); CHECK(wxCmpNaturalGeneric("", " ") < 0);
CHECK(wxCmpNaturalGeneric(" ", "") > 0);
CHECK(wxCmpNatural(s01, s02) < 0); CHECK(wxCmpNaturalGeneric("", ",") < 0);
CHECK(wxCmpNatural(s02, s03) < 0); CHECK(wxCmpNaturalGeneric(",", "") > 0);
CHECK(wxCmpNatural(s03, s04) == 0); // Check that case is ignored
CHECK(wxCmpNatural(s05, s06) < 0); // empty strings sort before numbers
CHECK(wxCmpNatural(s06, s07) < 0); CHECK(wxCmpNaturalGeneric("", "0") < 0);
CHECK(wxCmpNatural(s07, s08) < 0); CHECK(wxCmpNaturalGeneric("0", "") > 0);
CHECK(wxCmpNatural(s08, s09) < 0);
CHECK(wxCmpNatural(s09, s10) < 0); // empty strings sort before letters and symbols
CHECK(wxCmpNatural(s11, s12) < 0); CHECK(wxCmpNaturalGeneric("", "abc") < 0);
CHECK(wxCmpNatural(s12, s13) < 0); CHECK(wxCmpNaturalGeneric("abc", "") > 0);
CHECK(wxCmpNatural(s01, s01) == 0); // Check that equality works in all cases
CHECK(wxCmpNatural(s02, s02) == 0); // whitespace and punctiation sort before numbers
CHECK(wxCmpNatural(s03, s03) == 0); CHECK(wxCmpNaturalGeneric(" ", "1") < 0);
CHECK(wxCmpNatural(s04, s04) == 0); CHECK(wxCmpNaturalGeneric("1", " ") > 0);
CHECK(wxCmpNatural(s05, s05) == 0); CHECK(wxCmpNaturalGeneric(",", "1") < 0);
CHECK(wxCmpNatural(s06, s06) == 0); CHECK(wxCmpNaturalGeneric("1", ",") > 0);
CHECK(wxCmpNatural(s07, s07) == 0);
CHECK(wxCmpNatural(s08, s08) == 0); // strings containing numbers sort before letters and symbols
CHECK(wxCmpNatural(s09, s09) == 0); CHECK(wxCmpNaturalGeneric("00", "a") < 0);
CHECK(wxCmpNatural(s10, s10) == 0); CHECK(wxCmpNaturalGeneric("a", "00") > 0);
CHECK(wxCmpNatural(s11, s11) == 0);
CHECK(wxCmpNatural(s12, s12) == 0); // strings containing numbers are compared by their value
CHECK(wxCmpNatural(s13, s13) == 0); CHECK(wxCmpNaturalGeneric("01", "1") == 0);
CHECK(wxCmpNaturalGeneric("1", "01") == 0);
CHECK(wxCmpNaturalGeneric("1", "05") < 0);
CHECK(wxCmpNaturalGeneric("05", "1") > 0);
CHECK(wxCmpNaturalGeneric("10", "5") > 0);
CHECK(wxCmpNaturalGeneric("5", "10") < 0);
CHECK(wxCmpNaturalGeneric("1", "9999999999999999999") < 0);
CHECK(wxCmpNaturalGeneric("9999999999999999999", "1") > 0);
// comparing strings composed from whitespace,
// punctuation, numbers, letters, and symbols
CHECK(wxCmpNaturalGeneric("1st", " 1st") > 0);
CHECK(wxCmpNaturalGeneric(" 1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", ",1st") > 0);
CHECK(wxCmpNaturalGeneric(",1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", "01st") == 0);
CHECK(wxCmpNaturalGeneric("01st", "1st") == 0);
CHECK(wxCmpNaturalGeneric("10th", "5th") > 0);
CHECK(wxCmpNaturalGeneric("5th", "10th") < 0);
CHECK(wxCmpNaturalGeneric("a1st", "a01st") == 0);
CHECK(wxCmpNaturalGeneric("a01st", "a1st") == 0);
CHECK(wxCmpNaturalGeneric("a10th", "a5th") > 0);
CHECK(wxCmpNaturalGeneric("a5th", "a10th") < 0);
CHECK(wxCmpNaturalGeneric("a 10th", "a5th") < 0);
CHECK(wxCmpNaturalGeneric("a5th", "a 10th") > 0);
CHECK(wxCmpNaturalGeneric("a1st1", "a01st01") == 0);
CHECK(wxCmpNaturalGeneric("a01st01", "a1st1") == 0);
CHECK(wxCmpNaturalGeneric("a10th10", "a5th5") > 0);
CHECK(wxCmpNaturalGeneric("a5th5", "a10th10") < 0);
CHECK(wxCmpNaturalGeneric("a 10th 10", "a5th 5") < 0);
CHECK(wxCmpNaturalGeneric("a5th 5", "a 10th 10") > 0);
#endif // #if !wxUSE_REGEX
} }