Refactor the natural string compare and sort algorithm

Add a new string fragment type for whitespace and punctuation which needs
to be assessed separately from letters and symbols.

Use wxUint64 instead of long for storing the value for numeric fragment.

Use collate instead of compare for non-numeric fragments.

Change names for the public comparison functions: wxWidgets provided function
is now named wxCmpGenericNatural() and for common public use is wxCmpNatural()
which calls a native function in wxMSW and wxCmpGenericNatural() elsewhere.

Try harder in wxCmpNaturalGeneric() if wxRegEx is unavailable: do not
just make a simple string comparison, but perform a case-insensitive
collation.

Make some other changes to simplify and possibly speed up the code.
This commit is contained in:
PB
2020-07-02 18:15:25 +02:00
committed by Vadim Zeitlin
parent 371c4b1366
commit 83a2a1e505
4 changed files with 282 additions and 237 deletions

View File

@@ -53,7 +53,7 @@ WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2);
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
inline int wxCMPFUNC_CONV wxNaturalStringSortAscending(const wxString& s1, const wxString& s2)
{

View File

@@ -417,20 +417,45 @@ int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2);
/**
Comparison function used for Natural Sort.
Comparison function comparing strings in natural order.
Functions in the same way as wxDictionaryStringSortAscending(), with
the exception that numbers within the string are recognised, and
compared numerically, rather than alphabetically. When used for
sorting, the result is that e.g. file names containing numbers are
sorted in a natural way.
This function can be used with wxSortedArrayString::Sort()
or passed as an argument to wxSortedArrayString constructor.
This function will use an OS native function if one is available,
to ensure that the sort order is the same as the OS uses.
See wxCmpNatural() for more information about how natural
sort order is implemented.
Comparison is case insensitive.
@see wxNaturalStringSortDescending(),
wxStringSortAscending(), wxDictionaryStringSortAscending()
e.g. Sorting using wxDictionaryStringSortAscending() results in:
@since 3.1.4
*/
int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2);
/**
Comparison function comparing strings in reverse natural order.
This function can be used with wxSortedArrayString::Sort()
or passed as an argument to wxSortedArrayString constructor.
See wxCmpNatural() for more information about how natural
sort order is implemented.
@see wxNaturalStringSortAscending(),
wxStringSortDescending(), wxDictionaryStringSortDescending()
@since 3.1.4
*/
int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2);
/**
This function compares strings using case-insensitive collation and
additionally, numbers within strings are recognised and compared
numerically, rather than alphabetically. When used for sorting,
the result is that e.g. file names containing numbers are sorted
in a natural way.
For example, sorting with a simple string comparison results in:
- file1.txt
- file10.txt
- file100.txt
@@ -438,66 +463,37 @@ int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2);
- file20.txt
- file3.txt
e.g. Sorting using wxNaturalStringSortAscending() results in:
But sorting the same strings in natural sort order results in:
- file1.txt
- file2.txt
- file3.txt
- file11.txt
- file10.txt
- file20.txt
- file100.txt
@see wxNaturalStringSortDescending(),
wxStringSortAscending(),
wxDictionaryStringSortAscending()
wxCmpNatural() uses an OS native natural sort function when available
(currently only under Microsoft Windows), wxCmpNaturalGeneric() otherwise.
@since 3.1.2
Be aware that OS native implementations might differ from each other,
and might change behaviour from release to release.
@see wxNaturalStringSortAscending(), wxNaturalStringSortDescending()
@since 3.1.4
*/
int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2);
int wxCmpNatural(const wxString& s1, const wxString& s2);
/**
Comparison function comparing strings in reverse natural order.
This is wxWidgets' own implementation of the natural sort comparison function.
See wxNaturalStringSortAscending() for the natural sort description.
Requires wxRegEx, if it is unavailable numbers within strings are not
recognised and only case-insensitive collation is performed.
@see wxNaturalStringSortAscending(),
wxStringSortDescending(),
wxDictionaryStringSortDescending()
@see wxCmpNatural()
@since 3.1.2
@since 3.1.4
*/
int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2);
/**
This is wxWidgets' own implementation of the natural sort comparison
function. This will be used whenever an OS native function is not available.
Since OS native implementations might differ from each other, the user might
wish to use this function which behaves in the same way across all platforms.
@since 3.1.2
*/
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
/**
Comparison function, identical to wxNaturalStringSortAscending().
In fact, wxNaturalStringSortAscending() and wxNaturalStringSortDescending()
are both implemented using this function.
When an OS native natural sort function is available, that will be used,
otherwise wxCmpNatural() will be used.
Be aware that OS native implementations might differ from each other, and
might change behaviour from release to release.
@see wxNaturalStringSortAscending(),
wxNaturalStringSortDescending()
@since 3.1.2
*/
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2);
int wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
// ============================================================================

View File

@@ -20,13 +20,14 @@
#endif
#include "wx/arrstr.h"
#include "wx/regex.h"
#include "wx/scopedarray.h"
#include "wx/wxcrt.h"
#include "wx/beforestd.h"
#include <algorithm>
#include <functional>
#include "wx/afterstd.h"
#include "wx/regex.h"
#if defined( __WINDOWS__ )
#include <shlwapi.h>
@@ -728,143 +729,149 @@ wxArrayString wxSplit(const wxString& str, const wxChar sep, const wxChar escape
return ret;
}
#if wxUSE_REGEX
namespace // enum, class and functions needed by wxCmpNatural().
namespace // helpers needed by wxCmpNaturalGeneric()
{
enum wxStringFragmentType
// Used for comparison of string parts
struct wxStringFragment
{
wxFRAGMENT_TYPE_EMPTY = 0,
wxFRAGMENT_TYPE_ALPHA = 1,
wxFRAGMENT_TYPE_DIGIT = 2
// Fragment types are generally sorted like this:
// Empty < SpaceOrPunct < Digit < LetterOrSymbol
// Fragments of the same type are compared as follows:
// SpaceOrPunct - collated, Digit - as numbers using value
// LetterOrSymbol - lower-cased and then collated
enum Type
{
Empty,
SpaceOrPunct, // whitespace or punctuation
Digit, // a sequence of decimal digits
LetterOrSymbol // letters and symbols, i.e., anything not covered by the above types
};
wxStringFragment() : type(Empty), value(0) {}
// ----------------------------------------------------------------------------
// wxStringFragment
// ----------------------------------------------------------------------------
//
// Lightweight object returned by GetNaturalFragment().
// Represents either a number, or a string which contains no numerical digits.
class wxStringFragment
{
public:
wxStringFragment()
: type(wxFRAGMENT_TYPE_EMPTY)
{}
Type type;
wxString text;
long value;
wxStringFragmentType type;
wxUint64 value; // used only for Digit type
};
wxStringFragment GetFragment(wxString& text)
{
static const wxRegEx naturalNumeric(wxS("[0-9]+"));
static const wxRegEx naturalAlpha(wxS("[^0-9]+"));
size_t digitStart = 0;
size_t digitLength = 0;
size_t alphaStart = 0;
size_t alphaLength = 0;
wxStringFragment fragment;
static const wxRegEx reSpaceOrPunct(wxS("^([[:space:]]|[[:punct:]])+"));
// Limit the length to make sure the value will fit into a wxUint64
static const wxRegEx reDigit(wxS("^[[:digit:]]{1,19}"));
static const wxRegEx reLetterOrSymbol("^[^[:space:]|[:punct:]|[:digit:]]+");
if ( text.empty() )
return fragment;
return wxStringFragment();
if ( naturalNumeric.Matches(text) )
wxStringFragment fragment;
size_t length = 0;
// In attempt to minimize the number of wxRegEx.Matches() calls,
// try to do them from the most expected to the least expected
// string fragment type.
if ( reLetterOrSymbol.Matches(text) )
{
naturalNumeric.GetMatch(&digitStart, &digitLength, 0);
}
if ( naturalAlpha.Matches(text) )
{
naturalAlpha.GetMatch(&alphaStart, &alphaLength, 0);
}
if ( alphaStart == 0 )
{
fragment.text = text.Mid(0, alphaLength);
fragment.value = 0;
fragment.type = wxFRAGMENT_TYPE_ALPHA;
text.erase(0, alphaLength);
}
if ( digitStart == 0 )
{
fragment.text = text.Mid(0, digitLength);
fragment.text.ToLong(&fragment.value);
fragment.type = wxFRAGMENT_TYPE_DIGIT;
text.erase(0, digitLength);
if ( reLetterOrSymbol.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::LetterOrSymbol;
fragment.text = text.Left(length);
}
}
else if ( reDigit.Matches(text) )
{
if ( reDigit.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::Digit;
fragment.text = text.Left(length);
fragment.text.ToULongLong(&fragment.value);
}
}
else if ( reSpaceOrPunct.Matches(text) )
{
if ( reSpaceOrPunct.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::SpaceOrPunct;
fragment.text = text.Left(length);
}
}
text.erase(0, length);
return fragment;
}
int CompareFragmentNatural(const wxStringFragment& lhs, const wxStringFragment& rhs)
{
if ( (lhs.type == wxFRAGMENT_TYPE_ALPHA) &&
(rhs.type == wxFRAGMENT_TYPE_ALPHA) )
switch ( lhs.type )
{
return lhs.text.CmpNoCase(rhs.text);
}
if ( (lhs.type == wxFRAGMENT_TYPE_DIGIT) &&
(rhs.type == wxFRAGMENT_TYPE_DIGIT) )
{
if ( lhs.value == rhs.value )
case wxStringFragment::Empty:
switch ( rhs.type )
{
case wxStringFragment::Empty:
return 0;
}
if ( lhs.value < rhs.value )
{
case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::SpaceOrPunct:
switch ( rhs.type )
{
case wxStringFragment::Empty:
return 1;
case wxStringFragment::SpaceOrPunct:
return wxStrcoll_String(lhs.text, rhs.text);
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::Digit:
switch ( rhs.type )
{
case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
return 1;
case wxStringFragment::Digit:
if ( lhs.value > rhs.value )
{
return 1;
}
}
if ( (lhs.type == wxFRAGMENT_TYPE_DIGIT) &&
(rhs.type == wxFRAGMENT_TYPE_ALPHA) )
{
else if ( lhs.value < rhs.value )
return -1;
}
if ( (lhs.type == wxFRAGMENT_TYPE_ALPHA) &&
(rhs.type == wxFRAGMENT_TYPE_DIGIT) )
{
return 1;
}
if ( lhs.type == wxFRAGMENT_TYPE_EMPTY )
{
return -1;
}
if ( rhs.type == wxFRAGMENT_TYPE_EMPTY )
{
return 1;
}
else
return 0;
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::LetterOrSymbol:
switch ( rhs.type )
{
case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
return 1;
case wxStringFragment::LetterOrSymbol:
return wxStrcoll_String(lhs.text.Lower(), rhs.text.Lower());
}
}
// all possible cases should be covered by the switch above
// but return also from here to prevent the compiler warning
return 1;
}
} // unnamed namespace
// ----------------------------------------------------------------------------
// wxCmpNaturalNative
// wxCmpNaturalGeneric
// ----------------------------------------------------------------------------
//
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{
wxString lhs(s1);
wxString rhs(s2);
@@ -873,14 +880,23 @@ int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
while ( (comparison == 0) && (!lhs.empty() || !rhs.empty()) )
{
wxStringFragment fragmentL = GetFragment(lhs);
wxStringFragment fragmentR = GetFragment(rhs);
comparison = CompareFragmentNatural(fragmentL, fragmentR);
const wxStringFragment fragmentLHS = GetFragment(lhs);
const wxStringFragment fragmentRHS = GetFragment(rhs);
comparison = CompareFragmentNatural(fragmentLHS, fragmentRHS);
}
return comparison;
}
#else
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{
return wxStrcoll_String(s1.Lower(), s2.Lower());
}
#endif // #if wxUSE_REGEX
// ----------------------------------------------------------------------------
// Declaration of StrCmpLogicalW()
@@ -894,19 +910,17 @@ int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
// ----------------------------------------------------------------------------
// wxCmpNaturalNative
// wxCmpNatural
// ----------------------------------------------------------------------------
//
// If a native version of Natural sort is available, then use that, otherwise
// use the wxWidgets version, wxCmpNatural().
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2)
// use the generic version.
inline int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
{
#if defined( __WINDOWS__ )
return StrCmpLogicalW(s1.wc_str(), s2.wc_str());
#else
return wxCmpNatural( s1, s2 );
#endif
return wxCmpNaturalGeneric(s1, s2);
#endif // #if defined( __WINDOWS__ )
}

View File

@@ -782,47 +782,82 @@ void ArraysTestCase::IndexFromEnd()
}
TEST_CASE("wxNaturalStringSortAscending()", "[array][sort][string]")
TEST_CASE("wxNaturalStringComparisonGeneric()", "[wxString][compare]")
{
wxString s01("3String");
wxString s02("21String");
#if !wxUSE_REGEX
WARN("Skipping wxCmpNaturalGeneric() tests: wxRegEx not available");
#else
// simple string comparison
CHECK(wxCmpNaturalGeneric("a", "a") == 0);
CHECK(wxCmpNaturalGeneric("a", "z") < 0);
CHECK(wxCmpNaturalGeneric("z", "a") > 0);
wxString s03("100string");
wxString s04("100String");
// case insensitivity
CHECK(wxCmpNaturalGeneric("a", "A") == 0);
CHECK(wxCmpNaturalGeneric("A", "a") == 0);
CHECK(wxCmpNaturalGeneric("AB", "a") > 0);
CHECK(wxCmpNaturalGeneric("a", "AB") < 0);
wxString s05("10String");
wxString s06("Str3ing");
wxString s07("Str20ing");
wxString s08("Str200ing");
wxString s09("String8");
wxString s10("String90");
// empty strings sort before whitespace and punctiation
CHECK(wxCmpNaturalGeneric("", " ") < 0);
CHECK(wxCmpNaturalGeneric(" ", "") > 0);
CHECK(wxCmpNaturalGeneric("", ",") < 0);
CHECK(wxCmpNaturalGeneric(",", "") > 0);
wxString s11("7String3");
wxString s12("07String20");
wxString s13("007String100");
// empty strings sort before numbers
CHECK(wxCmpNaturalGeneric("", "0") < 0);
CHECK(wxCmpNaturalGeneric("0", "") > 0);
CHECK(wxCmpNatural(s01, s02) < 0);
CHECK(wxCmpNatural(s02, s03) < 0);
CHECK(wxCmpNatural(s03, s04) == 0); // Check that case is ignored
CHECK(wxCmpNatural(s05, s06) < 0);
CHECK(wxCmpNatural(s06, s07) < 0);
CHECK(wxCmpNatural(s07, s08) < 0);
CHECK(wxCmpNatural(s08, s09) < 0);
CHECK(wxCmpNatural(s09, s10) < 0);
CHECK(wxCmpNatural(s11, s12) < 0);
CHECK(wxCmpNatural(s12, s13) < 0);
CHECK(wxCmpNatural(s01, s01) == 0); // Check that equality works in all cases
CHECK(wxCmpNatural(s02, s02) == 0);
CHECK(wxCmpNatural(s03, s03) == 0);
CHECK(wxCmpNatural(s04, s04) == 0);
CHECK(wxCmpNatural(s05, s05) == 0);
CHECK(wxCmpNatural(s06, s06) == 0);
CHECK(wxCmpNatural(s07, s07) == 0);
CHECK(wxCmpNatural(s08, s08) == 0);
CHECK(wxCmpNatural(s09, s09) == 0);
CHECK(wxCmpNatural(s10, s10) == 0);
CHECK(wxCmpNatural(s11, s11) == 0);
CHECK(wxCmpNatural(s12, s12) == 0);
CHECK(wxCmpNatural(s13, s13) == 0);
// empty strings sort before letters and symbols
CHECK(wxCmpNaturalGeneric("", "abc") < 0);
CHECK(wxCmpNaturalGeneric("abc", "") > 0);
// whitespace and punctiation sort before numbers
CHECK(wxCmpNaturalGeneric(" ", "1") < 0);
CHECK(wxCmpNaturalGeneric("1", " ") > 0);
CHECK(wxCmpNaturalGeneric(",", "1") < 0);
CHECK(wxCmpNaturalGeneric("1", ",") > 0);
// strings containing numbers sort before letters and symbols
CHECK(wxCmpNaturalGeneric("00", "a") < 0);
CHECK(wxCmpNaturalGeneric("a", "00") > 0);
// strings containing numbers are compared by their value
CHECK(wxCmpNaturalGeneric("01", "1") == 0);
CHECK(wxCmpNaturalGeneric("1", "01") == 0);
CHECK(wxCmpNaturalGeneric("1", "05") < 0);
CHECK(wxCmpNaturalGeneric("05", "1") > 0);
CHECK(wxCmpNaturalGeneric("10", "5") > 0);
CHECK(wxCmpNaturalGeneric("5", "10") < 0);
CHECK(wxCmpNaturalGeneric("1", "9999999999999999999") < 0);
CHECK(wxCmpNaturalGeneric("9999999999999999999", "1") > 0);
// comparing strings composed from whitespace,
// punctuation, numbers, letters, and symbols
CHECK(wxCmpNaturalGeneric("1st", " 1st") > 0);
CHECK(wxCmpNaturalGeneric(" 1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", ",1st") > 0);
CHECK(wxCmpNaturalGeneric(",1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", "01st") == 0);
CHECK(wxCmpNaturalGeneric("01st", "1st") == 0);
CHECK(wxCmpNaturalGeneric("10th", "5th") > 0);
CHECK(wxCmpNaturalGeneric("5th", "10th") < 0);
CHECK(wxCmpNaturalGeneric("a1st", "a01st") == 0);
CHECK(wxCmpNaturalGeneric("a01st", "a1st") == 0);
CHECK(wxCmpNaturalGeneric("a10th", "a5th") > 0);
CHECK(wxCmpNaturalGeneric("a5th", "a10th") < 0);
CHECK(wxCmpNaturalGeneric("a 10th", "a5th") < 0);
CHECK(wxCmpNaturalGeneric("a5th", "a 10th") > 0);
CHECK(wxCmpNaturalGeneric("a1st1", "a01st01") == 0);
CHECK(wxCmpNaturalGeneric("a01st01", "a1st1") == 0);
CHECK(wxCmpNaturalGeneric("a10th10", "a5th5") > 0);
CHECK(wxCmpNaturalGeneric("a5th5", "a10th10") < 0);
CHECK(wxCmpNaturalGeneric("a 10th 10", "a5th 5") < 0);
CHECK(wxCmpNaturalGeneric("a5th 5", "a 10th 10") > 0);
#endif // #if !wxUSE_REGEX
}