Refactor the natural string compare and sort algorithm

Add a new string fragment type for whitespace and punctuation which needs
to be assessed separately from letters and symbols.

Use wxUint64 instead of long for storing the value for numeric fragment.

Use collate instead of compare for non-numeric fragments.

Change names for the public comparison functions: wxWidgets provided function
is now named wxCmpGenericNatural() and for common public use is wxCmpNatural()
which calls a native function in wxMSW and wxCmpGenericNatural() elsewhere.

Try harder in wxCmpNaturalGeneric() if wxRegEx is unavailable: do not
just make a simple string comparison, but perform a case-insensitive
collation.

Make some other changes to simplify and possibly speed up the code.
This commit is contained in:
PB
2020-07-02 18:15:25 +02:00
committed by Vadim Zeitlin
parent 371c4b1366
commit 83a2a1e505
4 changed files with 282 additions and 237 deletions

View File

@@ -53,7 +53,7 @@ WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2); int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
WXDLLIMPEXP_BASE WXDLLIMPEXP_BASE
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2); int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
inline int wxCMPFUNC_CONV wxNaturalStringSortAscending(const wxString& s1, const wxString& s2) inline int wxCMPFUNC_CONV wxNaturalStringSortAscending(const wxString& s1, const wxString& s2)
{ {

View File

@@ -416,21 +416,46 @@ int wxDictionaryStringSortAscending(const wxString& s1, const wxString& s2);
int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2); int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2);
/** /**
Comparison function used for Natural Sort. Comparison function comparing strings in natural order.
Functions in the same way as wxDictionaryStringSortAscending(), with This function can be used with wxSortedArrayString::Sort()
the exception that numbers within the string are recognised, and or passed as an argument to wxSortedArrayString constructor.
compared numerically, rather than alphabetically. When used for
sorting, the result is that e.g. file names containing numbers are
sorted in a natural way.
This function will use an OS native function if one is available, See wxCmpNatural() for more information about how natural
to ensure that the sort order is the same as the OS uses. sort order is implemented.
Comparison is case insensitive. @see wxNaturalStringSortDescending(),
wxStringSortAscending(), wxDictionaryStringSortAscending()
e.g. Sorting using wxDictionaryStringSortAscending() results in: @since 3.1.4
*/
int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2);
/**
Comparison function comparing strings in reverse natural order.
This function can be used with wxSortedArrayString::Sort()
or passed as an argument to wxSortedArrayString constructor.
See wxCmpNatural() for more information about how natural
sort order is implemented.
@see wxNaturalStringSortAscending(),
wxStringSortDescending(), wxDictionaryStringSortDescending()
@since 3.1.4
*/
int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2);
/**
This function compares strings using case-insensitive collation and
additionally, numbers within strings are recognised and compared
numerically, rather than alphabetically. When used for sorting,
the result is that e.g. file names containing numbers are sorted
in a natural way.
For example, sorting with a simple string comparison results in:
- file1.txt - file1.txt
- file10.txt - file10.txt
- file100.txt - file100.txt
@@ -438,66 +463,37 @@ int wxDictionaryStringSortDescending(const wxString& s1, const wxString& s2);
- file20.txt - file20.txt
- file3.txt - file3.txt
e.g. Sorting using wxNaturalStringSortAscending() results in: But sorting the same strings in natural sort order results in:
- file1.txt - file1.txt
- file2.txt - file2.txt
- file3.txt - file3.txt
- file11.txt - file10.txt
- file20.txt - file20.txt
- file100.txt - file100.txt
@see wxNaturalStringSortDescending(), wxCmpNatural() uses an OS native natural sort function when available
wxStringSortAscending(), (currently only under Microsoft Windows), wxCmpNaturalGeneric() otherwise.
wxDictionaryStringSortAscending()
@since 3.1.2 Be aware that OS native implementations might differ from each other,
*/ and might change behaviour from release to release.
int wxNaturalStringSortAscending(const wxString& s1, const wxString& s2);
@see wxNaturalStringSortAscending(), wxNaturalStringSortDescending()
/** @since 3.1.4
Comparison function comparing strings in reverse natural order. */
int wxCmpNatural(const wxString& s1, const wxString& s2);
See wxNaturalStringSortAscending() for the natural sort description. /**
This is wxWidgets' own implementation of the natural sort comparison function.
@see wxNaturalStringSortAscending(), Requires wxRegEx, if it is unavailable numbers within strings are not
wxStringSortDescending(), recognised and only case-insensitive collation is performed.
wxDictionaryStringSortDescending()
@since 3.1.2 @see wxCmpNatural()
*/
int wxNaturalStringSortDescending(const wxString& s1, const wxString& s2);
@since 3.1.4
/** */
This is wxWidgets' own implementation of the natural sort comparison int wxCmpNaturalGeneric(const wxString& s1, const wxString& s2);
function. This will be used whenever an OS native function is not available.
Since OS native implementations might differ from each other, the user might
wish to use this function which behaves in the same way across all platforms.
@since 3.1.2
*/
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2);
/**
Comparison function, identical to wxNaturalStringSortAscending().
In fact, wxNaturalStringSortAscending() and wxNaturalStringSortDescending()
are both implemented using this function.
When an OS native natural sort function is available, that will be used,
otherwise wxCmpNatural() will be used.
Be aware that OS native implementations might differ from each other, and
might change behaviour from release to release.
@see wxNaturalStringSortAscending(),
wxNaturalStringSortDescending()
@since 3.1.2
*/
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2);
// ============================================================================ // ============================================================================

View File

@@ -20,13 +20,14 @@
#endif #endif
#include "wx/arrstr.h" #include "wx/arrstr.h"
#include "wx/regex.h"
#include "wx/scopedarray.h" #include "wx/scopedarray.h"
#include "wx/wxcrt.h"
#include "wx/beforestd.h" #include "wx/beforestd.h"
#include <algorithm> #include <algorithm>
#include <functional> #include <functional>
#include "wx/afterstd.h" #include "wx/afterstd.h"
#include "wx/regex.h"
#if defined( __WINDOWS__ ) #if defined( __WINDOWS__ )
#include <shlwapi.h> #include <shlwapi.h>
@@ -728,143 +729,149 @@ wxArrayString wxSplit(const wxString& str, const wxChar sep, const wxChar escape
return ret; return ret;
} }
#if wxUSE_REGEX
namespace // enum, class and functions needed by wxCmpNatural(). namespace // helpers needed by wxCmpNaturalGeneric()
{ {
enum wxStringFragmentType // Used for comparison of string parts
struct wxStringFragment
{
// Fragment types are generally sorted like this:
// Empty < SpaceOrPunct < Digit < LetterOrSymbol
// Fragments of the same type are compared as follows:
// SpaceOrPunct - collated, Digit - as numbers using value
// LetterOrSymbol - lower-cased and then collated
enum Type
{ {
wxFRAGMENT_TYPE_EMPTY = 0, Empty,
wxFRAGMENT_TYPE_ALPHA = 1, SpaceOrPunct, // whitespace or punctuation
wxFRAGMENT_TYPE_DIGIT = 2 Digit, // a sequence of decimal digits
LetterOrSymbol // letters and symbols, i.e., anything not covered by the above types
}; };
wxStringFragment() : type(Empty), value(0) {}
// ---------------------------------------------------------------------------- Type type;
// wxStringFragment
// ----------------------------------------------------------------------------
//
// Lightweight object returned by GetNaturalFragment().
// Represents either a number, or a string which contains no numerical digits.
class wxStringFragment
{
public:
wxStringFragment()
: type(wxFRAGMENT_TYPE_EMPTY)
{}
wxString text; wxString text;
long value; wxUint64 value; // used only for Digit type
wxStringFragmentType type; };
};
wxStringFragment GetFragment(wxString& text) wxStringFragment GetFragment(wxString& text)
{ {
static const wxRegEx naturalNumeric(wxS("[0-9]+")); static const wxRegEx reSpaceOrPunct(wxS("^([[:space:]]|[[:punct:]])+"));
static const wxRegEx naturalAlpha(wxS("[^0-9]+")); // Limit the length to make sure the value will fit into a wxUint64
static const wxRegEx reDigit(wxS("^[[:digit:]]{1,19}"));
size_t digitStart = 0; static const wxRegEx reLetterOrSymbol("^[^[:space:]|[:punct:]|[:digit:]]+");
size_t digitLength = 0;
size_t alphaStart = 0;
size_t alphaLength = 0;
wxStringFragment fragment;
if ( text.empty() ) if ( text.empty() )
return wxStringFragment();
wxStringFragment fragment;
size_t length = 0;
// In attempt to minimize the number of wxRegEx.Matches() calls,
// try to do them from the most expected to the least expected
// string fragment type.
if ( reLetterOrSymbol.Matches(text) )
{
if ( reLetterOrSymbol.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::LetterOrSymbol;
fragment.text = text.Left(length);
}
}
else if ( reDigit.Matches(text) )
{
if ( reDigit.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::Digit;
fragment.text = text.Left(length);
fragment.text.ToULongLong(&fragment.value);
}
}
else if ( reSpaceOrPunct.Matches(text) )
{
if ( reSpaceOrPunct.GetMatch(NULL, &length) )
{
fragment.type = wxStringFragment::SpaceOrPunct;
fragment.text = text.Left(length);
}
}
text.erase(0, length);
return fragment; return fragment;
}
if ( naturalNumeric.Matches(text) ) int CompareFragmentNatural(const wxStringFragment& lhs, const wxStringFragment& rhs)
{
switch ( lhs.type )
{ {
naturalNumeric.GetMatch(&digitStart, &digitLength, 0); case wxStringFragment::Empty:
} switch ( rhs.type )
if ( naturalAlpha.Matches(text) )
{
naturalAlpha.GetMatch(&alphaStart, &alphaLength, 0);
}
if ( alphaStart == 0 )
{
fragment.text = text.Mid(0, alphaLength);
fragment.value = 0;
fragment.type = wxFRAGMENT_TYPE_ALPHA;
text.erase(0, alphaLength);
}
if ( digitStart == 0 )
{
fragment.text = text.Mid(0, digitLength);
fragment.text.ToLong(&fragment.value);
fragment.type = wxFRAGMENT_TYPE_DIGIT;
text.erase(0, digitLength);
}
return fragment;
}
int CompareFragmentNatural(const wxStringFragment& lhs, const wxStringFragment& rhs)
{
if ( (lhs.type == wxFRAGMENT_TYPE_ALPHA) &&
(rhs.type == wxFRAGMENT_TYPE_ALPHA) )
{
return lhs.text.CmpNoCase(rhs.text);
}
if ( (lhs.type == wxFRAGMENT_TYPE_DIGIT) &&
(rhs.type == wxFRAGMENT_TYPE_DIGIT) )
{
if ( lhs.value == rhs.value )
{ {
case wxStringFragment::Empty:
return 0; return 0;
} case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
if ( lhs.value < rhs.value ) case wxStringFragment::LetterOrSymbol:
{
return -1; return -1;
} }
case wxStringFragment::SpaceOrPunct:
switch ( rhs.type )
{
case wxStringFragment::Empty:
return 1;
case wxStringFragment::SpaceOrPunct:
return wxStrcoll_String(lhs.text, rhs.text);
case wxStringFragment::Digit:
case wxStringFragment::LetterOrSymbol:
return -1;
}
case wxStringFragment::Digit:
switch ( rhs.type )
{
case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
return 1;
case wxStringFragment::Digit:
if ( lhs.value > rhs.value ) if ( lhs.value > rhs.value )
{
return 1; return 1;
} else if ( lhs.value < rhs.value )
}
if ( (lhs.type == wxFRAGMENT_TYPE_DIGIT) &&
(rhs.type == wxFRAGMENT_TYPE_ALPHA) )
{
return -1; return -1;
} else
if ( (lhs.type == wxFRAGMENT_TYPE_ALPHA) &&
(rhs.type == wxFRAGMENT_TYPE_DIGIT) )
{
return 1;
}
if ( lhs.type == wxFRAGMENT_TYPE_EMPTY )
{
return -1;
}
if ( rhs.type == wxFRAGMENT_TYPE_EMPTY )
{
return 1;
}
return 0; return 0;
case wxStringFragment::LetterOrSymbol:
return -1;
} }
case wxStringFragment::LetterOrSymbol:
switch ( rhs.type )
{
case wxStringFragment::Empty:
case wxStringFragment::SpaceOrPunct:
case wxStringFragment::Digit:
return 1;
case wxStringFragment::LetterOrSymbol:
return wxStrcoll_String(lhs.text.Lower(), rhs.text.Lower());
}
}
// all possible cases should be covered by the switch above
// but return also from here to prevent the compiler warning
return 1;
}
} // unnamed namespace } // unnamed namespace
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// wxCmpNaturalNative // wxCmpNaturalGeneric
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// //
int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2) int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{ {
wxString lhs(s1); wxString lhs(s1);
wxString rhs(s2); wxString rhs(s2);
@@ -873,14 +880,23 @@ int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
while ( (comparison == 0) && (!lhs.empty() || !rhs.empty()) ) while ( (comparison == 0) && (!lhs.empty() || !rhs.empty()) )
{ {
wxStringFragment fragmentL = GetFragment(lhs); const wxStringFragment fragmentLHS = GetFragment(lhs);
wxStringFragment fragmentR = GetFragment(rhs); const wxStringFragment fragmentRHS = GetFragment(rhs);
comparison = CompareFragmentNatural(fragmentL, fragmentR);
comparison = CompareFragmentNatural(fragmentLHS, fragmentRHS);
} }
return comparison; return comparison;
} }
#else
int wxCMPFUNC_CONV wxCmpNaturalGeneric(const wxString& s1, const wxString& s2)
{
return wxStrcoll_String(s1.Lower(), s2.Lower());
}
#endif // #if wxUSE_REGEX
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// Declaration of StrCmpLogicalW() // Declaration of StrCmpLogicalW()
@@ -894,19 +910,17 @@ int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// wxCmpNaturalNative // wxCmpNatural
// ---------------------------------------------------------------------------- // ----------------------------------------------------------------------------
// //
// If a native version of Natural sort is available, then use that, otherwise // If a native version of Natural sort is available, then use that, otherwise
// use the wxWidgets version, wxCmpNatural(). // use the generic version.
int wxCMPFUNC_CONV wxCmpNaturalNative(const wxString& s1, const wxString& s2) inline int wxCMPFUNC_CONV wxCmpNatural(const wxString& s1, const wxString& s2)
{ {
#if defined( __WINDOWS__ ) #if defined( __WINDOWS__ )
return StrCmpLogicalW( s1.wc_str(), s2.wc_str() ); return StrCmpLogicalW(s1.wc_str(), s2.wc_str());
#else
#else return wxCmpNaturalGeneric(s1, s2);
return wxCmpNatural( s1, s2 ); #endif // #if defined( __WINDOWS__ )
#endif
} }

View File

@@ -782,47 +782,82 @@ void ArraysTestCase::IndexFromEnd()
} }
TEST_CASE("wxNaturalStringSortAscending()", "[array][sort][string]") TEST_CASE("wxNaturalStringComparisonGeneric()", "[wxString][compare]")
{ {
wxString s01("3String"); #if !wxUSE_REGEX
wxString s02("21String"); WARN("Skipping wxCmpNaturalGeneric() tests: wxRegEx not available");
#else
// simple string comparison
CHECK(wxCmpNaturalGeneric("a", "a") == 0);
CHECK(wxCmpNaturalGeneric("a", "z") < 0);
CHECK(wxCmpNaturalGeneric("z", "a") > 0);
wxString s03("100string"); // case insensitivity
wxString s04("100String"); CHECK(wxCmpNaturalGeneric("a", "A") == 0);
CHECK(wxCmpNaturalGeneric("A", "a") == 0);
CHECK(wxCmpNaturalGeneric("AB", "a") > 0);
CHECK(wxCmpNaturalGeneric("a", "AB") < 0);
wxString s05("10String"); // empty strings sort before whitespace and punctiation
wxString s06("Str3ing"); CHECK(wxCmpNaturalGeneric("", " ") < 0);
wxString s07("Str20ing"); CHECK(wxCmpNaturalGeneric(" ", "") > 0);
wxString s08("Str200ing"); CHECK(wxCmpNaturalGeneric("", ",") < 0);
wxString s09("String8"); CHECK(wxCmpNaturalGeneric(",", "") > 0);
wxString s10("String90");
wxString s11("7String3"); // empty strings sort before numbers
wxString s12("07String20"); CHECK(wxCmpNaturalGeneric("", "0") < 0);
wxString s13("007String100"); CHECK(wxCmpNaturalGeneric("0", "") > 0);
CHECK(wxCmpNatural(s01, s02) < 0); // empty strings sort before letters and symbols
CHECK(wxCmpNatural(s02, s03) < 0); CHECK(wxCmpNaturalGeneric("", "abc") < 0);
CHECK(wxCmpNatural(s03, s04) == 0); // Check that case is ignored CHECK(wxCmpNaturalGeneric("abc", "") > 0);
CHECK(wxCmpNatural(s05, s06) < 0);
CHECK(wxCmpNatural(s06, s07) < 0); // whitespace and punctiation sort before numbers
CHECK(wxCmpNatural(s07, s08) < 0); CHECK(wxCmpNaturalGeneric(" ", "1") < 0);
CHECK(wxCmpNatural(s08, s09) < 0); CHECK(wxCmpNaturalGeneric("1", " ") > 0);
CHECK(wxCmpNatural(s09, s10) < 0); CHECK(wxCmpNaturalGeneric(",", "1") < 0);
CHECK(wxCmpNatural(s11, s12) < 0); CHECK(wxCmpNaturalGeneric("1", ",") > 0);
CHECK(wxCmpNatural(s12, s13) < 0);
CHECK(wxCmpNatural(s01, s01) == 0); // Check that equality works in all cases // strings containing numbers sort before letters and symbols
CHECK(wxCmpNatural(s02, s02) == 0); CHECK(wxCmpNaturalGeneric("00", "a") < 0);
CHECK(wxCmpNatural(s03, s03) == 0); CHECK(wxCmpNaturalGeneric("a", "00") > 0);
CHECK(wxCmpNatural(s04, s04) == 0);
CHECK(wxCmpNatural(s05, s05) == 0); // strings containing numbers are compared by their value
CHECK(wxCmpNatural(s06, s06) == 0); CHECK(wxCmpNaturalGeneric("01", "1") == 0);
CHECK(wxCmpNatural(s07, s07) == 0); CHECK(wxCmpNaturalGeneric("1", "01") == 0);
CHECK(wxCmpNatural(s08, s08) == 0); CHECK(wxCmpNaturalGeneric("1", "05") < 0);
CHECK(wxCmpNatural(s09, s09) == 0); CHECK(wxCmpNaturalGeneric("05", "1") > 0);
CHECK(wxCmpNatural(s10, s10) == 0); CHECK(wxCmpNaturalGeneric("10", "5") > 0);
CHECK(wxCmpNatural(s11, s11) == 0); CHECK(wxCmpNaturalGeneric("5", "10") < 0);
CHECK(wxCmpNatural(s12, s12) == 0); CHECK(wxCmpNaturalGeneric("1", "9999999999999999999") < 0);
CHECK(wxCmpNatural(s13, s13) == 0); CHECK(wxCmpNaturalGeneric("9999999999999999999", "1") > 0);
// comparing strings composed from whitespace,
// punctuation, numbers, letters, and symbols
CHECK(wxCmpNaturalGeneric("1st", " 1st") > 0);
CHECK(wxCmpNaturalGeneric(" 1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", ",1st") > 0);
CHECK(wxCmpNaturalGeneric(",1st", "1st") < 0);
CHECK(wxCmpNaturalGeneric("1st", "01st") == 0);
CHECK(wxCmpNaturalGeneric("01st", "1st") == 0);
CHECK(wxCmpNaturalGeneric("10th", "5th") > 0);
CHECK(wxCmpNaturalGeneric("5th", "10th") < 0);
CHECK(wxCmpNaturalGeneric("a1st", "a01st") == 0);
CHECK(wxCmpNaturalGeneric("a01st", "a1st") == 0);
CHECK(wxCmpNaturalGeneric("a10th", "a5th") > 0);
CHECK(wxCmpNaturalGeneric("a5th", "a10th") < 0);
CHECK(wxCmpNaturalGeneric("a 10th", "a5th") < 0);
CHECK(wxCmpNaturalGeneric("a5th", "a 10th") > 0);
CHECK(wxCmpNaturalGeneric("a1st1", "a01st01") == 0);
CHECK(wxCmpNaturalGeneric("a01st01", "a1st1") == 0);
CHECK(wxCmpNaturalGeneric("a10th10", "a5th5") > 0);
CHECK(wxCmpNaturalGeneric("a5th5", "a10th10") < 0);
CHECK(wxCmpNaturalGeneric("a 10th 10", "a5th 5") < 0);
CHECK(wxCmpNaturalGeneric("a5th 5", "a 10th 10") > 0);
#endif // #if !wxUSE_REGEX
} }