Skip to content

Commit

Permalink
Brand new .naturalSort() code, it was previously borrowed code that w…
Browse files Browse the repository at this point in the history
…as slow and vulnerable to RegExp DoS, the new code is made by myself, has no RegExp, only matching forward, is more flexible and easiest to maintain (#3)
  • Loading branch information
cronvel committed Aug 17, 2021
1 parent 694f697 commit 9cac4c2
Show file tree
Hide file tree
Showing 5 changed files with 150 additions and 58 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG
Original file line number Diff line number Diff line change
@@ -1,4 +1,10 @@

v0.12.8
-------

Brand new .naturalSort() code, it was previously borrowed code that was slow and vulnerable to RegExp DoS, the new code is made by myself, has no RegExp, only matching forward, is more flexible and easiest to maintain (#3)


v0.12.7
-------

Expand Down
161 changes: 111 additions & 50 deletions lib/naturalSort.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,57 +28,118 @@



/*
* Natural Sort algorithm for Javascript - Version 0.8 - Released under MIT license
* Author: Jim Palmer (based on chunking idea from Dave Koelle)
*/
module.exports = function( a , b ) {
var re = /(^([+-]?(?:\d*)(?:\.\d*)?(?:[eE][+-]?\d+)?)?$|^0x[\da-fA-F]+$|\d+)/g ,
sre = /^\s+|\s+$/g , // trim pre-post whitespace
snre = /\s+/g , // normalize all whitespace to single ' ' character
dre = /(^([\w ]+,?[\w ]+)?[\w ]+,?[\w ]+\d+:\d+(:\d+)?[\w ]?|^\d{1,4}[/-]\d{1,4}[/-]\d{1,4}|^\w+, \w+ \d+, \d{4})/ ,
hre = /^0x[0-9a-f]+$/i ,
ore = /^0/ ,
i = function( s ) {
return ( '' + s ).toLowerCase().replace( sre , '' ) ;
} ,
// convert all to strings strip whitespace
x = i( a ) || '' ,
y = i( b ) || '' ,
// chunk/tokenize
xN = x.replace( re , '\0$1\0' ).replace( /\0$/ , '' )
.replace( /^\0/ , '' )
.split( '\0' ) ,
yN = y.replace( re , '\0$1\0' ).replace( /\0$/ , '' )
.replace( /^\0/ , '' )
.split( '\0' ) ,
// numeric, hex or date detection
xD = parseInt( x.match( hre ) , 16 ) || ( xN.length !== 1 && Date.parse( x ) ) ,
yD = parseInt( y.match( hre ) , 16 ) || xD && y.match( dre ) && Date.parse( y ) || null ,
normChunk = function( s , l ) {
// normalize spaces; find floats not starting with '0', string or 0 if not defined (Clint Priest)
return ( ! s.match( ore ) || l === 1 ) && parseFloat( s ) || s.replace( snre , ' ' ).replace( sre , '' ) || 0 ; // jshint ignore:line
} ,
oFxNcL , oFyNcL ;
// first try and sort Hex codes or Dates
if ( yD ) {
if ( xD < yD ) { return -1 ; }
else if ( xD > yD ) { return 1 ; }
const CONTROL_CLASS = 1 ;
const WORD_SEPARATOR_CLASS = 2 ;
const LETTER_CLASS = 3 ;
const NUMBER_CLASS = 4 ;
const SYMBOL_CLASS = 5 ;



function getCharacterClass( char , code ) {
if ( isWordSeparator( code ) ) { return WORD_SEPARATOR_CLASS ; }
if ( code <= 0x1f || code === 0x7f ) { return CONTROL_CLASS ; }
if ( isNumber( code ) ) { return NUMBER_CLASS ; }
// Here we assume that a letter is a char with a “case”
if ( char.toUpperCase() !== char.toLowerCase() ) { return LETTER_CLASS ; }
return SYMBOL_CLASS ;
}



function isWordSeparator( code ) {
if (
// space, tab, no-break space
code === 0x20 || code === 0x09 || code === 0xa0 ||
// hyphen, underscore
code === 0x2d || code === 0x5f
) {
return true ;
}
// natural sorting through split numeric strings and default strings
for( var cLoc = 0 , xNl = xN.length , yNl = yN.length , numS = Math.max( xNl , yNl ) ; cLoc < numS ; cLoc ++ ) {
oFxNcL = normChunk( xN[cLoc] , xNl ) ;
oFyNcL = normChunk( yN[cLoc] , yNl ) ;
// handle numeric vs string comparison - number < string - (Kyle Adams)
if ( isNaN( oFxNcL ) !== isNaN( oFyNcL ) ) { return ( isNaN( oFxNcL ) ) ? 1 : -1 ; }
// rely on string comparison if different types - i.e. '02' < 2 != '02' < '2'
else if ( typeof oFxNcL !== typeof oFyNcL ) {
oFxNcL += '' ;
oFyNcL += '' ;

return false ;
}



function isNumber( code ) {
if ( code >= 0x30 && code <= 0x39 ) { return true ; }
return false ;
}



function naturalSort( a , b ) {
a = '' + a ;
b = '' + b ;

var aIndex , aEndIndex , aChar , aCode , aClass , aCharLc , aNumber ,
aTrim = a.trim() ,
aLength = aTrim.length ,
bIndex , bEndIndex , bChar , bCode , bClass , bCharLc , bNumber ,
bTrim = b.trim() ,
bLength = bTrim.length ,
advantage = 0 ;

for ( aIndex = bIndex = 0 ; aIndex < aLength && bIndex < bLength ; aIndex ++ , bIndex ++ ) {
aChar = aTrim[ aIndex ] ;
bChar = bTrim[ bIndex ] ;
aCode = aTrim.charCodeAt( aIndex ) ;
bCode = bTrim.charCodeAt( bIndex ) ;
aClass = getCharacterClass( aChar , aCode ) ;
bClass = getCharacterClass( bChar , bCode ) ;
if ( aClass !== bClass ) { return aClass - bClass ; }

switch ( aClass ) {
case WORD_SEPARATOR_CLASS :
// Eat all white chars and continue
while ( isWordSeparator( aTrim.charCodeAt( aIndex + 1 ) ) ) { aIndex ++ ; }
while ( isWordSeparator( bTrim.charCodeAt( bIndex + 1 ) ) ) { bIndex ++ ; }
break ;

case CONTROL_CLASS :
case SYMBOL_CLASS :
if ( aCode !== bCode ) { return aCode - bCode ; }
break ;

case LETTER_CLASS :
aCharLc = aChar.toLowerCase() ;
bCharLc = bChar.toLowerCase() ;
if ( aCharLc !== bCharLc ) { return aCharLc > bCharLc ? 1 : -1 ; }

// As a last resort, we would sort uppercase first
if ( ! advantage && aChar !== bChar ) { advantage = aChar !== aCharLc ? -1 : 1 ; }

break ;

case NUMBER_CLASS :
// Lookup for a whole number and parse it
aEndIndex = aIndex + 1 ;
while ( isNumber( aTrim.charCodeAt( aEndIndex ) ) ) { aEndIndex ++ ; }
aNumber = parseFloat( aTrim.slice( aIndex , aEndIndex ) ) ;

bEndIndex = bIndex + 1 ;
while ( isNumber( bTrim.charCodeAt( bEndIndex ) ) ) { bEndIndex ++ ; }
bNumber = parseFloat( bTrim.slice( bIndex , bEndIndex ) ) ;

if ( aNumber !== bNumber ) { return aNumber - bNumber ; }

// As a last resort, we would sort the number with the less char first
if ( ! advantage && aEndIndex - aIndex !== bEndIndex - bIndex ) { advantage = ( aEndIndex - aIndex ) - ( bEndIndex - bIndex ) ; }

// Advance the index at the end of the number area
aIndex = aEndIndex - 1 ;
bIndex = bEndIndex - 1 ;
break ;
}
if ( oFxNcL < oFyNcL ) { return -1 ; }
if ( oFxNcL > oFyNcL ) { return 1 ; }
}
return 0 ;
} ;

// If there was an “advantage”, use it now
if ( advantage ) { return advantage ; }

// Finally, sort by remaining char, or by trimmed length or by full length
return ( aLength - aIndex ) - ( bLength - bIndex ) || aLength - bLength || a.length - b.length ;
}

module.exports = naturalSort ;

2 changes: 1 addition & 1 deletion lib/string.js
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@



var stringKit = {} ;
const stringKit = {} ;
module.exports = stringKit ;


Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "string-kit",
"version": "0.12.7",
"version": "0.12.8",
"engines": {
"node": ">=6.0.0"
},
Expand Down
37 changes: 31 additions & 6 deletions test/string-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1063,6 +1063,37 @@ describe( "Fuzzy string matching" , () => {



describe( "Natural sort" , () => {

it( "basic natural sort tests" , () => {
expect( [ 'one' , 'two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'three' , 'two' ] ) ;

// Case insensitive
expect( [ 'one' , 'two' , 'Three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'Three' , 'two' ] ) ;
expect( [ 'One' , 'Two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'three' , 'Two' ] ) ;

// Uppercase first as a tie-breaker
expect( [ 'one' , 'One' , 'two' , 'Two' , 'Three' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'one' , 'Three' , 'three' , 'Two' , 'two' ] ) ;

// Lesser number first
expect( [ 'abc121' , 'abc17' , 'abc12' , 'abc134' ].sort( string.naturalSort ) ).to.equal( [ 'abc12' , 'abc17' , 'abc121' , 'abc134' ] ) ;

// White space / word separator insensitive
expect( [ ' One ' , ' Two ' , 'three' ].sort( string.naturalSort ) ).to.equal( [ ' One ' , 'three' , ' Two ' ] ) ;
expect( [ 'abc 121' , 'abc 17' , 'abc 12' , 'abc 134' ].sort( string.naturalSort ) ).to.equal( [ 'abc 12' , 'abc 17' , 'abc 121' , 'abc 134' ] ) ;
expect( [ 'a-123-a' , 'a_12_a' , 'a 18 a' ].sort( string.naturalSort ) ).to.equal( [ 'a_12_a' , 'a 18 a' , 'a-123-a' ] ) ;
expect( [ 'a_123_a' , 'a-12-a' , 'a 18 a' ].sort( string.naturalSort ) ).to.equal( [ 'a-12-a' , 'a 18 a' , 'a_123_a' ] ) ;

// Number with shorter char-width as a tie-breaker
expect( [ 'abc00012' , 'abc012' , 'abc017' , 'abc12' , 'abc134' ].sort( string.naturalSort ) ).to.equal( [ 'abc12' , 'abc012' , 'abc00012' , 'abc017' , 'abc134' ] ) ;

// Symbols
expect( [ ';+$' , '!:;,' , '“”' ].sort( string.naturalSort ) ).to.equal( [ "!:;," , ";+$" , "“”" ] ) ;
} ) ;
} ) ;



describe( "Misc" , () => {

it( ".resize()" , () => {
Expand All @@ -1071,12 +1102,6 @@ describe( "Misc" , () => {
expect( string.resize( 'bobby' , 8 ) ).to.be( 'bobby ' ) ;
} ) ;

it( ".naturalSort()" , () => {
expect( [ 'one' , 'two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'three' , 'two' ] ) ;
expect( [ 'one' , 'two' , 'Three' ].sort( string.naturalSort ) ).to.equal( [ 'one' , 'Three' , 'two' ] ) ;
expect( [ 'One' , 'Two' , 'three' ].sort( string.naturalSort ) ).to.equal( [ 'One' , 'three' , 'Two' ] ) ;
} ) ;

it( ".occurrenceCount()" , () => {
expect( string.occurrenceCount( '' , '' ) ).to.be( 0 ) ;
expect( string.occurrenceCount( 'three' , '' ) ).to.be( 0 ) ;
Expand Down

0 comments on commit 9cac4c2

Please sign in to comment.