How to sort file names naturally
1. Problem
I recently wrote a tool gomdtoc to generate a Table of Contents for my notes directory. In order to sort the file names, I used the sort.Strings().
But it not working as I expected, the problem is: it sorts the names alphabetically.
Suppose we have a list like this:
ab abc1 abc01 abc2 abc5 abc10
After sorting by sort.Strings()
ab abc01 abc1 abc10 abc2 abc5
The abc2 should be placed before abc10, this is not expected, I want it sorted naturally.
2. Natual Sort Order
In computing, natural sort order (or natural sorting) is the ordering of strings in alphabetical order, except that multi-digit numbers are treated atomically, i.e., as if they were a single character. Natural sort order has been promoted as being more human-friendly ("natural") than machine-oriented, pure alphabetical sort order.
3. Implementations
For the full code see https://github.com/dreamjz/golang-notes/blob/main/blog/natural-sort/natural_sort.go
3.1 Using regular expression to separate number/string
The first thing that comes to my mind is using regexp to separate number and string.
func NaturalLess03(str1, str2 string) bool {
	pStr1 := processWithRegexp(str1)
	pStr2 := processWithRegexp(str2)
	minLen := int(math.Min(float64(len(pStr1)), float64(len(pStr2))))
	for i := 0; i < minLen; i++ {
		e1 := pStr1[i]
		e2 := pStr2[i]
		if e1 == e2 {
			continue
		}
		ei1, err1 := strconv.Atoi(e1)
		ei2, err2 := strconv.Atoi(e2)
		switch {
		case err1 == nil && err2 == nil && ei1 != ei2:
			return ei1 < ei2
		case err1 != nil && err2 != nil:
			return e1 < e2
		case err1 != nil:
			return true
		case err2 != nil:
			return false
		}
	}
	return len(str1) < len(str2)
}
func processWithRegexp(str string) []string {
	reg := regexp.MustCompile(`\d+|\D+`)
	return reg.FindAllString(str, -1)
}
3.2 Using pointer/index to separate number/string
The approach is the same as above, but we use the index to separate number and string.
func NaturalLess02(str1, str2 string) bool {
	pStr1 := processStr(str1)
	pStr2 := processStr(str2)
	minLen := int(math.Min(float64(len(pStr1)), float64(len(pStr2))))
	for i := 0; i < minLen; i++ {
		e1 := pStr1[i]
		e2 := pStr2[i]
		if e1 == e2 {
			continue
		}
		ei1, err1 := strconv.Atoi(e1)
		ei2, err2 := strconv.Atoi(e2)
		switch {
		case err1 == nil && err2 == nil && ei1 != ei2:
			return ei1 < ei2
		case err1 != nil && err2 != nil:
			return e1 < e2
		case err1 != nil:
			return true
		case err2 != nil:
			return false
		}
	}
	return len(str1) < len(str2)
}
func processStr(str string) []string {
	var processedStr []string
	for i := 0; i < len(str); {
		j := i + 1
		for ; j < len(str); j++ {
			if isDigit(str[i]) != isDigit(str[j]) {
				break
			}
		}
		processedStr = append(processedStr, str[i:j])
		i = j
	}
	return processedStr
}
func isDigit(b byte) bool {
	return '0' <= b && b <= '9'
}
3.3 Using pointer/index without separating number/string (Recommended)
This approach compares rune-by-rune, without separating numbers or strings.
Code from https://github.com/fvbommel/sortorder/blob/main/natsort.go
func NaturalLess01(str1, str2 string) bool {
	idx1, idx2 := 0, 0
	for idx1 < len(str1) && idx2 < len(str2) {
		c1, c2 := str1[idx1], str2[idx2]
		dig1, dig2 := isDigit(c1), isDigit(c2)
		switch {
		case dig1 != dig2: // Digits before other characters
			return dig1
		case !dig1: // && !dig2
			if c1 != c2 {
				return c1 < c2
			}
			idx1++
			idx2++
		default: // Digits
            // Eat zeros
			for ; idx1 < len(str1) && str1[idx1] == '0'; idx1++ {
			}
			for ; idx2 < len(str2) && str2[idx2] == '0'; idx2++ {
			}
            // Eat all digits
			nonZero1, nonZero2 := idx1, idx2
			for ; idx1 < len(str1) && isDigit(str1[idx1]); idx1++ {
			}
			for ; idx2 < len(str2) && isDigit(str2[idx2]); idx2++ {
			}
            // If lengths of numbers with non-zero prefix differ, the shorter
			// one is less.
			if len1, len2 := idx1-nonZero1, idx2-nonZero2; len1 != len2 {
				return len1 < len2
			}
            // If they're equally long, string comparison is correct.
			if nr1, nr2 := str1[nonZero1:idx1], str2[nonZero2:idx2]; nr1 != nr2 {
				return nr1 < nr2
			}
            // Otherwise, the one with less zeros is less.
			// Because everything up to the number is equal, comparing the index
			// after the zeros is sufficient.
			if nonZero1 != nonZero2 {
				return nonZero1 < nonZero2
			}
		}
	}
    	// So far they are identical. At least one is ended. If the other continues,
	// it sorts last.
	return len(str1) < len(str2)
}
3.4 Simple test
const PrintFormat = "%-25s: %v\n"
func main() {
	strs := []string{
		"ab", "abc1",
		"abc01", "abc2",
		"abc5", "abc10",
	}
	fmt.Printf(PrintFormat, "Origin", strs)
	var strs00, strs01, strs02, strs03 []string
	strs00 = make([]string, len(strs))
	strs01 = make([]string, len(strs))
	strs02 = make([]string, len(strs))
	strs03 = make([]string, len(strs))
	copy(strs00, strs)
	sort.Strings(strs00)
	fmt.Printf(PrintFormat, "Sort by stdsorter", strs00)
	copy(strs01, strs)
	sort.Slice(strs01, func(i, j int) bool {
		return NaturalLess01(strs01[i], strs01[j])
	})
	fmt.Printf(PrintFormat, "Sort by NaturalLess01", strs01)
	copy(strs02, strs)
	sort.Slice(strs02, func(i, j int) bool {
		return NaturalLess02(strs02[i], strs02[j])
	})
	fmt.Printf(PrintFormat, "Sort by NaturalLess02", strs02)
	copy(strs03, strs)
	sort.Slice(strs03, func(i, j int) bool {
		return NaturalLess03(strs03[i], strs03[j])
	})
	fmt.Printf(PrintFormat, "Sort by NaturalLess03", strs03)
}
Origin                   : [ab abc1 abc01 abc2 abc5 abc10]
Sort by stdsorter        : [ab abc01 abc1 abc10 abc2 abc5]
Sort by NaturalLess01    : [ab abc1 abc01 abc2 abc5 abc10]
Sort by NaturalLess02    : [ab abc1 abc01 abc2 abc5 abc10]
Sort by NaturalLess03    : [ab abc1 abc01 abc2 abc5 abc10]
4. Benchmark
4.1 Generate test set
type generator struct {
	src *rand.Rand
}
func (g *generator) NexInt(max int) int {
	return g.src.Intn(max)
}
func (g *generator) NextString() (str string) {
	strlen := g.src.Intn(6) + 3
	numlen := g.src.Intn(3) + 1
	numpos := g.src.Intn(strlen + 1)
	var num string
	for i := 0; i < numlen; i++ {
		num += strconv.Itoa(g.src.Intn(10))
	}
	for i := 0; i < strlen+1; i++ {
		if i == numpos {
			str += num
		} else {
			str += string('a' + rune(g.src.Intn(16)))
		}
	}
	return str
}
func testSet(seed int) [][]string {
	gen := &generator{
		src: rand.New(rand.NewSource(int64(seed))),
	}
	n := 1000
	if testing.Short() {
		n = 1
	}
	set := make([][]string, n)
	for i := range set {
		strings := make([]string, 10000)
		for idx := range strings {
			strings[idx] = gen.NextString()
		}
		set[i] = strings
	}
	return set
}
4.2 Benchmark
func BenchmarkStdStringSort00(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()
			sort.Strings(arr)
		}
	}
}
func BenchmarkNaturalStringSort01(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			// Resetting the test set to be unsorted does not count.
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()
			sort.Slice(arr, func(i, j int) bool {
				return NaturalLess01(arr[i], arr[j])
			})
		}
	}
}
func BenchmarkNaturalStringSort02(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			// Resetting the test set to be unsorted does not count.
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()
			sort.Slice(arr, func(i, j int) bool {
				return NaturalLess02(arr[i], arr[j])
			})
		}
	}
}
func BenchmarkNaturalStringSort03(b *testing.B) {
	set := testSet(300)
	arr := make([]string, len(set[0]))
	b.ResetTimer()
	for i := 0; i < b.N; i++ {
		for _, list := range set {
			// Resetting the test set to be unsorted does not count.
			b.StopTimer()
			copy(arr, list)
			b.StartTimer()
			sort.Slice(arr, func(i, j int) bool {
				return NaturalLess03(arr[i], arr[j])
			})
		}
	}
}
Start test:
$ go test -bench '.' -benchmem .

As above, the NaturalLess01 is the most efficient approach.
However, the approach that uses regexp even timed out 😂.
