match.go
4.7 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
// Copyright (c) 2012-2016 The go-diff authors. All rights reserved.
// https://github.com/sergi/go-diff
// See the included LICENSE file for license details.
//
// go-diff is a Go implementation of Google's Diff, Match, and Patch library
// Original library is Copyright (c) 2006 Google Inc.
// http://code.google.com/p/google-diff-match-patch/
package diffmatchpatch
import (
"math"
)
// MatchMain locates the best instance of 'pattern' in 'text' near 'loc'.
// Returns -1 if no match found.
func (dmp *DiffMatchPatch) MatchMain(text, pattern string, loc int) int {
// Check for null inputs not needed since null can't be passed in C#.
loc = int(math.Max(0, math.Min(float64(loc), float64(len(text)))))
if text == pattern {
// Shortcut (potentially not guaranteed by the algorithm)
return 0
} else if len(text) == 0 {
// Nothing to match.
return -1
} else if loc+len(pattern) <= len(text) && text[loc:loc+len(pattern)] == pattern {
// Perfect match at the perfect spot! (Includes case of null pattern)
return loc
}
// Do a fuzzy compare.
return dmp.MatchBitap(text, pattern, loc)
}
// MatchBitap locates the best instance of 'pattern' in 'text' near 'loc' using the Bitap algorithm.
// Returns -1 if no match was found.
func (dmp *DiffMatchPatch) MatchBitap(text, pattern string, loc int) int {
// Initialise the alphabet.
s := dmp.MatchAlphabet(pattern)
// Highest score beyond which we give up.
scoreThreshold := dmp.MatchThreshold
// Is there a nearby exact match? (speedup)
bestLoc := indexOf(text, pattern, loc)
if bestLoc != -1 {
scoreThreshold = math.Min(dmp.matchBitapScore(0, bestLoc, loc,
pattern), scoreThreshold)
// What about in the other direction? (speedup)
bestLoc = lastIndexOf(text, pattern, loc+len(pattern))
if bestLoc != -1 {
scoreThreshold = math.Min(dmp.matchBitapScore(0, bestLoc, loc,
pattern), scoreThreshold)
}
}
// Initialise the bit arrays.
matchmask := 1 << uint((len(pattern) - 1))
bestLoc = -1
var binMin, binMid int
binMax := len(pattern) + len(text)
lastRd := []int{}
for d := 0; d < len(pattern); d++ {
// Scan for the best match; each iteration allows for one more error. Run a binary search to determine how far from 'loc' we can stray at this error level.
binMin = 0
binMid = binMax
for binMin < binMid {
if dmp.matchBitapScore(d, loc+binMid, loc, pattern) <= scoreThreshold {
binMin = binMid
} else {
binMax = binMid
}
binMid = (binMax-binMin)/2 + binMin
}
// Use the result from this iteration as the maximum for the next.
binMax = binMid
start := int(math.Max(1, float64(loc-binMid+1)))
finish := int(math.Min(float64(loc+binMid), float64(len(text))) + float64(len(pattern)))
rd := make([]int, finish+2)
rd[finish+1] = (1 << uint(d)) - 1
for j := finish; j >= start; j-- {
var charMatch int
if len(text) <= j-1 {
// Out of range.
charMatch = 0
} else if _, ok := s[text[j-1]]; !ok {
charMatch = 0
} else {
charMatch = s[text[j-1]]
}
if d == 0 {
// First pass: exact match.
rd[j] = ((rd[j+1] << 1) | 1) & charMatch
} else {
// Subsequent passes: fuzzy match.
rd[j] = ((rd[j+1]<<1)|1)&charMatch | (((lastRd[j+1] | lastRd[j]) << 1) | 1) | lastRd[j+1]
}
if (rd[j] & matchmask) != 0 {
score := dmp.matchBitapScore(d, j-1, loc, pattern)
// This match will almost certainly be better than any existing match. But check anyway.
if score <= scoreThreshold {
// Told you so.
scoreThreshold = score
bestLoc = j - 1
if bestLoc > loc {
// When passing loc, don't exceed our current distance from loc.
start = int(math.Max(1, float64(2*loc-bestLoc)))
} else {
// Already passed loc, downhill from here on in.
break
}
}
}
}
if dmp.matchBitapScore(d+1, loc, loc, pattern) > scoreThreshold {
// No hope for a (better) match at greater error levels.
break
}
lastRd = rd
}
return bestLoc
}
// matchBitapScore computes and returns the score for a match with e errors and x location.
func (dmp *DiffMatchPatch) matchBitapScore(e, x, loc int, pattern string) float64 {
accuracy := float64(e) / float64(len(pattern))
proximity := math.Abs(float64(loc - x))
if dmp.MatchDistance == 0 {
// Dodge divide by zero error.
if proximity == 0 {
return accuracy
}
return 1.0
}
return accuracy + (proximity / float64(dmp.MatchDistance))
}
// MatchAlphabet initialises the alphabet for the Bitap algorithm.
func (dmp *DiffMatchPatch) MatchAlphabet(pattern string) map[byte]int {
s := map[byte]int{}
charPattern := []byte(pattern)
for _, c := range charPattern {
_, ok := s[c]
if !ok {
s[c] = 0
}
}
i := 0
for _, c := range charPattern {
value := s[c] | int(uint(1)<<uint((len(pattern)-i-1)))
s[c] = value
i++
}
return s
}