96 lines
2.5 KiB
Go
96 lines
2.5 KiB
Go
// Package matcher contains functions for matching template.Block to a string.
|
|
package matcher
|
|
|
|
import (
|
|
"math"
|
|
|
|
"git.max-richter.dev/max/marka/parser/utils"
|
|
"git.max-richter.dev/max/marka/template"
|
|
"github.com/agext/levenshtein"
|
|
)
|
|
|
|
// Block matches a template.Block to a section inside a string
|
|
type Block struct {
|
|
Start, End int
|
|
Block template.Block
|
|
src *string
|
|
}
|
|
|
|
func (m Block) GetContent() string {
|
|
if m.src == nil || m.Start < 0 || m.End > len(*m.src) || m.Start > m.End {
|
|
return ""
|
|
}
|
|
return (*m.src)[m.Start:m.End]
|
|
}
|
|
|
|
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
|
|
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
|
|
// segments as gaps between those anchors.
|
|
func MatchBlocksFuzzy(markdown string, templateBlocks []template.Block, maxDist float64) []Block {
|
|
var out []Block
|
|
|
|
lastIndex := 0
|
|
for i, b := range templateBlocks {
|
|
if b.Type == template.MatchingBlock {
|
|
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
|
|
if end != -1 {
|
|
if i > 0 {
|
|
previousBlock := templateBlocks[i-1]
|
|
if previousBlock.Type == template.DataBlock {
|
|
out = append(out, Block{
|
|
Start: lastIndex,
|
|
End: start,
|
|
Block: previousBlock,
|
|
src: &markdown,
|
|
})
|
|
}
|
|
}
|
|
lastIndex = end
|
|
}
|
|
}
|
|
}
|
|
|
|
// Handle the last block
|
|
if len(templateBlocks) > 0 {
|
|
lastBlock := templateBlocks[len(templateBlocks)-1]
|
|
if lastBlock.Type == template.DataBlock {
|
|
out = append(out, Block{
|
|
Start: lastIndex,
|
|
End: len(markdown),
|
|
Block: lastBlock,
|
|
src: &markdown,
|
|
})
|
|
}
|
|
}
|
|
|
|
return out
|
|
}
|
|
|
|
func FuzzyFind(haystack string, from int, needle string, maxDist float64) (start int, end int) {
|
|
bestStart, bestEnd, bestDist := -1, -1, math.MaxFloat64
|
|
needleLen := len(needle)
|
|
minWindow := max(1, needleLen-int(float64(needleLen)*maxDist)-1)
|
|
maxWindow := needleLen + int(float64(needleLen)*maxDist) + 1
|
|
|
|
for i := from; i < len(haystack); i++ {
|
|
for windowSize := minWindow; windowSize <= maxWindow && i+windowSize <= len(haystack); windowSize++ {
|
|
sub := haystack[i : i+windowSize]
|
|
dist := levenshtein.Distance(sub, needle, nil)
|
|
maxLen := max(needleLen, windowSize)
|
|
norm := float64(dist)/float64(maxLen) + float64(utils.Abs(windowSize-needleLen))*0.01/float64(maxLen)
|
|
|
|
if norm < bestDist {
|
|
bestStart, bestEnd, bestDist = i, i+windowSize, norm
|
|
}
|
|
}
|
|
if bestDist <= 0.05 {
|
|
break
|
|
}
|
|
}
|
|
|
|
if bestStart >= 0 && bestDist <= maxDist+0.01 {
|
|
return bestStart, bestEnd
|
|
}
|
|
return -1, -1
|
|
}
|