big tings
This commit is contained in:
95
parser/matcher/matcher.go
Normal file
95
parser/matcher/matcher.go
Normal file
@@ -0,0 +1,95 @@
|
||||
// Package matcher contains functions for matching template.Block to a string.
|
||||
package matcher
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser/utils"
|
||||
"git.max-richter.dev/max/marka/template"
|
||||
"github.com/agext/levenshtein"
|
||||
)
|
||||
|
||||
// Block matches a template.Block to a section inside a string
|
||||
type Block struct {
|
||||
Start, End int
|
||||
Block template.Block
|
||||
src *string
|
||||
}
|
||||
|
||||
func (m Block) GetContent() string {
|
||||
if m.src == nil || m.Start < 0 || m.End > len(*m.src) || m.Start > m.End {
|
||||
return ""
|
||||
}
|
||||
return (*m.src)[m.Start:m.End]
|
||||
}
|
||||
|
||||
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
|
||||
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
|
||||
// segments as gaps between those anchors.
|
||||
func MatchBlocksFuzzy(markdown string, templateBlocks []template.Block, maxDist float64) []Block {
|
||||
var out []Block
|
||||
|
||||
lastIndex := 0
|
||||
for i, b := range templateBlocks {
|
||||
if b.Type == template.MatchingBlock {
|
||||
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
|
||||
if end != -1 {
|
||||
if i > 0 {
|
||||
previousBlock := templateBlocks[i-1]
|
||||
if previousBlock.Type == template.DataBlock {
|
||||
out = append(out, Block{
|
||||
Start: lastIndex,
|
||||
End: start,
|
||||
Block: previousBlock,
|
||||
src: &markdown,
|
||||
})
|
||||
}
|
||||
}
|
||||
lastIndex = end
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the last block
|
||||
if len(templateBlocks) > 0 {
|
||||
lastBlock := templateBlocks[len(templateBlocks)-1]
|
||||
if lastBlock.Type == template.DataBlock {
|
||||
out = append(out, Block{
|
||||
Start: lastIndex,
|
||||
End: len(markdown),
|
||||
Block: lastBlock,
|
||||
src: &markdown,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func FuzzyFind(haystack string, from int, needle string, maxDist float64) (start int, end int) {
|
||||
bestStart, bestEnd, bestDist := -1, -1, math.MaxFloat64
|
||||
needleLen := len(needle)
|
||||
minWindow := max(1, needleLen-int(float64(needleLen)*maxDist)-1)
|
||||
maxWindow := needleLen + int(float64(needleLen)*maxDist) + 1
|
||||
|
||||
for i := from; i < len(haystack); i++ {
|
||||
for windowSize := minWindow; windowSize <= maxWindow && i+windowSize <= len(haystack); windowSize++ {
|
||||
sub := haystack[i : i+windowSize]
|
||||
dist := levenshtein.Distance(sub, needle, nil)
|
||||
maxLen := max(needleLen, windowSize)
|
||||
norm := float64(dist)/float64(maxLen) + float64(utils.Abs(windowSize-needleLen))*0.01/float64(maxLen)
|
||||
|
||||
if norm < bestDist {
|
||||
bestStart, bestEnd, bestDist = i, i+windowSize, norm
|
||||
}
|
||||
}
|
||||
if bestDist <= 0.05 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if bestStart >= 0 && bestDist <= maxDist+0.01 {
|
||||
return bestStart, bestEnd
|
||||
}
|
||||
return -1, -1
|
||||
}
|
87
parser/matcher/matcher_test.go
Normal file
87
parser/matcher/matcher_test.go
Normal file
@@ -0,0 +1,87 @@
|
||||
package matcher_test
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser/matcher"
|
||||
"git.max-richter.dev/max/marka/parser/utils"
|
||||
"git.max-richter.dev/max/marka/registry"
|
||||
"git.max-richter.dev/max/marka/template"
|
||||
)
|
||||
|
||||
func TestFuzzyFindAll(t *testing.T) {
|
||||
recipeMd := utils.ReadTestDataFile(t, "baguette.md")
|
||||
|
||||
tests := []struct {
|
||||
Needle string
|
||||
Start, End, StartIndex int
|
||||
}{
|
||||
{StartIndex: 0, Needle: "# Ingredients\n", Start: 77, End: 91},
|
||||
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 77, End: 91},
|
||||
{StartIndex: 0, Needle: "# Inrdients\n", Start: 77, End: 91},
|
||||
{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
|
||||
{StartIndex: 4, Needle: "---\n", Start: 29, End: 33},
|
||||
{StartIndex: 0, Needle: "# Steps\n", Start: 116, End: 124},
|
||||
{StartIndex: 0, Needle: "# Stps\n", Start: 116, End: 124},
|
||||
{StartIndex: 0, Needle: "# Step\n", Start: 116, End: 124},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
start, end := matcher.FuzzyFind(recipeMd, test.StartIndex, test.Needle, 0.3) // allow 50% error
|
||||
|
||||
if start != test.Start || end != test.End {
|
||||
t.Errorf("Start or end do not match: Needle=%q Start=%d/%d End=%d/%d", test.Needle, test.Start, start, test.End, end)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestFuzzyBlockMatch(t *testing.T) {
|
||||
recipeMd := utils.ReadTestDataFile(t, "baguette.md")
|
||||
schemaMd, err := registry.GetTemplate("Recipe")
|
||||
if err != nil {
|
||||
t.Errorf("Failed to load template: %s", err.Error())
|
||||
t.FailNow()
|
||||
}
|
||||
blocks, err := template.CompileTemplate(schemaMd)
|
||||
if err != nil {
|
||||
t.Errorf("Failed to compile template: %s", err.Error())
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
for _, b := range blocks {
|
||||
fmt.Printf("block: %#v\n", b)
|
||||
}
|
||||
|
||||
matches := matcher.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
||||
|
||||
expected := []struct {
|
||||
value string
|
||||
}{
|
||||
{
|
||||
value: "author.name: Max Richter",
|
||||
},
|
||||
{
|
||||
value: "Baguette",
|
||||
},
|
||||
{
|
||||
value: "My favourite baguette recipe",
|
||||
},
|
||||
{
|
||||
value: "- Flour\n- Water\n- Salt",
|
||||
},
|
||||
{
|
||||
value: "1. Mix Flour Water and Salt\n2. Bake the bread\n",
|
||||
},
|
||||
}
|
||||
|
||||
for i, m := range matches {
|
||||
if i > len(expected)-1 {
|
||||
t.Errorf("No expected result for match: %d -> %q", i, m.GetContent())
|
||||
t.FailNow()
|
||||
}
|
||||
if expected[i].value != m.GetContent() {
|
||||
t.Errorf("Match %d did not match expected: %q", i, m.GetContent())
|
||||
}
|
||||
}
|
||||
}
|
Reference in New Issue
Block a user