ffs, i should have committed wayyy earlier
This commit is contained in:
97
parser/blocks.go
Normal file
97
parser/blocks.go
Normal file
@@ -0,0 +1,97 @@
|
||||
package parser
|
||||
|
||||
type BlockType string
|
||||
|
||||
const (
|
||||
BlockData BlockType = "data" // content between lines "{" and "}"
|
||||
BlockMatching BlockType = "matching" // everything outside data blocks
|
||||
)
|
||||
|
||||
type Block struct {
|
||||
Type BlockType
|
||||
Start, End int // byte offsets [Start, End)
|
||||
src *string
|
||||
}
|
||||
|
||||
func (b Block) GetContent() string {
|
||||
if b.src == nil || b.Start < 0 || b.End > len(*b.src) || b.Start > b.End {
|
||||
return ""
|
||||
}
|
||||
return (*b.src)[b.Start:b.End]
|
||||
}
|
||||
|
||||
// ExtractBlocks scans once, emitting:
|
||||
// - data blocks: inner content between a line that's exactly "{" and a line that's exactly "}"
|
||||
// - matching blocks: gaps between data blocks (excluding the brace lines themselves)
|
||||
func ExtractBlocks(src string) []Block {
|
||||
var out []Block
|
||||
var curlyIndex int
|
||||
|
||||
const CLOSING = '}'
|
||||
const OPENING = '{'
|
||||
|
||||
if len(src) > 0 && src[0] == OPENING {
|
||||
curlyIndex = 1
|
||||
out = append(out, Block{
|
||||
Start: 0,
|
||||
Type: BlockData,
|
||||
src: &src,
|
||||
})
|
||||
} else {
|
||||
out = append(out, Block{
|
||||
Start: 0,
|
||||
Type: BlockMatching,
|
||||
src: &src,
|
||||
})
|
||||
}
|
||||
|
||||
for i, r := range src {
|
||||
|
||||
var nextCurlyIndex = curlyIndex
|
||||
|
||||
switch r {
|
||||
case OPENING:
|
||||
nextCurlyIndex++
|
||||
case CLOSING:
|
||||
nextCurlyIndex--
|
||||
}
|
||||
|
||||
var nextChar rune = ' '
|
||||
if i+1 < len(src) {
|
||||
nextChar = rune(src[i+1])
|
||||
}
|
||||
|
||||
if curlyIndex == 0 && nextCurlyIndex == 1 {
|
||||
out[len(out)-1].End = i
|
||||
out = append(out, Block{
|
||||
Start: i,
|
||||
Type: BlockData,
|
||||
src: &src,
|
||||
})
|
||||
} else if curlyIndex == 1 && nextCurlyIndex == 0 {
|
||||
out[len(out)-1].End = i + 1
|
||||
if nextChar == OPENING {
|
||||
out = append(out, Block{
|
||||
Start: i + 1,
|
||||
Type: BlockData,
|
||||
src: &src,
|
||||
})
|
||||
} else {
|
||||
out = append(out, Block{
|
||||
Start: i + 1,
|
||||
Type: BlockMatching,
|
||||
src: &src,
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
curlyIndex = nextCurlyIndex
|
||||
}
|
||||
|
||||
var lastBlock = out[len(out)-1]
|
||||
if lastBlock.End == 0 {
|
||||
out = out[:len(out)-1]
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
51
parser/blocks_test.go
Normal file
51
parser/blocks_test.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func readFile(t *testing.T, fileName string) string {
|
||||
path := filepath.Join("testdata", fileName)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read test data file: %v", err)
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func TestExtractBlocks(t *testing.T) {
|
||||
src := readFile(t, "recipe.schema.md")
|
||||
blocks := ExtractBlocks(src)
|
||||
|
||||
expected := []struct {
|
||||
Type BlockType
|
||||
Content string
|
||||
}{
|
||||
{BlockMatching, "---\\n"},
|
||||
{BlockData, "{ . }"},
|
||||
{BlockMatching, "\\n---\\n\\n# "},
|
||||
{BlockData, "{ name | text,required }"},
|
||||
{BlockMatching, "\\n\\n"},
|
||||
{BlockData, "{ description | text,optional }"},
|
||||
{BlockMatching, "\\n\\n## Ingredients\\n"},
|
||||
{BlockData, "{\\n path: recipeIngredient\\n codec: list\\n required: true\\n item:\\n template: \"- { . }\"\\n}"},
|
||||
{BlockMatching, "\\n\\n## Steps\\n"},
|
||||
{BlockData, "{\\n path: recipeInstructions\\n codec: list\\n required: true\\n item:\\n template: \"{ @index }. { . }\"\\n}"},
|
||||
}
|
||||
|
||||
if len(blocks) != len(expected) {
|
||||
t.Fatalf("expected %d blocks, got %d", len(expected), len(blocks))
|
||||
}
|
||||
|
||||
for i, b := range blocks {
|
||||
exp := expected[i]
|
||||
content := strings.ReplaceAll(b.GetContent(), "\n", "\\n")
|
||||
if b.Type != exp.Type || content != exp.Content {
|
||||
t.Errorf("Block %d: expected %v, got Type: %v, Start: %d, End: %d, Content: %s", i, exp, b.Type, b.Start, b.End, content)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
5
parser/go.mod
Normal file
5
parser/go.mod
Normal file
@@ -0,0 +1,5 @@
|
||||
module git.max-richter.dev/max/marka/parser
|
||||
|
||||
go 1.24.3
|
||||
|
||||
require github.com/agext/levenshtein v1.2.3
|
2
parser/go.sum
Normal file
2
parser/go.sum
Normal file
@@ -0,0 +1,2 @@
|
||||
github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
|
||||
github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
|
103
parser/matcher.go
Normal file
103
parser/matcher.go
Normal file
@@ -0,0 +1,103 @@
|
||||
package parser
|
||||
|
||||
import (
|
||||
"math"
|
||||
|
||||
"github.com/agext/levenshtein"
|
||||
)
|
||||
|
||||
type MatchBlock struct {
|
||||
Start, End int
|
||||
Block Block
|
||||
src *string
|
||||
}
|
||||
|
||||
func (m MatchBlock) GetContent() string {
|
||||
if m.src == nil || m.Start < 0 || m.End > len(*m.src) || m.Start > m.End {
|
||||
return ""
|
||||
}
|
||||
return (*m.src)[m.Start:m.End]
|
||||
}
|
||||
|
||||
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
|
||||
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
|
||||
// segments as gaps between those anchors.
|
||||
func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchBlock {
|
||||
var out []MatchBlock
|
||||
|
||||
var lastIndex = 0
|
||||
for i, b := range blocks {
|
||||
if b.Type == BlockMatching {
|
||||
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
|
||||
if end != -1 {
|
||||
if i > 0 {
|
||||
previousBlock := blocks[i-1]
|
||||
if previousBlock.Type == BlockData {
|
||||
out = append(out, MatchBlock{
|
||||
Start: lastIndex,
|
||||
End: start,
|
||||
Block: previousBlock,
|
||||
src: &markdown,
|
||||
})
|
||||
}
|
||||
}
|
||||
lastIndex = end
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the last block
|
||||
lastBlock := blocks[len(blocks)-1]
|
||||
if lastBlock.Type == BlockData {
|
||||
out = append(out, MatchBlock{
|
||||
Start: lastIndex,
|
||||
End: len(markdown),
|
||||
Block: lastBlock,
|
||||
src: &markdown,
|
||||
})
|
||||
}
|
||||
|
||||
return out
|
||||
}
|
||||
|
||||
func FuzzyFind(haystack string, from int, needle string, maxDist float64) (start int, end int) {
|
||||
bestStart, bestEnd, bestDist := -1, -1, math.MaxFloat64
|
||||
needleLen := len(needle)
|
||||
minWindow := max(1, needleLen-int(float64(needleLen)*maxDist)-1)
|
||||
maxWindow := needleLen + int(float64(needleLen)*maxDist) + 1
|
||||
|
||||
for i := from; i < len(haystack); i++ {
|
||||
for windowSize := minWindow; windowSize <= maxWindow && i+windowSize <= len(haystack); windowSize++ {
|
||||
sub := haystack[i : i+windowSize]
|
||||
dist := levenshtein.Distance(sub, needle, nil)
|
||||
maxLen := max(needleLen, windowSize)
|
||||
norm := float64(dist)/float64(maxLen) + float64(abs(windowSize-needleLen))*0.01/float64(maxLen)
|
||||
|
||||
if norm < bestDist {
|
||||
bestStart, bestEnd, bestDist = i, i+windowSize, norm
|
||||
}
|
||||
}
|
||||
if bestDist <= 0.05 {
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if bestStart >= 0 && bestDist <= maxDist+0.01 {
|
||||
return bestStart, bestEnd
|
||||
}
|
||||
return -1, -1
|
||||
}
|
||||
|
||||
func abs(x int) int {
|
||||
if x < 0 {
|
||||
return -x
|
||||
}
|
||||
return x
|
||||
}
|
||||
|
||||
func max(a, b int) int {
|
||||
if a > b {
|
||||
return a
|
||||
}
|
||||
return b
|
||||
}
|
83
parser/matcher_test.go
Normal file
83
parser/matcher_test.go
Normal file
@@ -0,0 +1,83 @@
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser"
|
||||
)
|
||||
|
||||
func readFile(t *testing.T, fileName string) string {
|
||||
path := filepath.Join("testdata", fileName)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read test data file: %v", err)
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func TestFuzzyFindAll(t *testing.T) {
|
||||
recipeMd := readFile(t, "baguette.md")
|
||||
|
||||
tests := []struct {
|
||||
Needle string
|
||||
Start, End, StartIndex int
|
||||
}{
|
||||
{StartIndex: 0, Needle: "# Ingredients\n", Start: 72, End: 86},
|
||||
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 72, End: 86},
|
||||
{StartIndex: 0, Needle: "# Inrdients\n", Start: 72, End: 86},
|
||||
{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
|
||||
{StartIndex: 4, Needle: "---\n", Start: 24, End: 28},
|
||||
{StartIndex: 0, Needle: "# Steps\n", Start: 111, End: 119},
|
||||
{StartIndex: 0, Needle: "# Stps\n", Start: 111, End: 119},
|
||||
{StartIndex: 0, Needle: "# Step\n", Start: 111, End: 119},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
start, end := parser.FuzzyFind(recipeMd, test.StartIndex, test.Needle, 0.3) // allow 50% error
|
||||
|
||||
if start != test.Start || end != test.End {
|
||||
t.Errorf("Start or end do not match: Needle=%q Start=%d/%d End=%d/%d", test.Needle, test.Start, start, test.End, end)
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
func TestFuzzyBlockMatch(t *testing.T) {
|
||||
recipeMd := readFile(t, "baguette.md")
|
||||
schemaMd := readFile(t, "recipe.schema.md")
|
||||
blocks := parser.ExtractBlocks(schemaMd)
|
||||
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
||||
|
||||
expected := []struct {
|
||||
value string
|
||||
}{
|
||||
{
|
||||
value: "author: Max Richter",
|
||||
},
|
||||
{
|
||||
value: "Baguette",
|
||||
},
|
||||
{
|
||||
value: "My favourite baguette recipe",
|
||||
},
|
||||
{
|
||||
value: "- Flour\n- Water\n- Salt",
|
||||
},
|
||||
{
|
||||
value: "1. Mix Flour Water and Salt\n2. Bake the bread",
|
||||
},
|
||||
}
|
||||
|
||||
for i, m := range matches {
|
||||
if i > len(expected)-1 {
|
||||
t.Errorf("No expected result for match: %d -> %q", i, m.GetContent())
|
||||
t.FailNow()
|
||||
}
|
||||
if expected[i].value != m.GetContent() {
|
||||
t.Errorf("Match %d did not match expected: %q", i, m.GetContent())
|
||||
}
|
||||
}
|
||||
|
||||
}
|
17
parser/parser.go
Normal file
17
parser/parser.go
Normal file
@@ -0,0 +1,17 @@
|
||||
// Package parser provides functions for parsing Markdown templates into
|
||||
// structured JSON objects that conform to a JSON Schema.
|
||||
package parser
|
||||
|
||||
func ParseFile(markdownContent string) (map[string]any, error) {
|
||||
|
||||
// _schema, err := registry.GetTemplate("Recipe")
|
||||
// if err != nil {
|
||||
// return nil, fmt.Errorf("could not get schema: %w", err)
|
||||
// }
|
||||
|
||||
// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
|
||||
// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
|
||||
// The "matching" blocks should soft match with a levenshtein distance
|
||||
|
||||
return map[string]any{}, nil
|
||||
}
|
43
parser/parser_test.go
Normal file
43
parser/parser_test.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser"
|
||||
)
|
||||
|
||||
func TestParseRecipe_Golden(t *testing.T) {
|
||||
td := filepath.Join("testdata", "recipe_salad")
|
||||
input := filepath.Join(td, "input.md")
|
||||
output := filepath.Join(td, "output.json")
|
||||
|
||||
inputContent, err := os.ReadFile(input)
|
||||
if err != nil {
|
||||
t.Fatalf("read input.md: %v", err)
|
||||
}
|
||||
|
||||
got, err := parser.ParseFile(string(inputContent))
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile: %v", err)
|
||||
}
|
||||
|
||||
var want map[string]any
|
||||
b, err := os.ReadFile(output)
|
||||
if err != nil {
|
||||
t.Fatalf("read expected.json: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal(b, &want); err != nil {
|
||||
t.Fatalf("unmarshal expected.json: %v", err)
|
||||
}
|
||||
|
||||
// Deep structural compare
|
||||
if !reflect.DeepEqual(want, got) {
|
||||
gb, _ := json.MarshalIndent(got, "", " ")
|
||||
wb, _ := json.MarshalIndent(want, "", " ")
|
||||
t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
|
||||
}
|
||||
}
|
16
parser/testdata/baguette.md
vendored
Normal file
16
parser/testdata/baguette.md
vendored
Normal file
@@ -0,0 +1,16 @@
|
||||
---
|
||||
author: Max Richter
|
||||
---
|
||||
|
||||
# Baguette
|
||||
|
||||
My favourite baguette recipe
|
||||
|
||||
## Ingredients
|
||||
- Flour
|
||||
- Water
|
||||
- Salt
|
||||
|
||||
## Steps
|
||||
1. Mix Flour Water and Salt
|
||||
2. Bake the bread
|
25
parser/testdata/recipe.schema.md
vendored
Normal file
25
parser/testdata/recipe.schema.md
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
---
|
||||
{ . }
|
||||
---
|
||||
|
||||
# { name | text,required }
|
||||
|
||||
{ description | text,optional }
|
||||
|
||||
## Ingredients
|
||||
{
|
||||
path: recipeIngredient
|
||||
codec: list
|
||||
required: true
|
||||
item:
|
||||
template: "- { . }"
|
||||
}
|
||||
|
||||
## Steps
|
||||
{
|
||||
path: recipeInstructions
|
||||
codec: list
|
||||
required: true
|
||||
item:
|
||||
template: "{ @index }. { . }"
|
||||
}
|
25
parser/testdata/recipe_salad/input.md
vendored
Normal file
25
parser/testdata/recipe_salad/input.md
vendored
Normal file
@@ -0,0 +1,25 @@
|
||||
---
|
||||
@type: Recipe
|
||||
image: https://example.com/salad.jpg
|
||||
author: Alex Chef
|
||||
datePublished: 2025-08-12
|
||||
prepTime: PT10M
|
||||
cookTime: PT0M
|
||||
recipeYield: 2 servings
|
||||
---
|
||||
|
||||
# Simple Salad
|
||||
|
||||
A quick green salad.
|
||||
|
||||
## Ingredients
|
||||
- 100 g lettuce
|
||||
- 5 cherry tomatoes
|
||||
- 1 tbsp olive oil
|
||||
- Pinch of salt
|
||||
|
||||
## Steps
|
||||
1. Wash and dry the lettuce.
|
||||
2. Halve the cherry tomatoes.
|
||||
3. Toss with olive oil and salt.
|
||||
|
26
parser/testdata/recipe_salad/output.json
vendored
Normal file
26
parser/testdata/recipe_salad/output.json
vendored
Normal file
@@ -0,0 +1,26 @@
|
||||
{
|
||||
"@context": "https://schema.org/",
|
||||
"@type": "Recipe",
|
||||
"name": "Simple Salad",
|
||||
"image": "https://example.com/salad.jpg",
|
||||
"author": {
|
||||
"@type": "Person",
|
||||
"name": "Alex Chef"
|
||||
},
|
||||
"datePublished": "2025-08-12",
|
||||
"description": "A quick green salad.",
|
||||
"prepTime": "PT10M",
|
||||
"cookTime": "PT0M",
|
||||
"recipeYield": "2 servings",
|
||||
"recipeIngredient": [
|
||||
"100 g lettuce",
|
||||
"5 cherry tomatoes",
|
||||
"1 tbsp olive oil",
|
||||
"Pinch of salt"
|
||||
],
|
||||
"recipeInstructions": [
|
||||
"Wash and dry the lettuce.",
|
||||
"Halve the cherry tomatoes.",
|
||||
"Toss with olive oil and salt."
|
||||
]
|
||||
}
|
Reference in New Issue
Block a user