ffs, i should have committed wayyy earlier

This commit is contained in:
Max Richter
2025-08-16 20:38:40 +02:00
commit 43644c4f40
25 changed files with 865 additions and 0 deletions

97
parser/blocks.go Normal file
View File

@@ -0,0 +1,97 @@
package parser
type BlockType string
const (
BlockData BlockType = "data" // content between lines "{" and "}"
BlockMatching BlockType = "matching" // everything outside data blocks
)
type Block struct {
Type BlockType
Start, End int // byte offsets [Start, End)
src *string
}
func (b Block) GetContent() string {
if b.src == nil || b.Start < 0 || b.End > len(*b.src) || b.Start > b.End {
return ""
}
return (*b.src)[b.Start:b.End]
}
// ExtractBlocks scans once, emitting:
// - data blocks: inner content between a line that's exactly "{" and a line that's exactly "}"
// - matching blocks: gaps between data blocks (excluding the brace lines themselves)
func ExtractBlocks(src string) []Block {
var out []Block
var curlyIndex int
const CLOSING = '}'
const OPENING = '{'
if len(src) > 0 && src[0] == OPENING {
curlyIndex = 1
out = append(out, Block{
Start: 0,
Type: BlockData,
src: &src,
})
} else {
out = append(out, Block{
Start: 0,
Type: BlockMatching,
src: &src,
})
}
for i, r := range src {
var nextCurlyIndex = curlyIndex
switch r {
case OPENING:
nextCurlyIndex++
case CLOSING:
nextCurlyIndex--
}
var nextChar rune = ' '
if i+1 < len(src) {
nextChar = rune(src[i+1])
}
if curlyIndex == 0 && nextCurlyIndex == 1 {
out[len(out)-1].End = i
out = append(out, Block{
Start: i,
Type: BlockData,
src: &src,
})
} else if curlyIndex == 1 && nextCurlyIndex == 0 {
out[len(out)-1].End = i + 1
if nextChar == OPENING {
out = append(out, Block{
Start: i + 1,
Type: BlockData,
src: &src,
})
} else {
out = append(out, Block{
Start: i + 1,
Type: BlockMatching,
src: &src,
})
}
}
curlyIndex = nextCurlyIndex
}
var lastBlock = out[len(out)-1]
if lastBlock.End == 0 {
out = out[:len(out)-1]
}
return out
}

51
parser/blocks_test.go Normal file
View File

@@ -0,0 +1,51 @@
package parser
import (
"os"
"path/filepath"
"strings"
"testing"
)
func readFile(t *testing.T, fileName string) string {
path := filepath.Join("testdata", fileName)
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("failed to read test data file: %v", err)
}
return string(data)
}
func TestExtractBlocks(t *testing.T) {
src := readFile(t, "recipe.schema.md")
blocks := ExtractBlocks(src)
expected := []struct {
Type BlockType
Content string
}{
{BlockMatching, "---\\n"},
{BlockData, "{ . }"},
{BlockMatching, "\\n---\\n\\n# "},
{BlockData, "{ name | text,required }"},
{BlockMatching, "\\n\\n"},
{BlockData, "{ description | text,optional }"},
{BlockMatching, "\\n\\n## Ingredients\\n"},
{BlockData, "{\\n path: recipeIngredient\\n codec: list\\n required: true\\n item:\\n template: \"- { . }\"\\n}"},
{BlockMatching, "\\n\\n## Steps\\n"},
{BlockData, "{\\n path: recipeInstructions\\n codec: list\\n required: true\\n item:\\n template: \"{ @index }. { . }\"\\n}"},
}
if len(blocks) != len(expected) {
t.Fatalf("expected %d blocks, got %d", len(expected), len(blocks))
}
for i, b := range blocks {
exp := expected[i]
content := strings.ReplaceAll(b.GetContent(), "\n", "\\n")
if b.Type != exp.Type || content != exp.Content {
t.Errorf("Block %d: expected %v, got Type: %v, Start: %d, End: %d, Content: %s", i, exp, b.Type, b.Start, b.End, content)
}
}
}

5
parser/go.mod Normal file
View File

@@ -0,0 +1,5 @@
module git.max-richter.dev/max/marka/parser
go 1.24.3
require github.com/agext/levenshtein v1.2.3

2
parser/go.sum Normal file
View File

@@ -0,0 +1,2 @@
github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=

103
parser/matcher.go Normal file
View File

@@ -0,0 +1,103 @@
package parser
import (
"math"
"github.com/agext/levenshtein"
)
type MatchBlock struct {
Start, End int
Block Block
src *string
}
func (m MatchBlock) GetContent() string {
if m.src == nil || m.Start < 0 || m.End > len(*m.src) || m.Start > m.End {
return ""
}
return (*m.src)[m.Start:m.End]
}
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
// segments as gaps between those anchors.
func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchBlock {
var out []MatchBlock
var lastIndex = 0
for i, b := range blocks {
if b.Type == BlockMatching {
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
if end != -1 {
if i > 0 {
previousBlock := blocks[i-1]
if previousBlock.Type == BlockData {
out = append(out, MatchBlock{
Start: lastIndex,
End: start,
Block: previousBlock,
src: &markdown,
})
}
}
lastIndex = end
}
}
}
// Handle the last block
lastBlock := blocks[len(blocks)-1]
if lastBlock.Type == BlockData {
out = append(out, MatchBlock{
Start: lastIndex,
End: len(markdown),
Block: lastBlock,
src: &markdown,
})
}
return out
}
func FuzzyFind(haystack string, from int, needle string, maxDist float64) (start int, end int) {
bestStart, bestEnd, bestDist := -1, -1, math.MaxFloat64
needleLen := len(needle)
minWindow := max(1, needleLen-int(float64(needleLen)*maxDist)-1)
maxWindow := needleLen + int(float64(needleLen)*maxDist) + 1
for i := from; i < len(haystack); i++ {
for windowSize := minWindow; windowSize <= maxWindow && i+windowSize <= len(haystack); windowSize++ {
sub := haystack[i : i+windowSize]
dist := levenshtein.Distance(sub, needle, nil)
maxLen := max(needleLen, windowSize)
norm := float64(dist)/float64(maxLen) + float64(abs(windowSize-needleLen))*0.01/float64(maxLen)
if norm < bestDist {
bestStart, bestEnd, bestDist = i, i+windowSize, norm
}
}
if bestDist <= 0.05 {
break
}
}
if bestStart >= 0 && bestDist <= maxDist+0.01 {
return bestStart, bestEnd
}
return -1, -1
}
func abs(x int) int {
if x < 0 {
return -x
}
return x
}
func max(a, b int) int {
if a > b {
return a
}
return b
}

83
parser/matcher_test.go Normal file
View File

@@ -0,0 +1,83 @@
package parser_test
import (
"os"
"path/filepath"
"testing"
"git.max-richter.dev/max/marka/parser"
)
func readFile(t *testing.T, fileName string) string {
path := filepath.Join("testdata", fileName)
data, err := os.ReadFile(path)
if err != nil {
t.Fatalf("failed to read test data file: %v", err)
}
return string(data)
}
func TestFuzzyFindAll(t *testing.T) {
recipeMd := readFile(t, "baguette.md")
tests := []struct {
Needle string
Start, End, StartIndex int
}{
{StartIndex: 0, Needle: "# Ingredients\n", Start: 72, End: 86},
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 72, End: 86},
{StartIndex: 0, Needle: "# Inrdients\n", Start: 72, End: 86},
{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
{StartIndex: 4, Needle: "---\n", Start: 24, End: 28},
{StartIndex: 0, Needle: "# Steps\n", Start: 111, End: 119},
{StartIndex: 0, Needle: "# Stps\n", Start: 111, End: 119},
{StartIndex: 0, Needle: "# Step\n", Start: 111, End: 119},
}
for _, test := range tests {
start, end := parser.FuzzyFind(recipeMd, test.StartIndex, test.Needle, 0.3) // allow 50% error
if start != test.Start || end != test.End {
t.Errorf("Start or end do not match: Needle=%q Start=%d/%d End=%d/%d", test.Needle, test.Start, start, test.End, end)
}
}
}
func TestFuzzyBlockMatch(t *testing.T) {
recipeMd := readFile(t, "baguette.md")
schemaMd := readFile(t, "recipe.schema.md")
blocks := parser.ExtractBlocks(schemaMd)
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
expected := []struct {
value string
}{
{
value: "author: Max Richter",
},
{
value: "Baguette",
},
{
value: "My favourite baguette recipe",
},
{
value: "- Flour\n- Water\n- Salt",
},
{
value: "1. Mix Flour Water and Salt\n2. Bake the bread",
},
}
for i, m := range matches {
if i > len(expected)-1 {
t.Errorf("No expected result for match: %d -> %q", i, m.GetContent())
t.FailNow()
}
if expected[i].value != m.GetContent() {
t.Errorf("Match %d did not match expected: %q", i, m.GetContent())
}
}
}

17
parser/parser.go Normal file
View File

@@ -0,0 +1,17 @@
// Package parser provides functions for parsing Markdown templates into
// structured JSON objects that conform to a JSON Schema.
package parser
func ParseFile(markdownContent string) (map[string]any, error) {
// _schema, err := registry.GetTemplate("Recipe")
// if err != nil {
// return nil, fmt.Errorf("could not get schema: %w", err)
// }
// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
// The "matching" blocks should soft match with a levenshtein distance
return map[string]any{}, nil
}

43
parser/parser_test.go Normal file
View File

@@ -0,0 +1,43 @@
package parser_test
import (
"encoding/json"
"os"
"path/filepath"
"reflect"
"testing"
"git.max-richter.dev/max/marka/parser"
)
func TestParseRecipe_Golden(t *testing.T) {
td := filepath.Join("testdata", "recipe_salad")
input := filepath.Join(td, "input.md")
output := filepath.Join(td, "output.json")
inputContent, err := os.ReadFile(input)
if err != nil {
t.Fatalf("read input.md: %v", err)
}
got, err := parser.ParseFile(string(inputContent))
if err != nil {
t.Fatalf("ParseFile: %v", err)
}
var want map[string]any
b, err := os.ReadFile(output)
if err != nil {
t.Fatalf("read expected.json: %v", err)
}
if err := json.Unmarshal(b, &want); err != nil {
t.Fatalf("unmarshal expected.json: %v", err)
}
// Deep structural compare
if !reflect.DeepEqual(want, got) {
gb, _ := json.MarshalIndent(got, "", " ")
wb, _ := json.MarshalIndent(want, "", " ")
t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
}
}

16
parser/testdata/baguette.md vendored Normal file
View File

@@ -0,0 +1,16 @@
---
author: Max Richter
---
# Baguette
My favourite baguette recipe
## Ingredients
- Flour
- Water
- Salt
## Steps
1. Mix Flour Water and Salt
2. Bake the bread

25
parser/testdata/recipe.schema.md vendored Normal file
View File

@@ -0,0 +1,25 @@
---
{ . }
---
# { name | text,required }
{ description | text,optional }
## Ingredients
{
path: recipeIngredient
codec: list
required: true
item:
template: "- { . }"
}
## Steps
{
path: recipeInstructions
codec: list
required: true
item:
template: "{ @index }. { . }"
}

25
parser/testdata/recipe_salad/input.md vendored Normal file
View File

@@ -0,0 +1,25 @@
---
@type: Recipe
image: https://example.com/salad.jpg
author: Alex Chef
datePublished: 2025-08-12
prepTime: PT10M
cookTime: PT0M
recipeYield: 2 servings
---
# Simple Salad
A quick green salad.
## Ingredients
- 100 g lettuce
- 5 cherry tomatoes
- 1 tbsp olive oil
- Pinch of salt
## Steps
1. Wash and dry the lettuce.
2. Halve the cherry tomatoes.
3. Toss with olive oil and salt.

View File

@@ -0,0 +1,26 @@
{
"@context": "https://schema.org/",
"@type": "Recipe",
"name": "Simple Salad",
"image": "https://example.com/salad.jpg",
"author": {
"@type": "Person",
"name": "Alex Chef"
},
"datePublished": "2025-08-12",
"description": "A quick green salad.",
"prepTime": "PT10M",
"cookTime": "PT0M",
"recipeYield": "2 servings",
"recipeIngredient": [
"100 g lettuce",
"5 cherry tomatoes",
"1 tbsp olive oil",
"Pinch of salt"
],
"recipeInstructions": [
"Wash and dry the lettuce.",
"Halve the cherry tomatoes.",
"Toss with olive oil and salt."
]
}