ffs, i should have committed wayyy earlier

2025-08-16 20:38:40 +02:00
commit 43644c4f40
25 changed files with 865 additions and 0 deletions
--- a/parser/blocks.go
+++ b/parser/blocks.go
@@ -0,0 +1,97 @@
+package parser
+
+type BlockType string
+
+const (
+	BlockData     BlockType = "data"     // content between lines "{" and "}"
+	BlockMatching BlockType = "matching" // everything outside data blocks
+)
+
+type Block struct {
+	Type       BlockType
+	Start, End int // byte offsets [Start, End)
+	src        *string
+}
+
+func (b Block) GetContent() string {
+	if b.src == nil || b.Start < 0 || b.End > len(*b.src) || b.Start > b.End {
+		return ""
+	}
+	return (*b.src)[b.Start:b.End]
+}
+
+// ExtractBlocks scans once, emitting:
+// - data blocks: inner content between a line that's exactly "{" and a line that's exactly "}"
+// - matching blocks: gaps between data blocks (excluding the brace lines themselves)
+func ExtractBlocks(src string) []Block {
+	var out []Block
+	var curlyIndex int
+
+	const CLOSING = '}'
+	const OPENING = '{'
+
+	if len(src) > 0 && src[0] == OPENING {
+		curlyIndex = 1
+		out = append(out, Block{
+			Start: 0,
+			Type:  BlockData,
+			src:   &src,
+		})
+	} else {
+		out = append(out, Block{
+			Start: 0,
+			Type:  BlockMatching,
+			src:   &src,
+		})
+	}
+
+	for i, r := range src {
+
+		var nextCurlyIndex = curlyIndex
+
+		switch r {
+		case OPENING:
+			nextCurlyIndex++
+		case CLOSING:
+			nextCurlyIndex--
+		}
+
+		var nextChar rune = ' '
+		if i+1 < len(src) {
+			nextChar = rune(src[i+1])
+		}
+
+		if curlyIndex == 0 && nextCurlyIndex == 1 {
+			out[len(out)-1].End = i
+			out = append(out, Block{
+				Start: i,
+				Type:  BlockData,
+				src:   &src,
+			})
+		} else if curlyIndex == 1 && nextCurlyIndex == 0 {
+			out[len(out)-1].End = i + 1
+			if nextChar == OPENING {
+				out = append(out, Block{
+					Start: i + 1,
+					Type:  BlockData,
+					src:   &src,
+				})
+			} else {
+				out = append(out, Block{
+					Start: i + 1,
+					Type:  BlockMatching,
+					src:   &src,
+				})
+			}
+		}
+
+		curlyIndex = nextCurlyIndex
+	}
+
+	var lastBlock = out[len(out)-1]
+	if lastBlock.End == 0 {
+		out = out[:len(out)-1]
+	}
+
+	return out
+}
--- a/parser/blocks_test.go
+++ b/parser/blocks_test.go
@@ -0,0 +1,51 @@
+package parser
+
+import (
+	"os"
+	"path/filepath"
+	"strings"
+	"testing"
+)
+
+func readFile(t *testing.T, fileName string) string {
+	path := filepath.Join("testdata", fileName)
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("failed to read test data file: %v", err)
+	}
+	return string(data)
+}
+
+func TestExtractBlocks(t *testing.T) {
+	src := readFile(t, "recipe.schema.md")
+	blocks := ExtractBlocks(src)
+
+	expected := []struct {
+		Type    BlockType
+		Content string
+	}{
+		{BlockMatching, "---\\n"},
+		{BlockData, "{ . }"},
+		{BlockMatching, "\\n---\\n\\n# "},
+		{BlockData, "{ name | text,required }"},
+		{BlockMatching, "\\n\\n"},
+		{BlockData, "{ description | text,optional }"},
+		{BlockMatching, "\\n\\n## Ingredients\\n"},
+		{BlockData, "{\\n  path: recipeIngredient\\n  codec: list\\n  required: true\\n  item:\\n    template: \"- { . }\"\\n}"},
+		{BlockMatching, "\\n\\n## Steps\\n"},
+		{BlockData, "{\\n  path: recipeInstructions\\n  codec: list\\n  required: true\\n  item:\\n    template: \"{ @index }. { . }\"\\n}"},
+	}
+
+	if len(blocks) != len(expected) {
+		t.Fatalf("expected %d blocks, got %d", len(expected), len(blocks))
+	}
+
+	for i, b := range blocks {
+		exp := expected[i]
+		content := strings.ReplaceAll(b.GetContent(), "\n", "\\n")
+		if b.Type != exp.Type || content != exp.Content {
+			t.Errorf("Block %d: expected %v, got Type: %v, Start: %d, End: %d, Content: %s", i, exp, b.Type, b.Start, b.End, content)
+		}
+	}
+
+}
--- a/parser/go.mod
+++ b/parser/go.mod
@@ -0,0 +1,5 @@
+module git.max-richter.dev/max/marka/parser
+
+go 1.24.3
+
+require github.com/agext/levenshtein v1.2.3
--- a/parser/go.sum
+++ b/parser/go.sum
@@ -0,0 +1,2 @@
+github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
+github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
--- a/parser/matcher.go
+++ b/parser/matcher.go
@@ -0,0 +1,103 @@
+package parser
+
+import (
+	"math"
+
+	"github.com/agext/levenshtein"
+)
+
+type MatchBlock struct {
+	Start, End int
+	Block      Block
+	src        *string
+}
+
+func (m MatchBlock) GetContent() string {
+	if m.src == nil || m.Start < 0 || m.End > len(*m.src) || m.Start > m.End {
+		return ""
+	}
+	return (*m.src)[m.Start:m.End]
+}
+
+// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
+// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
+// segments as gaps between those anchors.
+func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchBlock {
+	var out []MatchBlock
+
+	var lastIndex = 0
+	for i, b := range blocks {
+		if b.Type == BlockMatching {
+			start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
+			if end != -1 {
+				if i > 0 {
+					previousBlock := blocks[i-1]
+					if previousBlock.Type == BlockData {
+						out = append(out, MatchBlock{
+							Start: lastIndex,
+							End:   start,
+							Block: previousBlock,
+							src:   &markdown,
+						})
+					}
+				}
+				lastIndex = end
+			}
+		}
+	}
+
+	// Handle the last block
+	lastBlock := blocks[len(blocks)-1]
+	if lastBlock.Type == BlockData {
+		out = append(out, MatchBlock{
+			Start: lastIndex,
+			End:   len(markdown),
+			Block: lastBlock,
+			src:   &markdown,
+		})
+	}
+
+	return out
+}
+
+func FuzzyFind(haystack string, from int, needle string, maxDist float64) (start int, end int) {
+	bestStart, bestEnd, bestDist := -1, -1, math.MaxFloat64
+	needleLen := len(needle)
+	minWindow := max(1, needleLen-int(float64(needleLen)*maxDist)-1)
+	maxWindow := needleLen + int(float64(needleLen)*maxDist) + 1
+
+	for i := from; i < len(haystack); i++ {
+		for windowSize := minWindow; windowSize <= maxWindow && i+windowSize <= len(haystack); windowSize++ {
+			sub := haystack[i : i+windowSize]
+			dist := levenshtein.Distance(sub, needle, nil)
+			maxLen := max(needleLen, windowSize)
+			norm := float64(dist)/float64(maxLen) + float64(abs(windowSize-needleLen))*0.01/float64(maxLen)
+
+			if norm < bestDist {
+				bestStart, bestEnd, bestDist = i, i+windowSize, norm
+			}
+		}
+		if bestDist <= 0.05 {
+			break
+		}
+	}
+
+	if bestStart >= 0 && bestDist <= maxDist+0.01 {
+		return bestStart, bestEnd
+	}
+	return -1, -1
+}
+
+func abs(x int) int {
+	if x < 0 {
+		return -x
+	}
+	return x
+}
+
+func max(a, b int) int {
+	if a > b {
+		return a
+	}
+	return b
+}
--- a/parser/matcher_test.go
+++ b/parser/matcher_test.go
@@ -0,0 +1,83 @@
+package parser_test
+
+import (
+	"os"
+	"path/filepath"
+	"testing"
+
+	"git.max-richter.dev/max/marka/parser"
+)
+
+func readFile(t *testing.T, fileName string) string {
+	path := filepath.Join("testdata", fileName)
+	data, err := os.ReadFile(path)
+	if err != nil {
+		t.Fatalf("failed to read test data file: %v", err)
+	}
+	return string(data)
+}
+
+func TestFuzzyFindAll(t *testing.T) {
+	recipeMd := readFile(t, "baguette.md")
+
+	tests := []struct {
+		Needle                 string
+		Start, End, StartIndex int
+	}{
+		{StartIndex: 0, Needle: "# Ingredients\n", Start: 72, End: 86},
+		{StartIndex: 0, Needle: "# Ingrdients\n", Start: 72, End: 86},
+		{StartIndex: 0, Needle: "# Inrdients\n", Start: 72, End: 86},
+		{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
+		{StartIndex: 4, Needle: "---\n", Start: 24, End: 28},
+		{StartIndex: 0, Needle: "# Steps\n", Start: 111, End: 119},
+		{StartIndex: 0, Needle: "# Stps\n", Start: 111, End: 119},
+		{StartIndex: 0, Needle: "# Step\n", Start: 111, End: 119},
+	}
+
+	for _, test := range tests {
+		start, end := parser.FuzzyFind(recipeMd, test.StartIndex, test.Needle, 0.3) // allow 50% error
+
+		if start != test.Start || end != test.End {
+			t.Errorf("Start or end do not match: Needle=%q Start=%d/%d End=%d/%d", test.Needle, test.Start, start, test.End, end)
+		}
+	}
+
+}
+
+func TestFuzzyBlockMatch(t *testing.T) {
+	recipeMd := readFile(t, "baguette.md")
+	schemaMd := readFile(t, "recipe.schema.md")
+	blocks := parser.ExtractBlocks(schemaMd)
+	matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
+
+	expected := []struct {
+		value string
+	}{
+		{
+			value: "author: Max Richter",
+		},
+		{
+			value: "Baguette",
+		},
+		{
+			value: "My favourite baguette recipe",
+		},
+		{
+			value: "- Flour\n- Water\n- Salt",
+		},
+		{
+			value: "1. Mix Flour Water and Salt\n2. Bake the bread",
+		},
+	}
+
+	for i, m := range matches {
+		if i > len(expected)-1 {
+			t.Errorf("No expected result for match: %d -> %q", i, m.GetContent())
+			t.FailNow()
+		}
+		if expected[i].value != m.GetContent() {
+			t.Errorf("Match %d did not match expected: %q", i, m.GetContent())
+		}
+	}
+
+}
--- a/parser/parser.go
+++ b/parser/parser.go
@@ -0,0 +1,17 @@
+// Package parser provides functions for parsing Markdown templates into
+// structured JSON objects that conform to a JSON Schema.
+package parser
+
+func ParseFile(markdownContent string) (map[string]any, error) {
+
+	// _schema, err := registry.GetTemplate("Recipe")
+	// if err != nil {
+	// 	return nil, fmt.Errorf("could not get schema: %w", err)
+	// }
+
+	// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
+	// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
+	// The "matching" blocks should soft match with a levenshtein distance
+
+	return map[string]any{}, nil
+}
--- a/parser/parser_test.go
+++ b/parser/parser_test.go
@@ -0,0 +1,43 @@
+package parser_test
+
+import (
+	"encoding/json"
+	"os"
+	"path/filepath"
+	"reflect"
+	"testing"
+
+	"git.max-richter.dev/max/marka/parser"
+)
+
+func TestParseRecipe_Golden(t *testing.T) {
+	td := filepath.Join("testdata", "recipe_salad")
+	input := filepath.Join(td, "input.md")
+	output := filepath.Join(td, "output.json")
+
+	inputContent, err := os.ReadFile(input)
+	if err != nil {
+		t.Fatalf("read input.md: %v", err)
+	}
+
+	got, err := parser.ParseFile(string(inputContent))
+	if err != nil {
+		t.Fatalf("ParseFile: %v", err)
+	}
+
+	var want map[string]any
+	b, err := os.ReadFile(output)
+	if err != nil {
+		t.Fatalf("read expected.json: %v", err)
+	}
+	if err := json.Unmarshal(b, &want); err != nil {
+		t.Fatalf("unmarshal expected.json: %v", err)
+	}
+
+	// Deep structural compare
+	if !reflect.DeepEqual(want, got) {
+		gb, _ := json.MarshalIndent(got, "", "  ")
+		wb, _ := json.MarshalIndent(want, "", "  ")
+		t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
+	}
+}
--- a/parser/testdata/baguette.md
+++ b/parser/testdata/baguette.md
@@ -0,0 +1,16 @@
+---
+author: Max Richter
+---
+
+# Baguette
+
+My favourite baguette recipe
+
+## Ingredients
+- Flour
+- Water
+- Salt
+
+## Steps
+1. Mix Flour Water and Salt
+2. Bake the bread
--- a/parser/testdata/recipe.schema.md
+++ b/parser/testdata/recipe.schema.md
@@ -0,0 +1,25 @@
+---
+{ . }
+---
+
+# { name | text,required }
+
+{ description | text,optional }
+
+## Ingredients
+{
+  path: recipeIngredient
+  codec: list
+  required: true
+  item:
+    template: "- { . }"
+}
+
+## Steps
+{
+  path: recipeInstructions
+  codec: list
+  required: true
+  item:
+    template: "{ @index }. { . }"
+}
--- a/parser/testdata/recipe_salad/input.md
+++ b/parser/testdata/recipe_salad/input.md
@@ -0,0 +1,25 @@
+---
+@type: Recipe
+image: https://example.com/salad.jpg
+author: Alex Chef
+datePublished: 2025-08-12
+prepTime: PT10M
+cookTime: PT0M
+recipeYield: 2 servings
+---
+
+# Simple Salad
+
+A quick green salad.
+
+## Ingredients
+- 100 g lettuce
+- 5 cherry tomatoes
+- 1 tbsp olive oil
+- Pinch of salt
+
+## Steps
+1. Wash and dry the lettuce.
+2. Halve the cherry tomatoes.
+3. Toss with olive oil and salt.
+
--- a/parser/testdata/recipe_salad/output.json
+++ b/parser/testdata/recipe_salad/output.json
@@ -0,0 +1,26 @@
+{
+  "@context": "https://schema.org/",
+  "@type": "Recipe",
+  "name": "Simple Salad",
+  "image": "https://example.com/salad.jpg",
+  "author": {
+    "@type": "Person",
+    "name": "Alex Chef"
+  },
+  "datePublished": "2025-08-12",
+  "description": "A quick green salad.",
+  "prepTime": "PT10M",
+  "cookTime": "PT0M",
+  "recipeYield": "2 servings",
+  "recipeIngredient": [
+    "100 g lettuce",
+    "5 cherry tomatoes",
+    "1 tbsp olive oil",
+    "Pinch of salt"
+  ],
+  "recipeInstructions": [
+    "Wash and dry the lettuce.",
+    "Halve the cherry tomatoes.",
+    "Toss with olive oil and salt."
+  ]
+}