feat: refactor some shit
This commit is contained in:
@@ -1,51 +1,32 @@
|
|||||||
package parser
|
package parser
|
||||||
|
|
||||||
type BlockType string
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
const (
|
"git.max-richter.dev/max/marka/parser/blocks"
|
||||||
BlockData BlockType = "data" // content between lines "{" and "}"
|
|
||||||
BlockMatching BlockType = "matching" // everything outside data blocks
|
|
||||||
)
|
)
|
||||||
|
|
||||||
type Block struct {
|
|
||||||
Type BlockType
|
|
||||||
Start, End int // byte offsets [Start, End)
|
|
||||||
src *string
|
|
||||||
}
|
|
||||||
|
|
||||||
func (b Block) GetContent() string {
|
|
||||||
if b.src == nil || b.Start < 0 || b.End > len(*b.src) || b.Start > b.End {
|
|
||||||
return ""
|
|
||||||
}
|
|
||||||
return (*b.src)[b.Start:b.End]
|
|
||||||
}
|
|
||||||
|
|
||||||
// ExtractBlocks scans once, emitting:
|
// ExtractBlocks scans once, emitting:
|
||||||
// - data blocks: inner content between a line that's exactly "{" and a line that's exactly "}"
|
// - data blocks: inner content between a line that's exactly "{" and a line that's exactly "}"
|
||||||
// - matching blocks: gaps between data blocks (excluding the brace lines themselves)
|
// - matching blocks: gaps between data blocks (excluding the brace lines themselves)
|
||||||
func ExtractBlocks(src string) []Block {
|
func ExtractBlocks(template string) ([]blocks.TemplateBlock, error) {
|
||||||
var out []Block
|
var out []blocks.TemplateBlock
|
||||||
var curlyIndex int
|
var curlyIndex int
|
||||||
|
|
||||||
const CLOSING = '}'
|
const CLOSING = '}'
|
||||||
const OPENING = '{'
|
const OPENING = '{'
|
||||||
|
|
||||||
if len(src) > 0 && src[0] == OPENING {
|
var start int
|
||||||
|
var blockType blocks.BlockType
|
||||||
|
|
||||||
|
if len(template) > 0 && template[0] == OPENING {
|
||||||
curlyIndex = 1
|
curlyIndex = 1
|
||||||
out = append(out, Block{
|
blockType = blocks.DataBlock
|
||||||
Start: 0,
|
|
||||||
Type: BlockData,
|
|
||||||
src: &src,
|
|
||||||
})
|
|
||||||
} else {
|
} else {
|
||||||
out = append(out, Block{
|
blockType = blocks.MatchingBlock
|
||||||
Start: 0,
|
|
||||||
Type: BlockMatching,
|
|
||||||
src: &src,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, r := range src {
|
for i, r := range template {
|
||||||
|
|
||||||
var nextCurlyIndex = curlyIndex
|
var nextCurlyIndex = curlyIndex
|
||||||
|
|
||||||
@@ -57,41 +38,42 @@ func ExtractBlocks(src string) []Block {
|
|||||||
}
|
}
|
||||||
|
|
||||||
var nextChar rune = ' '
|
var nextChar rune = ' '
|
||||||
if i+1 < len(src) {
|
if i+1 < len(template) {
|
||||||
nextChar = rune(src[i+1])
|
nextChar = rune(template[i+1])
|
||||||
}
|
}
|
||||||
|
|
||||||
if curlyIndex == 0 && nextCurlyIndex == 1 {
|
if curlyIndex == 0 && nextCurlyIndex == 1 {
|
||||||
out[len(out)-1].End = i
|
block, err := blocks.ParseTemplateBlock(template[start:i], blockType)
|
||||||
out = append(out, Block{
|
if err != nil {
|
||||||
Start: i,
|
return nil, fmt.Errorf("Failed to parse block: %w", err)
|
||||||
Type: BlockData,
|
}
|
||||||
src: &src,
|
out = append(out, block)
|
||||||
})
|
start = i
|
||||||
|
blockType = blocks.DataBlock
|
||||||
} else if curlyIndex == 1 && nextCurlyIndex == 0 {
|
} else if curlyIndex == 1 && nextCurlyIndex == 0 {
|
||||||
out[len(out)-1].End = i + 1
|
|
||||||
|
block, err := blocks.ParseTemplateBlock(template[start:i+1], blockType)
|
||||||
|
if err != nil {
|
||||||
|
return nil, fmt.Errorf("Failed to parse block: %w", err)
|
||||||
|
}
|
||||||
|
out = append(out, block)
|
||||||
|
|
||||||
if nextChar == OPENING {
|
if nextChar == OPENING {
|
||||||
out = append(out, Block{
|
start = i + 1
|
||||||
Start: i + 1,
|
blockType = blocks.DataBlock
|
||||||
Type: BlockData,
|
|
||||||
src: &src,
|
|
||||||
})
|
|
||||||
} else {
|
} else {
|
||||||
out = append(out, Block{
|
start = i + 1
|
||||||
Start: i + 1,
|
blockType = blocks.MatchingBlock
|
||||||
Type: BlockMatching,
|
|
||||||
src: &src,
|
|
||||||
})
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
curlyIndex = nextCurlyIndex
|
curlyIndex = nextCurlyIndex
|
||||||
}
|
}
|
||||||
|
|
||||||
var lastBlock = out[len(out)-1]
|
// var lastBlock = out[len(out)-1]
|
||||||
if lastBlock.End == 0 {
|
// if lastBlock.End == 0 {
|
||||||
out = out[:len(out)-1]
|
// out = out[:len(out)-1]
|
||||||
}
|
// }
|
||||||
|
|
||||||
return out
|
return out, nil
|
||||||
}
|
}
|
||||||
|
98
parser/blocks/blocks.go
Normal file
98
parser/blocks/blocks.go
Normal file
@@ -0,0 +1,98 @@
|
|||||||
|
package blocks
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
)
|
||||||
|
|
||||||
|
// TemplateType represents whether a template is short, long, or invalid.
|
||||||
|
type TemplateType int
|
||||||
|
|
||||||
|
const (
|
||||||
|
InvalidTemplate TemplateType = iota
|
||||||
|
ShortTemplate
|
||||||
|
LongTemplate
|
||||||
|
)
|
||||||
|
|
||||||
|
// DetectTemplateType checks if the template is short or long.
|
||||||
|
func DetectTemplateType(tmpl string) TemplateType {
|
||||||
|
trimmed := strings.TrimSpace(tmpl)
|
||||||
|
|
||||||
|
// Short type: starts with "{" and ends with "}" on a single line,
|
||||||
|
// and contains "|" or "," inside for inline definition
|
||||||
|
// Matchs for example { name | text,required }
|
||||||
|
if strings.HasPrefix(trimmed, "{") &&
|
||||||
|
strings.HasSuffix(trimmed, "}") &&
|
||||||
|
!strings.Contains(trimmed, "\n") {
|
||||||
|
return ShortTemplate
|
||||||
|
}
|
||||||
|
|
||||||
|
// Long type: multiline and contains keys like "path:" or "codec:" inside
|
||||||
|
// Matches for example:
|
||||||
|
// {
|
||||||
|
// path: name
|
||||||
|
// codec: text
|
||||||
|
// required: true
|
||||||
|
// }
|
||||||
|
if strings.Contains(trimmed, "\n") &&
|
||||||
|
(strings.Contains(trimmed, "path:") || strings.Contains(trimmed, "codec:")) {
|
||||||
|
return LongTemplate
|
||||||
|
}
|
||||||
|
|
||||||
|
return InvalidTemplate
|
||||||
|
}
|
||||||
|
|
||||||
|
// CodecType represents the type of codec used to encode/render a value
|
||||||
|
type CodecType string
|
||||||
|
|
||||||
|
const (
|
||||||
|
CodecText CodecType = "text"
|
||||||
|
CodecNumber CodecType = "number"
|
||||||
|
CodecYaml CodecType = "yaml"
|
||||||
|
CodecList CodecType = "list"
|
||||||
|
)
|
||||||
|
|
||||||
|
func parseCodecType(input string) (CodecType, error) {
|
||||||
|
switch input {
|
||||||
|
case "number":
|
||||||
|
return CodecNumber, nil
|
||||||
|
case "yaml":
|
||||||
|
return CodecYaml, nil
|
||||||
|
case "list":
|
||||||
|
return CodecList, nil
|
||||||
|
case "text":
|
||||||
|
return CodecText, nil
|
||||||
|
}
|
||||||
|
return CodecText, fmt.Errorf("unknown codec: '%s'", input)
|
||||||
|
}
|
||||||
|
|
||||||
|
type BlockType string
|
||||||
|
|
||||||
|
const (
|
||||||
|
DataBlock BlockType = "data" // content between lines "{" and "}"
|
||||||
|
MatchingBlock BlockType = "matching" // everything outside data blocks
|
||||||
|
)
|
||||||
|
|
||||||
|
type TemplateBlock struct {
|
||||||
|
Type BlockType
|
||||||
|
Path string
|
||||||
|
Codec CodecType
|
||||||
|
Required bool
|
||||||
|
content string
|
||||||
|
}
|
||||||
|
|
||||||
|
func (b TemplateBlock) GetContent() string {
|
||||||
|
return b.content
|
||||||
|
}
|
||||||
|
|
||||||
|
func (p *TemplateBlock) Parse(input string) (key string, value any, err error) {
|
||||||
|
switch p.Codec {
|
||||||
|
case CodecText:
|
||||||
|
return p.Path, input, nil
|
||||||
|
case CodecYaml:
|
||||||
|
return p.ParseYamlBlock(input)
|
||||||
|
case CodecList:
|
||||||
|
return p.ParseListBlock(input)
|
||||||
|
}
|
||||||
|
return p.Path, "", nil
|
||||||
|
}
|
10
parser/blocks/list_block.go
Normal file
10
parser/blocks/list_block.go
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
package blocks
|
||||||
|
|
||||||
|
import "fmt"
|
||||||
|
|
||||||
|
func (b TemplateBlock) ParseListBlock(input string) (key string, value any, error error) {
|
||||||
|
|
||||||
|
fmt.Printf("Parsing List: '%q'", input)
|
||||||
|
|
||||||
|
return "", nil, nil
|
||||||
|
}
|
118
parser/blocks/template.go
Normal file
118
parser/blocks/template.go
Normal file
@@ -0,0 +1,118 @@
|
|||||||
|
package blocks
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
"strings"
|
||||||
|
|
||||||
|
"go.yaml.in/yaml/v4"
|
||||||
|
)
|
||||||
|
|
||||||
|
func cleanTemplate(input string) string {
|
||||||
|
s := strings.TrimSpace(input)
|
||||||
|
s = strings.TrimPrefix(s, "{")
|
||||||
|
s = strings.TrimSuffix(s, "}")
|
||||||
|
return s
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseShortTemplate(input string) (TemplateBlock, error) {
|
||||||
|
|
||||||
|
var split = strings.Split(cleanTemplate(input), "|")
|
||||||
|
if len(split) < 1 {
|
||||||
|
return TemplateBlock{}, fmt.Errorf("Invalid Short Template")
|
||||||
|
}
|
||||||
|
|
||||||
|
block := TemplateBlock{
|
||||||
|
Type: DataBlock,
|
||||||
|
Path: strings.TrimSpace(split[0]),
|
||||||
|
Codec: CodecText,
|
||||||
|
content: input,
|
||||||
|
}
|
||||||
|
|
||||||
|
if len(split) > 1 {
|
||||||
|
var optionSplit = strings.Split(split[1], ",")
|
||||||
|
for _, option := range optionSplit {
|
||||||
|
switch strings.TrimSpace(option) {
|
||||||
|
case "required":
|
||||||
|
block.Required = true
|
||||||
|
case "number":
|
||||||
|
block.Codec = CodecNumber
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return block, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
type yamlBlock struct {
|
||||||
|
Path string `yaml:"path"`
|
||||||
|
Codec string `yaml:"codec"`
|
||||||
|
Required bool `yaml:"required,omitempty"`
|
||||||
|
Fields []yamlField `yaml:"fields"`
|
||||||
|
Item *struct {
|
||||||
|
Template string `yaml:"template,omitempty"`
|
||||||
|
} `yaml:"item,omitempty"`
|
||||||
|
Template string `yaml:"template,omitempty"`
|
||||||
|
}
|
||||||
|
|
||||||
|
type yamlField struct {
|
||||||
|
Path string `yaml:"path"`
|
||||||
|
Value any `yaml:"value,omitempty"`
|
||||||
|
Codec string `yaml:"codec"`
|
||||||
|
Required bool `yaml:"required"`
|
||||||
|
}
|
||||||
|
|
||||||
|
func parseYamlTemplate(input string) (block TemplateBlock, err error) {
|
||||||
|
|
||||||
|
var blk yamlBlock
|
||||||
|
|
||||||
|
cleaned := cleanTemplate(input)
|
||||||
|
|
||||||
|
dec := yaml.NewDecoder(strings.NewReader(cleaned))
|
||||||
|
dec.KnownFields(true)
|
||||||
|
|
||||||
|
if err := dec.Decode(&blk); err != nil {
|
||||||
|
fmt.Printf("Failed to parse:\n---\n%s\n---\n", cleaned)
|
||||||
|
return block, err
|
||||||
|
}
|
||||||
|
|
||||||
|
if blk.Path == "" {
|
||||||
|
return block, fmt.Errorf("missing top-level 'path'")
|
||||||
|
}
|
||||||
|
|
||||||
|
codec, err := parseCodecType(blk.Codec)
|
||||||
|
if err != nil {
|
||||||
|
return block, fmt.Errorf("failed to parse codec: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return TemplateBlock{
|
||||||
|
Type: DataBlock,
|
||||||
|
Path: blk.Path,
|
||||||
|
Codec: codec,
|
||||||
|
content: input,
|
||||||
|
}, nil
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
func ParseTemplateBlock(template string, blockType BlockType) (block TemplateBlock, err error) {
|
||||||
|
|
||||||
|
if blockType == MatchingBlock {
|
||||||
|
return TemplateBlock{
|
||||||
|
Type: MatchingBlock,
|
||||||
|
content: template,
|
||||||
|
}, nil
|
||||||
|
}
|
||||||
|
|
||||||
|
block.Type = DataBlock
|
||||||
|
block.content = template
|
||||||
|
|
||||||
|
templateType := DetectTemplateType(template)
|
||||||
|
if templateType == InvalidTemplate {
|
||||||
|
return block, fmt.Errorf("Invalid Template")
|
||||||
|
}
|
||||||
|
|
||||||
|
if templateType == ShortTemplate {
|
||||||
|
return parseShortTemplate(template)
|
||||||
|
}
|
||||||
|
|
||||||
|
return parseYamlTemplate(template)
|
||||||
|
}
|
18
parser/blocks/yaml_block.go
Normal file
18
parser/blocks/yaml_block.go
Normal file
@@ -0,0 +1,18 @@
|
|||||||
|
package blocks
|
||||||
|
|
||||||
|
import (
|
||||||
|
"fmt"
|
||||||
|
|
||||||
|
"go.yaml.in/yaml/v4"
|
||||||
|
)
|
||||||
|
|
||||||
|
func (b TemplateBlock) ParseYamlBlock(input string) (key string, value any, error error) {
|
||||||
|
|
||||||
|
res := make(map[string]any)
|
||||||
|
err := yaml.Unmarshal([]byte(input), &res)
|
||||||
|
if err != nil {
|
||||||
|
return "", nil, fmt.Errorf("failed to parse yaml: %w", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
return "", nil, nil
|
||||||
|
}
|
@@ -1,50 +1,54 @@
|
|||||||
package parser
|
package parser_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"strings"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
|
"git.max-richter.dev/max/marka/parser"
|
||||||
|
"git.max-richter.dev/max/marka/parser/blocks"
|
||||||
|
"git.max-richter.dev/max/marka/registry"
|
||||||
)
|
)
|
||||||
|
|
||||||
func readFile(t *testing.T, fileName string) string {
|
func TestExtractBlocks(t *testing.T) {
|
||||||
path := filepath.Join("testdata", fileName)
|
src, err := registry.GetTemplate("recipe")
|
||||||
data, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("failed to read test data file: %v", err)
|
t.Errorf("Failed to extract blocks: %s", err.Error())
|
||||||
}
|
t.FailNow()
|
||||||
return string(data)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
func TestExtractBlocks(t *testing.T) {
|
templateBlocks, err := parser.ExtractBlocks(src)
|
||||||
src := readFile(t, "recipe.schema.md")
|
if err != nil {
|
||||||
blocks := ExtractBlocks(src)
|
t.Errorf("Failed to extract blocks: %s", err.Error())
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
|
||||||
expected := []struct {
|
expected := []struct {
|
||||||
Type BlockType
|
Type blocks.BlockType
|
||||||
Content string
|
Content string
|
||||||
}{
|
}{
|
||||||
{BlockMatching, "---\\n"},
|
{blocks.MatchingBlock, "---\n"},
|
||||||
{BlockData, "{ . }"},
|
{blocks.DataBlock, "{\n path: .\n codec: yaml\n fields:\n - path: name\n codec: text\n required: true\n - path: image\n codec: text\n required: true\n - path: author.@type\n codec: const\n value: Person\n - path: author.name\n codec: text\n - path: datePublished\n codec: text\n - path: description\n codec: text\n - path: prepTime\n codec: text\n - path: cookTime\n codec: text\n - path: recipeYield\n codec: text\n}"},
|
||||||
{BlockMatching, "\\n---\\n\\n# "},
|
{blocks.MatchingBlock, "\n---\n\n# "},
|
||||||
{BlockData, "{ name | text,required }"},
|
{blocks.DataBlock, "{ name | text,required }"},
|
||||||
{BlockMatching, "\\n\\n"},
|
{blocks.MatchingBlock, "\n\n"},
|
||||||
{BlockData, "{ description | text,optional }"},
|
{blocks.DataBlock, "{ description | text }"},
|
||||||
{BlockMatching, "\\n\\n## Ingredients\\n"},
|
{blocks.MatchingBlock, "\n\n## Ingredients\n"},
|
||||||
{BlockData, "{\\n path: recipeIngredient\\n codec: list\\n required: true\\n item:\\n template: \"- { . }\"\\n}"},
|
{blocks.DataBlock, "{\n path: recipeIngredient\n codec: list\n required: true\n item:\n template: \"- { . }\"\n}"},
|
||||||
{BlockMatching, "\\n\\n## Steps\\n"},
|
{blocks.MatchingBlock, "\n\n## Steps\n"},
|
||||||
{BlockData, "{\\n path: recipeInstructions\\n codec: list\\n required: true\\n item:\\n template: \"{ @index }. { . }\"\\n}"},
|
{blocks.DataBlock, "{\n path: recipeInstructions\n codec: list\n required: true\n item:\n template: \"{ @index }. { . }\"\n}"},
|
||||||
}
|
}
|
||||||
|
|
||||||
if len(blocks) != len(expected) {
|
if len(templateBlocks) != len(expected) {
|
||||||
t.Fatalf("expected %d blocks, got %d", len(expected), len(blocks))
|
t.Fatalf("expected %d blocks, got %d", len(expected), len(templateBlocks))
|
||||||
}
|
}
|
||||||
|
|
||||||
for i, b := range blocks {
|
for i, b := range templateBlocks {
|
||||||
exp := expected[i]
|
exp := expected[i]
|
||||||
content := strings.ReplaceAll(b.GetContent(), "\n", "\\n")
|
if b.Type != exp.Type {
|
||||||
if b.Type != exp.Type || content != exp.Content {
|
t.Errorf("Block#%d Type '%s' did not match expected type '%s'", i, b.Type, exp.Type)
|
||||||
t.Errorf("Block %d: expected %v, got Type: %v, Start: %d, End: %d, Content: %s", i, exp, b.Type, b.Start, b.End, content)
|
}
|
||||||
|
content := b.GetContent()
|
||||||
|
if content != exp.Content {
|
||||||
|
t.Errorf("Block#%d Content '%s' did not match expected Content: '%s'", i, content, exp.Content)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -3,3 +3,5 @@ module git.max-richter.dev/max/marka/parser
|
|||||||
go 1.24.3
|
go 1.24.3
|
||||||
|
|
||||||
require github.com/agext/levenshtein v1.2.3
|
require github.com/agext/levenshtein v1.2.3
|
||||||
|
|
||||||
|
require go.yaml.in/yaml/v4 v4.0.0-rc.1 // indirect
|
||||||
|
@@ -1,2 +1,4 @@
|
|||||||
github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
|
github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
|
||||||
github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
|
github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
|
||||||
|
go.yaml.in/yaml/v4 v4.0.0-rc.1 h1:4J1+yLKUIPGexM/Si+9d3pij4hdc7aGO04NhrElqXbY=
|
||||||
|
go.yaml.in/yaml/v4 v4.0.0-rc.1/go.mod h1:CBdeces52/nUXndfQ5OY8GEQuNR9uEEOJPZj/Xq5IzU=
|
||||||
|
17
parser/main.go
Normal file
17
parser/main.go
Normal file
@@ -0,0 +1,17 @@
|
|||||||
|
// Package parser provides functions for parsing Markdown templates into
|
||||||
|
// structured JSON objects that conform to a JSON Schema.
|
||||||
|
package parser
|
||||||
|
|
||||||
|
func ParseFile(markdownContent string) (map[string]any, error) {
|
||||||
|
|
||||||
|
// _schema, err := registry.GetTemplate("Recipe")
|
||||||
|
// if err != nil {
|
||||||
|
// return nil, fmt.Errorf("could not get schema: %w", err)
|
||||||
|
// }
|
||||||
|
|
||||||
|
// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
|
||||||
|
// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
|
||||||
|
// The "matching" blocks should soft match with a levenshtein distance
|
||||||
|
|
||||||
|
return map[string]any{}, nil
|
||||||
|
}
|
43
parser/main_test.go
Normal file
43
parser/main_test.go
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
package parser_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"encoding/json"
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"reflect"
|
||||||
|
"testing"
|
||||||
|
|
||||||
|
"git.max-richter.dev/max/marka/parser"
|
||||||
|
)
|
||||||
|
|
||||||
|
func TestParseRecipe_Golden(t *testing.T) {
|
||||||
|
td := filepath.Join("testdata", "recipe_salad")
|
||||||
|
input := filepath.Join(td, "input.md")
|
||||||
|
output := filepath.Join(td, "output.json")
|
||||||
|
|
||||||
|
inputContent, err := os.ReadFile(input)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read input.md: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
got, err := parser.ParseFile(string(inputContent))
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("ParseFile: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
var want map[string]any
|
||||||
|
b, err := os.ReadFile(output)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("read expected.json: %v", err)
|
||||||
|
}
|
||||||
|
if err := json.Unmarshal(b, &want); err != nil {
|
||||||
|
t.Fatalf("unmarshal expected.json: %v", err)
|
||||||
|
}
|
||||||
|
|
||||||
|
// Deep structural compare
|
||||||
|
if !reflect.DeepEqual(want, got) {
|
||||||
|
gb, _ := json.MarshalIndent(got, "", " ")
|
||||||
|
wb, _ := json.MarshalIndent(want, "", " ")
|
||||||
|
t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
|
||||||
|
}
|
||||||
|
}
|
@@ -3,12 +3,13 @@ package parser
|
|||||||
import (
|
import (
|
||||||
"math"
|
"math"
|
||||||
|
|
||||||
|
"git.max-richter.dev/max/marka/parser/blocks"
|
||||||
"github.com/agext/levenshtein"
|
"github.com/agext/levenshtein"
|
||||||
)
|
)
|
||||||
|
|
||||||
type MatchBlock struct {
|
type MatchBlock struct {
|
||||||
Start, End int
|
Start, End int
|
||||||
Block Block
|
Block blocks.TemplateBlock
|
||||||
src *string
|
src *string
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -22,17 +23,17 @@ func (m MatchBlock) GetContent() string {
|
|||||||
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
|
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
|
||||||
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
|
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
|
||||||
// segments as gaps between those anchors.
|
// segments as gaps between those anchors.
|
||||||
func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchBlock {
|
func MatchBlocksFuzzy(markdown string, templateBlocks []blocks.TemplateBlock, maxDist float64) []MatchBlock {
|
||||||
var out []MatchBlock
|
var out []MatchBlock
|
||||||
|
|
||||||
var lastIndex = 0
|
var lastIndex = 0
|
||||||
for i, b := range blocks {
|
for i, b := range templateBlocks {
|
||||||
if b.Type == BlockMatching {
|
if b.Type == blocks.MatchingBlock {
|
||||||
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
|
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
|
||||||
if end != -1 {
|
if end != -1 {
|
||||||
if i > 0 {
|
if i > 0 {
|
||||||
previousBlock := blocks[i-1]
|
previousBlock := templateBlocks[i-1]
|
||||||
if previousBlock.Type == BlockData {
|
if previousBlock.Type == blocks.DataBlock {
|
||||||
out = append(out, MatchBlock{
|
out = append(out, MatchBlock{
|
||||||
Start: lastIndex,
|
Start: lastIndex,
|
||||||
End: start,
|
End: start,
|
||||||
@@ -47,8 +48,8 @@ func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchB
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Handle the last block
|
// Handle the last block
|
||||||
lastBlock := blocks[len(blocks)-1]
|
lastBlock := templateBlocks[len(templateBlocks)-1]
|
||||||
if lastBlock.Type == BlockData {
|
if lastBlock.Type == blocks.DataBlock {
|
||||||
out = append(out, MatchBlock{
|
out = append(out, MatchBlock{
|
||||||
Start: lastIndex,
|
Start: lastIndex,
|
||||||
End: len(markdown),
|
End: len(markdown),
|
||||||
|
@@ -1,37 +1,27 @@
|
|||||||
package parser_test
|
package parser_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.max-richter.dev/max/marka/parser"
|
"git.max-richter.dev/max/marka/parser"
|
||||||
|
"git.max-richter.dev/max/marka/registry"
|
||||||
)
|
)
|
||||||
|
|
||||||
func readFile(t *testing.T, fileName string) string {
|
|
||||||
path := filepath.Join("testdata", fileName)
|
|
||||||
data, err := os.ReadFile(path)
|
|
||||||
if err != nil {
|
|
||||||
t.Fatalf("failed to read test data file: %v", err)
|
|
||||||
}
|
|
||||||
return string(data)
|
|
||||||
}
|
|
||||||
|
|
||||||
func TestFuzzyFindAll(t *testing.T) {
|
func TestFuzzyFindAll(t *testing.T) {
|
||||||
recipeMd := readFile(t, "baguette.md")
|
recipeMd := readTestDataFile(t, "baguette.md")
|
||||||
|
|
||||||
tests := []struct {
|
tests := []struct {
|
||||||
Needle string
|
Needle string
|
||||||
Start, End, StartIndex int
|
Start, End, StartIndex int
|
||||||
}{
|
}{
|
||||||
{StartIndex: 0, Needle: "# Ingredients\n", Start: 72, End: 86},
|
{StartIndex: 0, Needle: "# Ingredients\n", Start: 77, End: 91},
|
||||||
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 72, End: 86},
|
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 77, End: 91},
|
||||||
{StartIndex: 0, Needle: "# Inrdients\n", Start: 72, End: 86},
|
{StartIndex: 0, Needle: "# Inrdients\n", Start: 77, End: 91},
|
||||||
{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
|
{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
|
||||||
{StartIndex: 4, Needle: "---\n", Start: 24, End: 28},
|
{StartIndex: 4, Needle: "---\n", Start: 29, End: 33},
|
||||||
{StartIndex: 0, Needle: "# Steps\n", Start: 111, End: 119},
|
{StartIndex: 0, Needle: "# Steps\n", Start: 116, End: 124},
|
||||||
{StartIndex: 0, Needle: "# Stps\n", Start: 111, End: 119},
|
{StartIndex: 0, Needle: "# Stps\n", Start: 116, End: 124},
|
||||||
{StartIndex: 0, Needle: "# Step\n", Start: 111, End: 119},
|
{StartIndex: 0, Needle: "# Step\n", Start: 116, End: 124},
|
||||||
}
|
}
|
||||||
|
|
||||||
for _, test := range tests {
|
for _, test := range tests {
|
||||||
@@ -45,16 +35,20 @@ func TestFuzzyFindAll(t *testing.T) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
func TestFuzzyBlockMatch(t *testing.T) {
|
func TestFuzzyBlockMatch(t *testing.T) {
|
||||||
recipeMd := readFile(t, "baguette.md")
|
recipeMd := readTestDataFile(t, "baguette.md")
|
||||||
schemaMd := readFile(t, "recipe.schema.md")
|
schemaMd, err := registry.GetTemplate("recipe")
|
||||||
blocks := parser.ExtractBlocks(schemaMd)
|
if err != nil {
|
||||||
|
t.Errorf("Failed to load template: %s", err.Error())
|
||||||
|
t.FailNow()
|
||||||
|
}
|
||||||
|
blocks, _ := parser.ExtractBlocks(schemaMd)
|
||||||
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
||||||
|
|
||||||
expected := []struct {
|
expected := []struct {
|
||||||
value string
|
value string
|
||||||
}{
|
}{
|
||||||
{
|
{
|
||||||
value: "author: Max Richter",
|
value: "author.name: Max Richter",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
value: "Baguette",
|
value: "Baguette",
|
||||||
@@ -66,7 +60,7 @@ func TestFuzzyBlockMatch(t *testing.T) {
|
|||||||
value: "- Flour\n- Water\n- Salt",
|
value: "- Flour\n- Water\n- Salt",
|
||||||
},
|
},
|
||||||
{
|
{
|
||||||
value: "1. Mix Flour Water and Salt\n2. Bake the bread",
|
value: "1. Mix Flour Water and Salt\n2. Bake the bread\n",
|
||||||
},
|
},
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@@ -1,17 +1,15 @@
|
|||||||
// Package parser provides functions for parsing Markdown templates into
|
|
||||||
// structured JSON objects that conform to a JSON Schema.
|
|
||||||
package parser
|
package parser
|
||||||
|
|
||||||
func ParseFile(markdownContent string) (map[string]any, error) {
|
func Parse(blocks []MatchBlock) map[string]any {
|
||||||
|
|
||||||
// _schema, err := registry.GetTemplate("Recipe")
|
result := make(map[string]any)
|
||||||
// if err != nil {
|
|
||||||
// return nil, fmt.Errorf("could not get schema: %w", err)
|
|
||||||
// }
|
|
||||||
|
|
||||||
// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
|
for _, b := range blocks {
|
||||||
// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
|
input := b.GetContent()
|
||||||
// The "matching" blocks should soft match with a levenshtein distance
|
|
||||||
|
|
||||||
return map[string]any{}, nil
|
key, value, _ := b.Block.Parse(input)
|
||||||
|
result[key] = value
|
||||||
|
}
|
||||||
|
|
||||||
|
return result
|
||||||
}
|
}
|
||||||
|
@@ -1,43 +1,42 @@
|
|||||||
package parser_test
|
package parser_test
|
||||||
|
|
||||||
import (
|
import (
|
||||||
"encoding/json"
|
"fmt"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
|
||||||
"reflect"
|
|
||||||
"testing"
|
"testing"
|
||||||
|
|
||||||
"git.max-richter.dev/max/marka/parser"
|
"git.max-richter.dev/max/marka/parser"
|
||||||
|
"git.max-richter.dev/max/marka/registry"
|
||||||
)
|
)
|
||||||
|
|
||||||
func TestParseRecipe_Golden(t *testing.T) {
|
func TestParseBaguette(t *testing.T) {
|
||||||
td := filepath.Join("testdata", "recipe_salad")
|
recipeMd := readTestDataFile(t, "baguette.md")
|
||||||
input := filepath.Join(td, "input.md")
|
|
||||||
output := filepath.Join(td, "output.json")
|
|
||||||
|
|
||||||
inputContent, err := os.ReadFile(input)
|
template, err := registry.GetTemplate("recipe")
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("read input.md: %v", err)
|
t.Fatalf("Err: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
got, err := parser.ParseFile(string(inputContent))
|
blocks, err := parser.ExtractBlocks(template)
|
||||||
if err != nil {
|
if err != nil {
|
||||||
t.Fatalf("ParseFile: %v", err)
|
t.Fatalf("Err: %s", err)
|
||||||
}
|
}
|
||||||
|
|
||||||
var want map[string]any
|
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
||||||
b, err := os.ReadFile(output)
|
parsed := parser.Parse(matches)
|
||||||
if err != nil {
|
expected := map[string]any{
|
||||||
t.Fatalf("read expected.json: %v", err)
|
"name": "Baguette",
|
||||||
}
|
"description": "My favourite baguette recipe",
|
||||||
if err := json.Unmarshal(b, &want); err != nil {
|
"recipeIngredient": []string{"Flour", "Water", "Salt"},
|
||||||
t.Fatalf("unmarshal expected.json: %v", err)
|
// "recipeInstructions": []string{
|
||||||
|
// "Mix Flour Water and Salt",
|
||||||
|
// "Bake the bread",
|
||||||
|
// },
|
||||||
}
|
}
|
||||||
|
|
||||||
// Deep structural compare
|
for k, v := range expected {
|
||||||
if !reflect.DeepEqual(want, got) {
|
if fmt.Sprintf("%v", parsed[k]) != fmt.Sprintf("%v", v) {
|
||||||
gb, _ := json.MarshalIndent(got, "", " ")
|
t.Errorf("Expected %v but got %v", v, parsed[k])
|
||||||
wb, _ := json.MarshalIndent(want, "", " ")
|
|
||||||
t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
}
|
||||||
|
2
parser/testdata/baguette.md
vendored
2
parser/testdata/baguette.md
vendored
@@ -1,5 +1,5 @@
|
|||||||
---
|
---
|
||||||
author: Max Richter
|
author.name: Max Richter
|
||||||
---
|
---
|
||||||
|
|
||||||
# Baguette
|
# Baguette
|
||||||
|
25
parser/testdata/recipe.schema.md
vendored
25
parser/testdata/recipe.schema.md
vendored
@@ -1,25 +0,0 @@
|
|||||||
---
|
|
||||||
{ . }
|
|
||||||
---
|
|
||||||
|
|
||||||
# { name | text,required }
|
|
||||||
|
|
||||||
{ description | text,optional }
|
|
||||||
|
|
||||||
## Ingredients
|
|
||||||
{
|
|
||||||
path: recipeIngredient
|
|
||||||
codec: list
|
|
||||||
required: true
|
|
||||||
item:
|
|
||||||
template: "- { . }"
|
|
||||||
}
|
|
||||||
|
|
||||||
## Steps
|
|
||||||
{
|
|
||||||
path: recipeInstructions
|
|
||||||
codec: list
|
|
||||||
required: true
|
|
||||||
item:
|
|
||||||
template: "{ @index }. { . }"
|
|
||||||
}
|
|
16
parser/testutils_test.go
Normal file
16
parser/testutils_test.go
Normal file
@@ -0,0 +1,16 @@
|
|||||||
|
package parser_test
|
||||||
|
|
||||||
|
import (
|
||||||
|
"os"
|
||||||
|
"path/filepath"
|
||||||
|
"testing"
|
||||||
|
)
|
||||||
|
|
||||||
|
func readTestDataFile(t *testing.T, fileName string) string {
|
||||||
|
path := filepath.Join("testdata", fileName)
|
||||||
|
data, err := os.ReadFile(path)
|
||||||
|
if err != nil {
|
||||||
|
t.Fatalf("failed to read test data file: %v", err)
|
||||||
|
}
|
||||||
|
return string(data)
|
||||||
|
}
|
@@ -1,6 +0,0 @@
|
|||||||
description: "Core capture aliases for Marka"
|
|
||||||
patterns:
|
|
||||||
text: ".+"
|
|
||||||
word: "\\S+"
|
|
||||||
num: "(?:\\d+(?:[.,]\\d+)?(?:\\s?\\d+/\\d+)?)" # 3 | 1.5 | 1 1/2
|
|
||||||
indexMarker: "\\d+[.)]" # 1. / 1)
|
|
@@ -28,9 +28,6 @@ var templates embed.FS
|
|||||||
//go:embed schema-org/*
|
//go:embed schema-org/*
|
||||||
var schemas embed.FS
|
var schemas embed.FS
|
||||||
|
|
||||||
//go:embed aliases/*
|
|
||||||
var aliases embed.FS
|
|
||||||
|
|
||||||
func GetTemplates() Source {
|
func GetTemplates() Source {
|
||||||
return src{fsys: templates}
|
return src{fsys: templates}
|
||||||
}
|
}
|
||||||
@@ -49,11 +46,3 @@ func GetTemplate(name string) (string, error) {
|
|||||||
|
|
||||||
return string(templateBytes), nil
|
return string(templateBytes), nil
|
||||||
}
|
}
|
||||||
|
|
||||||
func GetSchemas() Source {
|
|
||||||
return src{fsys: schemas}
|
|
||||||
}
|
|
||||||
|
|
||||||
func GetAliases() Source {
|
|
||||||
return src{fsys: aliases}
|
|
||||||
}
|
|
||||||
|
@@ -2,10 +2,6 @@
|
|||||||
{
|
{
|
||||||
path: .
|
path: .
|
||||||
codec: yaml
|
codec: yaml
|
||||||
required: true
|
|
||||||
assert:
|
|
||||||
"@context": https://schema.org/
|
|
||||||
"@type": Recipe
|
|
||||||
fields:
|
fields:
|
||||||
- path: name
|
- path: name
|
||||||
codec: text
|
codec: text
|
||||||
@@ -18,28 +14,22 @@
|
|||||||
value: Person
|
value: Person
|
||||||
- path: author.name
|
- path: author.name
|
||||||
codec: text
|
codec: text
|
||||||
required: true
|
|
||||||
- path: datePublished
|
- path: datePublished
|
||||||
codec: text
|
codec: text
|
||||||
optional: true
|
|
||||||
- path: description
|
- path: description
|
||||||
codec: text
|
codec: text
|
||||||
optional: true
|
|
||||||
- path: prepTime
|
- path: prepTime
|
||||||
codec: text
|
codec: text
|
||||||
optional: true
|
|
||||||
- path: cookTime
|
- path: cookTime
|
||||||
codec: text
|
codec: text
|
||||||
optional: true
|
|
||||||
- path: recipeYield
|
- path: recipeYield
|
||||||
codec: text
|
codec: text
|
||||||
optional: true
|
|
||||||
}
|
}
|
||||||
---
|
---
|
||||||
|
|
||||||
# { name | text,required }
|
# { name | text,required }
|
||||||
|
|
||||||
{ description | text,optional }
|
{ description | text }
|
||||||
|
|
||||||
## Ingredients
|
## Ingredients
|
||||||
{
|
{
|
Reference in New Issue
Block a user