feat: refactor some shit
This commit is contained in:
@@ -1,51 +1,32 @@
|
||||
package parser
|
||||
|
||||
type BlockType string
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
const (
|
||||
BlockData BlockType = "data" // content between lines "{" and "}"
|
||||
BlockMatching BlockType = "matching" // everything outside data blocks
|
||||
"git.max-richter.dev/max/marka/parser/blocks"
|
||||
)
|
||||
|
||||
type Block struct {
|
||||
Type BlockType
|
||||
Start, End int // byte offsets [Start, End)
|
||||
src *string
|
||||
}
|
||||
|
||||
func (b Block) GetContent() string {
|
||||
if b.src == nil || b.Start < 0 || b.End > len(*b.src) || b.Start > b.End {
|
||||
return ""
|
||||
}
|
||||
return (*b.src)[b.Start:b.End]
|
||||
}
|
||||
|
||||
// ExtractBlocks scans once, emitting:
|
||||
// - data blocks: inner content between a line that's exactly "{" and a line that's exactly "}"
|
||||
// - matching blocks: gaps between data blocks (excluding the brace lines themselves)
|
||||
func ExtractBlocks(src string) []Block {
|
||||
var out []Block
|
||||
func ExtractBlocks(template string) ([]blocks.TemplateBlock, error) {
|
||||
var out []blocks.TemplateBlock
|
||||
var curlyIndex int
|
||||
|
||||
const CLOSING = '}'
|
||||
const OPENING = '{'
|
||||
|
||||
if len(src) > 0 && src[0] == OPENING {
|
||||
var start int
|
||||
var blockType blocks.BlockType
|
||||
|
||||
if len(template) > 0 && template[0] == OPENING {
|
||||
curlyIndex = 1
|
||||
out = append(out, Block{
|
||||
Start: 0,
|
||||
Type: BlockData,
|
||||
src: &src,
|
||||
})
|
||||
blockType = blocks.DataBlock
|
||||
} else {
|
||||
out = append(out, Block{
|
||||
Start: 0,
|
||||
Type: BlockMatching,
|
||||
src: &src,
|
||||
})
|
||||
blockType = blocks.MatchingBlock
|
||||
}
|
||||
|
||||
for i, r := range src {
|
||||
for i, r := range template {
|
||||
|
||||
var nextCurlyIndex = curlyIndex
|
||||
|
||||
@@ -57,41 +38,42 @@ func ExtractBlocks(src string) []Block {
|
||||
}
|
||||
|
||||
var nextChar rune = ' '
|
||||
if i+1 < len(src) {
|
||||
nextChar = rune(src[i+1])
|
||||
if i+1 < len(template) {
|
||||
nextChar = rune(template[i+1])
|
||||
}
|
||||
|
||||
if curlyIndex == 0 && nextCurlyIndex == 1 {
|
||||
out[len(out)-1].End = i
|
||||
out = append(out, Block{
|
||||
Start: i,
|
||||
Type: BlockData,
|
||||
src: &src,
|
||||
})
|
||||
block, err := blocks.ParseTemplateBlock(template[start:i], blockType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Failed to parse block: %w", err)
|
||||
}
|
||||
out = append(out, block)
|
||||
start = i
|
||||
blockType = blocks.DataBlock
|
||||
} else if curlyIndex == 1 && nextCurlyIndex == 0 {
|
||||
out[len(out)-1].End = i + 1
|
||||
|
||||
block, err := blocks.ParseTemplateBlock(template[start:i+1], blockType)
|
||||
if err != nil {
|
||||
return nil, fmt.Errorf("Failed to parse block: %w", err)
|
||||
}
|
||||
out = append(out, block)
|
||||
|
||||
if nextChar == OPENING {
|
||||
out = append(out, Block{
|
||||
Start: i + 1,
|
||||
Type: BlockData,
|
||||
src: &src,
|
||||
})
|
||||
start = i + 1
|
||||
blockType = blocks.DataBlock
|
||||
} else {
|
||||
out = append(out, Block{
|
||||
Start: i + 1,
|
||||
Type: BlockMatching,
|
||||
src: &src,
|
||||
})
|
||||
start = i + 1
|
||||
blockType = blocks.MatchingBlock
|
||||
}
|
||||
}
|
||||
|
||||
curlyIndex = nextCurlyIndex
|
||||
}
|
||||
|
||||
var lastBlock = out[len(out)-1]
|
||||
if lastBlock.End == 0 {
|
||||
out = out[:len(out)-1]
|
||||
}
|
||||
// var lastBlock = out[len(out)-1]
|
||||
// if lastBlock.End == 0 {
|
||||
// out = out[:len(out)-1]
|
||||
// }
|
||||
|
||||
return out
|
||||
return out, nil
|
||||
}
|
||||
|
98
parser/blocks/blocks.go
Normal file
98
parser/blocks/blocks.go
Normal file
@@ -0,0 +1,98 @@
|
||||
package blocks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
)
|
||||
|
||||
// TemplateType represents whether a template is short, long, or invalid.
|
||||
type TemplateType int
|
||||
|
||||
const (
|
||||
InvalidTemplate TemplateType = iota
|
||||
ShortTemplate
|
||||
LongTemplate
|
||||
)
|
||||
|
||||
// DetectTemplateType checks if the template is short or long.
|
||||
func DetectTemplateType(tmpl string) TemplateType {
|
||||
trimmed := strings.TrimSpace(tmpl)
|
||||
|
||||
// Short type: starts with "{" and ends with "}" on a single line,
|
||||
// and contains "|" or "," inside for inline definition
|
||||
// Matchs for example { name | text,required }
|
||||
if strings.HasPrefix(trimmed, "{") &&
|
||||
strings.HasSuffix(trimmed, "}") &&
|
||||
!strings.Contains(trimmed, "\n") {
|
||||
return ShortTemplate
|
||||
}
|
||||
|
||||
// Long type: multiline and contains keys like "path:" or "codec:" inside
|
||||
// Matches for example:
|
||||
// {
|
||||
// path: name
|
||||
// codec: text
|
||||
// required: true
|
||||
// }
|
||||
if strings.Contains(trimmed, "\n") &&
|
||||
(strings.Contains(trimmed, "path:") || strings.Contains(trimmed, "codec:")) {
|
||||
return LongTemplate
|
||||
}
|
||||
|
||||
return InvalidTemplate
|
||||
}
|
||||
|
||||
// CodecType represents the type of codec used to encode/render a value
|
||||
type CodecType string
|
||||
|
||||
const (
|
||||
CodecText CodecType = "text"
|
||||
CodecNumber CodecType = "number"
|
||||
CodecYaml CodecType = "yaml"
|
||||
CodecList CodecType = "list"
|
||||
)
|
||||
|
||||
func parseCodecType(input string) (CodecType, error) {
|
||||
switch input {
|
||||
case "number":
|
||||
return CodecNumber, nil
|
||||
case "yaml":
|
||||
return CodecYaml, nil
|
||||
case "list":
|
||||
return CodecList, nil
|
||||
case "text":
|
||||
return CodecText, nil
|
||||
}
|
||||
return CodecText, fmt.Errorf("unknown codec: '%s'", input)
|
||||
}
|
||||
|
||||
type BlockType string
|
||||
|
||||
const (
|
||||
DataBlock BlockType = "data" // content between lines "{" and "}"
|
||||
MatchingBlock BlockType = "matching" // everything outside data blocks
|
||||
)
|
||||
|
||||
type TemplateBlock struct {
|
||||
Type BlockType
|
||||
Path string
|
||||
Codec CodecType
|
||||
Required bool
|
||||
content string
|
||||
}
|
||||
|
||||
func (b TemplateBlock) GetContent() string {
|
||||
return b.content
|
||||
}
|
||||
|
||||
func (p *TemplateBlock) Parse(input string) (key string, value any, err error) {
|
||||
switch p.Codec {
|
||||
case CodecText:
|
||||
return p.Path, input, nil
|
||||
case CodecYaml:
|
||||
return p.ParseYamlBlock(input)
|
||||
case CodecList:
|
||||
return p.ParseListBlock(input)
|
||||
}
|
||||
return p.Path, "", nil
|
||||
}
|
10
parser/blocks/list_block.go
Normal file
10
parser/blocks/list_block.go
Normal file
@@ -0,0 +1,10 @@
|
||||
package blocks
|
||||
|
||||
import "fmt"
|
||||
|
||||
func (b TemplateBlock) ParseListBlock(input string) (key string, value any, error error) {
|
||||
|
||||
fmt.Printf("Parsing List: '%q'", input)
|
||||
|
||||
return "", nil, nil
|
||||
}
|
118
parser/blocks/template.go
Normal file
118
parser/blocks/template.go
Normal file
@@ -0,0 +1,118 @@
|
||||
package blocks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"strings"
|
||||
|
||||
"go.yaml.in/yaml/v4"
|
||||
)
|
||||
|
||||
func cleanTemplate(input string) string {
|
||||
s := strings.TrimSpace(input)
|
||||
s = strings.TrimPrefix(s, "{")
|
||||
s = strings.TrimSuffix(s, "}")
|
||||
return s
|
||||
}
|
||||
|
||||
func parseShortTemplate(input string) (TemplateBlock, error) {
|
||||
|
||||
var split = strings.Split(cleanTemplate(input), "|")
|
||||
if len(split) < 1 {
|
||||
return TemplateBlock{}, fmt.Errorf("Invalid Short Template")
|
||||
}
|
||||
|
||||
block := TemplateBlock{
|
||||
Type: DataBlock,
|
||||
Path: strings.TrimSpace(split[0]),
|
||||
Codec: CodecText,
|
||||
content: input,
|
||||
}
|
||||
|
||||
if len(split) > 1 {
|
||||
var optionSplit = strings.Split(split[1], ",")
|
||||
for _, option := range optionSplit {
|
||||
switch strings.TrimSpace(option) {
|
||||
case "required":
|
||||
block.Required = true
|
||||
case "number":
|
||||
block.Codec = CodecNumber
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return block, nil
|
||||
}
|
||||
|
||||
type yamlBlock struct {
|
||||
Path string `yaml:"path"`
|
||||
Codec string `yaml:"codec"`
|
||||
Required bool `yaml:"required,omitempty"`
|
||||
Fields []yamlField `yaml:"fields"`
|
||||
Item *struct {
|
||||
Template string `yaml:"template,omitempty"`
|
||||
} `yaml:"item,omitempty"`
|
||||
Template string `yaml:"template,omitempty"`
|
||||
}
|
||||
|
||||
type yamlField struct {
|
||||
Path string `yaml:"path"`
|
||||
Value any `yaml:"value,omitempty"`
|
||||
Codec string `yaml:"codec"`
|
||||
Required bool `yaml:"required"`
|
||||
}
|
||||
|
||||
func parseYamlTemplate(input string) (block TemplateBlock, err error) {
|
||||
|
||||
var blk yamlBlock
|
||||
|
||||
cleaned := cleanTemplate(input)
|
||||
|
||||
dec := yaml.NewDecoder(strings.NewReader(cleaned))
|
||||
dec.KnownFields(true)
|
||||
|
||||
if err := dec.Decode(&blk); err != nil {
|
||||
fmt.Printf("Failed to parse:\n---\n%s\n---\n", cleaned)
|
||||
return block, err
|
||||
}
|
||||
|
||||
if blk.Path == "" {
|
||||
return block, fmt.Errorf("missing top-level 'path'")
|
||||
}
|
||||
|
||||
codec, err := parseCodecType(blk.Codec)
|
||||
if err != nil {
|
||||
return block, fmt.Errorf("failed to parse codec: %w", err)
|
||||
}
|
||||
|
||||
return TemplateBlock{
|
||||
Type: DataBlock,
|
||||
Path: blk.Path,
|
||||
Codec: codec,
|
||||
content: input,
|
||||
}, nil
|
||||
|
||||
}
|
||||
|
||||
func ParseTemplateBlock(template string, blockType BlockType) (block TemplateBlock, err error) {
|
||||
|
||||
if blockType == MatchingBlock {
|
||||
return TemplateBlock{
|
||||
Type: MatchingBlock,
|
||||
content: template,
|
||||
}, nil
|
||||
}
|
||||
|
||||
block.Type = DataBlock
|
||||
block.content = template
|
||||
|
||||
templateType := DetectTemplateType(template)
|
||||
if templateType == InvalidTemplate {
|
||||
return block, fmt.Errorf("Invalid Template")
|
||||
}
|
||||
|
||||
if templateType == ShortTemplate {
|
||||
return parseShortTemplate(template)
|
||||
}
|
||||
|
||||
return parseYamlTemplate(template)
|
||||
}
|
18
parser/blocks/yaml_block.go
Normal file
18
parser/blocks/yaml_block.go
Normal file
@@ -0,0 +1,18 @@
|
||||
package blocks
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
|
||||
"go.yaml.in/yaml/v4"
|
||||
)
|
||||
|
||||
func (b TemplateBlock) ParseYamlBlock(input string) (key string, value any, error error) {
|
||||
|
||||
res := make(map[string]any)
|
||||
err := yaml.Unmarshal([]byte(input), &res)
|
||||
if err != nil {
|
||||
return "", nil, fmt.Errorf("failed to parse yaml: %w", err)
|
||||
}
|
||||
|
||||
return "", nil, nil
|
||||
}
|
@@ -1,50 +1,54 @@
|
||||
package parser
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser"
|
||||
"git.max-richter.dev/max/marka/parser/blocks"
|
||||
"git.max-richter.dev/max/marka/registry"
|
||||
)
|
||||
|
||||
func readFile(t *testing.T, fileName string) string {
|
||||
path := filepath.Join("testdata", fileName)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read test data file: %v", err)
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func TestExtractBlocks(t *testing.T) {
|
||||
src := readFile(t, "recipe.schema.md")
|
||||
blocks := ExtractBlocks(src)
|
||||
src, err := registry.GetTemplate("recipe")
|
||||
if err != nil {
|
||||
t.Errorf("Failed to extract blocks: %s", err.Error())
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
templateBlocks, err := parser.ExtractBlocks(src)
|
||||
if err != nil {
|
||||
t.Errorf("Failed to extract blocks: %s", err.Error())
|
||||
t.FailNow()
|
||||
}
|
||||
|
||||
expected := []struct {
|
||||
Type BlockType
|
||||
Type blocks.BlockType
|
||||
Content string
|
||||
}{
|
||||
{BlockMatching, "---\\n"},
|
||||
{BlockData, "{ . }"},
|
||||
{BlockMatching, "\\n---\\n\\n# "},
|
||||
{BlockData, "{ name | text,required }"},
|
||||
{BlockMatching, "\\n\\n"},
|
||||
{BlockData, "{ description | text,optional }"},
|
||||
{BlockMatching, "\\n\\n## Ingredients\\n"},
|
||||
{BlockData, "{\\n path: recipeIngredient\\n codec: list\\n required: true\\n item:\\n template: \"- { . }\"\\n}"},
|
||||
{BlockMatching, "\\n\\n## Steps\\n"},
|
||||
{BlockData, "{\\n path: recipeInstructions\\n codec: list\\n required: true\\n item:\\n template: \"{ @index }. { . }\"\\n}"},
|
||||
{blocks.MatchingBlock, "---\n"},
|
||||
{blocks.DataBlock, "{\n path: .\n codec: yaml\n fields:\n - path: name\n codec: text\n required: true\n - path: image\n codec: text\n required: true\n - path: author.@type\n codec: const\n value: Person\n - path: author.name\n codec: text\n - path: datePublished\n codec: text\n - path: description\n codec: text\n - path: prepTime\n codec: text\n - path: cookTime\n codec: text\n - path: recipeYield\n codec: text\n}"},
|
||||
{blocks.MatchingBlock, "\n---\n\n# "},
|
||||
{blocks.DataBlock, "{ name | text,required }"},
|
||||
{blocks.MatchingBlock, "\n\n"},
|
||||
{blocks.DataBlock, "{ description | text }"},
|
||||
{blocks.MatchingBlock, "\n\n## Ingredients\n"},
|
||||
{blocks.DataBlock, "{\n path: recipeIngredient\n codec: list\n required: true\n item:\n template: \"- { . }\"\n}"},
|
||||
{blocks.MatchingBlock, "\n\n## Steps\n"},
|
||||
{blocks.DataBlock, "{\n path: recipeInstructions\n codec: list\n required: true\n item:\n template: \"{ @index }. { . }\"\n}"},
|
||||
}
|
||||
|
||||
if len(blocks) != len(expected) {
|
||||
t.Fatalf("expected %d blocks, got %d", len(expected), len(blocks))
|
||||
if len(templateBlocks) != len(expected) {
|
||||
t.Fatalf("expected %d blocks, got %d", len(expected), len(templateBlocks))
|
||||
}
|
||||
|
||||
for i, b := range blocks {
|
||||
for i, b := range templateBlocks {
|
||||
exp := expected[i]
|
||||
content := strings.ReplaceAll(b.GetContent(), "\n", "\\n")
|
||||
if b.Type != exp.Type || content != exp.Content {
|
||||
t.Errorf("Block %d: expected %v, got Type: %v, Start: %d, End: %d, Content: %s", i, exp, b.Type, b.Start, b.End, content)
|
||||
if b.Type != exp.Type {
|
||||
t.Errorf("Block#%d Type '%s' did not match expected type '%s'", i, b.Type, exp.Type)
|
||||
}
|
||||
content := b.GetContent()
|
||||
if content != exp.Content {
|
||||
t.Errorf("Block#%d Content '%s' did not match expected Content: '%s'", i, content, exp.Content)
|
||||
}
|
||||
}
|
||||
|
||||
|
@@ -3,3 +3,5 @@ module git.max-richter.dev/max/marka/parser
|
||||
go 1.24.3
|
||||
|
||||
require github.com/agext/levenshtein v1.2.3
|
||||
|
||||
require go.yaml.in/yaml/v4 v4.0.0-rc.1 // indirect
|
||||
|
@@ -1,2 +1,4 @@
|
||||
github.com/agext/levenshtein v1.2.3 h1:YB2fHEn0UJagG8T1rrWknE3ZQzWM06O8AMAatNn7lmo=
|
||||
github.com/agext/levenshtein v1.2.3/go.mod h1:JEDfjyjHDjOF/1e4FlBE/PkbqA9OfWu2ki2W0IB5558=
|
||||
go.yaml.in/yaml/v4 v4.0.0-rc.1 h1:4J1+yLKUIPGexM/Si+9d3pij4hdc7aGO04NhrElqXbY=
|
||||
go.yaml.in/yaml/v4 v4.0.0-rc.1/go.mod h1:CBdeces52/nUXndfQ5OY8GEQuNR9uEEOJPZj/Xq5IzU=
|
||||
|
17
parser/main.go
Normal file
17
parser/main.go
Normal file
@@ -0,0 +1,17 @@
|
||||
// Package parser provides functions for parsing Markdown templates into
|
||||
// structured JSON objects that conform to a JSON Schema.
|
||||
package parser
|
||||
|
||||
func ParseFile(markdownContent string) (map[string]any, error) {
|
||||
|
||||
// _schema, err := registry.GetTemplate("Recipe")
|
||||
// if err != nil {
|
||||
// return nil, fmt.Errorf("could not get schema: %w", err)
|
||||
// }
|
||||
|
||||
// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
|
||||
// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
|
||||
// The "matching" blocks should soft match with a levenshtein distance
|
||||
|
||||
return map[string]any{}, nil
|
||||
}
|
43
parser/main_test.go
Normal file
43
parser/main_test.go
Normal file
@@ -0,0 +1,43 @@
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser"
|
||||
)
|
||||
|
||||
func TestParseRecipe_Golden(t *testing.T) {
|
||||
td := filepath.Join("testdata", "recipe_salad")
|
||||
input := filepath.Join(td, "input.md")
|
||||
output := filepath.Join(td, "output.json")
|
||||
|
||||
inputContent, err := os.ReadFile(input)
|
||||
if err != nil {
|
||||
t.Fatalf("read input.md: %v", err)
|
||||
}
|
||||
|
||||
got, err := parser.ParseFile(string(inputContent))
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile: %v", err)
|
||||
}
|
||||
|
||||
var want map[string]any
|
||||
b, err := os.ReadFile(output)
|
||||
if err != nil {
|
||||
t.Fatalf("read expected.json: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal(b, &want); err != nil {
|
||||
t.Fatalf("unmarshal expected.json: %v", err)
|
||||
}
|
||||
|
||||
// Deep structural compare
|
||||
if !reflect.DeepEqual(want, got) {
|
||||
gb, _ := json.MarshalIndent(got, "", " ")
|
||||
wb, _ := json.MarshalIndent(want, "", " ")
|
||||
t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
|
||||
}
|
||||
}
|
@@ -3,12 +3,13 @@ package parser
|
||||
import (
|
||||
"math"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser/blocks"
|
||||
"github.com/agext/levenshtein"
|
||||
)
|
||||
|
||||
type MatchBlock struct {
|
||||
Start, End int
|
||||
Block Block
|
||||
Block blocks.TemplateBlock
|
||||
src *string
|
||||
}
|
||||
|
||||
@@ -22,17 +23,17 @@ func (m MatchBlock) GetContent() string {
|
||||
// MatchBlocksFuzzy finds anchor positions for all BlockMatching blocks using
|
||||
// Levenshtein distance (tolerant matching), then returns ONLY the BlockData
|
||||
// segments as gaps between those anchors.
|
||||
func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchBlock {
|
||||
func MatchBlocksFuzzy(markdown string, templateBlocks []blocks.TemplateBlock, maxDist float64) []MatchBlock {
|
||||
var out []MatchBlock
|
||||
|
||||
var lastIndex = 0
|
||||
for i, b := range blocks {
|
||||
if b.Type == BlockMatching {
|
||||
for i, b := range templateBlocks {
|
||||
if b.Type == blocks.MatchingBlock {
|
||||
start, end := FuzzyFind(markdown, lastIndex, b.GetContent(), 0.3)
|
||||
if end != -1 {
|
||||
if i > 0 {
|
||||
previousBlock := blocks[i-1]
|
||||
if previousBlock.Type == BlockData {
|
||||
previousBlock := templateBlocks[i-1]
|
||||
if previousBlock.Type == blocks.DataBlock {
|
||||
out = append(out, MatchBlock{
|
||||
Start: lastIndex,
|
||||
End: start,
|
||||
@@ -47,8 +48,8 @@ func MatchBlocksFuzzy(markdown string, blocks []Block, maxDist float64) []MatchB
|
||||
}
|
||||
|
||||
// Handle the last block
|
||||
lastBlock := blocks[len(blocks)-1]
|
||||
if lastBlock.Type == BlockData {
|
||||
lastBlock := templateBlocks[len(templateBlocks)-1]
|
||||
if lastBlock.Type == blocks.DataBlock {
|
||||
out = append(out, MatchBlock{
|
||||
Start: lastIndex,
|
||||
End: len(markdown),
|
||||
|
@@ -1,37 +1,27 @@
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser"
|
||||
"git.max-richter.dev/max/marka/registry"
|
||||
)
|
||||
|
||||
func readFile(t *testing.T, fileName string) string {
|
||||
path := filepath.Join("testdata", fileName)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read test data file: %v", err)
|
||||
}
|
||||
return string(data)
|
||||
}
|
||||
|
||||
func TestFuzzyFindAll(t *testing.T) {
|
||||
recipeMd := readFile(t, "baguette.md")
|
||||
recipeMd := readTestDataFile(t, "baguette.md")
|
||||
|
||||
tests := []struct {
|
||||
Needle string
|
||||
Start, End, StartIndex int
|
||||
}{
|
||||
{StartIndex: 0, Needle: "# Ingredients\n", Start: 72, End: 86},
|
||||
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 72, End: 86},
|
||||
{StartIndex: 0, Needle: "# Inrdients\n", Start: 72, End: 86},
|
||||
{StartIndex: 0, Needle: "# Ingredients\n", Start: 77, End: 91},
|
||||
{StartIndex: 0, Needle: "# Ingrdients\n", Start: 77, End: 91},
|
||||
{StartIndex: 0, Needle: "# Inrdients\n", Start: 77, End: 91},
|
||||
{StartIndex: 0, Needle: "---\n", Start: 0, End: 4},
|
||||
{StartIndex: 4, Needle: "---\n", Start: 24, End: 28},
|
||||
{StartIndex: 0, Needle: "# Steps\n", Start: 111, End: 119},
|
||||
{StartIndex: 0, Needle: "# Stps\n", Start: 111, End: 119},
|
||||
{StartIndex: 0, Needle: "# Step\n", Start: 111, End: 119},
|
||||
{StartIndex: 4, Needle: "---\n", Start: 29, End: 33},
|
||||
{StartIndex: 0, Needle: "# Steps\n", Start: 116, End: 124},
|
||||
{StartIndex: 0, Needle: "# Stps\n", Start: 116, End: 124},
|
||||
{StartIndex: 0, Needle: "# Step\n", Start: 116, End: 124},
|
||||
}
|
||||
|
||||
for _, test := range tests {
|
||||
@@ -45,16 +35,20 @@ func TestFuzzyFindAll(t *testing.T) {
|
||||
}
|
||||
|
||||
func TestFuzzyBlockMatch(t *testing.T) {
|
||||
recipeMd := readFile(t, "baguette.md")
|
||||
schemaMd := readFile(t, "recipe.schema.md")
|
||||
blocks := parser.ExtractBlocks(schemaMd)
|
||||
recipeMd := readTestDataFile(t, "baguette.md")
|
||||
schemaMd, err := registry.GetTemplate("recipe")
|
||||
if err != nil {
|
||||
t.Errorf("Failed to load template: %s", err.Error())
|
||||
t.FailNow()
|
||||
}
|
||||
blocks, _ := parser.ExtractBlocks(schemaMd)
|
||||
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
||||
|
||||
expected := []struct {
|
||||
value string
|
||||
}{
|
||||
{
|
||||
value: "author: Max Richter",
|
||||
value: "author.name: Max Richter",
|
||||
},
|
||||
{
|
||||
value: "Baguette",
|
||||
@@ -66,7 +60,7 @@ func TestFuzzyBlockMatch(t *testing.T) {
|
||||
value: "- Flour\n- Water\n- Salt",
|
||||
},
|
||||
{
|
||||
value: "1. Mix Flour Water and Salt\n2. Bake the bread",
|
||||
value: "1. Mix Flour Water and Salt\n2. Bake the bread\n",
|
||||
},
|
||||
}
|
||||
|
||||
|
@@ -1,17 +1,15 @@
|
||||
// Package parser provides functions for parsing Markdown templates into
|
||||
// structured JSON objects that conform to a JSON Schema.
|
||||
package parser
|
||||
|
||||
func ParseFile(markdownContent string) (map[string]any, error) {
|
||||
func Parse(blocks []MatchBlock) map[string]any {
|
||||
|
||||
// _schema, err := registry.GetTemplate("Recipe")
|
||||
// if err != nil {
|
||||
// return nil, fmt.Errorf("could not get schema: %w", err)
|
||||
// }
|
||||
result := make(map[string]any)
|
||||
|
||||
// Idea is to split the template into blocks, either "matching" blocks which are simple strings.
|
||||
// Or "data" blocks which match the content. Then i want to soft match the "matching" blocks and "data" blocks to the template.
|
||||
// The "matching" blocks should soft match with a levenshtein distance
|
||||
for _, b := range blocks {
|
||||
input := b.GetContent()
|
||||
|
||||
return map[string]any{}, nil
|
||||
key, value, _ := b.Block.Parse(input)
|
||||
result[key] = value
|
||||
}
|
||||
|
||||
return result
|
||||
}
|
||||
|
@@ -1,43 +1,42 @@
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"encoding/json"
|
||||
"os"
|
||||
"path/filepath"
|
||||
"reflect"
|
||||
"fmt"
|
||||
"testing"
|
||||
|
||||
"git.max-richter.dev/max/marka/parser"
|
||||
"git.max-richter.dev/max/marka/registry"
|
||||
)
|
||||
|
||||
func TestParseRecipe_Golden(t *testing.T) {
|
||||
td := filepath.Join("testdata", "recipe_salad")
|
||||
input := filepath.Join(td, "input.md")
|
||||
output := filepath.Join(td, "output.json")
|
||||
func TestParseBaguette(t *testing.T) {
|
||||
recipeMd := readTestDataFile(t, "baguette.md")
|
||||
|
||||
inputContent, err := os.ReadFile(input)
|
||||
template, err := registry.GetTemplate("recipe")
|
||||
if err != nil {
|
||||
t.Fatalf("read input.md: %v", err)
|
||||
t.Fatalf("Err: %s", err)
|
||||
}
|
||||
|
||||
got, err := parser.ParseFile(string(inputContent))
|
||||
blocks, err := parser.ExtractBlocks(template)
|
||||
if err != nil {
|
||||
t.Fatalf("ParseFile: %v", err)
|
||||
t.Fatalf("Err: %s", err)
|
||||
}
|
||||
|
||||
var want map[string]any
|
||||
b, err := os.ReadFile(output)
|
||||
if err != nil {
|
||||
t.Fatalf("read expected.json: %v", err)
|
||||
}
|
||||
if err := json.Unmarshal(b, &want); err != nil {
|
||||
t.Fatalf("unmarshal expected.json: %v", err)
|
||||
matches := parser.MatchBlocksFuzzy(recipeMd, blocks, 0.3)
|
||||
parsed := parser.Parse(matches)
|
||||
expected := map[string]any{
|
||||
"name": "Baguette",
|
||||
"description": "My favourite baguette recipe",
|
||||
"recipeIngredient": []string{"Flour", "Water", "Salt"},
|
||||
// "recipeInstructions": []string{
|
||||
// "Mix Flour Water and Salt",
|
||||
// "Bake the bread",
|
||||
// },
|
||||
}
|
||||
|
||||
// Deep structural compare
|
||||
if !reflect.DeepEqual(want, got) {
|
||||
gb, _ := json.MarshalIndent(got, "", " ")
|
||||
wb, _ := json.MarshalIndent(want, "", " ")
|
||||
t.Fatalf("parsed JSON mismatch\n--- got ---\n%s\n--- want ---\n%s", string(gb), string(wb))
|
||||
for k, v := range expected {
|
||||
if fmt.Sprintf("%v", parsed[k]) != fmt.Sprintf("%v", v) {
|
||||
t.Errorf("Expected %v but got %v", v, parsed[k])
|
||||
}
|
||||
}
|
||||
|
||||
}
|
||||
|
4
parser/testdata/baguette.md
vendored
4
parser/testdata/baguette.md
vendored
@@ -1,5 +1,5 @@
|
||||
---
|
||||
author: Max Richter
|
||||
author.name: Max Richter
|
||||
---
|
||||
|
||||
# Baguette
|
||||
@@ -13,4 +13,4 @@ My favourite baguette recipe
|
||||
|
||||
## Steps
|
||||
1. Mix Flour Water and Salt
|
||||
2. Bake the bread
|
||||
2. Bake the bread
|
||||
|
25
parser/testdata/recipe.schema.md
vendored
25
parser/testdata/recipe.schema.md
vendored
@@ -1,25 +0,0 @@
|
||||
---
|
||||
{ . }
|
||||
---
|
||||
|
||||
# { name | text,required }
|
||||
|
||||
{ description | text,optional }
|
||||
|
||||
## Ingredients
|
||||
{
|
||||
path: recipeIngredient
|
||||
codec: list
|
||||
required: true
|
||||
item:
|
||||
template: "- { . }"
|
||||
}
|
||||
|
||||
## Steps
|
||||
{
|
||||
path: recipeInstructions
|
||||
codec: list
|
||||
required: true
|
||||
item:
|
||||
template: "{ @index }. { . }"
|
||||
}
|
16
parser/testutils_test.go
Normal file
16
parser/testutils_test.go
Normal file
@@ -0,0 +1,16 @@
|
||||
package parser_test
|
||||
|
||||
import (
|
||||
"os"
|
||||
"path/filepath"
|
||||
"testing"
|
||||
)
|
||||
|
||||
func readTestDataFile(t *testing.T, fileName string) string {
|
||||
path := filepath.Join("testdata", fileName)
|
||||
data, err := os.ReadFile(path)
|
||||
if err != nil {
|
||||
t.Fatalf("failed to read test data file: %v", err)
|
||||
}
|
||||
return string(data)
|
||||
}
|
@@ -1,6 +0,0 @@
|
||||
description: "Core capture aliases for Marka"
|
||||
patterns:
|
||||
text: ".+"
|
||||
word: "\\S+"
|
||||
num: "(?:\\d+(?:[.,]\\d+)?(?:\\s?\\d+/\\d+)?)" # 3 | 1.5 | 1 1/2
|
||||
indexMarker: "\\d+[.)]" # 1. / 1)
|
@@ -28,9 +28,6 @@ var templates embed.FS
|
||||
//go:embed schema-org/*
|
||||
var schemas embed.FS
|
||||
|
||||
//go:embed aliases/*
|
||||
var aliases embed.FS
|
||||
|
||||
func GetTemplates() Source {
|
||||
return src{fsys: templates}
|
||||
}
|
||||
@@ -49,11 +46,3 @@ func GetTemplate(name string) (string, error) {
|
||||
|
||||
return string(templateBytes), nil
|
||||
}
|
||||
|
||||
func GetSchemas() Source {
|
||||
return src{fsys: schemas}
|
||||
}
|
||||
|
||||
func GetAliases() Source {
|
||||
return src{fsys: aliases}
|
||||
}
|
||||
|
@@ -2,10 +2,6 @@
|
||||
{
|
||||
path: .
|
||||
codec: yaml
|
||||
required: true
|
||||
assert:
|
||||
"@context": https://schema.org/
|
||||
"@type": Recipe
|
||||
fields:
|
||||
- path: name
|
||||
codec: text
|
||||
@@ -18,28 +14,22 @@
|
||||
value: Person
|
||||
- path: author.name
|
||||
codec: text
|
||||
required: true
|
||||
- path: datePublished
|
||||
codec: text
|
||||
optional: true
|
||||
- path: description
|
||||
codec: text
|
||||
optional: true
|
||||
- path: prepTime
|
||||
codec: text
|
||||
optional: true
|
||||
- path: cookTime
|
||||
codec: text
|
||||
optional: true
|
||||
- path: recipeYield
|
||||
codec: text
|
||||
optional: true
|
||||
}
|
||||
---
|
||||
|
||||
# { name | text,required }
|
||||
|
||||
{ description | text,optional }
|
||||
{ description | text }
|
||||
|
||||
## Ingredients
|
||||
{
|
Reference in New Issue
Block a user