refactor: simplify parse ingredients code

This commit is contained in:
2025-01-19 19:22:19 +01:00
parent f106460502
commit 78e94ccf82
14 changed files with 201 additions and 137 deletions

View File

@ -109,7 +109,7 @@ export async function getDocument(name: string): Promise<string | undefined> {
export function updateDocument(name: string, content: string) {
return db.update(documentTable).set({
content,
}).where(eq(documentTable.name, name));
}).where(eq(documentTable.name, name)).run();
}
export function transformDocument(input: string, cb: (r: Root) => Root) {

View File

@ -3,7 +3,7 @@ import { zodResponseFormat } from "https://deno.land/x/openai@v4.69.0/helpers/zo
import { OPENAI_API_KEY } from "@lib/env.ts";
import { hashString } from "@lib/helpers.ts";
import { createCache } from "@lib/cache.ts";
import recipeSchema from "@lib/recipeSchema.ts";
import recipeSchema, { recipeResponseSchema } from "@lib/recipeSchema.ts";
const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY });
@ -223,7 +223,7 @@ export async function extractRecipe(content: string) {
},
{ role: "user", content },
],
response_format: zodResponseFormat(recipeSchema, "recipe-v2"),
response_format: zodResponseFormat(recipeResponseSchema, "recipe-v2"),
});
return recipeSchema.parse(completion.choices[0].message.parsed);

View File

@ -1,35 +1,94 @@
import { parseIngredient as _parseIngredient } from "https://esm.sh/parse-ingredient@1.0.1";
import {
parseIngredient,
unitsOfMeasure as _unitsOfMeasure,
} from "https://esm.sh/parse-ingredient@1.2.1";
import { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts";
import { removeMarkdownFormatting } from "@lib/string.ts";
export function parseIngredient(text: string) {
const ing = _parseIngredient(text, {
additionalUOMs: {
tableSpoon: {
short: "EL",
plural: "Table Spoons",
alternates: ["el", "EL", "Tbsp", "tbsp"],
},
teaSpoon: {
short: "TL",
plural: "Tea Spoon",
alternates: ["tl", "TL", "Tsp", "tsp", "teaspoon"],
},
litre: {
short: "L",
plural: "liters",
alternates: ["L", "l"],
},
paket: {
short: "Paket",
plural: "Pakets",
alternates: ["Paket", "paket"],
},
},
const customUnits = {
tableSpoon: {
short: "EL",
plural: "Table Spoons",
alternates: ["el", "EL", "Tbsp", "tbsp"],
},
dose: {
short: "Dose",
plural: "Dosen",
alternates: ["Dose", "dose", "Dose(n)"],
},
pound: {
short: "lb",
plural: "pounds",
alternates: ["lb", "lbs", "pound", "pounds"],
},
teaSpoon: {
short: "TL",
plural: "Tea Spoon",
alternates: ["tl", "TL", "Tsp", "tsp", "teaspoon"],
},
litre: {
short: "L",
plural: "liters",
alternates: ["L", "l"],
},
paket: {
short: "Paket",
plural: "Pakets",
alternates: ["Paket", "paket"],
},
};
export const unitsOfMeasure = {
..._unitsOfMeasure,
...customUnits,
} as const;
export function parseIngredients(
text: string,
): (Ingredient | IngredientGroup)[] {
const cleanText = removeMarkdownFormatting(text);
const ingredients = parseIngredient(cleanText, {
normalizeUOM: true,
additionalUOMs: customUnits,
});
return {
name: ing[0].description,
unit: ing[0].unitOfMeasure || "",
quantity: ing[0].quantity?.toString() || "",
note: "",
};
const results: (Ingredient | IngredientGroup)[] = [];
let currentGroup: IngredientGroup | undefined;
for (const ing of ingredients) {
if (ing.isGroupHeader) {
if (currentGroup) {
results.push(currentGroup);
}
currentGroup = {
name: ing.description.replace(/:$/, ""),
items: [],
};
} else {
const ingredient = {
name: ing.description.replace(/^\s?-/, "").trim(),
unit: ing.unitOfMeasure || "",
quantity: ing.quantity?.toString() || ing.quantity2?.toString() || "",
note: "",
};
const unit = ingredient.unit.toLowerCase() as keyof typeof unitsOfMeasure;
if (unit in unitsOfMeasure && unit !== "cup") {
ingredient.unit = unitsOfMeasure[unit].short;
}
if (!currentGroup) {
results.push(ingredient);
} else {
currentGroup.items.push(ingredient);
}
}
}
if (currentGroup) {
results.push(currentGroup);
}
return results;
}

View File

@ -16,6 +16,7 @@ export async function fetchHtmlWithPlaywright(
streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string> {
streamResponse.enqueue("booting up playwright");
// Launch the Playwright browser
const browser = await firefox.launch();

View File

@ -36,4 +36,12 @@ const recipeSchema = z.object({
notes: z.array(z.string()).describe("Optional notes about the recipe"),
});
const noRecipeSchema = z.object({
errorMessages: z.array(z.string()).describe(
"List of error messages, if no recipe was found",
),
});
export const recipeResponseSchema = z.union([recipeSchema, noRecipeSchema]);
export default recipeSchema;

View File

@ -1,6 +1,5 @@
import {
type DocumentChild,
getTextOfChild,
getTextOfRange,
parseDocument,
} from "@lib/documents.ts";
@ -9,7 +8,7 @@ import { createCrud } from "@lib/crud.ts";
import { extractHashTags } from "@lib/string.ts";
import { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts";
import { fixRenderedMarkdown } from "@lib/helpers.ts";
import { parseIngredient } from "@lib/parseIngredient.ts";
import { parseIngredients } from "@lib/parseIngredient.ts";
export type Recipe = {
type: "recipe";
@ -33,72 +32,14 @@ export type Recipe = {
};
};
function parseIngredientItem(listItem: DocumentChild): Ingredient | undefined {
if (listItem.type === "listItem") {
const children: DocumentChild[] = listItem.children[0]?.children ||
listItem.children;
const text = children.map((c) => getTextOfChild(c)).join(" ").trim();
return parseIngredient(text);
}
}
const isIngredient = (item: Ingredient | undefined): item is Ingredient => {
return !!item;
};
function parseIngredientsList(list: DocumentChild): Ingredient[] {
if (list.type === "list" && "children" in list) {
return list.children.map((listItem) => {
return parseIngredientItem(listItem);
}).filter(isIngredient);
}
return [];
}
function parseIngredients(children: DocumentChild[]): Recipe["ingredients"] {
const ingredients: (Ingredient | IngredientGroup)[] = [];
if (!children) return [];
let skip = false;
for (let i = 0; i < children.length; i++) {
if (skip) {
skip = false;
continue;
}
const child = children[i];
if (child.type === "paragraph") {
const nextChild = children[i + 1];
if (!nextChild || nextChild.type !== "list") continue;
const name = getTextOfChild(child);
ingredients.push({
name: name || "",
items: parseIngredientsList(nextChild),
});
skip = true;
continue;
}
if (child.type === "list") {
ingredients.push(...parseIngredientsList(child));
}
}
return ingredients;
}
function extractSteps(
content: string,
seperator: RegExp = /\n(?=\d+\.)/g,
): string[] {
const steps = content.split(seperator).map((step) => {
const match = step.match(/^(\d+)\.\s*(.*)/);
if (!match) return;
const [, , text] = match;
return text;
if (match) return match[2];
return step;
}).filter((step) => !!step);
return steps as string[];
}
@ -141,7 +82,14 @@ export function parseRecipe(original: string, id: string): Recipe {
let description = getTextOfRange(groups[0], original);
const ingredients = parseIngredients(groups[1]);
let ingredientsText = getTextOfRange(groups[1], original);
if (ingredientsText) {
ingredientsText = ingredientsText.replace(/#+\s?Ingredients?/, "");
} else {
ingredientsText = "";
}
const ingredients = parseIngredients(ingredientsText);
const instructionText = getTextOfRange(groups[2], original);
let instructions = extractSteps(instructionText || "");

View File

@ -55,8 +55,6 @@ const isResource = (
export async function searchResource(
{ q, tags = [], types, authors, rating }: SearchParams,
): Promise<GenericResource[]> {
console.log("searchResource", { q, tags, types, authors, rating });
let resources = (await Promise.all([
(!types || types.includes("movie")) && getAllMovies(),
(!types || types.includes("series")) && getAllSeries(),

View File

@ -133,3 +133,41 @@ export function parseTimeCacheKey(key: string) {
export function rgbToHex(r: number, g: number, b: number) {
return "#" + componentToHex(r) + componentToHex(g) + componentToHex(b);
}
export function removeMarkdownFormatting(text: string): string {
// Remove code blocks
text = text.replace(/```[\s\S]*?```/g, "");
// Remove inline code
text = text.replace(/`([^`]+)`/g, "$1");
// Remove images
text = text.replace(/!\[.*?\]\(.*?\)/g, "");
// Remove links
text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, "$1");
// Remove bold and italic formatting
text = text.replace(/(\*\*|__)(.*?)\1/g, "$2"); // Bold
text = text.replace(/(\*|_)(.*?)\1/g, "$2"); // Italic
// Remove strikethrough
text = text.replace(/~~(.*?)~~/g, "$1");
// Remove headings
text = text.replace(/^#{1,6}\s*(.+)$/gm, "$1");
// Remove blockquotes
text = text.replace(/^>\s*/gm, "");
// Remove unordered list markers
text = text.replace(/^[-*+]\s+/gm, "-");
// Remove ordered list markers
text = text.replace(/^\d+\.\s+/gm, "");
// Remove horizontal rules
text = text.replace(/^---+$/gm, "");
return text;
}