refactor: simplify parse ingredients code

This commit is contained in:
max_richter 2025-01-19 19:22:19 +01:00
parent f106460502
commit 78e94ccf82
14 changed files with 201 additions and 137 deletions

View File

@ -1,21 +1,25 @@
FROM denoland/deno:2.1.4 AS build FROM denoland/deno:2.1.4 AS build
RUN apt-get update && apt-get install -y --no-install-recommends \
curl && \
deno run -A npm:playwright install --with-deps firefox &&\
apt-get clean && \
rm -rf /var/lib/apt/lists/*
WORKDIR /app WORKDIR /app
COPY deno.json /app/
COPY . . COPY . .
ENV DATA_DIR=/app/data ENV DATA_DIR=/app/data
RUN apt-get update && apt install -y curl && \ RUN mkdir -p $DATA_DIR && \
deno run -A npm:playwright install --with-deps firefox && \
deno install --allow-import --allow-ffi --allow-scripts=npm:sharp@0.33.5-rc.1 -e main.ts &&\ deno install --allow-import --allow-ffi --allow-scripts=npm:sharp@0.33.5-rc.1 -e main.ts &&\
sed -i -e 's/"deno"/"no-deno"/' node_modules/@libsql/client/package.json &&\ sed -i -e 's/"deno"/"no-deno"/' node_modules/@libsql/client/package.json &&\
mkdir -p $DATA_DIR &&\
deno task build deno task build
EXPOSE 8000 EXPOSE 8000
# Start the application
CMD ["run", "-A", "main.ts"] CMD ["run", "-A", "main.ts"]

View File

@ -1,13 +1,23 @@
import { Signal } from "@preact/signals"; import { Signal } from "@preact/signals";
import type { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts"; import type { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts";
import { FunctionalComponent } from "preact"; import { FunctionalComponent } from "preact";
import { unitsOfMeasure } from "@lib/parseIngredient.ts";
function numberToString(num: number) { function formatAmount(num: number) {
if (num === 0) return "";
return (Math.floor(num * 4) / 4).toString(); return (Math.floor(num * 4) / 4).toString();
} }
function stringToNumber(str: string) { function formatUnit(unit: string, amount: number) {
return parseFloat(str); const unitKey = unit.toLowerCase() as keyof typeof unitsOfMeasure;
if (unitKey in unitsOfMeasure) {
if (amount > 1 && unitsOfMeasure[unitKey].plural !== undefined) {
return unitsOfMeasure[unitKey].plural;
}
return unitsOfMeasure[unitKey].short;
} else {
return unit;
}
} }
const Ingredient = ( const Ingredient = (
@ -20,7 +30,7 @@ const Ingredient = (
) => { ) => {
const { name, quantity, unit } = ingredient; const { name, quantity, unit } = ingredient;
const parsedQuantity = stringToNumber(quantity); const parsedQuantity = parseFloat(quantity);
const finalAmount = (typeof parsedQuantity === "number" && amount) const finalAmount = (typeof parsedQuantity === "number" && amount)
? (parsedQuantity / portion) * (amount?.value || 1) ? (parsedQuantity / portion) * (amount?.value || 1)
@ -29,8 +39,10 @@ const Ingredient = (
return ( return (
<tr key={key}> <tr key={key}>
<td class="pr-4 py-2"> <td class="pr-4 py-2">
{numberToString(finalAmount || 0) + {formatAmount(finalAmount || 0)}
(typeof unit === "string" ? unit : "")} <span class="ml-0.5 opacity-50">
{formatUnit(unit, finalAmount || 0)}
</span>
</td> </td>
<td class="px-4 py-2">{name}</td> <td class="px-4 py-2">{name}</td>
</tr> </tr>

View File

@ -109,7 +109,7 @@ export async function getDocument(name: string): Promise<string | undefined> {
export function updateDocument(name: string, content: string) { export function updateDocument(name: string, content: string) {
return db.update(documentTable).set({ return db.update(documentTable).set({
content, content,
}).where(eq(documentTable.name, name)); }).where(eq(documentTable.name, name)).run();
} }
export function transformDocument(input: string, cb: (r: Root) => Root) { export function transformDocument(input: string, cb: (r: Root) => Root) {

View File

@ -3,7 +3,7 @@ import { zodResponseFormat } from "https://deno.land/x/openai@v4.69.0/helpers/zo
import { OPENAI_API_KEY } from "@lib/env.ts"; import { OPENAI_API_KEY } from "@lib/env.ts";
import { hashString } from "@lib/helpers.ts"; import { hashString } from "@lib/helpers.ts";
import { createCache } from "@lib/cache.ts"; import { createCache } from "@lib/cache.ts";
import recipeSchema from "@lib/recipeSchema.ts"; import recipeSchema, { recipeResponseSchema } from "@lib/recipeSchema.ts";
const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY }); const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY });
@ -223,7 +223,7 @@ export async function extractRecipe(content: string) {
}, },
{ role: "user", content }, { role: "user", content },
], ],
response_format: zodResponseFormat(recipeSchema, "recipe-v2"), response_format: zodResponseFormat(recipeResponseSchema, "recipe-v2"),
}); });
return recipeSchema.parse(completion.choices[0].message.parsed); return recipeSchema.parse(completion.choices[0].message.parsed);

View File

@ -1,13 +1,26 @@
import { parseIngredient as _parseIngredient } from "https://esm.sh/parse-ingredient@1.0.1"; import {
parseIngredient,
unitsOfMeasure as _unitsOfMeasure,
} from "https://esm.sh/parse-ingredient@1.2.1";
import { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts";
import { removeMarkdownFormatting } from "@lib/string.ts";
export function parseIngredient(text: string) { const customUnits = {
const ing = _parseIngredient(text, {
additionalUOMs: {
tableSpoon: { tableSpoon: {
short: "EL", short: "EL",
plural: "Table Spoons", plural: "Table Spoons",
alternates: ["el", "EL", "Tbsp", "tbsp"], alternates: ["el", "EL", "Tbsp", "tbsp"],
}, },
dose: {
short: "Dose",
plural: "Dosen",
alternates: ["Dose", "dose", "Dose(n)"],
},
pound: {
short: "lb",
plural: "pounds",
alternates: ["lb", "lbs", "pound", "pounds"],
},
teaSpoon: { teaSpoon: {
short: "TL", short: "TL",
plural: "Tea Spoon", plural: "Tea Spoon",
@ -23,13 +36,59 @@ export function parseIngredient(text: string) {
plural: "Pakets", plural: "Pakets",
alternates: ["Paket", "paket"], alternates: ["Paket", "paket"],
}, },
}, };
export const unitsOfMeasure = {
..._unitsOfMeasure,
...customUnits,
} as const;
export function parseIngredients(
text: string,
): (Ingredient | IngredientGroup)[] {
const cleanText = removeMarkdownFormatting(text);
const ingredients = parseIngredient(cleanText, {
normalizeUOM: true,
additionalUOMs: customUnits,
}); });
return { const results: (Ingredient | IngredientGroup)[] = [];
name: ing[0].description, let currentGroup: IngredientGroup | undefined;
unit: ing[0].unitOfMeasure || "",
quantity: ing[0].quantity?.toString() || "", for (const ing of ingredients) {
if (ing.isGroupHeader) {
if (currentGroup) {
results.push(currentGroup);
}
currentGroup = {
name: ing.description.replace(/:$/, ""),
items: [],
};
} else {
const ingredient = {
name: ing.description.replace(/^\s?-/, "").trim(),
unit: ing.unitOfMeasure || "",
quantity: ing.quantity?.toString() || ing.quantity2?.toString() || "",
note: "", note: "",
}; };
const unit = ingredient.unit.toLowerCase() as keyof typeof unitsOfMeasure;
if (unit in unitsOfMeasure && unit !== "cup") {
ingredient.unit = unitsOfMeasure[unit].short;
}
if (!currentGroup) {
results.push(ingredient);
} else {
currentGroup.items.push(ingredient);
}
}
}
if (currentGroup) {
results.push(currentGroup);
}
return results;
} }

View File

@ -16,6 +16,7 @@ export async function fetchHtmlWithPlaywright(
streamResponse: ReturnType<typeof createStreamResponse>, streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string> { ): Promise<string> {
streamResponse.enqueue("booting up playwright"); streamResponse.enqueue("booting up playwright");
// Launch the Playwright browser // Launch the Playwright browser
const browser = await firefox.launch(); const browser = await firefox.launch();

View File

@ -36,4 +36,12 @@ const recipeSchema = z.object({
notes: z.array(z.string()).describe("Optional notes about the recipe"), notes: z.array(z.string()).describe("Optional notes about the recipe"),
}); });
const noRecipeSchema = z.object({
errorMessages: z.array(z.string()).describe(
"List of error messages, if no recipe was found",
),
});
export const recipeResponseSchema = z.union([recipeSchema, noRecipeSchema]);
export default recipeSchema; export default recipeSchema;

View File

@ -1,6 +1,5 @@
import { import {
type DocumentChild, type DocumentChild,
getTextOfChild,
getTextOfRange, getTextOfRange,
parseDocument, parseDocument,
} from "@lib/documents.ts"; } from "@lib/documents.ts";
@ -9,7 +8,7 @@ import { createCrud } from "@lib/crud.ts";
import { extractHashTags } from "@lib/string.ts"; import { extractHashTags } from "@lib/string.ts";
import { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts"; import { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts";
import { fixRenderedMarkdown } from "@lib/helpers.ts"; import { fixRenderedMarkdown } from "@lib/helpers.ts";
import { parseIngredient } from "@lib/parseIngredient.ts"; import { parseIngredients } from "@lib/parseIngredient.ts";
export type Recipe = { export type Recipe = {
type: "recipe"; type: "recipe";
@ -33,72 +32,14 @@ export type Recipe = {
}; };
}; };
function parseIngredientItem(listItem: DocumentChild): Ingredient | undefined {
if (listItem.type === "listItem") {
const children: DocumentChild[] = listItem.children[0]?.children ||
listItem.children;
const text = children.map((c) => getTextOfChild(c)).join(" ").trim();
return parseIngredient(text);
}
}
const isIngredient = (item: Ingredient | undefined): item is Ingredient => {
return !!item;
};
function parseIngredientsList(list: DocumentChild): Ingredient[] {
if (list.type === "list" && "children" in list) {
return list.children.map((listItem) => {
return parseIngredientItem(listItem);
}).filter(isIngredient);
}
return [];
}
function parseIngredients(children: DocumentChild[]): Recipe["ingredients"] {
const ingredients: (Ingredient | IngredientGroup)[] = [];
if (!children) return [];
let skip = false;
for (let i = 0; i < children.length; i++) {
if (skip) {
skip = false;
continue;
}
const child = children[i];
if (child.type === "paragraph") {
const nextChild = children[i + 1];
if (!nextChild || nextChild.type !== "list") continue;
const name = getTextOfChild(child);
ingredients.push({
name: name || "",
items: parseIngredientsList(nextChild),
});
skip = true;
continue;
}
if (child.type === "list") {
ingredients.push(...parseIngredientsList(child));
}
}
return ingredients;
}
function extractSteps( function extractSteps(
content: string, content: string,
seperator: RegExp = /\n(?=\d+\.)/g, seperator: RegExp = /\n(?=\d+\.)/g,
): string[] { ): string[] {
const steps = content.split(seperator).map((step) => { const steps = content.split(seperator).map((step) => {
const match = step.match(/^(\d+)\.\s*(.*)/); const match = step.match(/^(\d+)\.\s*(.*)/);
if (!match) return; if (match) return match[2];
const [, , text] = match; return step;
return text;
}).filter((step) => !!step); }).filter((step) => !!step);
return steps as string[]; return steps as string[];
} }
@ -141,7 +82,14 @@ export function parseRecipe(original: string, id: string): Recipe {
let description = getTextOfRange(groups[0], original); let description = getTextOfRange(groups[0], original);
const ingredients = parseIngredients(groups[1]); let ingredientsText = getTextOfRange(groups[1], original);
if (ingredientsText) {
ingredientsText = ingredientsText.replace(/#+\s?Ingredients?/, "");
} else {
ingredientsText = "";
}
const ingredients = parseIngredients(ingredientsText);
const instructionText = getTextOfRange(groups[2], original); const instructionText = getTextOfRange(groups[2], original);
let instructions = extractSteps(instructionText || ""); let instructions = extractSteps(instructionText || "");

View File

@ -55,8 +55,6 @@ const isResource = (
export async function searchResource( export async function searchResource(
{ q, tags = [], types, authors, rating }: SearchParams, { q, tags = [], types, authors, rating }: SearchParams,
): Promise<GenericResource[]> { ): Promise<GenericResource[]> {
console.log("searchResource", { q, tags, types, authors, rating });
let resources = (await Promise.all([ let resources = (await Promise.all([
(!types || types.includes("movie")) && getAllMovies(), (!types || types.includes("movie")) && getAllMovies(),
(!types || types.includes("series")) && getAllSeries(), (!types || types.includes("series")) && getAllSeries(),

View File

@ -133,3 +133,41 @@ export function parseTimeCacheKey(key: string) {
export function rgbToHex(r: number, g: number, b: number) { export function rgbToHex(r: number, g: number, b: number) {
return "#" + componentToHex(r) + componentToHex(g) + componentToHex(b); return "#" + componentToHex(r) + componentToHex(g) + componentToHex(b);
} }
export function removeMarkdownFormatting(text: string): string {
// Remove code blocks
text = text.replace(/```[\s\S]*?```/g, "");
// Remove inline code
text = text.replace(/`([^`]+)`/g, "$1");
// Remove images
text = text.replace(/!\[.*?\]\(.*?\)/g, "");
// Remove links
text = text.replace(/\[([^\]]+)\]\([^\)]+\)/g, "$1");
// Remove bold and italic formatting
text = text.replace(/(\*\*|__)(.*?)\1/g, "$2"); // Bold
text = text.replace(/(\*|_)(.*?)\1/g, "$2"); // Italic
// Remove strikethrough
text = text.replace(/~~(.*?)~~/g, "$1");
// Remove headings
text = text.replace(/^#{1,6}\s*(.+)$/gm, "$1");
// Remove blockquotes
text = text.replace(/^>\s*/gm, "");
// Remove unordered list markers
text = text.replace(/^[-*+]\s+/gm, "-");
// Remove ordered list markers
text = text.replace(/^\d+\.\s+/gm, "");
// Remove horizontal rules
text = text.replace(/^---+$/gm, "");
return text;
}

View File

@ -90,14 +90,8 @@ async function extractUsingAI(
const markdown = service.turndown(cleanDocument); const markdown = service.turndown(cleanDocument);
streamResponse.enqueue("extracting recipe with openai"); streamResponse.enqueue("extracting recipe with openai");
console.log("------- MARKDOWN ------");
console.log(markdown);
console.log("-----------------------");
const recipe = await openai.extractRecipe(markdown); const recipe = await openai.extractRecipe(markdown);
console.log("------- EXTRACTED ------");
console.log(JSON.stringify(recipe, null, 2));
console.log("-----------------------");
return recipe; return recipe;
} }
@ -142,7 +136,6 @@ async function processCreateRecipeFromUrl(
let recipe: z.infer<typeof recipeSchema> | undefined = undefined; let recipe: z.infer<typeof recipeSchema> | undefined = undefined;
if (jsonLds.length > 0) { if (jsonLds.length > 0) {
for (const jsonLd of jsonLds) { for (const jsonLd of jsonLds) {
console.log({ content: jsonLd.textContent });
recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || ""); recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || "");
if (recipe) break; if (recipe) break;
} }
@ -152,7 +145,7 @@ async function processCreateRecipeFromUrl(
recipe = await extractUsingAI(url, document, streamResponse); recipe = await extractUsingAI(url, document, streamResponse);
} }
const id = (recipe?.title || title || "").replaceAll(" ", "-"); const id = (recipe?.title || title || "").replace(/--+/, "-");
if (!recipe) { if (!recipe) {
streamResponse.enqueue("failed to parse recipe"); streamResponse.enqueue("failed to parse recipe");
@ -226,10 +219,6 @@ async function processCreateRecipeFromUrl(
streamResponse.enqueue("finished processing, creating file"); streamResponse.enqueue("finished processing, creating file");
console.log("------- CREATING ------");
console.log(JSON.stringify(recipe, null, 2));
console.log("-----------------------");
await createRecipe(newRecipe.id, newRecipe); await createRecipe(newRecipe.id, newRecipe);
streamResponse.enqueue("id: " + newRecipe.id); streamResponse.enqueue("id: " + newRecipe.id);
@ -254,6 +243,7 @@ export const handler: Handlers = {
processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => { processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => {
log.debug("created article from link", { article }); log.debug("created article from link", { article });
}).catch((err) => { }).catch((err) => {
streamResponse.enqueue(`error creating article: ${err}`);
log.error(err); log.error(err);
}).finally(() => { }).finally(() => {
streamResponse.cancel(); streamResponse.cancel();

View File

@ -1,5 +1,5 @@
import recipeSchema from "@lib/recipeSchema.ts"; import recipeSchema from "@lib/recipeSchema.ts";
import { parseIngredient } from "@lib/parseIngredient.ts"; import { parseIngredients } from "@lib/parseIngredient.ts";
export function parseJsonLdToRecipeSchema(jsonLdContent: string) { export function parseJsonLdToRecipeSchema(jsonLdContent: string) {
try { try {
@ -20,8 +20,8 @@ export function parseJsonLdToRecipeSchema(jsonLdContent: string) {
} }
// Map and parse ingredients into the new schema // Map and parse ingredients into the new schema
const ingredients = (data.recipeIngredient || []).map( const ingredients = parseIngredients(
parseIngredient, data?.recipeIngredient?.join("\n") || "",
); );
const instructions = Array.isArray(data.recipeInstructions) const instructions = Array.isArray(data.recipeInstructions)

View File

@ -46,6 +46,7 @@ function ValidRecipe({
portion={portion} portion={portion}
/> />
<h3 class="text-3xl my-5">Preparation</h3> <h3 class="text-3xl my-5">Preparation</h3>
<div class="pl-2">
<ol class="list-decimal grid gap-4"> <ol class="list-decimal grid gap-4">
{recipe.instructions && (recipe.instructions.map((instruction) => { {recipe.instructions && (recipe.instructions.map((instruction) => {
return ( return (
@ -57,6 +58,7 @@ function ValidRecipe({
); );
}))} }))}
</ol> </ol>
</div>
</> </>
); );
} }

View File

@ -116,3 +116,7 @@ input[type=number] {
.highlight>pre { .highlight>pre {
text-wrap: wrap; text-wrap: wrap;
} }
.list-decimal li::marker {
color: #8a898c;
}