feat: url scraper to recipe

This commit is contained in:
2025-01-18 00:46:05 +01:00
parent 6112d007c2
commit d4cccacc28
24 changed files with 1349 additions and 137 deletions

View File

@ -85,6 +85,9 @@ export function createCrud<T extends GenericResource>(
}
const content = await getDocument(path);
if (!content) {
return;
}
const parsed = parse(content, id);
@ -92,12 +95,13 @@ export function createCrud<T extends GenericResource>(
return addThumbnailToResource(parsed);
}
const doc = { ...parsed, content };
cache.set(path, doc);
cache.set(path, doc, { expires: 10 * 1000 });
return doc;
}
function create(id: string, content: string | ArrayBuffer | T) {
const path = pathFromId(id);
cache.set("all", undefined);
if (
typeof content === "string" || content instanceof ArrayBuffer
) {
@ -105,7 +109,9 @@ export function createCrud<T extends GenericResource>(
}
if (render) {
return createDocument(path, render(content));
const rendered = render(content);
cache.set(path, content);
return createDocument(path, rendered);
}
throw new Error("No renderer defined for " + prefix + " CRUD");
@ -114,7 +120,11 @@ export function createCrud<T extends GenericResource>(
async function update(id: string, updater: (r: Root) => Root) {
const path = pathFromId(id);
const content = await getDocument(path);
if (!content) {
return;
}
const newDoc = transformDocument(content, updater);
cache.set("all", undefined);
await createDocument(path, newDoc);
}
@ -132,7 +142,8 @@ export function createCrud<T extends GenericResource>(
const id = doc.name.replace(prefix, "").replace(/\.md$/, "");
return read(id);
}),
)).sort(sortFunction<T>(sort));
)).sort(sortFunction<T>(sort)).filter((v) => !!v);
cache.set("all", parsed);
return parsed;
}

View File

@ -58,6 +58,10 @@ export function createDocument(
log.info("creating document", { name });
if (typeof content === "string") {
updateDocument(name, content).catch(log.error);
}
return fetch(SILVERBULLET_SERVER + "/" + name, {
body: content,
method: "PUT",
@ -65,25 +69,49 @@ export function createDocument(
});
}
export async function getDocument(name: string): Promise<string> {
const documents = await db.select().from(documentTable).where(
eq(documentTable.name, name),
).limit(1);
if (documents[0]?.content) return documents[0].content;
async function fetchDocument(name: string) {
log.debug("fetching document", { name });
const headers = new Headers();
headers.append("X-Sync-Mode", "true");
const response = await fetch(SILVERBULLET_SERVER + "/" + name, { headers });
const text = await response.text();
if (response.status === 404) {
return;
}
return response.text();
}
await db.update(documentTable).set({
content: text,
}).where(eq(documentTable.name, name));
export async function getDocument(name: string): Promise<string | undefined> {
const documents = await db.select().from(documentTable).where(
eq(documentTable.name, name),
).limit(1);
// This updates the document in the background
fetchDocument(name).then((content) => {
if (content) {
updateDocument(name, content);
} else {
db.delete(documentTable).where(eq(documentTable.name, name));
}
}).catch(
log.error,
);
if (documents[0]?.content) return documents[0].content;
const text = await fetchDocument(name);
if (!text) {
db.delete(documentTable).where(eq(documentTable.name, name));
return;
}
await updateDocument(name, text);
return text;
}
export function updateDocument(name: string, content: string) {
return db.update(documentTable).set({
content,
}).where(eq(documentTable.name, name));
}
export function transformDocument(input: string, cb: (r: Root) => Root) {
const out = unified()
.use(remarkParse)

View File

@ -1,7 +1,9 @@
import OpenAI from "https://deno.land/x/openai@v4.52.0/mod.ts";
import OpenAI from "https://deno.land/x/openai@v4.69.0/mod.ts";
import { zodResponseFormat } from "https://deno.land/x/openai@v4.69.0/helpers/zod.ts";
import { OPENAI_API_KEY } from "@lib/env.ts";
import { hashString } from "@lib/helpers.ts";
import { createCache } from "@lib/cache.ts";
import recipeSchema from "@lib/recipeSchema.ts";
const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY });
@ -208,3 +210,21 @@ export async function createTags(content: string) {
return extractListFromResponse(res).map((v) => v.replaceAll(" ", "-"));
}
export async function extractRecipe(content: string) {
if (!openAI) return;
const completion = await openAI.beta.chat.completions.parse({
model: "gpt-4o-2024-08-06",
temperature: 0.1,
messages: [
{
role: "system",
content: "Extract the recipe information from the provided markdown.",
},
{ role: "user", content },
],
response_format: zodResponseFormat(recipeSchema, "recipe-v2"),
});
return recipeSchema.parse(completion.choices[0].message.parsed);
}

35
lib/parseIngredient.ts Normal file
View File

@ -0,0 +1,35 @@
import { parseIngredient as _parseIngredient } from "https://esm.sh/parse-ingredient@1.0.1";
export function parseIngredient(text: string) {
const ing = _parseIngredient(text, {
additionalUOMs: {
tableSpoon: {
short: "EL",
plural: "Table Spoons",
alternates: ["el", "EL", "Tbsp", "tbsp"],
},
teaSpoon: {
short: "TL",
plural: "Tea Spoon",
alternates: ["tl", "TL", "Tsp", "tsp", "teaspoon"],
},
litre: {
short: "L",
plural: "liters",
alternates: ["L", "l"],
},
paket: {
short: "Paket",
plural: "Pakets",
alternates: ["Paket", "paket"],
},
},
});
return {
name: ing[0].description,
unit: ing[0].unitOfMeasure || "",
quantity: ing[0].quantity?.toString() || "",
note: "",
};
}

55
lib/playwright.ts Normal file
View File

@ -0,0 +1,55 @@
import { firefox } from "npm:playwright-extra";
import { createStreamResponse } from "@lib/helpers.ts";
import StealthPlugin from "npm:puppeteer-extra-plugin-stealth";
const userAgentStrings = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.3497.92 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
];
firefox.use(StealthPlugin());
export async function fetchHtmlWithPlaywright(
fetchUrl: string,
streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string> {
streamResponse.enqueue("booting up playwright");
// Launch the Playwright browser
const browser = await firefox.launch();
streamResponse.enqueue("fetching html");
try {
// Open a new browser context and page
const context = await browser.newContext({
userAgent:
userAgentStrings[Math.floor(Math.random() * userAgentStrings.length)],
});
//add init script
await context.addInitScript(
"Object.defineProperty(navigator, 'webdriver', {get: () => undefined})",
);
const page = await context.newPage();
// Navigate to the URL
await page.goto(fetchUrl, {
waitUntil: "domcontentloaded", // Wait for the DOM to load
});
// Get the HTML content of the page
const html = await page.content();
return html;
} catch (error) {
streamResponse.enqueue("error fetching html");
console.error(error);
return "";
} finally {
// Close the browser
await browser.close();
}
}

39
lib/recipeSchema.ts Normal file
View File

@ -0,0 +1,39 @@
import { z } from "npm:zod";
export const IngredientSchema = z.object({
quantity: z.string().describe(
"e.g., '2', '1/2', or an empty string for 'to taste'",
),
unit: z.string().describe('e.g., "g", "tbsp", "cup"'),
name: z.string().describe('e.g., "sugar", "flour"'), //
note: z.string().describe('optional, e.g., "sifted", "chopped finely"'),
});
export type Ingredient = z.infer<typeof IngredientSchema>;
export const IngredientGroupSchema = z.object({
name: z.string(),
items: z.array(IngredientSchema),
});
export type IngredientGroup = z.infer<typeof IngredientGroupSchema>;
const recipeSchema = z.object({
title: z.string().describe(
"Title of the Recipe, without the name of the website or author",
),
image: z.string().describe("URL of the main image of the recipe"),
author: z.string().describe("author of the Recipe (optional)"),
description: z.string().describe("Optional, short description of the recipe"),
ingredients: z.array(z.union([IngredientSchema, IngredientGroupSchema]))
.describe("List of ingredients"),
instructions: z.array(z.string()).describe("List of instructions"),
servings: z.number().describe("Amount of Portions"),
prepTime: z.number().describe("Preparation time in minutes"),
cookTime: z.number().describe("Cooking time in minutes"),
totalTime: z.number().describe("Total time in minutes"),
tags: z.array(z.string()).describe(
"List of tags (e.g., ['vegan', 'dessert'])",
),
notes: z.array(z.string()).describe("Optional notes about the recipe"),
});
export default recipeSchema;

View File

@ -4,31 +4,22 @@ import {
getTextOfRange,
parseDocument,
} from "@lib/documents.ts";
import { parse } from "yaml";
import { parseIngredient } from "https://esm.sh/parse-ingredient@1.0.1";
import { parse, stringify } from "yaml";
import { createCrud } from "@lib/crud.ts";
import { extractHashTags } from "@lib/string.ts";
export type IngredientGroup = {
name: string;
ingredients: Ingredient[];
};
export type Ingredient = {
type: string;
unit?: string;
amount?: string;
};
export type Ingredients = (Ingredient | IngredientGroup)[];
import { Ingredient, IngredientGroup } from "@lib/recipeSchema.ts";
import { fixRenderedMarkdown } from "@lib/helpers.ts";
import { parseIngredient } from "@lib/parseIngredient.ts";
export type Recipe = {
type: "recipe";
id: string;
name: string;
description?: string;
ingredients: Ingredients;
preparation?: string;
markdown?: string;
ingredients: (Ingredient | IngredientGroup)[];
instructions?: string[];
notes?: string[];
tags: string[];
meta?: {
time?: string;
@ -49,38 +40,8 @@ function parseIngredientItem(listItem: DocumentChild): Ingredient | undefined {
const text = children.map((c) => getTextOfChild(c)).join(" ").trim();
const ing = parseIngredient(text, {
additionalUOMs: {
tableSpoon: {
short: "EL",
plural: "Table Spoons",
alternates: ["el", "EL", "Tbsp", "tbsp"],
},
teaSpoon: {
short: "TL",
plural: "Tea Spoon",
alternates: ["tl", "TL", "Tsp", "tsp", "teaspoon"],
},
litre: {
short: "L",
plural: "liters",
alternates: ["L", "l"],
},
paket: {
short: "Paket",
plural: "Pakets",
alternates: ["Paket", "paket"],
},
},
});
return {
type: ing[0].description,
unit: ing[0].unitOfMeasure,
amount: ing[0].quantity,
};
return parseIngredient(text);
}
return;
}
const isIngredient = (item: Ingredient | undefined): item is Ingredient => {
@ -112,9 +73,10 @@ function parseIngredients(children: DocumentChild[]): Recipe["ingredients"] {
if (!nextChild || nextChild.type !== "list") continue;
const name = getTextOfChild(child);
ingredients.push({
name: getTextOfChild(child) || "",
ingredients: parseIngredientsList(nextChild),
name: name || "",
items: parseIngredientsList(nextChild),
});
skip = true;
continue;
@ -128,6 +90,19 @@ function parseIngredients(children: DocumentChild[]): Recipe["ingredients"] {
return ingredients;
}
function extractSteps(
content: string,
seperator: RegExp = /\n(?=\d+\.)/g,
): string[] {
const steps = content.split(seperator).map((step) => {
const match = step.match(/^(\d+)\.\s*(.*)/);
if (!match) return;
const [, , text] = match;
return text;
}).filter((step) => !!step);
return steps as string[];
}
export function parseRecipe(original: string, id: string): Recipe {
const doc = parseDocument(original);
@ -140,8 +115,8 @@ export function parseRecipe(original: string, id: string): Recipe {
if (child.type === "yaml") {
try {
meta = parse(child.value) as Recipe["meta"];
} catch (_) {
// console.log("Error parsing YAML", err);
} catch (err) {
console.log("Error parsing YAML", err);
}
continue;
}
@ -168,7 +143,14 @@ export function parseRecipe(original: string, id: string): Recipe {
const ingredients = parseIngredients(groups[1]);
const preparation = getTextOfRange(groups[2], original);
const instructionText = getTextOfRange(groups[2], original);
let instructions = extractSteps(instructionText || "");
if (instructions.length <= 1) {
const d = extractSteps(instructionText || "", /\n/g);
if (d.length > instructions.length) {
instructions = d;
}
}
const tags = extractHashTags(description || "");
if (description) {
@ -183,15 +165,88 @@ export function parseRecipe(original: string, id: string): Recipe {
meta,
name,
tags,
markdown: original,
notes: getTextOfRange(groups[3], original)?.split("\n"),
description,
ingredients,
preparation,
instructions,
};
}
function filterUndefinedFromObject<T extends { [key: string]: unknown }>(
obj: T,
) {
return Object.fromEntries(
Object.entries(obj).filter(([_, v]) => v !== undefined),
);
}
export function renderRecipe(recipe: Recipe) {
const meta = filterUndefinedFromObject(recipe.meta || {});
// Clean up meta properties
delete meta.thumbnail;
delete meta.average;
const recipeImage = meta.image ? `![](${meta.image})` : "";
// Format ingredient groups and standalone ingredients
const ingredients = recipe.ingredients
.map((item) => {
if ("items" in item) {
return `\n*${item.name}*\n${
item.items
.map((ing) => {
if (ing.quantity && ing.unit) {
return `- **${ing.quantity.trim() || ""}${
ing.unit.trim() || ""
}** ${ing.name}`;
}
return `- ${ing.name}`;
})
.join("\n")
}`;
}
if (item.quantity && item.unit) {
return `- **${item.quantity?.trim() || ""}${
item.unit?.trim() || ""
}** ${item.name}`;
}
return `- ${item.name}`;
})
.join("\n");
// Format instructions as a numbered list
const instructions = recipe.instructions
? recipe.instructions.map((step, i) => `${i + 1}. ${step}`).join("\n")
: "";
// Render the final markdown
return fixRenderedMarkdown(`${
Object.keys(meta).length
? `---
${stringify(meta)}
---`
: `---
---`
}
# ${recipe.name}
${recipe.meta?.image ? recipeImage : ""}
${recipe.tags.map((t) => `#${t.replaceAll(" ", "-")}`).join(" ")}
${recipe.description || ""}
---
${ingredients ? `## Ingredients\n\n${ingredients}\n\n---\n` : ""}
${instructions ? `${instructions}\n\n---` : ""}
${recipe.notes?.length ? `\n${recipe.notes.join("\n")}` : ""}
`);
}
const crud = createCrud<Recipe>({
prefix: `Recipes/`,
parse: parseRecipe,
render: renderRecipe,
hasThumbnails: true,
});

View File

@ -6,17 +6,10 @@ export function formatDate(date: Date): string {
}
export function safeFileName(inputString: string): string {
// Convert the string to lowercase
let fileName = inputString.toLowerCase();
// Replace spaces with underscores
fileName = fileName.replace(/ /g, "_");
// Remove characters that are not safe for file names
fileName = fileName.replace(/[^\w.-]/g, "");
fileName = fileName.replaceAll(":", "");
return fileName;
}