feat: url scraper to recipe
This commit is contained in:
@ -1,6 +1,6 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm.ts";
|
||||
import { DOMParser } from "domparser";
|
||||
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
||||
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
|
12
routes/api/cache.ts
Normal file
12
routes/api/cache.ts
Normal file
@ -0,0 +1,12 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { documentTable } from "@lib/db/schema.ts";
|
||||
import { db } from "@lib/db/sqlite.ts";
|
||||
import { json } from "@lib/helpers.ts";
|
||||
|
||||
export const handler: Handlers = {
|
||||
async DELETE() {
|
||||
await db.delete(documentTable).run();
|
||||
return json({ status: "ok" });
|
||||
},
|
||||
};
|
||||
|
@ -71,7 +71,6 @@ const POST = async (
|
||||
if (posterPath && !movie.meta?.image) {
|
||||
const poster = await tmdb.getMoviePoster(posterPath);
|
||||
const extension = fileExtension(posterPath);
|
||||
|
||||
finalPath = `Media/movies/images/${safeFileName(name)}_cover.${extension}`;
|
||||
await createDocument(finalPath, poster);
|
||||
movie.meta = movie.meta || {};
|
||||
|
264
routes/api/recipes/create/index.ts
Normal file
264
routes/api/recipes/create/index.ts
Normal file
@ -0,0 +1,264 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "domparser";
|
||||
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
||||
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
|
||||
import { createLogger } from "@lib/log.ts";
|
||||
import { createRecipe, Recipe } from "@lib/resource/recipes.ts";
|
||||
import recipeSchema from "@lib/recipeSchema.ts";
|
||||
import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
|
||||
import { safeFileName } from "@lib/string.ts";
|
||||
import { createDocument } from "@lib/documents.ts";
|
||||
import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
|
||||
import z from "npm:zod";
|
||||
import { fetchHtmlWithPlaywright } from "@lib/playwright.ts";
|
||||
|
||||
const parser = new DOMParser();
|
||||
|
||||
const log = createLogger("api/article");
|
||||
|
||||
function makeUrlAbsolute(url: URL, src: string) {
|
||||
if (src.startsWith("/")) {
|
||||
return `${url.origin}${src.replace(/$\//, "")}`;
|
||||
}
|
||||
|
||||
if (!src.startsWith("https://") && !src.startsWith("http://")) {
|
||||
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
async function extractUsingAI(
|
||||
url: URL,
|
||||
document: Parameters<typeof Readability>[0] | null,
|
||||
streamResponse: ReturnType<typeof createStreamResponse>,
|
||||
) {
|
||||
const readable = new Readability(document);
|
||||
|
||||
const result = readable.parse();
|
||||
|
||||
const service = new tds({
|
||||
headingStyle: "atx",
|
||||
codeBlockStyle: "fenced",
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
});
|
||||
|
||||
service.addRule("fix image links", {
|
||||
filter: ["img"],
|
||||
replacement: function (_: string, node: HTMLImageElement) {
|
||||
const src = node.getAttribute("src");
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
if (!src || src.startsWith("data:image")) return "";
|
||||
|
||||
return `})`;
|
||||
},
|
||||
});
|
||||
service.addRule("fix normal links", {
|
||||
filter: ["a"],
|
||||
replacement: function (content: string, node: HTMLImageElement) {
|
||||
const href = node.getAttribute("href");
|
||||
if (!href) return content;
|
||||
|
||||
if (href.startsWith("/")) {
|
||||
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
|
||||
}
|
||||
|
||||
if (href.startsWith("#")) {
|
||||
if (content.length < 2) return "";
|
||||
return `[${content}](${url.href}#${href})`.replace("##", "#");
|
||||
}
|
||||
|
||||
if (!href.startsWith("https://") && !href.startsWith("http://")) {
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
||||
href.replace(/^\//, "")
|
||||
})`;
|
||||
}
|
||||
|
||||
return `[${content}](${href})`;
|
||||
},
|
||||
});
|
||||
|
||||
const cleanDocument = parser.parseFromString(
|
||||
result.content,
|
||||
"text/html",
|
||||
);
|
||||
|
||||
const markdown = service.turndown(cleanDocument);
|
||||
|
||||
streamResponse.enqueue("extracting recipe with openai");
|
||||
console.log("------- MARKDOWN ------");
|
||||
console.log(markdown);
|
||||
console.log("-----------------------");
|
||||
|
||||
const recipe = await openai.extractRecipe(markdown);
|
||||
console.log("------- EXTRACTED ------");
|
||||
console.log(JSON.stringify(recipe, null, 2));
|
||||
console.log("-----------------------");
|
||||
|
||||
return recipe;
|
||||
}
|
||||
|
||||
async function processCreateRecipeFromUrl(
|
||||
{ fetchUrl, streamResponse }: {
|
||||
fetchUrl: string;
|
||||
streamResponse: ReturnType<typeof createStreamResponse>;
|
||||
},
|
||||
) {
|
||||
log.info("create article from url", { url: fetchUrl });
|
||||
const url = new URL(fetchUrl);
|
||||
|
||||
streamResponse.enqueue("downloading article");
|
||||
|
||||
const html = await fetchHtmlWithPlaywright(fetchUrl, streamResponse);
|
||||
|
||||
streamResponse.enqueue("download success");
|
||||
Deno.writeTextFile("article.html", html);
|
||||
|
||||
const document = parser.parseFromString(html, "text/html");
|
||||
|
||||
const title = document?.querySelector("title")?.innerText;
|
||||
|
||||
const images: HTMLImageElement[] = [];
|
||||
document?.querySelectorAll("img").forEach((img) => {
|
||||
images.push(img as unknown as HTMLImageElement);
|
||||
});
|
||||
|
||||
const metaAuthor =
|
||||
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
||||
"content",
|
||||
) ||
|
||||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
||||
|
||||
const jsonLds = Array.from(
|
||||
document?.querySelectorAll(
|
||||
"script[type='application/ld+json']",
|
||||
) as HTMLScriptElement[],
|
||||
);
|
||||
|
||||
let recipe: z.infer<typeof recipeSchema> | undefined = undefined;
|
||||
if (jsonLds.length > 0) {
|
||||
for (const jsonLd of jsonLds) {
|
||||
console.log({ content: jsonLd.textContent });
|
||||
recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || "");
|
||||
if (recipe) break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!recipe) {
|
||||
recipe = await extractUsingAI(url, document, streamResponse);
|
||||
}
|
||||
|
||||
const id = (recipe?.title || title || "").replaceAll(" ", "-");
|
||||
|
||||
if (!recipe) {
|
||||
streamResponse.enqueue("failed to parse recipe");
|
||||
streamResponse.cancel();
|
||||
return;
|
||||
}
|
||||
if (!recipe.image) {
|
||||
const largestImage = images.filter((img) => {
|
||||
const src = img.getAttribute("src");
|
||||
return !!src && !src.startsWith("data:");
|
||||
}).sort((a, b) => {
|
||||
const aSize = +(a.getAttribute("width") || 0) +
|
||||
+(a.getAttribute("height") || 0);
|
||||
const bSize = +(b.getAttribute("width") || 0) +
|
||||
+(b.getAttribute("height") || 0);
|
||||
return aSize > bSize ? -1 : 1;
|
||||
})[0];
|
||||
const src = largestImage.getAttribute("src");
|
||||
if (src) {
|
||||
recipe.image = makeUrlAbsolute(url, src);
|
||||
}
|
||||
}
|
||||
|
||||
if (!recipe) {
|
||||
console.error("Failed to parse recipe");
|
||||
streamResponse.enqueue("failed to parse recipe");
|
||||
streamResponse.cancel();
|
||||
return;
|
||||
}
|
||||
|
||||
const newRecipe: Recipe = {
|
||||
type: "recipe",
|
||||
id,
|
||||
name: recipe?.title || title || "",
|
||||
description: recipe?.description,
|
||||
ingredients: recipe?.ingredients || [],
|
||||
instructions: recipe?.instructions || [],
|
||||
notes: recipe?.notes,
|
||||
tags: recipe.tags || [],
|
||||
meta: {
|
||||
image: recipe?.image,
|
||||
time: recipe?.totalTime
|
||||
? `${recipe?.totalTime?.toString()} minutes`
|
||||
: undefined,
|
||||
link: fetchUrl,
|
||||
portion: recipe?.servings,
|
||||
author: metaAuthor ?? recipe?.author,
|
||||
},
|
||||
};
|
||||
|
||||
if (newRecipe.meta?.image) {
|
||||
const src = makeUrlAbsolute(url, newRecipe.meta.image);
|
||||
if (src?.length > 5) {
|
||||
const extension = fileExtension(new URL(src).pathname);
|
||||
const finalPath = `Media/articles/images/${
|
||||
safeFileName(id)
|
||||
}_cover.${extension}`;
|
||||
streamResponse.enqueue("downloading image");
|
||||
try {
|
||||
streamResponse.enqueue("downloading image");
|
||||
const res = await fetch(src);
|
||||
streamResponse.enqueue("saving image");
|
||||
const buffer = await res.arrayBuffer();
|
||||
await createDocument(finalPath, buffer);
|
||||
newRecipe.meta.image = finalPath;
|
||||
} catch (err) {
|
||||
console.log("Failed to save image", err);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
streamResponse.enqueue("finished processing, creating file");
|
||||
|
||||
console.log("------- CREATING ------");
|
||||
console.log(JSON.stringify(recipe, null, 2));
|
||||
console.log("-----------------------");
|
||||
|
||||
await createRecipe(newRecipe.id, newRecipe);
|
||||
|
||||
streamResponse.enqueue("id: " + newRecipe.id);
|
||||
}
|
||||
|
||||
export const handler: Handlers = {
|
||||
GET(req, ctx) {
|
||||
const session = ctx.state.session;
|
||||
if (!session) {
|
||||
throw new AccessDeniedError();
|
||||
}
|
||||
|
||||
const url = new URL(req.url);
|
||||
const fetchUrl = url.searchParams.get("url");
|
||||
|
||||
if (!fetchUrl || !isValidUrl(fetchUrl)) {
|
||||
throw new BadRequestError();
|
||||
}
|
||||
|
||||
const streamResponse = createStreamResponse();
|
||||
|
||||
processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => {
|
||||
log.debug("created article from link", { article });
|
||||
}).catch((err) => {
|
||||
log.error(err);
|
||||
}).finally(() => {
|
||||
streamResponse.cancel();
|
||||
});
|
||||
|
||||
return streamResponse.response;
|
||||
},
|
||||
};
|
103
routes/api/recipes/create/parseJsonLd.ts
Normal file
103
routes/api/recipes/create/parseJsonLd.ts
Normal file
@ -0,0 +1,103 @@
|
||||
import recipeSchema from "@lib/recipeSchema.ts";
|
||||
import { parseIngredient } from "@lib/parseIngredient.ts";
|
||||
|
||||
export function parseJsonLdToRecipeSchema(jsonLdContent: string) {
|
||||
try {
|
||||
let data = JSON.parse(jsonLdContent);
|
||||
|
||||
const image = data.image;
|
||||
|
||||
// Handle nested data inside `mainEntity`
|
||||
if (data["mainEntity"]) {
|
||||
data = data["mainEntity"];
|
||||
}
|
||||
|
||||
// Ensure it's a valid Recipe type
|
||||
if (
|
||||
typeof data !== "object" || !data["@type"] || data["@type"] !== "Recipe"
|
||||
) {
|
||||
return;
|
||||
}
|
||||
|
||||
// Map and parse ingredients into the new schema
|
||||
const ingredients = (data.recipeIngredient || []).map(
|
||||
parseIngredient,
|
||||
);
|
||||
|
||||
const instructions = Array.isArray(data.recipeInstructions)
|
||||
? data.recipeInstructions.map((instr) => {
|
||||
if (typeof instr === "string") return instr;
|
||||
if (typeof instr === "object" && instr.text) return instr.text;
|
||||
return "";
|
||||
}).filter((instr) => instr.trim() !== "")
|
||||
: [];
|
||||
|
||||
// Parse servings
|
||||
const servings = parseServings(data.recipeYield);
|
||||
|
||||
// Parse times
|
||||
const prepTime = parseDuration(data.prepTime);
|
||||
const cookTime = parseDuration(data.cookTime);
|
||||
const totalTime = parseDuration(data.totalTime);
|
||||
|
||||
// Extract tags
|
||||
const tags = data.keywords
|
||||
? Array.isArray(data.keywords)
|
||||
? data.keywords
|
||||
: data.keywords.split(",").map((tag: string) => tag.trim())
|
||||
: [];
|
||||
|
||||
// Build the recipe object
|
||||
const recipe = {
|
||||
title: data.name || "Unnamed Recipe",
|
||||
image: pickImage(image || data.image || ""),
|
||||
author: Array.isArray(data.author)
|
||||
? data.author.map((a: any) => a.name).join(", ")
|
||||
: data.author?.name || "",
|
||||
description: data.description || "",
|
||||
ingredients,
|
||||
instructions,
|
||||
servings,
|
||||
prepTime,
|
||||
cookTime,
|
||||
totalTime,
|
||||
tags,
|
||||
notes: data.notes || [],
|
||||
};
|
||||
|
||||
// Validate against the schema
|
||||
return recipeSchema.parse(recipe);
|
||||
} catch (error) {
|
||||
console.error("Invalid JSON-LD content or parsing error:", error);
|
||||
return undefined;
|
||||
}
|
||||
}
|
||||
|
||||
function pickImage(images: string | string[]): string {
|
||||
if (Array.isArray(images)) {
|
||||
return images[0];
|
||||
}
|
||||
return images;
|
||||
}
|
||||
|
||||
function parseServings(servingsData: any): number {
|
||||
if (typeof servingsData === "string") {
|
||||
const match = servingsData.match(/\d+/);
|
||||
return match ? parseInt(match[0], 10) : 1;
|
||||
}
|
||||
if (typeof servingsData === "number") {
|
||||
return servingsData;
|
||||
}
|
||||
return 1;
|
||||
}
|
||||
|
||||
function parseDuration(duration: string | undefined): number {
|
||||
if (!duration) return 0;
|
||||
|
||||
// Matches ISO 8601 durations (e.g., "PT30M" -> 30 minutes)
|
||||
const match = duration.match(/PT(?:(\d+)H)?(?:(\d+)M)?/);
|
||||
const hours = match?.[1] ? parseInt(match[1], 10) : 0;
|
||||
const minutes = match?.[2] ? parseInt(match[2], 10) : 0;
|
||||
|
||||
return hours * 60 + minutes;
|
||||
}
|
Reference in New Issue
Block a user