import { Handlers } from "$fresh/server.ts"; import { Readability } from "https://cdn.skypack.dev/@mozilla/readability"; import { DOMParser } from "domparser"; import { AccessDeniedError, BadRequestError } from "@lib/errors.ts"; import { createStreamResponse, isValidUrl } from "@lib/helpers.ts"; import * as openai from "@lib/openai.ts"; import tds from "https://cdn.skypack.dev/turndown@7.2.0"; import { createLogger } from "@lib/log/index.ts"; import { createRecipe, Recipe } from "@lib/resource/recipes.ts"; import recipeSchema, { isValidRecipe } from "@lib/recipeSchema.ts"; import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts"; import { safeFileName } from "@lib/string.ts"; import { createDocument } from "@lib/documents.ts"; import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts"; import z from "npm:zod"; import { fetchHtmlWithPlaywright } from "@lib/playwright.ts"; const parser = new DOMParser(); const log = createLogger("api/article"); function makeUrlAbsolute(url: URL, src: string) { if (src.startsWith("/")) { return `${url.origin}${src.replace(/$\//, "")}`; } if (!src.startsWith("https://") && !src.startsWith("http://")) { return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`; } return src; } async function extractUsingAI( url: URL, document: Parameters[0] | null, streamResponse: ReturnType, ) { const readable = new Readability(document); const result = readable.parse(); const service = new tds({ headingStyle: "atx", codeBlockStyle: "fenced", hr: "---", bulletListMarker: "-", }); service.addRule("fix image links", { filter: ["img"], replacement: function (_: string, node: HTMLImageElement) { const src = node.getAttribute("src"); const alt = node.getAttribute("alt") || ""; if (!src || src.startsWith("data:image")) return ""; return `![${alt}](${makeUrlAbsolute(url, src)})`; }, }); service.addRule("fix normal links", { filter: ["a"], replacement: function (content: string, node: HTMLImageElement) { const href = node.getAttribute("href"); if (!href) return content; if (href.startsWith("/")) { return `[${content}](${url.origin}${href.replace(/$\//, "")})`; } if (href.startsWith("#")) { if (content.length < 2) return ""; return `[${content}](${url.href}#${href})`.replace("##", "#"); } if (!href.startsWith("https://") && !href.startsWith("http://")) { return `[${content}](${url.origin.replace(/\/$/, "")}/${ href.replace(/^\//, "") })`; } return `[${content}](${href})`; }, }); const cleanDocument = parser.parseFromString( result.content, "text/html", ); const markdown = service.turndown(cleanDocument); streamResponse.enqueue("extracting recipe with openai"); const recipe = await openai.extractRecipe(markdown); if (recipe) { if ("errorMessages" in recipe) { throw new Error("Failed to extract recipe: " + recipe.errorMessages[0]); } else { return recipe; } } } async function processCreateRecipeFromUrl( { fetchUrl, streamResponse }: { fetchUrl: string; streamResponse: ReturnType; }, ) { log.info("create article from url", { url: fetchUrl }); const url = new URL(fetchUrl); streamResponse.enqueue("downloading article"); const html = await fetchHtmlWithPlaywright(fetchUrl, streamResponse); streamResponse.enqueue("download success"); Deno.writeTextFile("article.html", html); const document = parser.parseFromString(html, "text/html"); const title = document?.querySelector("title")?.innerText; const images: HTMLImageElement[] = []; document?.querySelectorAll("img").forEach((img) => { images.push(img as unknown as HTMLImageElement); }); const metaAuthor = document?.querySelector('meta[name="twitter:creator"]')?.getAttribute( "content", ) || document?.querySelector('meta[name="author"]')?.getAttribute("content"); const jsonLds = Array.from( document?.querySelectorAll( "script[type='application/ld+json']", ), ) as unknown as HTMLScriptElement[]; let recipe: z.infer | undefined = undefined; if (jsonLds.length > 0) { for (const jsonLd of jsonLds) { recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || ""); if (recipe) break; } } if (!recipe) { recipe = await extractUsingAI(url, document, streamResponse); } const id = (recipe?.title || title || "").replace(/--+/, "-"); if (!recipe) { streamResponse.enqueue("failed to parse recipe"); streamResponse.cancel(); return; } if (!recipe.image) { const largestImage = images.filter((img) => { const src = img.getAttribute("src"); return !!src && !src.startsWith("data:"); }).sort((a, b) => { const aSize = +(a.getAttribute("width") || 0) + +(a.getAttribute("height") || 0); const bSize = +(b.getAttribute("width") || 0) + +(b.getAttribute("height") || 0); return aSize > bSize ? -1 : 1; })[0]; const src = largestImage.getAttribute("src"); if (src) { recipe.image = makeUrlAbsolute(url, src); } } const newRecipe: Recipe = { type: "recipe", id, name: recipe?.title || title || "", description: recipe?.description, ingredients: recipe?.ingredients || [], instructions: recipe?.instructions || [], notes: recipe?.notes, tags: recipe.tags || [], meta: { image: recipe?.image, time: recipe?.totalTime ? `${recipe?.totalTime?.toString()} minutes` : undefined, link: fetchUrl, portion: recipe?.servings, author: metaAuthor ?? recipe?.author, }, }; if (newRecipe.meta?.image) { const src = makeUrlAbsolute(url, newRecipe.meta.image); if (src?.length > 5) { const extension = fileExtension(new URL(src).pathname); const finalPath = `Media/articles/images/${ safeFileName(id) }_cover.${extension}`; streamResponse.enqueue("downloading image"); try { streamResponse.enqueue("downloading image"); const res = await fetch(src); streamResponse.enqueue("saving image"); const buffer = await res.arrayBuffer(); await createDocument(finalPath, buffer); newRecipe.meta.image = finalPath; } catch (err) { console.log("Failed to save image", err); } } } streamResponse.enqueue("finished processing, creating file"); await createRecipe(newRecipe.id, newRecipe); streamResponse.enqueue("id: " + newRecipe.id); } export const handler: Handlers = { GET(req, ctx) { const session = ctx.state.session; if (!session) { throw new AccessDeniedError(); } const url = new URL(req.url); const fetchUrl = url.searchParams.get("url"); if (!fetchUrl || !isValidUrl(fetchUrl)) { throw new BadRequestError(); } const streamResponse = createStreamResponse(); processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => { log.debug("created article from link", { article }); }).catch((err) => { streamResponse.enqueue(`error creating recipe: ${err}`); log.error(err); }).finally(() => { streamResponse.cancel(); }); return streamResponse.response; }, };