2025-01-18 00:46:05 +01:00
|
|
|
import { Handlers } from "$fresh/server.ts";
|
|
|
|
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
|
|
|
import { DOMParser } from "domparser";
|
|
|
|
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
|
|
|
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
|
|
|
import * as openai from "@lib/openai.ts";
|
|
|
|
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
|
2025-01-19 16:43:00 +01:00
|
|
|
import { createLogger } from "@lib/log/index.ts";
|
2025-01-18 00:46:05 +01:00
|
|
|
import { createRecipe, Recipe } from "@lib/resource/recipes.ts";
|
2025-01-19 19:49:24 +01:00
|
|
|
import recipeSchema, { isValidRecipe } from "@lib/recipeSchema.ts";
|
2025-01-18 00:46:05 +01:00
|
|
|
import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
|
|
|
|
import { safeFileName } from "@lib/string.ts";
|
|
|
|
import { createDocument } from "@lib/documents.ts";
|
|
|
|
import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
|
|
|
|
import z from "npm:zod";
|
|
|
|
import { fetchHtmlWithPlaywright } from "@lib/playwright.ts";
|
|
|
|
|
|
|
|
const parser = new DOMParser();
|
|
|
|
|
|
|
|
const log = createLogger("api/article");
|
|
|
|
|
|
|
|
function makeUrlAbsolute(url: URL, src: string) {
|
|
|
|
if (src.startsWith("/")) {
|
|
|
|
return `${url.origin}${src.replace(/$\//, "")}`;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!src.startsWith("https://") && !src.startsWith("http://")) {
|
|
|
|
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
|
|
|
|
}
|
|
|
|
|
|
|
|
return src;
|
|
|
|
}
|
|
|
|
|
|
|
|
async function extractUsingAI(
|
|
|
|
url: URL,
|
|
|
|
document: Parameters<typeof Readability>[0] | null,
|
|
|
|
streamResponse: ReturnType<typeof createStreamResponse>,
|
|
|
|
) {
|
|
|
|
const readable = new Readability(document);
|
|
|
|
|
|
|
|
const result = readable.parse();
|
|
|
|
|
|
|
|
const service = new tds({
|
|
|
|
headingStyle: "atx",
|
|
|
|
codeBlockStyle: "fenced",
|
|
|
|
hr: "---",
|
|
|
|
bulletListMarker: "-",
|
|
|
|
});
|
|
|
|
|
|
|
|
service.addRule("fix image links", {
|
|
|
|
filter: ["img"],
|
|
|
|
replacement: function (_: string, node: HTMLImageElement) {
|
|
|
|
const src = node.getAttribute("src");
|
|
|
|
const alt = node.getAttribute("alt") || "";
|
|
|
|
if (!src || src.startsWith("data:image")) return "";
|
|
|
|
|
|
|
|
return `})`;
|
|
|
|
},
|
|
|
|
});
|
|
|
|
service.addRule("fix normal links", {
|
|
|
|
filter: ["a"],
|
|
|
|
replacement: function (content: string, node: HTMLImageElement) {
|
|
|
|
const href = node.getAttribute("href");
|
|
|
|
if (!href) return content;
|
|
|
|
|
|
|
|
if (href.startsWith("/")) {
|
|
|
|
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (href.startsWith("#")) {
|
|
|
|
if (content.length < 2) return "";
|
|
|
|
return `[${content}](${url.href}#${href})`.replace("##", "#");
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!href.startsWith("https://") && !href.startsWith("http://")) {
|
|
|
|
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
|
|
|
href.replace(/^\//, "")
|
|
|
|
})`;
|
|
|
|
}
|
|
|
|
|
|
|
|
return `[${content}](${href})`;
|
|
|
|
},
|
|
|
|
});
|
|
|
|
|
|
|
|
const cleanDocument = parser.parseFromString(
|
|
|
|
result.content,
|
|
|
|
"text/html",
|
|
|
|
);
|
|
|
|
|
|
|
|
const markdown = service.turndown(cleanDocument);
|
|
|
|
|
|
|
|
streamResponse.enqueue("extracting recipe with openai");
|
|
|
|
|
|
|
|
const recipe = await openai.extractRecipe(markdown);
|
|
|
|
|
2025-01-19 21:11:38 +01:00
|
|
|
if (recipe) {
|
|
|
|
if ("errorMessages" in recipe) {
|
|
|
|
throw new Error("Failed to extract recipe: " + recipe.errorMessages[0]);
|
|
|
|
} else {
|
|
|
|
return recipe;
|
|
|
|
}
|
2025-01-19 19:49:24 +01:00
|
|
|
}
|
2025-01-18 00:46:05 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
async function processCreateRecipeFromUrl(
|
|
|
|
{ fetchUrl, streamResponse }: {
|
|
|
|
fetchUrl: string;
|
|
|
|
streamResponse: ReturnType<typeof createStreamResponse>;
|
|
|
|
},
|
|
|
|
) {
|
|
|
|
log.info("create article from url", { url: fetchUrl });
|
|
|
|
const url = new URL(fetchUrl);
|
|
|
|
|
|
|
|
streamResponse.enqueue("downloading article");
|
|
|
|
|
|
|
|
const html = await fetchHtmlWithPlaywright(fetchUrl, streamResponse);
|
|
|
|
|
|
|
|
streamResponse.enqueue("download success");
|
|
|
|
Deno.writeTextFile("article.html", html);
|
|
|
|
|
|
|
|
const document = parser.parseFromString(html, "text/html");
|
|
|
|
|
|
|
|
const title = document?.querySelector("title")?.innerText;
|
|
|
|
|
|
|
|
const images: HTMLImageElement[] = [];
|
|
|
|
document?.querySelectorAll("img").forEach((img) => {
|
|
|
|
images.push(img as unknown as HTMLImageElement);
|
|
|
|
});
|
|
|
|
|
|
|
|
const metaAuthor =
|
|
|
|
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
|
|
|
"content",
|
|
|
|
) ||
|
|
|
|
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
|
|
|
|
|
|
|
const jsonLds = Array.from(
|
|
|
|
document?.querySelectorAll(
|
|
|
|
"script[type='application/ld+json']",
|
2025-01-19 16:43:00 +01:00
|
|
|
),
|
|
|
|
) as unknown as HTMLScriptElement[];
|
2025-01-18 00:46:05 +01:00
|
|
|
|
|
|
|
let recipe: z.infer<typeof recipeSchema> | undefined = undefined;
|
|
|
|
if (jsonLds.length > 0) {
|
|
|
|
for (const jsonLd of jsonLds) {
|
|
|
|
recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || "");
|
|
|
|
if (recipe) break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!recipe) {
|
|
|
|
recipe = await extractUsingAI(url, document, streamResponse);
|
|
|
|
}
|
|
|
|
|
2025-01-19 19:22:19 +01:00
|
|
|
const id = (recipe?.title || title || "").replace(/--+/, "-");
|
2025-01-18 00:46:05 +01:00
|
|
|
|
|
|
|
if (!recipe) {
|
|
|
|
streamResponse.enqueue("failed to parse recipe");
|
|
|
|
streamResponse.cancel();
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
if (!recipe.image) {
|
|
|
|
const largestImage = images.filter((img) => {
|
|
|
|
const src = img.getAttribute("src");
|
|
|
|
return !!src && !src.startsWith("data:");
|
|
|
|
}).sort((a, b) => {
|
|
|
|
const aSize = +(a.getAttribute("width") || 0) +
|
|
|
|
+(a.getAttribute("height") || 0);
|
|
|
|
const bSize = +(b.getAttribute("width") || 0) +
|
|
|
|
+(b.getAttribute("height") || 0);
|
|
|
|
return aSize > bSize ? -1 : 1;
|
|
|
|
})[0];
|
|
|
|
const src = largestImage.getAttribute("src");
|
|
|
|
if (src) {
|
|
|
|
recipe.image = makeUrlAbsolute(url, src);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
const newRecipe: Recipe = {
|
|
|
|
type: "recipe",
|
|
|
|
id,
|
|
|
|
name: recipe?.title || title || "",
|
|
|
|
description: recipe?.description,
|
|
|
|
ingredients: recipe?.ingredients || [],
|
|
|
|
instructions: recipe?.instructions || [],
|
|
|
|
notes: recipe?.notes,
|
|
|
|
tags: recipe.tags || [],
|
|
|
|
meta: {
|
|
|
|
image: recipe?.image,
|
|
|
|
time: recipe?.totalTime
|
|
|
|
? `${recipe?.totalTime?.toString()} minutes`
|
|
|
|
: undefined,
|
|
|
|
link: fetchUrl,
|
|
|
|
portion: recipe?.servings,
|
|
|
|
author: metaAuthor ?? recipe?.author,
|
|
|
|
},
|
|
|
|
};
|
|
|
|
|
|
|
|
if (newRecipe.meta?.image) {
|
|
|
|
const src = makeUrlAbsolute(url, newRecipe.meta.image);
|
|
|
|
if (src?.length > 5) {
|
|
|
|
const extension = fileExtension(new URL(src).pathname);
|
|
|
|
const finalPath = `Media/articles/images/${
|
|
|
|
safeFileName(id)
|
|
|
|
}_cover.${extension}`;
|
|
|
|
streamResponse.enqueue("downloading image");
|
|
|
|
try {
|
|
|
|
streamResponse.enqueue("downloading image");
|
|
|
|
const res = await fetch(src);
|
|
|
|
streamResponse.enqueue("saving image");
|
|
|
|
const buffer = await res.arrayBuffer();
|
|
|
|
await createDocument(finalPath, buffer);
|
|
|
|
newRecipe.meta.image = finalPath;
|
|
|
|
} catch (err) {
|
|
|
|
console.log("Failed to save image", err);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
streamResponse.enqueue("finished processing, creating file");
|
|
|
|
|
|
|
|
await createRecipe(newRecipe.id, newRecipe);
|
|
|
|
|
|
|
|
streamResponse.enqueue("id: " + newRecipe.id);
|
|
|
|
}
|
|
|
|
|
|
|
|
export const handler: Handlers = {
|
|
|
|
GET(req, ctx) {
|
|
|
|
const session = ctx.state.session;
|
|
|
|
if (!session) {
|
|
|
|
throw new AccessDeniedError();
|
|
|
|
}
|
|
|
|
|
|
|
|
const url = new URL(req.url);
|
|
|
|
const fetchUrl = url.searchParams.get("url");
|
|
|
|
|
|
|
|
if (!fetchUrl || !isValidUrl(fetchUrl)) {
|
|
|
|
throw new BadRequestError();
|
|
|
|
}
|
|
|
|
|
|
|
|
const streamResponse = createStreamResponse();
|
|
|
|
|
|
|
|
processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => {
|
|
|
|
log.debug("created article from link", { article });
|
|
|
|
}).catch((err) => {
|
2025-01-19 20:13:25 +01:00
|
|
|
streamResponse.enqueue(`error creating recipe: ${err}`);
|
2025-01-18 00:46:05 +01:00
|
|
|
log.error(err);
|
|
|
|
}).finally(() => {
|
|
|
|
streamResponse.cancel();
|
|
|
|
});
|
|
|
|
|
|
|
|
return streamResponse.response;
|
|
|
|
},
|
|
|
|
};
|