258 lines
7.4 KiB
TypeScript

import { Handlers } from "$fresh/server.ts";
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
import { DOMParser } from "domparser";
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
import * as openai from "@lib/openai.ts";
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
import { createLogger } from "@lib/log/index.ts";
import { createRecipe, Recipe } from "@lib/resource/recipes.ts";
import recipeSchema, { isValidRecipe } from "@lib/recipeSchema.ts";
import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
import { safeFileName } from "@lib/string.ts";
import { createDocument } from "@lib/documents.ts";
import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
import z from "npm:zod";
import { fetchHtmlWithPlaywright } from "@lib/playwright.ts";
const parser = new DOMParser();
const log = createLogger("api/article");
function makeUrlAbsolute(url: URL, src: string) {
if (src.startsWith("/")) {
return `${url.origin}${src.replace(/$\//, "")}`;
}
if (!src.startsWith("https://") && !src.startsWith("http://")) {
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
}
return src;
}
async function extractUsingAI(
url: URL,
document: Parameters<typeof Readability>[0] | null,
streamResponse: ReturnType<typeof createStreamResponse>,
) {
const readable = new Readability(document);
const result = readable.parse();
const service = new tds({
headingStyle: "atx",
codeBlockStyle: "fenced",
hr: "---",
bulletListMarker: "-",
});
service.addRule("fix image links", {
filter: ["img"],
replacement: function (_: string, node: HTMLImageElement) {
const src = node.getAttribute("src");
const alt = node.getAttribute("alt") || "";
if (!src || src.startsWith("data:image")) return "";
return `![${alt}](${makeUrlAbsolute(url, src)})`;
},
});
service.addRule("fix normal links", {
filter: ["a"],
replacement: function (content: string, node: HTMLImageElement) {
const href = node.getAttribute("href");
if (!href) return content;
if (href.startsWith("/")) {
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
}
if (href.startsWith("#")) {
if (content.length < 2) return "";
return `[${content}](${url.href}#${href})`.replace("##", "#");
}
if (!href.startsWith("https://") && !href.startsWith("http://")) {
return `[${content}](${url.origin.replace(/\/$/, "")}/${
href.replace(/^\//, "")
})`;
}
return `[${content}](${href})`;
},
});
const cleanDocument = parser.parseFromString(
result.content,
"text/html",
);
const markdown = service.turndown(cleanDocument);
streamResponse.enqueue("extracting recipe with openai");
const recipe = await openai.extractRecipe(markdown);
if (isValidRecipe(recipe)) {
return recipe;
}
return;
}
async function processCreateRecipeFromUrl(
{ fetchUrl, streamResponse }: {
fetchUrl: string;
streamResponse: ReturnType<typeof createStreamResponse>;
},
) {
log.info("create article from url", { url: fetchUrl });
const url = new URL(fetchUrl);
streamResponse.enqueue("downloading article");
const html = await fetchHtmlWithPlaywright(fetchUrl, streamResponse);
streamResponse.enqueue("download success");
Deno.writeTextFile("article.html", html);
const document = parser.parseFromString(html, "text/html");
const title = document?.querySelector("title")?.innerText;
const images: HTMLImageElement[] = [];
document?.querySelectorAll("img").forEach((img) => {
images.push(img as unknown as HTMLImageElement);
});
const metaAuthor =
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
"content",
) ||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
const jsonLds = Array.from(
document?.querySelectorAll(
"script[type='application/ld+json']",
),
) as unknown as HTMLScriptElement[];
let recipe: z.infer<typeof recipeSchema> | undefined = undefined;
if (jsonLds.length > 0) {
for (const jsonLd of jsonLds) {
recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || "");
if (recipe) break;
}
}
if (!recipe) {
recipe = await extractUsingAI(url, document, streamResponse);
}
const id = (recipe?.title || title || "").replace(/--+/, "-");
if (!recipe) {
streamResponse.enqueue("failed to parse recipe");
streamResponse.cancel();
return;
}
if (!recipe.image) {
const largestImage = images.filter((img) => {
const src = img.getAttribute("src");
return !!src && !src.startsWith("data:");
}).sort((a, b) => {
const aSize = +(a.getAttribute("width") || 0) +
+(a.getAttribute("height") || 0);
const bSize = +(b.getAttribute("width") || 0) +
+(b.getAttribute("height") || 0);
return aSize > bSize ? -1 : 1;
})[0];
const src = largestImage.getAttribute("src");
if (src) {
recipe.image = makeUrlAbsolute(url, src);
}
}
if (!recipe) {
console.error("Failed to parse recipe");
streamResponse.enqueue("failed to parse recipe");
streamResponse.cancel();
return;
}
const newRecipe: Recipe = {
type: "recipe",
id,
name: recipe?.title || title || "",
description: recipe?.description,
ingredients: recipe?.ingredients || [],
instructions: recipe?.instructions || [],
notes: recipe?.notes,
tags: recipe.tags || [],
meta: {
image: recipe?.image,
time: recipe?.totalTime
? `${recipe?.totalTime?.toString()} minutes`
: undefined,
link: fetchUrl,
portion: recipe?.servings,
author: metaAuthor ?? recipe?.author,
},
};
if (newRecipe.meta?.image) {
const src = makeUrlAbsolute(url, newRecipe.meta.image);
if (src?.length > 5) {
const extension = fileExtension(new URL(src).pathname);
const finalPath = `Media/articles/images/${
safeFileName(id)
}_cover.${extension}`;
streamResponse.enqueue("downloading image");
try {
streamResponse.enqueue("downloading image");
const res = await fetch(src);
streamResponse.enqueue("saving image");
const buffer = await res.arrayBuffer();
await createDocument(finalPath, buffer);
newRecipe.meta.image = finalPath;
} catch (err) {
console.log("Failed to save image", err);
}
}
}
streamResponse.enqueue("finished processing, creating file");
await createRecipe(newRecipe.id, newRecipe);
streamResponse.enqueue("id: " + newRecipe.id);
}
export const handler: Handlers = {
GET(req, ctx) {
const session = ctx.state.session;
if (!session) {
throw new AccessDeniedError();
}
const url = new URL(req.url);
const fetchUrl = url.searchParams.get("url");
if (!fetchUrl || !isValidUrl(fetchUrl)) {
throw new BadRequestError();
}
const streamResponse = createStreamResponse();
processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => {
log.debug("created article from link", { article });
}).catch((err) => {
streamResponse.enqueue(`error creating recipe: ${err}`);
log.error(err);
}).finally(() => {
streamResponse.cancel();
});
return streamResponse.response;
},
};