254 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
			
		
		
	
	
			254 lines
		
	
	
		
			7.3 KiB
		
	
	
	
		
			TypeScript
		
	
	
	
	
	
| import { Handlers } from "$fresh/server.ts";
 | |
| import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
 | |
| import { DOMParser } from "domparser";
 | |
| import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
 | |
| import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
 | |
| import * as openai from "@lib/openai.ts";
 | |
| import tds from "https://cdn.skypack.dev/turndown@7.2.0";
 | |
| import { createLogger } from "@lib/log/index.ts";
 | |
| import { Recipe } from "@lib/resource/recipes.ts";
 | |
| import recipeSchema from "@lib/recipeSchema.ts";
 | |
| import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
 | |
| import { safeFileName } from "@lib/string.ts";
 | |
| import { createDocument } from "@lib/documents.ts";
 | |
| import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
 | |
| import z from "npm:zod";
 | |
| import { fetchHtmlWithPlaywright } from "@lib/playwright.ts";
 | |
| 
 | |
| const parser = new DOMParser();
 | |
| 
 | |
| const log = createLogger("api/article");
 | |
| 
 | |
| function makeUrlAbsolute(url: URL, src: string) {
 | |
|   if (src.startsWith("/")) {
 | |
|     return `${url.origin}${src.replace(/$\//, "")}`;
 | |
|   }
 | |
| 
 | |
|   if (!src.startsWith("https://") && !src.startsWith("http://")) {
 | |
|     return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
 | |
|   }
 | |
| 
 | |
|   return src;
 | |
| }
 | |
| 
 | |
| async function extractUsingAI(
 | |
|   url: URL,
 | |
|   document: Parameters<typeof Readability>[0] | null,
 | |
|   streamResponse: ReturnType<typeof createStreamResponse>,
 | |
| ) {
 | |
|   const readable = new Readability(document);
 | |
| 
 | |
|   const result = readable.parse();
 | |
| 
 | |
|   const service = new tds({
 | |
|     headingStyle: "atx",
 | |
|     codeBlockStyle: "fenced",
 | |
|     hr: "---",
 | |
|     bulletListMarker: "-",
 | |
|   });
 | |
| 
 | |
|   service.addRule("fix image links", {
 | |
|     filter: ["img"],
 | |
|     replacement: function (_: string, node: HTMLImageElement) {
 | |
|       const src = node.getAttribute("src");
 | |
|       const alt = node.getAttribute("alt") || "";
 | |
|       if (!src || src.startsWith("data:image")) return "";
 | |
| 
 | |
|       return `})`;
 | |
|     },
 | |
|   });
 | |
|   service.addRule("fix normal links", {
 | |
|     filter: ["a"],
 | |
|     replacement: function (content: string, node: HTMLImageElement) {
 | |
|       const href = node.getAttribute("href");
 | |
|       if (!href) return content;
 | |
| 
 | |
|       if (href.startsWith("/")) {
 | |
|         return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
 | |
|       }
 | |
| 
 | |
|       if (href.startsWith("#")) {
 | |
|         if (content.length < 2) return "";
 | |
|         return `[${content}](${url.href}#${href})`.replace("##", "#");
 | |
|       }
 | |
| 
 | |
|       if (!href.startsWith("https://") && !href.startsWith("http://")) {
 | |
|         return `[${content}](${url.origin.replace(/\/$/, "")}/${
 | |
|           href.replace(/^\//, "")
 | |
|         })`;
 | |
|       }
 | |
| 
 | |
|       return `[${content}](${href})`;
 | |
|     },
 | |
|   });
 | |
| 
 | |
|   const cleanDocument = parser.parseFromString(
 | |
|     result.content,
 | |
|     "text/html",
 | |
|   );
 | |
| 
 | |
|   const markdown = service.turndown(cleanDocument);
 | |
| 
 | |
|   streamResponse.enqueue("extracting recipe with openai");
 | |
| 
 | |
|   const recipe = await openai.extractRecipe(markdown);
 | |
| 
 | |
|   if (recipe) {
 | |
|     if ("errorMessages" in recipe) {
 | |
|       throw new Error("Failed to extract recipe: " + recipe.errorMessages[0]);
 | |
|     } else {
 | |
|       return recipe;
 | |
|     }
 | |
|   }
 | |
| }
 | |
| 
 | |
| async function processCreateRecipeFromUrl(
 | |
|   { fetchUrl, streamResponse }: {
 | |
|     fetchUrl: string;
 | |
|     streamResponse: ReturnType<typeof createStreamResponse>;
 | |
|   },
 | |
| ) {
 | |
|   log.info("create article from url", { url: fetchUrl });
 | |
|   const url = new URL(fetchUrl);
 | |
| 
 | |
|   streamResponse.enqueue("downloading article");
 | |
| 
 | |
|   const html = await fetchHtmlWithPlaywright(fetchUrl, streamResponse);
 | |
| 
 | |
|   streamResponse.enqueue("download success");
 | |
|   Deno.writeTextFile("article.html", html);
 | |
| 
 | |
|   const document = parser.parseFromString(html, "text/html");
 | |
| 
 | |
|   const title = document?.querySelector("title")?.innerText;
 | |
| 
 | |
|   const images: HTMLImageElement[] = [];
 | |
|   document?.querySelectorAll("img").forEach((img) => {
 | |
|     images.push(img as unknown as HTMLImageElement);
 | |
|   });
 | |
| 
 | |
|   const metaAuthor =
 | |
|     document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
 | |
|       "content",
 | |
|     ) ||
 | |
|     document?.querySelector('meta[name="author"]')?.getAttribute("content");
 | |
| 
 | |
|   const jsonLds = Array.from(
 | |
|     document?.querySelectorAll(
 | |
|       "script[type='application/ld+json']",
 | |
|     ),
 | |
|   ) as unknown as HTMLScriptElement[];
 | |
| 
 | |
|   let recipe: z.infer<typeof recipeSchema> | undefined = undefined;
 | |
|   if (jsonLds.length > 0) {
 | |
|     for (const jsonLd of jsonLds) {
 | |
|       recipe = parseJsonLdToRecipeSchema(jsonLd.textContent || "");
 | |
|       if (recipe) break;
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   if (!recipe) {
 | |
|     recipe = await extractUsingAI(url, document, streamResponse);
 | |
|   }
 | |
| 
 | |
|   const id = (recipe?.title || title || "").replace(/--+/, "-");
 | |
| 
 | |
|   if (!recipe) {
 | |
|     streamResponse.enqueue("failed to parse recipe");
 | |
|     streamResponse.cancel();
 | |
|     return;
 | |
|   }
 | |
|   if (!recipe.image) {
 | |
|     const largestImage = images.filter((img) => {
 | |
|       const src = img.getAttribute("src");
 | |
|       return !!src && !src.startsWith("data:");
 | |
|     }).sort((a, b) => {
 | |
|       const aSize = +(a.getAttribute("width") || 0) +
 | |
|         +(a.getAttribute("height") || 0);
 | |
|       const bSize = +(b.getAttribute("width") || 0) +
 | |
|         +(b.getAttribute("height") || 0);
 | |
|       return aSize > bSize ? -1 : 1;
 | |
|     })[0];
 | |
|     const src = largestImage.getAttribute("src");
 | |
|     if (src) {
 | |
|       recipe.image = makeUrlAbsolute(url, src);
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   const newRecipe: Recipe = {
 | |
|     type: "recipe",
 | |
|     id,
 | |
|     name: recipe?.title || title || "",
 | |
|     description: recipe?.description,
 | |
|     ingredients: recipe?.ingredients || [],
 | |
|     instructions: recipe?.instructions || [],
 | |
|     notes: recipe?.notes,
 | |
|     tags: recipe.tags || [],
 | |
|     meta: {
 | |
|       image: recipe?.image,
 | |
|       time: recipe?.totalTime
 | |
|         ? `${recipe?.totalTime?.toString()} minutes`
 | |
|         : undefined,
 | |
|       link: fetchUrl,
 | |
|       portion: recipe?.servings,
 | |
|       author: metaAuthor ?? recipe?.author,
 | |
|     },
 | |
|   };
 | |
| 
 | |
|   if (newRecipe.meta?.image) {
 | |
|     const src = makeUrlAbsolute(url, newRecipe.meta.image);
 | |
|     if (src?.length > 5) {
 | |
|       const extension = fileExtension(new URL(src).pathname);
 | |
|       const finalPath = `Media/articles/images/${
 | |
|         safeFileName(id)
 | |
|       }_cover.${extension}`;
 | |
|       streamResponse.enqueue("downloading image");
 | |
|       try {
 | |
|         streamResponse.enqueue("downloading image");
 | |
|         // const res = await fetch(src);
 | |
|         streamResponse.enqueue("saving image");
 | |
|         // const buffer = await res.arrayBuffer();
 | |
|         // await createDocument(finalPath, buffer);
 | |
|         newRecipe.meta.image = finalPath;
 | |
|       } catch (err) {
 | |
|         console.log("Failed to save image", err);
 | |
|       }
 | |
|     }
 | |
|   }
 | |
| 
 | |
|   streamResponse.enqueue("finished processing, creating file");
 | |
| 
 | |
|   // await createRecipe(newRecipe.id, newRecipe);
 | |
| 
 | |
|   streamResponse.enqueue("id: " + newRecipe.id);
 | |
| }
 | |
| 
 | |
| export const handler: Handlers = {
 | |
|   GET(req, ctx) {
 | |
|     const session = ctx.state.session;
 | |
|     if (!session) {
 | |
|       throw new AccessDeniedError();
 | |
|     }
 | |
| 
 | |
|     const url = new URL(req.url);
 | |
|     const fetchUrl = url.searchParams.get("url");
 | |
| 
 | |
|     if (!fetchUrl || !isValidUrl(fetchUrl)) {
 | |
|       throw new BadRequestError();
 | |
|     }
 | |
| 
 | |
|     const streamResponse = createStreamResponse();
 | |
| 
 | |
|     processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => {
 | |
|       log.debug("created article from link", { article });
 | |
|     }).catch((err) => {
 | |
|       streamResponse.enqueue(`error creating recipe: ${err}`);
 | |
|       log.error(err);
 | |
|     }).finally(() => {
 | |
|       streamResponse.cancel();
 | |
|     });
 | |
| 
 | |
|     return streamResponse.response;
 | |
|   },
 | |
| };
 |