import { Handlers } from "$fresh/server.ts"; import { Readability } from "https://cdn.skypack.dev/@mozilla/readability"; import { DOMParser } from "https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm.ts"; import { AccessDeniedError, BadRequestError } from "@lib/errors.ts"; import { createStreamResponse, isValidUrl } from "@lib/helpers.ts"; import * as openai from "@lib/openai.ts"; import tds from "https://cdn.skypack.dev/turndown@7.1.1"; import { Article, createArticle } from "@lib/resource/articles.ts"; import { getYoutubeVideoDetails } from "@lib/youtube.ts"; import { extractYoutubeId, isYoutubeLink } from "@lib/string.ts"; const parser = new DOMParser(); //service.use(gfm); async function processCreateArticle( { fetchUrl, streamResponse }: { fetchUrl: string; streamResponse: ReturnType; }, ) { console.log("[api/article] create article from url", { url: fetchUrl }); streamResponse.enqueue("downloading article"); const request = await fetch(fetchUrl); const html = await request.text(); streamResponse.enqueue("download success"); const document = parser.parseFromString(html, "text/html"); const title = document?.querySelector("title")?.innerText; const images: HTMLImageElement[] = []; document?.querySelectorAll("img").forEach((img) => { images.push(img as unknown as HTMLImageElement); }); const metaAuthor = document?.querySelector('meta[name="twitter:creator"]')?.getAttribute( "content", ) || document?.querySelector('meta[name="author"]')?.getAttribute("content"); const readable = new Readability(document); const result = readable.parse(); console.log("[api/article] parsed ", { url: fetchUrl, content: result.textContent, }); const cleanDocument = parser.parseFromString( result.content, "text/html", ); const service = new tds({ headingStyle: "atx", codeBlockStyle: "fenced", hr: "---", bulletListMarker: "-", }); const url = new URL(fetchUrl); function makeUrlAbsolute(src: string) { if (src.startsWith("/")) { return `${url.origin}${src.replace(/$\//, "")}`; } if (!src.startsWith("https://") && !src.startsWith("http://")) { return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`; } return src; } service.addRule("fix image links", { filter: ["img"], replacement: function (_: string, node: HTMLImageElement) { const src = node.getAttribute("src"); const alt = node.getAttribute("alt") || ""; if (!src || src.startsWith("data:image")) return ""; return `![${alt}](${makeUrlAbsolute(src)})`; }, }); service.addRule("fix normal links", { filter: ["a"], replacement: function (content: string, node: HTMLImageElement) { const href = node.getAttribute("href"); if (!href) return content; if (href.startsWith("/")) { return `[${content}](${url.origin}${href.replace(/$\//, "")})`; } if (href.startsWith("#")) { if (content.length < 2) return ""; return `[${content}](${url.href}#${href})`.replace("##", "#"); } if (!href.startsWith("https://") && !href.startsWith("http://")) { return `[${content}](${url.origin.replace(/\/$/, "")}/${ href.replace(/^\//, "") })`; } return `[${content}](${href})`; }, }); const markdown = service.turndown(cleanDocument); streamResponse.enqueue("parsed article, creating tags with openai"); const [tags, shortTitle, author] = await Promise.all([ openai.createTags(markdown), title && openai.shortenTitle(title), metaAuthor || openai.extractAuthorName(markdown), ]); const id = shortTitle || title || ""; const meta: Article["meta"] = { author: (author || "").replace("@", "twitter:"), link: fetchUrl, status: "not-finished", date: new Date(), }; const largestImage = images.filter((img) => { const src = img.getAttribute("src"); return !!src && !src.startsWith("data:"); }).sort((a, b) => { const aSize = +(a.getAttribute("width") || 0) + +(a.getAttribute("height") || 0); const bSize = +(b.getAttribute("width") || 0) + +(b.getAttribute("height") || 0); return aSize > bSize ? -1 : 1; })[0]; const newArticle = { type: "article", id, name: title || "", content: markdown, tags: tags || [], meta, } as const; if (largestImage) { const src = makeUrlAbsolute(largestImage.getAttribute("src") || ""); if (src) { meta.image = src; } } streamResponse.enqueue("finished processing"); await createArticle(newArticle); streamResponse.enqueue("id: " + newArticle.id); } async function processCreateYoutubeVideo( { fetchUrl, streamResponse }: { fetchUrl: string; streamResponse: ReturnType; }, ) { console.log("[api/article] create youtube article from url", { url: fetchUrl, }); streamResponse.enqueue("getting video infos from youtube api"); const id = extractYoutubeId(fetchUrl); const video = await getYoutubeVideoDetails(id); streamResponse.enqueue("shortening title with openai"); const newId = await openai.shortenTitle(video.snippet.title); const newArticle: Article = { name: video.snippet.title, id: newId || video.snippet.title, content: video.snippet.description, tags: video.snippet.tags.slice(0, 5), meta: { status: "not-finished", link: fetchUrl, author: video.snippet.channelTitle, date: new Date(video.snippet.publishedAt), }, }; streamResponse.enqueue("creating article"); await createArticle(newArticle); streamResponse.enqueue("finished"); streamResponse.enqueue("id: " + newArticle.id); } export const handler: Handlers = { GET(req, ctx) { const session = ctx.state.session; if (!session) { throw new AccessDeniedError(); } const url = new URL(req.url); const fetchUrl = url.searchParams.get("url"); if (!fetchUrl || !isValidUrl(fetchUrl)) { throw new BadRequestError(); } const streamResponse = createStreamResponse(); if (isYoutubeLink(fetchUrl)) { processCreateYoutubeVideo({ fetchUrl, streamResponse }).then( (article) => { console.log({ article }); }, ).catch((err) => { console.log(err); }).finally(() => { streamResponse.cancel(); }); } else { processCreateArticle({ fetchUrl, streamResponse }).then((article) => { console.log({ article }); }).catch((err) => { console.log(err); }).finally(() => { streamResponse.cancel(); }); } return streamResponse.response; }, };