252 lines
6.7 KiB
TypeScript
252 lines
6.7 KiB
TypeScript
import { Handlers } from "$fresh/server.ts";
|
|
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
|
import { DOMParser } from "https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm.ts";
|
|
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
|
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
|
import * as openai from "@lib/openai.ts";
|
|
|
|
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
|
|
import { Article, createArticle } from "@lib/resource/articles.ts";
|
|
import { getYoutubeVideoDetails } from "@lib/youtube.ts";
|
|
import { extractYoutubeId, isYoutubeLink } from "@lib/string.ts";
|
|
import { createLogger } from "@lib/log.ts";
|
|
|
|
const parser = new DOMParser();
|
|
|
|
const log = createLogger("api/article");
|
|
|
|
async function processCreateArticle(
|
|
{ fetchUrl, streamResponse }: {
|
|
fetchUrl: string;
|
|
streamResponse: ReturnType<typeof createStreamResponse>;
|
|
},
|
|
) {
|
|
log.info("create article from url", { url: fetchUrl });
|
|
|
|
streamResponse.enqueue("downloading article");
|
|
|
|
const request = await fetch(fetchUrl);
|
|
const html = await request.text();
|
|
|
|
streamResponse.enqueue("download success");
|
|
|
|
const document = parser.parseFromString(html, "text/html");
|
|
|
|
const title = document?.querySelector("title")?.innerText;
|
|
|
|
const images: HTMLImageElement[] = [];
|
|
document?.querySelectorAll("img").forEach((img) => {
|
|
images.push(img as unknown as HTMLImageElement);
|
|
});
|
|
|
|
const metaAuthor =
|
|
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
|
"content",
|
|
) ||
|
|
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
|
|
|
const readable = new Readability(document);
|
|
|
|
const result = readable.parse();
|
|
|
|
log.debug("parsed", {
|
|
url: fetchUrl,
|
|
content: result.textContent,
|
|
});
|
|
|
|
const cleanDocument = parser.parseFromString(
|
|
result.content,
|
|
"text/html",
|
|
);
|
|
|
|
const service = new tds({
|
|
headingStyle: "atx",
|
|
codeBlockStyle: "fenced",
|
|
hr: "---",
|
|
bulletListMarker: "-",
|
|
});
|
|
|
|
const url = new URL(fetchUrl);
|
|
|
|
function makeUrlAbsolute(src: string) {
|
|
if (src.startsWith("/")) {
|
|
return `${url.origin}${src.replace(/$\//, "")}`;
|
|
}
|
|
|
|
if (!src.startsWith("https://") && !src.startsWith("http://")) {
|
|
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
|
|
}
|
|
|
|
return src;
|
|
}
|
|
|
|
service.addRule("fix image links", {
|
|
filter: ["img"],
|
|
replacement: function (_: string, node: HTMLImageElement) {
|
|
const src = node.getAttribute("src");
|
|
const alt = node.getAttribute("alt") || "";
|
|
if (!src || src.startsWith("data:image")) return "";
|
|
|
|
return `})`;
|
|
},
|
|
});
|
|
service.addRule("fix normal links", {
|
|
filter: ["a"],
|
|
replacement: function (content: string, node: HTMLImageElement) {
|
|
const href = node.getAttribute("href");
|
|
if (!href) return content;
|
|
|
|
if (href.startsWith("/")) {
|
|
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
|
|
}
|
|
|
|
if (href.startsWith("#")) {
|
|
if (content.length < 2) return "";
|
|
return `[${content}](${url.href}#${href})`.replace("##", "#");
|
|
}
|
|
|
|
if (!href.startsWith("https://") && !href.startsWith("http://")) {
|
|
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
|
href.replace(/^\//, "")
|
|
})`;
|
|
}
|
|
|
|
return `[${content}](${href})`;
|
|
},
|
|
});
|
|
|
|
const markdown = service.turndown(cleanDocument);
|
|
|
|
streamResponse.enqueue("parsed article, creating tags with openai");
|
|
|
|
const [tags, shortTitle, author] = await Promise.all([
|
|
openai.createTags(markdown),
|
|
title && openai.shortenTitle(title),
|
|
metaAuthor || openai.extractAuthorName(markdown),
|
|
]);
|
|
|
|
const id = shortTitle || title || "";
|
|
|
|
const meta: Article["meta"] = {
|
|
author: (author || "").replace("@", "twitter:"),
|
|
link: fetchUrl,
|
|
done: false,
|
|
date: new Date(),
|
|
};
|
|
|
|
const largestImage = images.filter((img) => {
|
|
const src = img.getAttribute("src");
|
|
return !!src && !src.startsWith("data:");
|
|
}).sort((a, b) => {
|
|
const aSize = +(a.getAttribute("width") || 0) +
|
|
+(a.getAttribute("height") || 0);
|
|
const bSize = +(b.getAttribute("width") || 0) +
|
|
+(b.getAttribute("height") || 0);
|
|
return aSize > bSize ? -1 : 1;
|
|
})[0];
|
|
|
|
const newArticle = {
|
|
type: "article",
|
|
id,
|
|
name: title || "",
|
|
content: markdown,
|
|
tags: tags || [],
|
|
meta,
|
|
} as const;
|
|
|
|
if (largestImage) {
|
|
const src = makeUrlAbsolute(largestImage.getAttribute("src") || "");
|
|
if (src) {
|
|
meta.image = src;
|
|
}
|
|
}
|
|
|
|
streamResponse.enqueue("finished processing");
|
|
|
|
await createArticle(newArticle.id, newArticle);
|
|
|
|
streamResponse.enqueue("id: " + newArticle.id);
|
|
}
|
|
|
|
async function processCreateYoutubeVideo(
|
|
{ fetchUrl, streamResponse }: {
|
|
fetchUrl: string;
|
|
streamResponse: ReturnType<typeof createStreamResponse>;
|
|
},
|
|
) {
|
|
log.info("create youtube article from url", {
|
|
url: fetchUrl,
|
|
});
|
|
|
|
streamResponse.enqueue("getting video infos from youtube api");
|
|
|
|
const id = extractYoutubeId(fetchUrl);
|
|
|
|
const video = await getYoutubeVideoDetails(id);
|
|
|
|
streamResponse.enqueue("shortening title with openai");
|
|
const newId = await openai.shortenTitle(video.snippet.title);
|
|
|
|
const newArticle: Article = {
|
|
type: "article",
|
|
name: video.snippet.title,
|
|
id: newId || video.snippet.title,
|
|
content: video.snippet.description,
|
|
tags: video.snippet?.tags?.slice(0, 5) || [],
|
|
meta: {
|
|
done: false,
|
|
link: fetchUrl,
|
|
author: video.snippet.channelTitle,
|
|
date: new Date(video.snippet.publishedAt),
|
|
},
|
|
};
|
|
|
|
streamResponse.enqueue("creating article");
|
|
|
|
await createArticle(newArticle.id, newArticle);
|
|
|
|
streamResponse.enqueue("finished");
|
|
|
|
streamResponse.enqueue("id: " + newArticle.id);
|
|
}
|
|
|
|
export const handler: Handlers = {
|
|
GET(req, ctx) {
|
|
const session = ctx.state.session;
|
|
if (!session) {
|
|
throw new AccessDeniedError();
|
|
}
|
|
|
|
const url = new URL(req.url);
|
|
const fetchUrl = url.searchParams.get("url");
|
|
|
|
if (!fetchUrl || !isValidUrl(fetchUrl)) {
|
|
throw new BadRequestError();
|
|
}
|
|
|
|
const streamResponse = createStreamResponse();
|
|
|
|
if (isYoutubeLink(fetchUrl)) {
|
|
processCreateYoutubeVideo({ fetchUrl, streamResponse }).then(
|
|
(article) => {
|
|
log.debug("created article from youtube", { article });
|
|
},
|
|
).catch((err) => {
|
|
log.error(err);
|
|
}).finally(() => {
|
|
streamResponse.cancel();
|
|
});
|
|
} else {
|
|
processCreateArticle({ fetchUrl, streamResponse }).then((article) => {
|
|
log.debug("created article from link", { article });
|
|
}).catch((err) => {
|
|
log.error(err);
|
|
}).finally(() => {
|
|
streamResponse.cancel();
|
|
});
|
|
}
|
|
|
|
return streamResponse.response;
|
|
},
|
|
};
|