feat: allow creating articles with marka

This commit is contained in:
Max Richter
2025-10-31 15:26:34 +01:00
parent dfa3826ec5
commit 7e60327940
12 changed files with 332 additions and 210 deletions

View File

@@ -1,11 +1,9 @@
import { Handlers } from "$fresh/server.ts";
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
import { DOMParser } from "domparser";
import { Defuddle } from "defuddle/node";
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
import * as openai from "@lib/openai.ts";
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
import { Article } from "@lib/resource/articles.ts";
import { getYoutubeVideoDetails } from "@lib/youtube.ts";
import {
@@ -14,8 +12,8 @@ import {
toUrlSafeString,
} from "@lib/string.ts";
import { createLogger } from "@lib/log/index.ts";
const parser = new DOMParser();
import { createResource } from "@lib/resources.ts";
import { webScrape } from "@lib/webScraper.ts";
const log = createLogger("api/article");
@@ -29,150 +27,49 @@ async function processCreateArticle(
streamResponse.enqueue("downloading article");
const request = await fetch(fetchUrl);
const html = await request.text();
const doc = await webScrape(fetchUrl, streamResponse);
streamResponse.enqueue("download success");
const document = parser.parseFromString(html, "text/html");
const title = document?.querySelector("title")?.innerText;
const images: HTMLImageElement[] = [];
document?.querySelectorAll("img").forEach((img) => {
images.push(img as unknown as HTMLImageElement);
const result = await Defuddle(doc, fetchUrl, {
markdown: true,
});
const metaAuthor =
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
"content",
) ||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
const readable = new Readability(document);
const result = readable.parse();
log.debug("parsed", {
log.debug("downloaded and parse parsed", {
url: fetchUrl,
content: result.textContent,
content: result.content,
});
const cleanDocument = parser.parseFromString(
result.content,
"text/html",
);
const service = new tds({
headingStyle: "atx",
codeBlockStyle: "fenced",
hr: "---",
bulletListMarker: "-",
});
const url = new URL(fetchUrl);
function makeUrlAbsolute(src: string) {
if (src.startsWith("//")) {
return "https:" + src;
}
if (src.startsWith("/")) {
return `${url.origin}${src.replace(/$\//, "")}`;
}
if (!src.startsWith("https://") && !src.startsWith("http://")) {
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
}
return src;
}
service.addRule("fix image links", {
filter: ["img"],
replacement: function (_: string, node: HTMLImageElement) {
const src = node.getAttribute("src");
const alt = node.getAttribute("alt") || "";
if (!src || src.startsWith("data:image")) return "";
return `![${alt}](${makeUrlAbsolute(src)})`;
},
});
service.addRule("fix normal links", {
filter: ["a"],
replacement: function (content: string, node: HTMLImageElement) {
const href = node.getAttribute("href");
if (!href) return content;
if (href.startsWith("/")) {
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
} else if (href.startsWith("//")) {
return `[${content}](https:${href})`;
} else if (href.startsWith("#")) {
if (content.length < 2) return "";
return `[${content}](${url.href}#${href})`.replace("##", "#");
} else {
return `[${content}](${url.origin.replace(/\/$/, "")}/${
href.replace(/^\//, "")
})`;
}
},
});
const markdown = service.turndown(cleanDocument);
streamResponse.enqueue("parsed article, creating tags with openai");
const [tags, shortTitle, author] = await Promise.all([
openai.createTags(markdown),
title && openai.shortenTitle(title),
metaAuthor || openai.extractAuthorName(markdown),
]);
const aiMeta = await openai.extractArticleMetadata(result.content);
console.log({ tags, shortTitle, author });
streamResponse.enqueue("postprocessing article");
const id = toUrlSafeString(shortTitle || title || "");
const title = result?.title || aiMeta?.headline || "";
const id = toUrlSafeString(title);
const meta: Article["meta"] = {
author: (author || "").replace("@", "twitter:"),
link: fetchUrl,
done: false,
date: new Date(),
};
const largestImage = images.filter((img) => {
const src = img.getAttribute("src");
return !!src && !src.startsWith("data:");
}).sort((a, b) => {
const aSize = +(a.getAttribute("width") || 0) +
+(a.getAttribute("height") || 0);
const bSize = +(b.getAttribute("width") || 0) +
+(b.getAttribute("height") || 0);
return aSize > bSize ? -1 : 1;
})[0];
const newArticle = {
type: "article",
id,
name: title || "",
content: markdown,
tags: tags || [],
meta,
const newArticle: Article = {
_type: "Article",
headline: title,
articleBody: result.content,
url: fetchUrl,
datePublished: result?.published || aiMeta?.datePublished ||
new Date().toISOString(),
image: result?.image,
author: {
_type: "Person",
name: (result.schemaOrgData?.author?.name || aiMeta?.author || "")
.replace(
"@",
"twitter:",
),
},
} as const;
if (largestImage) {
const src = makeUrlAbsolute(largestImage.getAttribute("src") || "");
if (src) {
meta.image = src;
}
}
streamResponse.enqueue("writing to disk");
// await createArticle(newArticle.id, newArticle);
await createResource(`articles/${id}.md`, newArticle);
streamResponse.enqueue("id: " + newArticle.id);
streamResponse.enqueue("id: " + id);
}
async function processCreateYoutubeVideo(
@@ -187,34 +84,34 @@ async function processCreateYoutubeVideo(
streamResponse.enqueue("getting video infos from youtube api");
const id = extractYoutubeId(fetchUrl);
const youtubeId = extractYoutubeId(fetchUrl);
const video = await getYoutubeVideoDetails(id);
const video = await getYoutubeVideoDetails(youtubeId);
streamResponse.enqueue("shortening title with openai");
const newId = await openai.shortenTitle(video.snippet.title);
const id = newId || youtubeId;
const newArticle: Article = {
type: "article",
name: video.snippet.title,
id: newId || video.snippet.title,
content: video.snippet.description,
tags: video.snippet?.tags?.slice(0, 5) || [],
meta: {
done: false,
link: fetchUrl,
author: video.snippet.channelTitle,
date: new Date(video.snippet.publishedAt),
_type: "Article",
headline: video.snippet.title,
articleBody: video.snippet.description,
url: fetchUrl,
datePublished: new Date(video.snippet.publishedAt).toISOString(),
author: {
_type: "Person",
name: video.snippet.channelTitle,
},
};
streamResponse.enqueue("creating article");
// await createArticle(newArticle.id, newArticle);
await createResource(`articles/${id}.md`, newArticle);
streamResponse.enqueue("finished");
streamResponse.enqueue("id: " + newArticle.id);
streamResponse.enqueue("id: " + id);
}
export const handler: Handlers = {