feat: allow creating articles with marka
This commit is contained in:
@@ -1,11 +1,9 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "domparser";
|
||||
import { Defuddle } from "defuddle/node";
|
||||
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
||||
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
|
||||
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
|
||||
import { Article } from "@lib/resource/articles.ts";
|
||||
import { getYoutubeVideoDetails } from "@lib/youtube.ts";
|
||||
import {
|
||||
@@ -14,8 +12,8 @@ import {
|
||||
toUrlSafeString,
|
||||
} from "@lib/string.ts";
|
||||
import { createLogger } from "@lib/log/index.ts";
|
||||
|
||||
const parser = new DOMParser();
|
||||
import { createResource } from "@lib/resources.ts";
|
||||
import { webScrape } from "@lib/webScraper.ts";
|
||||
|
||||
const log = createLogger("api/article");
|
||||
|
||||
@@ -29,150 +27,49 @@ async function processCreateArticle(
|
||||
|
||||
streamResponse.enqueue("downloading article");
|
||||
|
||||
const request = await fetch(fetchUrl);
|
||||
const html = await request.text();
|
||||
const doc = await webScrape(fetchUrl, streamResponse);
|
||||
|
||||
streamResponse.enqueue("download success");
|
||||
|
||||
const document = parser.parseFromString(html, "text/html");
|
||||
|
||||
const title = document?.querySelector("title")?.innerText;
|
||||
|
||||
const images: HTMLImageElement[] = [];
|
||||
document?.querySelectorAll("img").forEach((img) => {
|
||||
images.push(img as unknown as HTMLImageElement);
|
||||
const result = await Defuddle(doc, fetchUrl, {
|
||||
markdown: true,
|
||||
});
|
||||
|
||||
const metaAuthor =
|
||||
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
||||
"content",
|
||||
) ||
|
||||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
||||
|
||||
const readable = new Readability(document);
|
||||
|
||||
const result = readable.parse();
|
||||
|
||||
log.debug("parsed", {
|
||||
log.debug("downloaded and parse parsed", {
|
||||
url: fetchUrl,
|
||||
content: result.textContent,
|
||||
content: result.content,
|
||||
});
|
||||
|
||||
const cleanDocument = parser.parseFromString(
|
||||
result.content,
|
||||
"text/html",
|
||||
);
|
||||
|
||||
const service = new tds({
|
||||
headingStyle: "atx",
|
||||
codeBlockStyle: "fenced",
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
});
|
||||
|
||||
const url = new URL(fetchUrl);
|
||||
|
||||
function makeUrlAbsolute(src: string) {
|
||||
if (src.startsWith("//")) {
|
||||
return "https:" + src;
|
||||
}
|
||||
|
||||
if (src.startsWith("/")) {
|
||||
return `${url.origin}${src.replace(/$\//, "")}`;
|
||||
}
|
||||
|
||||
if (!src.startsWith("https://") && !src.startsWith("http://")) {
|
||||
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
service.addRule("fix image links", {
|
||||
filter: ["img"],
|
||||
replacement: function (_: string, node: HTMLImageElement) {
|
||||
const src = node.getAttribute("src");
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
if (!src || src.startsWith("data:image")) return "";
|
||||
|
||||
return `})`;
|
||||
},
|
||||
});
|
||||
service.addRule("fix normal links", {
|
||||
filter: ["a"],
|
||||
replacement: function (content: string, node: HTMLImageElement) {
|
||||
const href = node.getAttribute("href");
|
||||
if (!href) return content;
|
||||
|
||||
if (href.startsWith("/")) {
|
||||
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
|
||||
} else if (href.startsWith("//")) {
|
||||
return `[${content}](https:${href})`;
|
||||
} else if (href.startsWith("#")) {
|
||||
if (content.length < 2) return "";
|
||||
return `[${content}](${url.href}#${href})`.replace("##", "#");
|
||||
} else {
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
||||
href.replace(/^\//, "")
|
||||
})`;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
const markdown = service.turndown(cleanDocument);
|
||||
|
||||
streamResponse.enqueue("parsed article, creating tags with openai");
|
||||
|
||||
const [tags, shortTitle, author] = await Promise.all([
|
||||
openai.createTags(markdown),
|
||||
title && openai.shortenTitle(title),
|
||||
metaAuthor || openai.extractAuthorName(markdown),
|
||||
]);
|
||||
const aiMeta = await openai.extractArticleMetadata(result.content);
|
||||
|
||||
console.log({ tags, shortTitle, author });
|
||||
streamResponse.enqueue("postprocessing article");
|
||||
|
||||
const id = toUrlSafeString(shortTitle || title || "");
|
||||
const title = result?.title || aiMeta?.headline || "";
|
||||
const id = toUrlSafeString(title);
|
||||
|
||||
const meta: Article["meta"] = {
|
||||
author: (author || "").replace("@", "twitter:"),
|
||||
link: fetchUrl,
|
||||
done: false,
|
||||
date: new Date(),
|
||||
};
|
||||
|
||||
const largestImage = images.filter((img) => {
|
||||
const src = img.getAttribute("src");
|
||||
return !!src && !src.startsWith("data:");
|
||||
}).sort((a, b) => {
|
||||
const aSize = +(a.getAttribute("width") || 0) +
|
||||
+(a.getAttribute("height") || 0);
|
||||
const bSize = +(b.getAttribute("width") || 0) +
|
||||
+(b.getAttribute("height") || 0);
|
||||
return aSize > bSize ? -1 : 1;
|
||||
})[0];
|
||||
|
||||
const newArticle = {
|
||||
type: "article",
|
||||
id,
|
||||
name: title || "",
|
||||
content: markdown,
|
||||
tags: tags || [],
|
||||
meta,
|
||||
const newArticle: Article = {
|
||||
_type: "Article",
|
||||
headline: title,
|
||||
articleBody: result.content,
|
||||
url: fetchUrl,
|
||||
datePublished: result?.published || aiMeta?.datePublished ||
|
||||
new Date().toISOString(),
|
||||
image: result?.image,
|
||||
author: {
|
||||
_type: "Person",
|
||||
name: (result.schemaOrgData?.author?.name || aiMeta?.author || "")
|
||||
.replace(
|
||||
"@",
|
||||
"twitter:",
|
||||
),
|
||||
},
|
||||
} as const;
|
||||
|
||||
if (largestImage) {
|
||||
const src = makeUrlAbsolute(largestImage.getAttribute("src") || "");
|
||||
if (src) {
|
||||
meta.image = src;
|
||||
}
|
||||
}
|
||||
|
||||
streamResponse.enqueue("writing to disk");
|
||||
|
||||
// await createArticle(newArticle.id, newArticle);
|
||||
await createResource(`articles/${id}.md`, newArticle);
|
||||
|
||||
streamResponse.enqueue("id: " + newArticle.id);
|
||||
streamResponse.enqueue("id: " + id);
|
||||
}
|
||||
|
||||
async function processCreateYoutubeVideo(
|
||||
@@ -187,34 +84,34 @@ async function processCreateYoutubeVideo(
|
||||
|
||||
streamResponse.enqueue("getting video infos from youtube api");
|
||||
|
||||
const id = extractYoutubeId(fetchUrl);
|
||||
const youtubeId = extractYoutubeId(fetchUrl);
|
||||
|
||||
const video = await getYoutubeVideoDetails(id);
|
||||
const video = await getYoutubeVideoDetails(youtubeId);
|
||||
|
||||
streamResponse.enqueue("shortening title with openai");
|
||||
const newId = await openai.shortenTitle(video.snippet.title);
|
||||
|
||||
const id = newId || youtubeId;
|
||||
|
||||
const newArticle: Article = {
|
||||
type: "article",
|
||||
name: video.snippet.title,
|
||||
id: newId || video.snippet.title,
|
||||
content: video.snippet.description,
|
||||
tags: video.snippet?.tags?.slice(0, 5) || [],
|
||||
meta: {
|
||||
done: false,
|
||||
link: fetchUrl,
|
||||
author: video.snippet.channelTitle,
|
||||
date: new Date(video.snippet.publishedAt),
|
||||
_type: "Article",
|
||||
headline: video.snippet.title,
|
||||
articleBody: video.snippet.description,
|
||||
url: fetchUrl,
|
||||
datePublished: new Date(video.snippet.publishedAt).toISOString(),
|
||||
author: {
|
||||
_type: "Person",
|
||||
name: video.snippet.channelTitle,
|
||||
},
|
||||
};
|
||||
|
||||
streamResponse.enqueue("creating article");
|
||||
|
||||
// await createArticle(newArticle.id, newArticle);
|
||||
await createResource(`articles/${id}.md`, newArticle);
|
||||
|
||||
streamResponse.enqueue("finished");
|
||||
|
||||
streamResponse.enqueue("id: " + newArticle.id);
|
||||
streamResponse.enqueue("id: " + id);
|
||||
}
|
||||
|
||||
export const handler: Handlers = {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "domparser";
|
||||
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
||||
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
@@ -10,13 +9,10 @@ import { Recipe } from "@lib/resource/recipes.ts";
|
||||
import recipeSchema from "@lib/recipeSchema.ts";
|
||||
import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
|
||||
import { safeFileName } from "@lib/string.ts";
|
||||
import { createDocument } from "@lib/documents.ts";
|
||||
import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
|
||||
import z from "npm:zod";
|
||||
import z from "zod";
|
||||
import { fetchHtmlWithPlaywright } from "@lib/playwright.ts";
|
||||
|
||||
const parser = new DOMParser();
|
||||
|
||||
const log = createLogger("api/article");
|
||||
|
||||
function makeUrlAbsolute(url: URL, src: string) {
|
||||
@@ -49,7 +45,7 @@ async function extractUsingAI(
|
||||
|
||||
service.addRule("fix image links", {
|
||||
filter: ["img"],
|
||||
replacement: function (_: string, node: HTMLImageElement) {
|
||||
replacement: function(_: string, node: HTMLImageElement) {
|
||||
const src = node.getAttribute("src");
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
if (!src || src.startsWith("data:image")) return "";
|
||||
@@ -59,7 +55,7 @@ async function extractUsingAI(
|
||||
});
|
||||
service.addRule("fix normal links", {
|
||||
filter: ["a"],
|
||||
replacement: function (content: string, node: HTMLImageElement) {
|
||||
replacement: function(content: string, node: HTMLImageElement) {
|
||||
const href = node.getAttribute("href");
|
||||
if (!href) return content;
|
||||
|
||||
@@ -73,9 +69,8 @@ async function extractUsingAI(
|
||||
}
|
||||
|
||||
if (!href.startsWith("https://") && !href.startsWith("http://")) {
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
||||
href.replace(/^\//, "")
|
||||
})`;
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${href.replace(/^\//, "")
|
||||
})`;
|
||||
}
|
||||
|
||||
return `[${content}](${href})`;
|
||||
@@ -199,9 +194,8 @@ async function processCreateRecipeFromUrl(
|
||||
const src = makeUrlAbsolute(url, newRecipe.meta.image);
|
||||
if (src?.length > 5) {
|
||||
const extension = fileExtension(new URL(src).pathname);
|
||||
const finalPath = `Media/articles/images/${
|
||||
safeFileName(id)
|
||||
}_cover.${extension}`;
|
||||
const finalPath = `Media/articles/images/${safeFileName(id)
|
||||
}_cover.${extension}`;
|
||||
streamResponse.enqueue("downloading image");
|
||||
try {
|
||||
streamResponse.enqueue("downloading image");
|
||||
|
||||
Reference in New Issue
Block a user