fix: hashtag extraction and make remote links absolute
This commit is contained in:
@ -2,25 +2,130 @@ import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "https://deno.land/x/deno_dom@v0.1.38/deno-dom-wasm.ts";
|
||||
import { BadRequestError } from "@lib/errors.ts";
|
||||
import { isValidUrl, json } from "@lib/helpers.ts";
|
||||
import { createStreamResponse, isValidUrl, json } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
|
||||
import tds from "https://cdn.skypack.dev/turndown@7.1.1";
|
||||
//import { gfm } from "https://cdn.skypack.dev/@guyplusplus/turndown-plugin-gfm@1.0.7";
|
||||
import { createArticle } from "@lib/resource/articles.ts";
|
||||
|
||||
const service = new tds({
|
||||
headingStyle: "atx",
|
||||
codeBlockStyle: "fenced",
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
});
|
||||
const parser = new DOMParser();
|
||||
|
||||
//service.use(gfm);
|
||||
|
||||
async function processCreateArticle(
|
||||
{ fetchUrl, streamResponse }: {
|
||||
fetchUrl: string;
|
||||
streamResponse: ReturnType<typeof createStreamResponse>;
|
||||
},
|
||||
) {
|
||||
console.log("[api/article] create article from url", { url: fetchUrl });
|
||||
|
||||
streamResponse.enqueue("downloading article");
|
||||
|
||||
const request = await fetch(fetchUrl);
|
||||
const html = await request.text();
|
||||
|
||||
streamResponse.enqueue("download success");
|
||||
|
||||
const document = parser.parseFromString(html, "text/html");
|
||||
|
||||
const title = document?.querySelector("title")?.innerText;
|
||||
|
||||
const metaAuthor =
|
||||
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
||||
"content",
|
||||
) ||
|
||||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
||||
|
||||
const readable = new Readability(document);
|
||||
|
||||
const result = readable.parse();
|
||||
|
||||
console.log("[api/article] parsed ", {
|
||||
url: fetchUrl,
|
||||
content: result.textContent,
|
||||
});
|
||||
|
||||
const cleanDocument = parser.parseFromString(
|
||||
result.content,
|
||||
"text/html",
|
||||
);
|
||||
|
||||
const service = new tds({
|
||||
headingStyle: "atx",
|
||||
codeBlockStyle: "fenced",
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
});
|
||||
|
||||
const url = new URL(fetchUrl);
|
||||
service.addRule("fix image links", {
|
||||
filter: ["img"],
|
||||
replacement: function (_: string, node: HTMLImageElement) {
|
||||
const src = node.getAttribute("src");
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
if (!src || src.startsWith("data:image")) return "";
|
||||
|
||||
if (src.startsWith("/")) {
|
||||
return `})`;
|
||||
}
|
||||
|
||||
return ``;
|
||||
},
|
||||
});
|
||||
service.addRule("fix normal links", {
|
||||
filter: ["a"],
|
||||
replacement: function (content: string, node: HTMLImageElement) {
|
||||
const href = node.getAttribute("href");
|
||||
if (!href) return content;
|
||||
|
||||
if (href.startsWith("/")) {
|
||||
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
|
||||
}
|
||||
|
||||
if (href.startsWith("#")) {
|
||||
return `[${content}](${url.href}#${href})`.replace("##", "#");
|
||||
}
|
||||
|
||||
return `[${content}](${href})`;
|
||||
},
|
||||
});
|
||||
|
||||
const markdown = service.turndown(cleanDocument);
|
||||
|
||||
streamResponse.enqueue("parsed article, creating tags with openai");
|
||||
|
||||
const [tags, shortTitle, author] = await Promise.all([
|
||||
openai.createTags(markdown),
|
||||
title && openai.shortenTitle(title),
|
||||
metaAuthor || openai.extractAuthorName(markdown),
|
||||
]);
|
||||
|
||||
const id = shortTitle || title || "";
|
||||
|
||||
const newArticle = {
|
||||
id,
|
||||
name: title || "",
|
||||
content: markdown,
|
||||
tags: tags || [],
|
||||
meta: {
|
||||
author: (author || "").replace("@", "twitter:"),
|
||||
link: fetchUrl,
|
||||
status: "not-finished",
|
||||
date: new Date(),
|
||||
},
|
||||
} as const;
|
||||
|
||||
streamResponse.enqueue("finished processing");
|
||||
|
||||
await createArticle(newArticle);
|
||||
|
||||
streamResponse.enqueue("id: " + newArticle.id);
|
||||
}
|
||||
|
||||
export const handler: Handlers = {
|
||||
async GET(req) {
|
||||
GET(req) {
|
||||
const url = new URL(req.url);
|
||||
const fetchUrl = url.searchParams.get("url");
|
||||
|
||||
@ -28,63 +133,16 @@ export const handler: Handlers = {
|
||||
throw new BadRequestError();
|
||||
}
|
||||
|
||||
console.log("[api/article] create article from url", { url: fetchUrl });
|
||||
const streamResponse = createStreamResponse();
|
||||
|
||||
const request = await fetch(fetchUrl);
|
||||
const html = await request.text();
|
||||
|
||||
const document = parser.parseFromString(html, "text/html");
|
||||
|
||||
const title = document?.querySelector("title")?.innerText;
|
||||
|
||||
const metaAuthor =
|
||||
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
||||
"content",
|
||||
) ||
|
||||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
||||
|
||||
console.log({ metaAuthor });
|
||||
|
||||
const readable = new Readability(document);
|
||||
|
||||
const result = readable.parse();
|
||||
|
||||
console.log("[api/article] parsed ", {
|
||||
url: fetchUrl,
|
||||
content: result.textContent,
|
||||
processCreateArticle({ fetchUrl, streamResponse }).then((article) => {
|
||||
console.log({ article });
|
||||
}).catch((err) => {
|
||||
console.log(err);
|
||||
}).finally(() => {
|
||||
streamResponse.cancel();
|
||||
});
|
||||
|
||||
const cleanDocument = parser.parseFromString(
|
||||
result.content,
|
||||
"text/html",
|
||||
);
|
||||
|
||||
const [tags, summary, shortTitle, author] = await Promise.all([
|
||||
openai.createTags(result.textContent),
|
||||
openai.summarize(result.textContent),
|
||||
title && openai.shortenTitle(title),
|
||||
metaAuthor || openai.extractAuthorName(result.textContent),
|
||||
]);
|
||||
|
||||
const markdown = service.turndown(cleanDocument);
|
||||
|
||||
const id = shortTitle || title || "";
|
||||
|
||||
const newArticle = {
|
||||
id,
|
||||
name: title || "",
|
||||
content: markdown,
|
||||
tags: tags || [],
|
||||
meta: {
|
||||
author: author || "",
|
||||
link: fetchUrl,
|
||||
status: "not-finished",
|
||||
date: new Date(),
|
||||
},
|
||||
} as const;
|
||||
|
||||
await createArticle(newArticle);
|
||||
|
||||
return json(newArticle);
|
||||
return streamResponse.response;
|
||||
},
|
||||
};
|
||||
|
30
routes/api/test.ts
Normal file
30
routes/api/test.ts
Normal file
@ -0,0 +1,30 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
|
||||
function GET() {
|
||||
let timer: number | undefined = undefined;
|
||||
const body = new ReadableStream({
|
||||
start(controller) {
|
||||
timer = setInterval(() => {
|
||||
const message = `It is ${new Date().toISOString()}\n`;
|
||||
controller.enqueue(new TextEncoder().encode(message));
|
||||
}, 1000);
|
||||
},
|
||||
|
||||
cancel() {
|
||||
if (timer !== undefined) {
|
||||
clearInterval(timer);
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
return new Response(body, {
|
||||
headers: {
|
||||
"content-type": "text/plain",
|
||||
"x-content-type-options": "nosniff",
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
export const handler: Handlers = {
|
||||
GET,
|
||||
};
|
Reference in New Issue
Block a user