feat: allow creating articles with marka
This commit is contained in:
@@ -26,9 +26,11 @@
|
||||
"@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.5.1",
|
||||
"@std/http": "jsr:@std/http@^1.0.12",
|
||||
"@std/yaml": "jsr:@std/yaml@^1.0.5",
|
||||
"defuddle": "npm:defuddle@^0.6.6",
|
||||
"drizzle-kit": "npm:drizzle-kit@^0.30.1",
|
||||
"drizzle-orm": "npm:drizzle-orm@^0.38.3",
|
||||
"fuzzysort": "npm:fuzzysort@^3.1.0",
|
||||
"jsdom": "npm:jsdom@^24.1.3",
|
||||
"playwright": "npm:playwright@^1.49.1",
|
||||
"playwright-extra": "npm:playwright-extra@^4.3.6",
|
||||
"preact": "https://esm.sh/preact@10.22.0",
|
||||
@@ -43,7 +45,7 @@
|
||||
"tsx": "npm:tsx@^4.19.2",
|
||||
"yaml": "https://deno.land/std@0.197.0/yaml/mod.ts",
|
||||
"zod": "npm:zod@^3.24.1",
|
||||
"domparser": "https://deno.land/x/deno_dom@v0.1.48/deno-dom-wasm.ts",
|
||||
"domparser": "https://deno.land/x/deno_dom@v0.1.56/deno-dom-wasm.ts",
|
||||
"fs": "https://deno.land/std/fs/mod.ts",
|
||||
"imagemagick": "https://deno.land/x/imagemagick_deno@0.0.31/mod.ts"
|
||||
},
|
||||
|
||||
@@ -17,6 +17,8 @@ export const GITEA_REDIRECT_URL = Deno.env.get("GITEA_REDIRECT_URL");
|
||||
const duration = Deno.env.get("SESSION_DURATION");
|
||||
export const SESSION_DURATION = duration ? +duration : (60 * 60 * 24);
|
||||
|
||||
export const MARKA_API_KEY = Deno.env.get("MARKA_API_KEY");
|
||||
|
||||
export const JWT_SECRET = Deno.env.get("JWT_SECRET");
|
||||
|
||||
export const DATA_DIR = Deno.env.has("DATA_DIR")
|
||||
|
||||
@@ -103,6 +103,12 @@ export function debounce<T extends (...args: Parameters<T>) => void>(
|
||||
|
||||
export function parseRating(rating: string | number) {
|
||||
if (typeof rating === "string") {
|
||||
try {
|
||||
return parseInt(rating);
|
||||
} catch (_e) {
|
||||
// This is okay
|
||||
}
|
||||
|
||||
return [...rating.matchAll(/⭐/g)].length;
|
||||
}
|
||||
return rating;
|
||||
|
||||
@@ -4,6 +4,7 @@ import { OPENAI_API_KEY } from "@lib/env.ts";
|
||||
import { hashString } from "@lib/helpers.ts";
|
||||
import { createCache } from "@lib/cache.ts";
|
||||
import { recipeResponseSchema } from "@lib/recipeSchema.ts";
|
||||
import { articleMetadataSchema } from "./resource/articles.ts";
|
||||
|
||||
const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY });
|
||||
|
||||
@@ -32,8 +33,7 @@ export async function summarize(content: string) {
|
||||
{
|
||||
role: "user",
|
||||
content:
|
||||
`Please summarize the article in one sentence as short as possible: ${
|
||||
content.slice(0, 2000)
|
||||
`Please summarize the article in one sentence as short as possible: ${content.slice(0, 2000)
|
||||
}`,
|
||||
},
|
||||
],
|
||||
@@ -100,8 +100,7 @@ export async function createGenres(
|
||||
{
|
||||
role: "system",
|
||||
content:
|
||||
`you create some keywords that can be used in a recommendation system. The keywords are based on a ${type} description or title. If you do not know the title, take into account the description aswell. Create a range of keywords from very specific ones that describe the general vibe. ${
|
||||
title ? `The name of the ${type} is ${title}` : ""
|
||||
`you create some keywords that can be used in a recommendation system. The keywords are based on a ${type} description or title. If you do not know the title, take into account the description aswell. Create a range of keywords from very specific ones that describe the general vibe. ${title ? `The name of the ${type} is ${title}` : ""
|
||||
}. Return a list of around 20 keywords seperated by commas`,
|
||||
},
|
||||
{
|
||||
@@ -165,8 +164,7 @@ export const getMovieRecommendations = async (
|
||||
|
||||
${keywords}
|
||||
|
||||
The movies should be similar to but not include ${
|
||||
exclude.join(", ")
|
||||
The movies should be similar to but not include ${exclude.join(", ")
|
||||
} or remakes of that.
|
||||
|
||||
respond with a plain unordered list each item starting with the year the movie was released and then the title of the movie seperated by a -`,
|
||||
@@ -229,6 +227,28 @@ export async function extractRecipe(content: string) {
|
||||
return recipeResponseSchema.parse(completion.choices[0].message.parsed);
|
||||
}
|
||||
|
||||
export async function extractArticleMetadata(content: string) {
|
||||
if (!openAI) return;
|
||||
const completion = await openAI.beta.chat.completions.parse({
|
||||
model: "gpt-4o-2024-08-06",
|
||||
temperature: 0.1,
|
||||
messages: [
|
||||
{
|
||||
role: "system",
|
||||
content:
|
||||
"Extract the article information from the provided markdown. If the specified data is not available return undefined for the data values.",
|
||||
},
|
||||
{ role: "user", content },
|
||||
],
|
||||
response_format: zodResponseFormat(
|
||||
articleMetadataSchema,
|
||||
"article-meta-v2",
|
||||
),
|
||||
});
|
||||
|
||||
return articleMetadataSchema.parse(completion.choices[0].message.parsed);
|
||||
}
|
||||
|
||||
export async function transcribe(
|
||||
mp3Data: Uint8Array,
|
||||
): Promise<string | undefined> {
|
||||
|
||||
@@ -3,13 +3,6 @@ import { createStreamResponse } from "@lib/helpers.ts";
|
||||
import StealthPlugin from "npm:puppeteer-extra-plugin-stealth";
|
||||
import * as env from "@lib/env.ts";
|
||||
|
||||
const userAgentStrings = [
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.2227.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.3497.92 Safari/537.36",
|
||||
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
|
||||
];
|
||||
|
||||
firefox.use(StealthPlugin());
|
||||
|
||||
export async function fetchHtmlWithPlaywright(
|
||||
|
||||
@@ -17,8 +17,6 @@ export const IngredientGroupSchema = z.object({
|
||||
export type IngredientGroup = z.infer<typeof IngredientGroupSchema>;
|
||||
|
||||
const recipeSchema = z.object({
|
||||
name: z.string(),
|
||||
content: z.object({
|
||||
_type: z.literal("Recipe"),
|
||||
name: z.string().describe(
|
||||
"Title of the Recipe, without the name of the website or author",
|
||||
@@ -31,13 +29,12 @@ const recipeSchema = z.object({
|
||||
_type: z.literal("Person"),
|
||||
name: z.string().describe("author of the Recipe (optional)"),
|
||||
}),
|
||||
recipeEngredient: z.array(z.string())
|
||||
recipeIngredient: z.array(z.string())
|
||||
.describe("List of ingredients"),
|
||||
recipeInstructions: z.array(z.string()).describe("List of instructions"),
|
||||
recipeYield: z.number().describe("Amount of Portions"),
|
||||
prepTime: z.number().describe("Preparation time in minutes"),
|
||||
cookTime: z.number().describe("Cooking time in minutes"),
|
||||
}),
|
||||
});
|
||||
|
||||
export type Recipe = z.infer<typeof recipeSchema>;
|
||||
|
||||
@@ -1,17 +1,30 @@
|
||||
import { z } from "zod";
|
||||
export type Article = {
|
||||
id: string;
|
||||
type: "article";
|
||||
content: string;
|
||||
name: string;
|
||||
tags: string[];
|
||||
meta: {
|
||||
done?: boolean;
|
||||
date: Date;
|
||||
link: string;
|
||||
thumbnail?: string;
|
||||
average?: string;
|
||||
_type: "Article";
|
||||
headline?: string;
|
||||
datePublished?: string;
|
||||
articleBody?: string;
|
||||
keywords?: string[];
|
||||
image?: string;
|
||||
author?: string;
|
||||
rating?: number;
|
||||
url?: string;
|
||||
reviewRating?: {
|
||||
bestRating?: number;
|
||||
worstRating?: number;
|
||||
ratingValue?: number;
|
||||
};
|
||||
author?: {
|
||||
_type: "Person";
|
||||
name?: string;
|
||||
};
|
||||
};
|
||||
|
||||
export const articleMetadataSchema = z.object({
|
||||
headline: z.union([z.null(), z.string()]).describe("Headline of the article"),
|
||||
author: z.union([z.null(), z.string()]).describe("Author of the article"),
|
||||
datePublished: z.union([z.null(), z.string()]).describe(
|
||||
"Date the article was published",
|
||||
),
|
||||
keywords: z.union([z.null(), z.array(z.string())]).describe(
|
||||
"Keywords for the article",
|
||||
),
|
||||
});
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import { MARKA_API_KEY } from "./env.ts";
|
||||
|
||||
export const resources = {
|
||||
"home": {
|
||||
emoji: "House with Garden.png",
|
||||
@@ -31,13 +33,37 @@ export const resources = {
|
||||
},
|
||||
} as const;
|
||||
|
||||
const url = `https://marka.max-richter.dev/resources`;
|
||||
//const url = "http://localhost:8080/resources";
|
||||
|
||||
export async function fetchResource(resource: string) {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`https://marka.max-richter.dev/resources/${resource}`,
|
||||
`${url}/${resource}`,
|
||||
);
|
||||
return response.json();
|
||||
} catch (_e) {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
|
||||
export async function createResource(
|
||||
path: string,
|
||||
content: string | object | ArrayBuffer,
|
||||
) {
|
||||
const isJson = typeof content === "object";
|
||||
const fetchUrl = `${url}/${path}`;
|
||||
console.log("Creating resource", { fetchUrl, content, isJson });
|
||||
const response = await fetch(fetchUrl, {
|
||||
method: "POST",
|
||||
headers: {
|
||||
"Content-Type": isJson ? "application/json" : "",
|
||||
"Authentication": MARKA_API_KEY,
|
||||
},
|
||||
body: isJson ? JSON.stringify(content) : content,
|
||||
});
|
||||
if (!response.ok) {
|
||||
throw new Error(`Failed to create resource: ${response.status}`);
|
||||
}
|
||||
return response.json();
|
||||
}
|
||||
|
||||
@@ -67,7 +67,7 @@ export async function searchResource(
|
||||
|
||||
for (const resource of resources) {
|
||||
if (
|
||||
!(resource.id in results) &&
|
||||
!(resource.name in results) &&
|
||||
tags?.length && resource.tags.length &&
|
||||
tags.every((t) => resource.tags.includes(t))
|
||||
) {
|
||||
|
||||
@@ -1,2 +1,174 @@
|
||||
export function webScrape(url: URL) {
|
||||
import { JSDOM } from "jsdom";
|
||||
import { fetchHtmlWithPlaywright } from "./playwright.ts";
|
||||
import { createStreamResponse } from "./helpers.ts";
|
||||
|
||||
/**
|
||||
* Mutates the given JSDOM instance: rewrites all relevant URL-bearing attributes
|
||||
* to absolute URLs, resolving against the provided domain (e.g., "https://example.com").
|
||||
*/
|
||||
export function absolutizeDomUrls(dom: JSDOM, domain: string): void {
|
||||
const { document } = dom.window;
|
||||
const base = toBase(domain);
|
||||
|
||||
const rewrite = (selector: string, attr: string) => {
|
||||
document.querySelectorAll<HTMLElement>(selector).forEach((el) => {
|
||||
const v = el.getAttribute(attr);
|
||||
if (!v) return;
|
||||
const abs = toAbsolute(v, base);
|
||||
if (abs !== v) el.setAttribute(attr, abs);
|
||||
});
|
||||
};
|
||||
|
||||
// Common URL attributes
|
||||
rewrite("a[href]", "href");
|
||||
rewrite("area[href]", "href");
|
||||
rewrite("link[href]", "href");
|
||||
rewrite("use[href]", "href"); // SVG 2
|
||||
rewrite("use[xlink\\:href]", "xlink:href"); // legacy SVG
|
||||
rewrite("image[href]", "href"); // SVG
|
||||
rewrite("image[xlink\\:href]", "xlink:href"); // legacy SVG
|
||||
|
||||
rewrite("script[src]", "src");
|
||||
rewrite("img[src]", "src");
|
||||
rewrite("source[src]", "src");
|
||||
rewrite("track[src]", "src");
|
||||
rewrite("iframe[src]", "src");
|
||||
rewrite("embed[src]", "src");
|
||||
rewrite("audio[src]", "src");
|
||||
rewrite("video[src]", "src");
|
||||
rewrite("object[data]", "data");
|
||||
rewrite("input[src]", "src");
|
||||
rewrite("form[action]", "action");
|
||||
rewrite("video[poster]", "poster");
|
||||
|
||||
// srcset (img, source)
|
||||
document
|
||||
.querySelectorAll<HTMLElement>("img[srcset], source[srcset]")
|
||||
.forEach((el) => {
|
||||
const v = el.getAttribute("srcset");
|
||||
if (!v) return;
|
||||
const abs = absolutizeSrcset(v, base);
|
||||
if (abs !== v) el.setAttribute("srcset", abs);
|
||||
});
|
||||
|
||||
// Inline CSS in style attributes: url(...)
|
||||
document.querySelectorAll<HTMLElement>("[style]").forEach((el) => {
|
||||
const v = el.getAttribute("style");
|
||||
if (!v) return;
|
||||
const abs = absolutizeCssUrls(v, base);
|
||||
if (abs !== v) el.setAttribute("style", abs);
|
||||
});
|
||||
|
||||
// <style> blocks (inline CSS): url(...)
|
||||
document.querySelectorAll<HTMLStyleElement>("style").forEach((styleEl) => {
|
||||
const css = styleEl.textContent ?? "";
|
||||
const abs = absolutizeCssUrls(css, base);
|
||||
if (abs !== css) styleEl.textContent = abs;
|
||||
});
|
||||
|
||||
// <meta http-equiv="refresh" content="5; url=/path">
|
||||
document
|
||||
.querySelectorAll<HTMLMetaElement>('meta[http-equiv="refresh" i][content]')
|
||||
.forEach((meta) => {
|
||||
const content = meta.getAttribute("content") || "";
|
||||
const abs = absolutizeMetaRefresh(content, base);
|
||||
if (abs !== content) meta.setAttribute("content", abs);
|
||||
});
|
||||
}
|
||||
|
||||
/** Normalize the base to a valid absolute URL root. */
|
||||
function toBase(domain: string): string {
|
||||
// Allow callers to pass "example.com" or "//example.com"
|
||||
let d = domain.trim();
|
||||
if (!/^[a-zA-Z][a-zA-Z0-9+\-.]*:/.test(d)) {
|
||||
d = d.startsWith("//") ? `https:${d}` : `https://${d}`;
|
||||
}
|
||||
// Ensure trailing slash does not matter for URL resolution
|
||||
try {
|
||||
// new URL('/', base) works whether base ends with slash or not
|
||||
return new URL("/", d).toString();
|
||||
} catch {
|
||||
// Fallback: if domain is irreparably bad, throw early
|
||||
throw new Error(`Invalid base domain: ${domain}`);
|
||||
}
|
||||
}
|
||||
|
||||
/** Convert a possibly-relative URL to absolute, using the provided base. */
|
||||
function toAbsolute(url: string, base: string): string {
|
||||
const trimmed = url.trim();
|
||||
|
||||
// Leave already absolute or special schemes untouched by just parsing directly.
|
||||
// If it's not a valid absolute URL, resolve against base.
|
||||
try {
|
||||
// If parse succeeds without base and includes a scheme, keep as-is
|
||||
const abs = new URL(trimmed);
|
||||
return abs.toString();
|
||||
} catch {
|
||||
// Not absolute, resolve relative to base (handles #hash, ?q, //host, etc.)
|
||||
try {
|
||||
return new URL(trimmed, base).toString();
|
||||
} catch {
|
||||
// If still invalid (e.g., badly formed), return original
|
||||
return url;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/** Absolutize a srcset list. */
|
||||
function absolutizeSrcset(srcset: string, base: string): string {
|
||||
// Split by commas but keep descriptors (1x, 2x, 100w, etc.)
|
||||
// Each candidate: <url> [<descriptor>]
|
||||
return srcset
|
||||
.split(",")
|
||||
.map((part) => {
|
||||
const s = part.trim();
|
||||
if (!s) return s;
|
||||
// First whitespace separates URL and descriptor
|
||||
const spaceIdx = s.search(/\s/);
|
||||
if (spaceIdx === -1) {
|
||||
return toAbsolute(s, base);
|
||||
}
|
||||
const url = s.slice(0, spaceIdx);
|
||||
const desc = s.slice(spaceIdx).trim();
|
||||
return `${toAbsolute(url, base)} ${desc}`;
|
||||
})
|
||||
.join(", ");
|
||||
}
|
||||
|
||||
/** Replace url(...) in CSS text with absolute URLs. */
|
||||
function absolutizeCssUrls(cssText: string, base: string): string {
|
||||
// Matches url("..."), url('...'), url(...)
|
||||
return cssText.replace(
|
||||
/url\(\s*(['"]?)([^'")]+)\1\s*\)/g,
|
||||
(_m, _q, rawUrl) => {
|
||||
const abs = toAbsolute(rawUrl, base);
|
||||
// Preserve quoting if present; browsers accept unquoted if safe, but keep simple.
|
||||
return `url(${abs})`;
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/** Rewrite the URL in a meta refresh content value if present. */
|
||||
function absolutizeMetaRefresh(content: string, base: string): string {
|
||||
// Format examples:
|
||||
// "5; url=/path", "0;URL='page.html'"
|
||||
const match = content.match(
|
||||
/^\s*([^;]+)\s*;\s*(url|URL)\s*=\s*('?)([^']+)\3\s*$/,
|
||||
);
|
||||
if (!match) return content;
|
||||
const delay = match[1].trim();
|
||||
const url = match[4].trim();
|
||||
const abs = toAbsolute(url, base);
|
||||
return `${delay}; url=${abs}`;
|
||||
}
|
||||
|
||||
export async function webScrape(
|
||||
url: string,
|
||||
streamResponse: ReturnType<typeof createStreamResponse>,
|
||||
): JSDOM {
|
||||
const u = new URL(url);
|
||||
const html = await fetchHtmlWithPlaywright(url, streamResponse);
|
||||
const dom = new JSDOM(html);
|
||||
absolutizeDomUrls(dom, u.origin);
|
||||
return dom;
|
||||
}
|
||||
|
||||
@@ -1,11 +1,9 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "domparser";
|
||||
import { Defuddle } from "defuddle/node";
|
||||
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
||||
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
|
||||
import tds from "https://cdn.skypack.dev/turndown@7.2.0";
|
||||
import { Article } from "@lib/resource/articles.ts";
|
||||
import { getYoutubeVideoDetails } from "@lib/youtube.ts";
|
||||
import {
|
||||
@@ -14,8 +12,8 @@ import {
|
||||
toUrlSafeString,
|
||||
} from "@lib/string.ts";
|
||||
import { createLogger } from "@lib/log/index.ts";
|
||||
|
||||
const parser = new DOMParser();
|
||||
import { createResource } from "@lib/resources.ts";
|
||||
import { webScrape } from "@lib/webScraper.ts";
|
||||
|
||||
const log = createLogger("api/article");
|
||||
|
||||
@@ -29,150 +27,49 @@ async function processCreateArticle(
|
||||
|
||||
streamResponse.enqueue("downloading article");
|
||||
|
||||
const request = await fetch(fetchUrl);
|
||||
const html = await request.text();
|
||||
const doc = await webScrape(fetchUrl, streamResponse);
|
||||
|
||||
streamResponse.enqueue("download success");
|
||||
|
||||
const document = parser.parseFromString(html, "text/html");
|
||||
|
||||
const title = document?.querySelector("title")?.innerText;
|
||||
|
||||
const images: HTMLImageElement[] = [];
|
||||
document?.querySelectorAll("img").forEach((img) => {
|
||||
images.push(img as unknown as HTMLImageElement);
|
||||
const result = await Defuddle(doc, fetchUrl, {
|
||||
markdown: true,
|
||||
});
|
||||
|
||||
const metaAuthor =
|
||||
document?.querySelector('meta[name="twitter:creator"]')?.getAttribute(
|
||||
"content",
|
||||
) ||
|
||||
document?.querySelector('meta[name="author"]')?.getAttribute("content");
|
||||
|
||||
const readable = new Readability(document);
|
||||
|
||||
const result = readable.parse();
|
||||
|
||||
log.debug("parsed", {
|
||||
log.debug("downloaded and parse parsed", {
|
||||
url: fetchUrl,
|
||||
content: result.textContent,
|
||||
content: result.content,
|
||||
});
|
||||
|
||||
const cleanDocument = parser.parseFromString(
|
||||
result.content,
|
||||
"text/html",
|
||||
);
|
||||
|
||||
const service = new tds({
|
||||
headingStyle: "atx",
|
||||
codeBlockStyle: "fenced",
|
||||
hr: "---",
|
||||
bulletListMarker: "-",
|
||||
});
|
||||
|
||||
const url = new URL(fetchUrl);
|
||||
|
||||
function makeUrlAbsolute(src: string) {
|
||||
if (src.startsWith("//")) {
|
||||
return "https:" + src;
|
||||
}
|
||||
|
||||
if (src.startsWith("/")) {
|
||||
return `${url.origin}${src.replace(/$\//, "")}`;
|
||||
}
|
||||
|
||||
if (!src.startsWith("https://") && !src.startsWith("http://")) {
|
||||
return `${url.origin.replace(/\/$/, "")}/${src.replace(/^\//, "")})`;
|
||||
}
|
||||
|
||||
return src;
|
||||
}
|
||||
|
||||
service.addRule("fix image links", {
|
||||
filter: ["img"],
|
||||
replacement: function (_: string, node: HTMLImageElement) {
|
||||
const src = node.getAttribute("src");
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
if (!src || src.startsWith("data:image")) return "";
|
||||
|
||||
return `})`;
|
||||
},
|
||||
});
|
||||
service.addRule("fix normal links", {
|
||||
filter: ["a"],
|
||||
replacement: function (content: string, node: HTMLImageElement) {
|
||||
const href = node.getAttribute("href");
|
||||
if (!href) return content;
|
||||
|
||||
if (href.startsWith("/")) {
|
||||
return `[${content}](${url.origin}${href.replace(/$\//, "")})`;
|
||||
} else if (href.startsWith("//")) {
|
||||
return `[${content}](https:${href})`;
|
||||
} else if (href.startsWith("#")) {
|
||||
if (content.length < 2) return "";
|
||||
return `[${content}](${url.href}#${href})`.replace("##", "#");
|
||||
} else {
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
||||
href.replace(/^\//, "")
|
||||
})`;
|
||||
}
|
||||
},
|
||||
});
|
||||
|
||||
const markdown = service.turndown(cleanDocument);
|
||||
|
||||
streamResponse.enqueue("parsed article, creating tags with openai");
|
||||
|
||||
const [tags, shortTitle, author] = await Promise.all([
|
||||
openai.createTags(markdown),
|
||||
title && openai.shortenTitle(title),
|
||||
metaAuthor || openai.extractAuthorName(markdown),
|
||||
]);
|
||||
const aiMeta = await openai.extractArticleMetadata(result.content);
|
||||
|
||||
console.log({ tags, shortTitle, author });
|
||||
streamResponse.enqueue("postprocessing article");
|
||||
|
||||
const id = toUrlSafeString(shortTitle || title || "");
|
||||
const title = result?.title || aiMeta?.headline || "";
|
||||
const id = toUrlSafeString(title);
|
||||
|
||||
const meta: Article["meta"] = {
|
||||
author: (author || "").replace("@", "twitter:"),
|
||||
link: fetchUrl,
|
||||
done: false,
|
||||
date: new Date(),
|
||||
};
|
||||
|
||||
const largestImage = images.filter((img) => {
|
||||
const src = img.getAttribute("src");
|
||||
return !!src && !src.startsWith("data:");
|
||||
}).sort((a, b) => {
|
||||
const aSize = +(a.getAttribute("width") || 0) +
|
||||
+(a.getAttribute("height") || 0);
|
||||
const bSize = +(b.getAttribute("width") || 0) +
|
||||
+(b.getAttribute("height") || 0);
|
||||
return aSize > bSize ? -1 : 1;
|
||||
})[0];
|
||||
|
||||
const newArticle = {
|
||||
type: "article",
|
||||
id,
|
||||
name: title || "",
|
||||
content: markdown,
|
||||
tags: tags || [],
|
||||
meta,
|
||||
const newArticle: Article = {
|
||||
_type: "Article",
|
||||
headline: title,
|
||||
articleBody: result.content,
|
||||
url: fetchUrl,
|
||||
datePublished: result?.published || aiMeta?.datePublished ||
|
||||
new Date().toISOString(),
|
||||
image: result?.image,
|
||||
author: {
|
||||
_type: "Person",
|
||||
name: (result.schemaOrgData?.author?.name || aiMeta?.author || "")
|
||||
.replace(
|
||||
"@",
|
||||
"twitter:",
|
||||
),
|
||||
},
|
||||
} as const;
|
||||
|
||||
if (largestImage) {
|
||||
const src = makeUrlAbsolute(largestImage.getAttribute("src") || "");
|
||||
if (src) {
|
||||
meta.image = src;
|
||||
}
|
||||
}
|
||||
|
||||
streamResponse.enqueue("writing to disk");
|
||||
|
||||
// await createArticle(newArticle.id, newArticle);
|
||||
await createResource(`articles/${id}.md`, newArticle);
|
||||
|
||||
streamResponse.enqueue("id: " + newArticle.id);
|
||||
streamResponse.enqueue("id: " + id);
|
||||
}
|
||||
|
||||
async function processCreateYoutubeVideo(
|
||||
@@ -187,34 +84,34 @@ async function processCreateYoutubeVideo(
|
||||
|
||||
streamResponse.enqueue("getting video infos from youtube api");
|
||||
|
||||
const id = extractYoutubeId(fetchUrl);
|
||||
const youtubeId = extractYoutubeId(fetchUrl);
|
||||
|
||||
const video = await getYoutubeVideoDetails(id);
|
||||
const video = await getYoutubeVideoDetails(youtubeId);
|
||||
|
||||
streamResponse.enqueue("shortening title with openai");
|
||||
const newId = await openai.shortenTitle(video.snippet.title);
|
||||
|
||||
const id = newId || youtubeId;
|
||||
|
||||
const newArticle: Article = {
|
||||
type: "article",
|
||||
name: video.snippet.title,
|
||||
id: newId || video.snippet.title,
|
||||
content: video.snippet.description,
|
||||
tags: video.snippet?.tags?.slice(0, 5) || [],
|
||||
meta: {
|
||||
done: false,
|
||||
link: fetchUrl,
|
||||
author: video.snippet.channelTitle,
|
||||
date: new Date(video.snippet.publishedAt),
|
||||
_type: "Article",
|
||||
headline: video.snippet.title,
|
||||
articleBody: video.snippet.description,
|
||||
url: fetchUrl,
|
||||
datePublished: new Date(video.snippet.publishedAt).toISOString(),
|
||||
author: {
|
||||
_type: "Person",
|
||||
name: video.snippet.channelTitle,
|
||||
},
|
||||
};
|
||||
|
||||
streamResponse.enqueue("creating article");
|
||||
|
||||
// await createArticle(newArticle.id, newArticle);
|
||||
await createResource(`articles/${id}.md`, newArticle);
|
||||
|
||||
streamResponse.enqueue("finished");
|
||||
|
||||
streamResponse.enqueue("id: " + newArticle.id);
|
||||
streamResponse.enqueue("id: " + id);
|
||||
}
|
||||
|
||||
export const handler: Handlers = {
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import { Handlers } from "$fresh/server.ts";
|
||||
import { Readability } from "https://cdn.skypack.dev/@mozilla/readability";
|
||||
import { DOMParser } from "domparser";
|
||||
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
|
||||
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
|
||||
import * as openai from "@lib/openai.ts";
|
||||
@@ -10,13 +9,10 @@ import { Recipe } from "@lib/resource/recipes.ts";
|
||||
import recipeSchema from "@lib/recipeSchema.ts";
|
||||
import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
|
||||
import { safeFileName } from "@lib/string.ts";
|
||||
import { createDocument } from "@lib/documents.ts";
|
||||
import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
|
||||
import z from "npm:zod";
|
||||
import z from "zod";
|
||||
import { fetchHtmlWithPlaywright } from "@lib/playwright.ts";
|
||||
|
||||
const parser = new DOMParser();
|
||||
|
||||
const log = createLogger("api/article");
|
||||
|
||||
function makeUrlAbsolute(url: URL, src: string) {
|
||||
@@ -49,7 +45,7 @@ async function extractUsingAI(
|
||||
|
||||
service.addRule("fix image links", {
|
||||
filter: ["img"],
|
||||
replacement: function (_: string, node: HTMLImageElement) {
|
||||
replacement: function(_: string, node: HTMLImageElement) {
|
||||
const src = node.getAttribute("src");
|
||||
const alt = node.getAttribute("alt") || "";
|
||||
if (!src || src.startsWith("data:image")) return "";
|
||||
@@ -59,7 +55,7 @@ async function extractUsingAI(
|
||||
});
|
||||
service.addRule("fix normal links", {
|
||||
filter: ["a"],
|
||||
replacement: function (content: string, node: HTMLImageElement) {
|
||||
replacement: function(content: string, node: HTMLImageElement) {
|
||||
const href = node.getAttribute("href");
|
||||
if (!href) return content;
|
||||
|
||||
@@ -73,8 +69,7 @@ async function extractUsingAI(
|
||||
}
|
||||
|
||||
if (!href.startsWith("https://") && !href.startsWith("http://")) {
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${
|
||||
href.replace(/^\//, "")
|
||||
return `[${content}](${url.origin.replace(/\/$/, "")}/${href.replace(/^\//, "")
|
||||
})`;
|
||||
}
|
||||
|
||||
@@ -199,8 +194,7 @@ async function processCreateRecipeFromUrl(
|
||||
const src = makeUrlAbsolute(url, newRecipe.meta.image);
|
||||
if (src?.length > 5) {
|
||||
const extension = fileExtension(new URL(src).pathname);
|
||||
const finalPath = `Media/articles/images/${
|
||||
safeFileName(id)
|
||||
const finalPath = `Media/articles/images/${safeFileName(id)
|
||||
}_cover.${extension}`;
|
||||
streamResponse.enqueue("downloading image");
|
||||
try {
|
||||
|
||||
Reference in New Issue
Block a user