feat: allow creating articles with marka

This commit is contained in:
Max Richter
2025-10-31 15:26:34 +01:00
parent dfa3826ec5
commit 7e60327940
12 changed files with 332 additions and 210 deletions

View File

@@ -17,6 +17,8 @@ export const GITEA_REDIRECT_URL = Deno.env.get("GITEA_REDIRECT_URL");
const duration = Deno.env.get("SESSION_DURATION");
export const SESSION_DURATION = duration ? +duration : (60 * 60 * 24);
export const MARKA_API_KEY = Deno.env.get("MARKA_API_KEY");
export const JWT_SECRET = Deno.env.get("JWT_SECRET");
export const DATA_DIR = Deno.env.has("DATA_DIR")

View File

@@ -103,6 +103,12 @@ export function debounce<T extends (...args: Parameters<T>) => void>(
export function parseRating(rating: string | number) {
if (typeof rating === "string") {
try {
return parseInt(rating);
} catch (_e) {
// This is okay
}
return [...rating.matchAll(/⭐/g)].length;
}
return rating;

View File

@@ -4,6 +4,7 @@ import { OPENAI_API_KEY } from "@lib/env.ts";
import { hashString } from "@lib/helpers.ts";
import { createCache } from "@lib/cache.ts";
import { recipeResponseSchema } from "@lib/recipeSchema.ts";
import { articleMetadataSchema } from "./resource/articles.ts";
const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY });
@@ -32,8 +33,7 @@ export async function summarize(content: string) {
{
role: "user",
content:
`Please summarize the article in one sentence as short as possible: ${
content.slice(0, 2000)
`Please summarize the article in one sentence as short as possible: ${content.slice(0, 2000)
}`,
},
],
@@ -100,8 +100,7 @@ export async function createGenres(
{
role: "system",
content:
`you create some keywords that can be used in a recommendation system. The keywords are based on a ${type} description or title. If you do not know the title, take into account the description aswell. Create a range of keywords from very specific ones that describe the general vibe. ${
title ? `The name of the ${type} is ${title}` : ""
`you create some keywords that can be used in a recommendation system. The keywords are based on a ${type} description or title. If you do not know the title, take into account the description aswell. Create a range of keywords from very specific ones that describe the general vibe. ${title ? `The name of the ${type} is ${title}` : ""
}. Return a list of around 20 keywords seperated by commas`,
},
{
@@ -165,8 +164,7 @@ export const getMovieRecommendations = async (
${keywords}
The movies should be similar to but not include ${
exclude.join(", ")
The movies should be similar to but not include ${exclude.join(", ")
} or remakes of that.
respond with a plain unordered list each item starting with the year the movie was released and then the title of the movie seperated by a -`,
@@ -229,6 +227,28 @@ export async function extractRecipe(content: string) {
return recipeResponseSchema.parse(completion.choices[0].message.parsed);
}
export async function extractArticleMetadata(content: string) {
if (!openAI) return;
const completion = await openAI.beta.chat.completions.parse({
model: "gpt-4o-2024-08-06",
temperature: 0.1,
messages: [
{
role: "system",
content:
"Extract the article information from the provided markdown. If the specified data is not available return undefined for the data values.",
},
{ role: "user", content },
],
response_format: zodResponseFormat(
articleMetadataSchema,
"article-meta-v2",
),
});
return articleMetadataSchema.parse(completion.choices[0].message.parsed);
}
export async function transcribe(
mp3Data: Uint8Array,
): Promise<string | undefined> {

View File

@@ -3,13 +3,6 @@ import { createStreamResponse } from "@lib/helpers.ts";
import StealthPlugin from "npm:puppeteer-extra-plugin-stealth";
import * as env from "@lib/env.ts";
const userAgentStrings = [
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.2227.0 Safari/537.36",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.3497.92 Safari/537.36",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36",
];
firefox.use(StealthPlugin());
export async function fetchHtmlWithPlaywright(

View File

@@ -17,27 +17,24 @@ export const IngredientGroupSchema = z.object({
export type IngredientGroup = z.infer<typeof IngredientGroupSchema>;
const recipeSchema = z.object({
name: z.string(),
content: z.object({
_type: z.literal("Recipe"),
name: z.string().describe(
"Title of the Recipe, without the name of the website or author",
),
description: z.string().describe(
"Optional, short description of the recipe",
),
image: z.string().describe("URL of the main image of the recipe"),
author: z.object({
_type: z.literal("Person"),
name: z.string().describe("author of the Recipe (optional)"),
}),
recipeEngredient: z.array(z.string())
.describe("List of ingredients"),
recipeInstructions: z.array(z.string()).describe("List of instructions"),
recipeYield: z.number().describe("Amount of Portions"),
prepTime: z.number().describe("Preparation time in minutes"),
cookTime: z.number().describe("Cooking time in minutes"),
_type: z.literal("Recipe"),
name: z.string().describe(
"Title of the Recipe, without the name of the website or author",
),
description: z.string().describe(
"Optional, short description of the recipe",
),
image: z.string().describe("URL of the main image of the recipe"),
author: z.object({
_type: z.literal("Person"),
name: z.string().describe("author of the Recipe (optional)"),
}),
recipeIngredient: z.array(z.string())
.describe("List of ingredients"),
recipeInstructions: z.array(z.string()).describe("List of instructions"),
recipeYield: z.number().describe("Amount of Portions"),
prepTime: z.number().describe("Preparation time in minutes"),
cookTime: z.number().describe("Cooking time in minutes"),
});
export type Recipe = z.infer<typeof recipeSchema>;

View File

@@ -1,17 +1,30 @@
import { z } from "zod";
export type Article = {
id: string;
type: "article";
content: string;
name: string;
tags: string[];
meta: {
done?: boolean;
date: Date;
link: string;
thumbnail?: string;
average?: string;
image?: string;
author?: string;
rating?: number;
_type: "Article";
headline?: string;
datePublished?: string;
articleBody?: string;
keywords?: string[];
image?: string;
url?: string;
reviewRating?: {
bestRating?: number;
worstRating?: number;
ratingValue?: number;
};
author?: {
_type: "Person";
name?: string;
};
};
export const articleMetadataSchema = z.object({
headline: z.union([z.null(), z.string()]).describe("Headline of the article"),
author: z.union([z.null(), z.string()]).describe("Author of the article"),
datePublished: z.union([z.null(), z.string()]).describe(
"Date the article was published",
),
keywords: z.union([z.null(), z.array(z.string())]).describe(
"Keywords for the article",
),
});

View File

@@ -1,3 +1,5 @@
import { MARKA_API_KEY } from "./env.ts";
export const resources = {
"home": {
emoji: "House with Garden.png",
@@ -31,13 +33,37 @@ export const resources = {
},
} as const;
const url = `https://marka.max-richter.dev/resources`;
//const url = "http://localhost:8080/resources";
export async function fetchResource(resource: string) {
try {
const response = await fetch(
`https://marka.max-richter.dev/resources/${resource}`,
`${url}/${resource}`,
);
return response.json();
} catch (_e) {
return [];
}
}
export async function createResource(
path: string,
content: string | object | ArrayBuffer,
) {
const isJson = typeof content === "object";
const fetchUrl = `${url}/${path}`;
console.log("Creating resource", { fetchUrl, content, isJson });
const response = await fetch(fetchUrl, {
method: "POST",
headers: {
"Content-Type": isJson ? "application/json" : "",
"Authentication": MARKA_API_KEY,
},
body: isJson ? JSON.stringify(content) : content,
});
if (!response.ok) {
throw new Error(`Failed to create resource: ${response.status}`);
}
return response.json();
}

View File

@@ -67,7 +67,7 @@ export async function searchResource(
for (const resource of resources) {
if (
!(resource.id in results) &&
!(resource.name in results) &&
tags?.length && resource.tags.length &&
tags.every((t) => resource.tags.includes(t))
) {

View File

@@ -1,2 +1,174 @@
export function webScrape(url: URL) {
import { JSDOM } from "jsdom";
import { fetchHtmlWithPlaywright } from "./playwright.ts";
import { createStreamResponse } from "./helpers.ts";
/**
* Mutates the given JSDOM instance: rewrites all relevant URL-bearing attributes
* to absolute URLs, resolving against the provided domain (e.g., "https://example.com").
*/
export function absolutizeDomUrls(dom: JSDOM, domain: string): void {
const { document } = dom.window;
const base = toBase(domain);
const rewrite = (selector: string, attr: string) => {
document.querySelectorAll<HTMLElement>(selector).forEach((el) => {
const v = el.getAttribute(attr);
if (!v) return;
const abs = toAbsolute(v, base);
if (abs !== v) el.setAttribute(attr, abs);
});
};
// Common URL attributes
rewrite("a[href]", "href");
rewrite("area[href]", "href");
rewrite("link[href]", "href");
rewrite("use[href]", "href"); // SVG 2
rewrite("use[xlink\\:href]", "xlink:href"); // legacy SVG
rewrite("image[href]", "href"); // SVG
rewrite("image[xlink\\:href]", "xlink:href"); // legacy SVG
rewrite("script[src]", "src");
rewrite("img[src]", "src");
rewrite("source[src]", "src");
rewrite("track[src]", "src");
rewrite("iframe[src]", "src");
rewrite("embed[src]", "src");
rewrite("audio[src]", "src");
rewrite("video[src]", "src");
rewrite("object[data]", "data");
rewrite("input[src]", "src");
rewrite("form[action]", "action");
rewrite("video[poster]", "poster");
// srcset (img, source)
document
.querySelectorAll<HTMLElement>("img[srcset], source[srcset]")
.forEach((el) => {
const v = el.getAttribute("srcset");
if (!v) return;
const abs = absolutizeSrcset(v, base);
if (abs !== v) el.setAttribute("srcset", abs);
});
// Inline CSS in style attributes: url(...)
document.querySelectorAll<HTMLElement>("[style]").forEach((el) => {
const v = el.getAttribute("style");
if (!v) return;
const abs = absolutizeCssUrls(v, base);
if (abs !== v) el.setAttribute("style", abs);
});
// <style> blocks (inline CSS): url(...)
document.querySelectorAll<HTMLStyleElement>("style").forEach((styleEl) => {
const css = styleEl.textContent ?? "";
const abs = absolutizeCssUrls(css, base);
if (abs !== css) styleEl.textContent = abs;
});
// <meta http-equiv="refresh" content="5; url=/path">
document
.querySelectorAll<HTMLMetaElement>('meta[http-equiv="refresh" i][content]')
.forEach((meta) => {
const content = meta.getAttribute("content") || "";
const abs = absolutizeMetaRefresh(content, base);
if (abs !== content) meta.setAttribute("content", abs);
});
}
/** Normalize the base to a valid absolute URL root. */
function toBase(domain: string): string {
// Allow callers to pass "example.com" or "//example.com"
let d = domain.trim();
if (!/^[a-zA-Z][a-zA-Z0-9+\-.]*:/.test(d)) {
d = d.startsWith("//") ? `https:${d}` : `https://${d}`;
}
// Ensure trailing slash does not matter for URL resolution
try {
// new URL('/', base) works whether base ends with slash or not
return new URL("/", d).toString();
} catch {
// Fallback: if domain is irreparably bad, throw early
throw new Error(`Invalid base domain: ${domain}`);
}
}
/** Convert a possibly-relative URL to absolute, using the provided base. */
function toAbsolute(url: string, base: string): string {
const trimmed = url.trim();
// Leave already absolute or special schemes untouched by just parsing directly.
// If it's not a valid absolute URL, resolve against base.
try {
// If parse succeeds without base and includes a scheme, keep as-is
const abs = new URL(trimmed);
return abs.toString();
} catch {
// Not absolute, resolve relative to base (handles #hash, ?q, //host, etc.)
try {
return new URL(trimmed, base).toString();
} catch {
// If still invalid (e.g., badly formed), return original
return url;
}
}
}
/** Absolutize a srcset list. */
function absolutizeSrcset(srcset: string, base: string): string {
// Split by commas but keep descriptors (1x, 2x, 100w, etc.)
// Each candidate: <url> [<descriptor>]
return srcset
.split(",")
.map((part) => {
const s = part.trim();
if (!s) return s;
// First whitespace separates URL and descriptor
const spaceIdx = s.search(/\s/);
if (spaceIdx === -1) {
return toAbsolute(s, base);
}
const url = s.slice(0, spaceIdx);
const desc = s.slice(spaceIdx).trim();
return `${toAbsolute(url, base)} ${desc}`;
})
.join(", ");
}
/** Replace url(...) in CSS text with absolute URLs. */
function absolutizeCssUrls(cssText: string, base: string): string {
// Matches url("..."), url('...'), url(...)
return cssText.replace(
/url\(\s*(['"]?)([^'")]+)\1\s*\)/g,
(_m, _q, rawUrl) => {
const abs = toAbsolute(rawUrl, base);
// Preserve quoting if present; browsers accept unquoted if safe, but keep simple.
return `url(${abs})`;
},
);
}
/** Rewrite the URL in a meta refresh content value if present. */
function absolutizeMetaRefresh(content: string, base: string): string {
// Format examples:
// "5; url=/path", "0;URL='page.html'"
const match = content.match(
/^\s*([^;]+)\s*;\s*(url|URL)\s*=\s*('?)([^']+)\3\s*$/,
);
if (!match) return content;
const delay = match[1].trim();
const url = match[4].trim();
const abs = toAbsolute(url, base);
return `${delay}; url=${abs}`;
}
export async function webScrape(
url: string,
streamResponse: ReturnType<typeof createStreamResponse>,
): JSDOM {
const u = new URL(url);
const html = await fetchHtmlWithPlaywright(url, streamResponse);
const dom = new JSDOM(html);
absolutizeDomUrls(dom, u.origin);
return dom;
}