From 7e603279408233a54ed0747de5c2bf1e9606b306 Mon Sep 17 00:00:00 2001 From: Max Richter Date: Fri, 31 Oct 2025 15:26:34 +0100 Subject: [PATCH] feat: allow creating articles with marka --- deno.json | 4 +- lib/env.ts | 2 + lib/helpers.ts | 6 + lib/openai.ts | 32 ++++- lib/playwright.ts | 7 -- lib/recipeSchema.ts | 37 +++--- lib/resource/articles.ts | 41 +++--- lib/resources.ts | 28 ++++- lib/search.ts | 2 +- lib/webScraper.ts | 174 ++++++++++++++++++++++++- routes/api/articles/create/index.ts | 189 +++++++--------------------- routes/api/recipes/create/index.ts | 20 ++- 12 files changed, 332 insertions(+), 210 deletions(-) diff --git a/deno.json b/deno.json index f1bff04..d546198 100644 --- a/deno.json +++ b/deno.json @@ -26,9 +26,11 @@ "@preact/signals-core": "https://esm.sh/*@preact/signals-core@1.5.1", "@std/http": "jsr:@std/http@^1.0.12", "@std/yaml": "jsr:@std/yaml@^1.0.5", + "defuddle": "npm:defuddle@^0.6.6", "drizzle-kit": "npm:drizzle-kit@^0.30.1", "drizzle-orm": "npm:drizzle-orm@^0.38.3", "fuzzysort": "npm:fuzzysort@^3.1.0", + "jsdom": "npm:jsdom@^24.1.3", "playwright": "npm:playwright@^1.49.1", "playwright-extra": "npm:playwright-extra@^4.3.6", "preact": "https://esm.sh/preact@10.22.0", @@ -43,7 +45,7 @@ "tsx": "npm:tsx@^4.19.2", "yaml": "https://deno.land/std@0.197.0/yaml/mod.ts", "zod": "npm:zod@^3.24.1", - "domparser": "https://deno.land/x/deno_dom@v0.1.48/deno-dom-wasm.ts", + "domparser": "https://deno.land/x/deno_dom@v0.1.56/deno-dom-wasm.ts", "fs": "https://deno.land/std/fs/mod.ts", "imagemagick": "https://deno.land/x/imagemagick_deno@0.0.31/mod.ts" }, diff --git a/lib/env.ts b/lib/env.ts index 0ec51fe..0886bd8 100644 --- a/lib/env.ts +++ b/lib/env.ts @@ -17,6 +17,8 @@ export const GITEA_REDIRECT_URL = Deno.env.get("GITEA_REDIRECT_URL"); const duration = Deno.env.get("SESSION_DURATION"); export const SESSION_DURATION = duration ? +duration : (60 * 60 * 24); +export const MARKA_API_KEY = Deno.env.get("MARKA_API_KEY"); + export const JWT_SECRET = Deno.env.get("JWT_SECRET"); export const DATA_DIR = Deno.env.has("DATA_DIR") diff --git a/lib/helpers.ts b/lib/helpers.ts index 5d351dc..b57c695 100644 --- a/lib/helpers.ts +++ b/lib/helpers.ts @@ -103,6 +103,12 @@ export function debounce) => void>( export function parseRating(rating: string | number) { if (typeof rating === "string") { + try { + return parseInt(rating); + } catch (_e) { + // This is okay + } + return [...rating.matchAll(/⭐/g)].length; } return rating; diff --git a/lib/openai.ts b/lib/openai.ts index dcb6052..0787adb 100644 --- a/lib/openai.ts +++ b/lib/openai.ts @@ -4,6 +4,7 @@ import { OPENAI_API_KEY } from "@lib/env.ts"; import { hashString } from "@lib/helpers.ts"; import { createCache } from "@lib/cache.ts"; import { recipeResponseSchema } from "@lib/recipeSchema.ts"; +import { articleMetadataSchema } from "./resource/articles.ts"; const openAI = OPENAI_API_KEY && new OpenAI({ apiKey: OPENAI_API_KEY }); @@ -32,8 +33,7 @@ export async function summarize(content: string) { { role: "user", content: - `Please summarize the article in one sentence as short as possible: ${ - content.slice(0, 2000) + `Please summarize the article in one sentence as short as possible: ${content.slice(0, 2000) }`, }, ], @@ -100,8 +100,7 @@ export async function createGenres( { role: "system", content: - `you create some keywords that can be used in a recommendation system. The keywords are based on a ${type} description or title. If you do not know the title, take into account the description aswell. Create a range of keywords from very specific ones that describe the general vibe. ${ - title ? `The name of the ${type} is ${title}` : "" + `you create some keywords that can be used in a recommendation system. The keywords are based on a ${type} description or title. If you do not know the title, take into account the description aswell. Create a range of keywords from very specific ones that describe the general vibe. ${title ? `The name of the ${type} is ${title}` : "" }. Return a list of around 20 keywords seperated by commas`, }, { @@ -165,8 +164,7 @@ export const getMovieRecommendations = async ( ${keywords} -The movies should be similar to but not include ${ - exclude.join(", ") +The movies should be similar to but not include ${exclude.join(", ") } or remakes of that. respond with a plain unordered list each item starting with the year the movie was released and then the title of the movie seperated by a -`, @@ -229,6 +227,28 @@ export async function extractRecipe(content: string) { return recipeResponseSchema.parse(completion.choices[0].message.parsed); } +export async function extractArticleMetadata(content: string) { + if (!openAI) return; + const completion = await openAI.beta.chat.completions.parse({ + model: "gpt-4o-2024-08-06", + temperature: 0.1, + messages: [ + { + role: "system", + content: + "Extract the article information from the provided markdown. If the specified data is not available return undefined for the data values.", + }, + { role: "user", content }, + ], + response_format: zodResponseFormat( + articleMetadataSchema, + "article-meta-v2", + ), + }); + + return articleMetadataSchema.parse(completion.choices[0].message.parsed); +} + export async function transcribe( mp3Data: Uint8Array, ): Promise { diff --git a/lib/playwright.ts b/lib/playwright.ts index df4dcdc..a6de683 100644 --- a/lib/playwright.ts +++ b/lib/playwright.ts @@ -3,13 +3,6 @@ import { createStreamResponse } from "@lib/helpers.ts"; import StealthPlugin from "npm:puppeteer-extra-plugin-stealth"; import * as env from "@lib/env.ts"; -const userAgentStrings = [ - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.2227.0 Safari/537.36", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.3497.92 Safari/537.36", - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/110.0.0.0 Safari/537.36", -]; - firefox.use(StealthPlugin()); export async function fetchHtmlWithPlaywright( diff --git a/lib/recipeSchema.ts b/lib/recipeSchema.ts index a1636ad..a9830d4 100644 --- a/lib/recipeSchema.ts +++ b/lib/recipeSchema.ts @@ -17,27 +17,24 @@ export const IngredientGroupSchema = z.object({ export type IngredientGroup = z.infer; const recipeSchema = z.object({ - name: z.string(), - content: z.object({ - _type: z.literal("Recipe"), - name: z.string().describe( - "Title of the Recipe, without the name of the website or author", - ), - description: z.string().describe( - "Optional, short description of the recipe", - ), - image: z.string().describe("URL of the main image of the recipe"), - author: z.object({ - _type: z.literal("Person"), - name: z.string().describe("author of the Recipe (optional)"), - }), - recipeEngredient: z.array(z.string()) - .describe("List of ingredients"), - recipeInstructions: z.array(z.string()).describe("List of instructions"), - recipeYield: z.number().describe("Amount of Portions"), - prepTime: z.number().describe("Preparation time in minutes"), - cookTime: z.number().describe("Cooking time in minutes"), + _type: z.literal("Recipe"), + name: z.string().describe( + "Title of the Recipe, without the name of the website or author", + ), + description: z.string().describe( + "Optional, short description of the recipe", + ), + image: z.string().describe("URL of the main image of the recipe"), + author: z.object({ + _type: z.literal("Person"), + name: z.string().describe("author of the Recipe (optional)"), }), + recipeIngredient: z.array(z.string()) + .describe("List of ingredients"), + recipeInstructions: z.array(z.string()).describe("List of instructions"), + recipeYield: z.number().describe("Amount of Portions"), + prepTime: z.number().describe("Preparation time in minutes"), + cookTime: z.number().describe("Cooking time in minutes"), }); export type Recipe = z.infer; diff --git a/lib/resource/articles.ts b/lib/resource/articles.ts index 038a210..a55d47c 100644 --- a/lib/resource/articles.ts +++ b/lib/resource/articles.ts @@ -1,17 +1,30 @@ +import { z } from "zod"; export type Article = { - id: string; - type: "article"; - content: string; - name: string; - tags: string[]; - meta: { - done?: boolean; - date: Date; - link: string; - thumbnail?: string; - average?: string; - image?: string; - author?: string; - rating?: number; + _type: "Article"; + headline?: string; + datePublished?: string; + articleBody?: string; + keywords?: string[]; + image?: string; + url?: string; + reviewRating?: { + bestRating?: number; + worstRating?: number; + ratingValue?: number; + }; + author?: { + _type: "Person"; + name?: string; }; }; + +export const articleMetadataSchema = z.object({ + headline: z.union([z.null(), z.string()]).describe("Headline of the article"), + author: z.union([z.null(), z.string()]).describe("Author of the article"), + datePublished: z.union([z.null(), z.string()]).describe( + "Date the article was published", + ), + keywords: z.union([z.null(), z.array(z.string())]).describe( + "Keywords for the article", + ), +}); diff --git a/lib/resources.ts b/lib/resources.ts index c532332..dab006a 100644 --- a/lib/resources.ts +++ b/lib/resources.ts @@ -1,3 +1,5 @@ +import { MARKA_API_KEY } from "./env.ts"; + export const resources = { "home": { emoji: "House with Garden.png", @@ -31,13 +33,37 @@ export const resources = { }, } as const; +const url = `https://marka.max-richter.dev/resources`; +//const url = "http://localhost:8080/resources"; + export async function fetchResource(resource: string) { try { const response = await fetch( - `https://marka.max-richter.dev/resources/${resource}`, + `${url}/${resource}`, ); return response.json(); } catch (_e) { return []; } } + +export async function createResource( + path: string, + content: string | object | ArrayBuffer, +) { + const isJson = typeof content === "object"; + const fetchUrl = `${url}/${path}`; + console.log("Creating resource", { fetchUrl, content, isJson }); + const response = await fetch(fetchUrl, { + method: "POST", + headers: { + "Content-Type": isJson ? "application/json" : "", + "Authentication": MARKA_API_KEY, + }, + body: isJson ? JSON.stringify(content) : content, + }); + if (!response.ok) { + throw new Error(`Failed to create resource: ${response.status}`); + } + return response.json(); +} diff --git a/lib/search.ts b/lib/search.ts index d9b87c5..44e4264 100644 --- a/lib/search.ts +++ b/lib/search.ts @@ -67,7 +67,7 @@ export async function searchResource( for (const resource of resources) { if ( - !(resource.id in results) && + !(resource.name in results) && tags?.length && resource.tags.length && tags.every((t) => resource.tags.includes(t)) ) { diff --git a/lib/webScraper.ts b/lib/webScraper.ts index a811e82..1ed6e45 100644 --- a/lib/webScraper.ts +++ b/lib/webScraper.ts @@ -1,2 +1,174 @@ -export function webScrape(url: URL) { +import { JSDOM } from "jsdom"; +import { fetchHtmlWithPlaywright } from "./playwright.ts"; +import { createStreamResponse } from "./helpers.ts"; + +/** + * Mutates the given JSDOM instance: rewrites all relevant URL-bearing attributes + * to absolute URLs, resolving against the provided domain (e.g., "https://example.com"). + */ +export function absolutizeDomUrls(dom: JSDOM, domain: string): void { + const { document } = dom.window; + const base = toBase(domain); + + const rewrite = (selector: string, attr: string) => { + document.querySelectorAll(selector).forEach((el) => { + const v = el.getAttribute(attr); + if (!v) return; + const abs = toAbsolute(v, base); + if (abs !== v) el.setAttribute(attr, abs); + }); + }; + + // Common URL attributes + rewrite("a[href]", "href"); + rewrite("area[href]", "href"); + rewrite("link[href]", "href"); + rewrite("use[href]", "href"); // SVG 2 + rewrite("use[xlink\\:href]", "xlink:href"); // legacy SVG + rewrite("image[href]", "href"); // SVG + rewrite("image[xlink\\:href]", "xlink:href"); // legacy SVG + + rewrite("script[src]", "src"); + rewrite("img[src]", "src"); + rewrite("source[src]", "src"); + rewrite("track[src]", "src"); + rewrite("iframe[src]", "src"); + rewrite("embed[src]", "src"); + rewrite("audio[src]", "src"); + rewrite("video[src]", "src"); + rewrite("object[data]", "data"); + rewrite("input[src]", "src"); + rewrite("form[action]", "action"); + rewrite("video[poster]", "poster"); + + // srcset (img, source) + document + .querySelectorAll("img[srcset], source[srcset]") + .forEach((el) => { + const v = el.getAttribute("srcset"); + if (!v) return; + const abs = absolutizeSrcset(v, base); + if (abs !== v) el.setAttribute("srcset", abs); + }); + + // Inline CSS in style attributes: url(...) + document.querySelectorAll("[style]").forEach((el) => { + const v = el.getAttribute("style"); + if (!v) return; + const abs = absolutizeCssUrls(v, base); + if (abs !== v) el.setAttribute("style", abs); + }); + + //