feat: fallback to unsplash cover when article contains no image

This commit is contained in:
Max Richter
2025-11-09 23:52:53 +01:00
parent 6c6b69a46a
commit 655fc648e6
27 changed files with 687 additions and 224 deletions

View File

@@ -7,6 +7,7 @@ export const PROXY_PASSWORD = Deno.env.get("PROXY_PASSWORD");
export const TMDB_API_KEY = Deno.env.get("TMDB_API_KEY");
export const OPENAI_API_KEY = Deno.env.get("OPENAI_API_KEY");
export const YOUTUBE_API_KEY = Deno.env.get("YOUTUBE_API_KEY");
export const UNSPLASH_API_KEY = Deno.env.get("UNSPLASH_API_KEY");
export const TELEGRAM_API_KEY = Deno.env.get("TELEGRAM_API_KEY")!;
export const GITEA_SERVER = Deno.env.get("GITEA_SERVER");

View File

@@ -31,19 +31,54 @@ export const fixRenderedMarkdown = (content: string) => {
});
};
export async function fetchStream(url: string, cb: (chunk: string) => void) {
const response = await fetch(url);
const reader = response?.body?.getReader();
if (reader) {
while (true) {
const { done, value } = await reader.read();
if (done) return;
const data = new TextDecoder().decode(value);
data
.split("$")
.filter((d) => d && d.length)
.map((d) => cb(Array.isArray(d) ? d[0] : d));
}
type StreamMessage = {
type: "info";
message: string;
} | {
type: "error";
message: string;
} | {
type: "warning";
message: string;
} | {
type: "finished";
url: string;
};
export async function fetchStream(
url: string,
cb: (chunk: StreamMessage) => void,
init?: RequestInit,
) {
const res = await fetch(url, init);
if (!res.body) return;
let buffer = "";
const reader = res.body
.pipeThrough(new TextDecoderStream())
.pipeThrough(
new TransformStream<string, string>({
transform(chunk, controller) {
buffer += chunk;
let idx;
while ((idx = buffer.indexOf("\n")) >= 0) {
const line = buffer.slice(0, idx).trim();
buffer = buffer.slice(idx + 1);
if (line) controller.enqueue(line);
}
},
flush(controller) {
const line = buffer.trim();
if (line) controller.enqueue(line);
},
}),
)
.getReader();
while (true) {
const { done, value } = await reader.read();
if (done) break;
cb(JSON.parse(value));
}
}
@@ -58,32 +93,53 @@ export function hashString(message: string) {
}
export const createStreamResponse = () => {
let controller: ReadableStreamController<ArrayBufferView>;
const body = new ReadableStream({
start(cont) {
controller = cont;
const encoder = new TextEncoder();
let controller: ReadableStreamDefaultController<Uint8Array>;
const body = new ReadableStream<Uint8Array>({
start(c) {
controller = c;
},
});
const response = new Response(body, {
headers: {
"content-type": "text/plain",
// newline-delimited JSON
"content-type": "application/x-ndjson; charset=utf-8",
// prevent intermediaries from buffering/transforming
"cache-control": "no-cache, no-transform",
"x-content-type-options": "nosniff",
// nginx hint to disable proxy buffering
"x-accel-buffering": "no",
// if you control compression, keep it off for streams
// "content-encoding": "identity",
},
});
function cancel() {
controller.close();
const send = (obj: unknown) => {
controller.enqueue(encoder.encode(JSON.stringify(obj) + "\n")); // ← delimiter
};
const cancel = () => controller.close();
function info(message: string) {
return send({ type: "info", message });
}
function enqueue(chunk: string) {
controller?.enqueue(new TextEncoder().encode("$" + chunk));
function error(message: string) {
return send({ type: "error", message });
}
function warning(message: string) {
return send({ type: "warning", message });
}
return {
response,
cancel,
enqueue,
send,
info,
error,
warning,
};
};

View File

@@ -38,13 +38,13 @@ export function createLogger(scope: string, _options?: LoggerOptions): Logger {
export function loggerFromStream(stream: StreamResponse) {
return {
debug: (...data: unknown[]) =>
stream.enqueue(`${data.length > 1 ? data.join(" ") : data[0]}`),
stream.info(`${data.length > 1 ? data.join(" ") : data[0]}`),
info: (...data: unknown[]) =>
stream.enqueue(`${data.length > 1 ? data.join(" ") : data[0]}`),
stream.info(`${data.length > 1 ? data.join(" ") : data[0]}`),
error: (...data: unknown[]) =>
stream.enqueue(`[ERROR]: ${data.length > 1 ? data.join(" ") : data[0]}`),
stream.error(`[ERROR]: ${data.length > 1 ? data.join(" ") : data[0]}`),
warn: (...data: unknown[]) =>
stream.enqueue(`[WARN]: ${data.length > 1 ? data.join(" ") : data[0]}`),
stream.warning(`[WARN]: ${data.length > 1 ? data.join(" ") : data[0]}`),
};
}

View File

@@ -106,8 +106,11 @@ export async function createResource(
body: isJson ? JSON.stringify(content) : content,
});
if (!response.ok) {
const text = await response.text();
throw new Error(
`Failed to create resource (resources/${path}) : ${response.status}`,
`failed to create resource (resources/${path}): ${
text || response.status
}`,
);
}
return response.json();

View File

@@ -195,6 +195,23 @@ respond with a plain unordered list each item starting with the year the movie w
return recommendations;
};
export async function createUnsplashSearchTerm(content: string) {
if (!openAI) return;
const chatCompletion = await openAI.chat.completions.create({
model: model,
messages: [
{
role: "system",
content:
"Please respond with a search term for unsplash for the following article",
},
{ role: "user", content: content.slice(0, 10_000) },
],
});
return chatCompletion.choices[0].message.content?.toLowerCase();
}
export async function createTags(content: string) {
if (!openAI) return;
const chatCompletion = await openAI.chat.completions.create({

View File

@@ -9,7 +9,7 @@ export async function fetchHtmlWithPlaywright(
fetchUrl: string,
streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string> {
streamResponse.enqueue("booting up playwright");
streamResponse.info("booting up playwright");
const config: Parameters<typeof firefox.launch>[0] = {};
if (env.PROXY_SERVER) {
@@ -24,7 +24,7 @@ export async function fetchHtmlWithPlaywright(
// Launch the Playwright browser
const browser = await firefox.launch(config);
streamResponse.enqueue("fetching html");
streamResponse.info("fetching html");
try {
// Open a new browser context and page
@@ -42,7 +42,7 @@ export async function fetchHtmlWithPlaywright(
return html;
} catch (error) {
streamResponse.enqueue("error fetching html");
streamResponse.error("error fetching html");
console.error(error);
return "";
} finally {

29
lib/unsplash.ts Normal file
View File

@@ -0,0 +1,29 @@
import { UNSPLASH_API_KEY } from "./env.ts";
const API_URL = "https://api.unsplash.com";
export async function getImageBySearchTerm(
searchTerm: string,
): Promise<string | undefined> {
if (!UNSPLASH_API_KEY) {
throw new Error("UNSPLASH_API_KEY is not set");
}
const url = new URL("/search/photos", API_URL);
url.searchParams.append("query", searchTerm);
url.searchParams.append("per_page", "1");
url.searchParams.append("orientation", "landscape");
const response = await fetch(url.toString(), {
headers: {
Authorization: `Client-ID ${UNSPLASH_API_KEY}`,
},
});
if (!response.ok) {
throw new Error(`Unsplash API request failed: ${response.statusText}`);
}
const data = await response.json();
return data.results[0]?.urls?.regular;
}

View File

@@ -1,6 +1,8 @@
import { JSDOM } from "jsdom";
import { fetchHtmlWithPlaywright } from "./playwright.ts";
import { createStreamResponse } from "./helpers.ts";
import { Defuddle } from "defuddle/node";
import TurndownService from "turndown";
/**
* Mutates the given JSDOM instance: rewrites all relevant URL-bearing attributes
@@ -164,6 +166,8 @@ function absolutizeMetaRefresh(content: string, base: string): string {
return `${delay}; url=${abs}`;
}
const turndownService = new TurndownService();
export async function webScrape(
url: string,
streamResponse: ReturnType<typeof createStreamResponse>,
@@ -172,5 +176,12 @@ export async function webScrape(
const html = await fetchHtmlWithPlaywright(url, streamResponse);
const dom = new JSDOM(html);
absolutizeDomUrls(dom, u.origin);
return dom;
const result = await Defuddle(dom, url);
return {
...result,
dom,
markdown: turndownService.turndown(result.content),
};
}