feat: fallback to unsplash cover when article contains no image

This commit is contained in:
Max Richter
2025-11-09 23:52:53 +01:00
parent 6c6b69a46a
commit 655fc648e6
27 changed files with 687 additions and 224 deletions

View File

@@ -3,6 +3,7 @@ import { Defuddle } from "defuddle/node";
import { AccessDeniedError, BadRequestError } from "@lib/errors.ts";
import { createStreamResponse, isValidUrl } from "@lib/helpers.ts";
import * as openai from "@lib/openai.ts";
import * as unsplash from "@lib/unsplash.ts";
import { getYoutubeVideoDetails } from "@lib/youtube.ts";
import {
extractYoutubeId,
@@ -19,6 +20,35 @@ import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts"
const log = createLogger("api/article");
async function getUnsplashCoverImage(
content: string,
streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string | undefined> {
try {
streamResponse.info("creating unsplash search term");
const searchTerm = await openai.createUnsplashSearchTerm(content);
if (!searchTerm) return;
streamResponse.info(`searching for ${searchTerm}`);
const unsplashUrl = await unsplash.getImageBySearchTerm(searchTerm);
return unsplashUrl;
} catch (e) {
log.error("Failed to get unsplash cover image", e);
return undefined;
}
}
function ext(str: string) {
try {
const u = new URL(str);
if (u.searchParams.has("fm")) {
return u.searchParams.get("fm")!;
}
return fileExtension(u.pathname);
} catch (_e) {
return fileExtension(str);
}
}
async function fetchAndStoreCover(
imageUrl: string | undefined,
title: string,
@@ -26,12 +56,12 @@ async function fetchAndStoreCover(
): Promise<string | undefined> {
if (!imageUrl) return;
const imagePath = `articles/images/${safeFileName(title)}_cover.${
fileExtension(imageUrl)
ext(imageUrl)
}`;
try {
streamResponse?.enqueue("downloading image");
streamResponse?.info("downloading image");
const res = await fetch(imageUrl);
streamResponse?.enqueue("saving image");
streamResponse?.info("saving image");
if (!res.ok) {
console.log(`Failed to download remote image: ${imageUrl}`, res.status);
return;
@@ -53,38 +83,43 @@ async function processCreateArticle(
) {
log.info("create article from url", { url: fetchUrl });
streamResponse.enqueue("downloading article");
streamResponse.info("downloading article");
const doc = await webScrape(fetchUrl, streamResponse);
const result = await webScrape(fetchUrl, streamResponse);
const result = await Defuddle(doc, fetchUrl, {
markdown: true,
});
log.debug("downloaded and parse parsed", result);
log.debug("downloaded and parse parsed", {
...result,
url: fetchUrl,
content: result.content.slice(0, 200),
});
streamResponse.info("parsed article, creating tags with openai");
streamResponse.enqueue("parsed article, creating tags with openai");
const aiMeta = await openai.extractArticleMetadata(result.markdown);
const aiMeta = await openai.extractArticleMetadata(result.content);
streamResponse.enqueue("postprocessing article");
streamResponse.info("postprocessing article");
const title = result?.title || aiMeta?.headline || "";
const coverImagePath = await fetchAndStoreCover(
result.image,
title,
streamResponse,
);
let coverImagePath: string | undefined = undefined;
if (result?.image?.length) {
log.debug("using local image for cover image", { image: result.image });
coverImagePath = await fetchAndStoreCover(
result.image,
title,
streamResponse,
);
} else {
const urlPath = await getUnsplashCoverImage(
result.markdown,
streamResponse,
);
coverImagePath = await fetchAndStoreCover(urlPath, title, streamResponse);
log.debug("using unsplash for cover image", { image: coverImagePath });
}
const url = toUrlSafeString(title);
const newArticle: ArticleResource["content"] = {
_type: "Article",
headline: title,
articleBody: result.content,
articleBody: result.markdown,
url: fetchUrl,
datePublished: formatDate(
result?.published || aiMeta?.datePublished || undefined,
@@ -100,16 +135,16 @@ async function processCreateArticle(
},
} as const;
streamResponse.enqueue("writing to disk");
streamResponse.info("writing to disk");
log.debug("writing to disk", {
...newArticle,
articleBody: newArticle.articleBody?.slice(0, 200),
});
await createResource(`articles/${toUrlSafeString(title)}.md`, newArticle);
await createResource(`articles/${url}.md`, newArticle);
streamResponse.enqueue("id: " + title);
streamResponse.send({ type: "finished", url });
}
async function processCreateYoutubeVideo(
@@ -122,13 +157,13 @@ async function processCreateYoutubeVideo(
url: fetchUrl,
});
streamResponse.enqueue("getting video infos from youtube api");
streamResponse.info("getting video infos from youtube api");
const youtubeId = extractYoutubeId(fetchUrl);
const video = await getYoutubeVideoDetails(youtubeId);
streamResponse.enqueue("shortening title with openai");
streamResponse.info("shortening title with openai");
const videoTitle = await openai.shortenTitle(video.snippet.title) ||
video.snippet.title;
@@ -152,16 +187,18 @@ async function processCreateYoutubeVideo(
},
};
streamResponse.enqueue("creating article");
streamResponse.info("creating article");
const filename = toUrlSafeString(videoTitle);
await createResource(
`articles/${toUrlSafeString(videoTitle)}.md`,
`articles/${filename}.md`,
newArticle,
);
streamResponse.enqueue("finished");
streamResponse.info("finished");
streamResponse.enqueue("id: " + toUrlSafeString(videoTitle));
streamResponse.send({ type: "finished", url: filename });
}
export const handler: Handlers = {

View File

@@ -0,0 +1,191 @@
import { FreshContext, Handlers } from "$fresh/server.ts";
import { fileExtension } from "https://deno.land/x/file_extension@v2.1.0/mod.ts";
import { formatDate, safeFileName } from "@lib/string.ts";
import { createStreamResponse } from "@lib/helpers.ts";
import {
AccessDeniedError,
BadRequestError,
NotFoundError,
} from "@lib/errors.ts";
import { createResource, fetchResource } from "@lib/marka/index.ts";
import { ArticleResource } from "@lib/marka/schema.ts";
import { webScrape } from "@lib/webScraper.ts";
import * as openai from "@lib/openai.ts";
import * as unsplash from "@lib/unsplash.ts";
import { createLogger } from "@lib/log/index.ts";
function ext(str: string) {
try {
const u = new URL(str);
if (u.searchParams.has("fm")) {
return u.searchParams.get("fm")!;
}
return fileExtension(u.pathname);
} catch (_e) {
return fileExtension(str);
}
}
const log = createLogger("api/article/enhance");
async function getUnsplashCoverImage(
content: string,
streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string | undefined> {
try {
streamResponse.info("creating unsplash search term");
const searchTerm = await openai.createUnsplashSearchTerm(content);
if (!searchTerm) return;
streamResponse.info(`searching for ${searchTerm}`);
const unsplashUrl = await unsplash.getImageBySearchTerm(searchTerm);
return unsplashUrl;
} catch (e) {
log.error("Failed to get unsplash cover image", e);
return undefined;
}
}
async function fetchAndStoreCover(
imageUrl: string | undefined,
title: string,
streamResponse: ReturnType<typeof createStreamResponse>,
): Promise<string | undefined> {
if (!imageUrl) return;
const imagePath = `articles/images/${safeFileName(title)}_cover.${
ext(imageUrl)
}`;
try {
streamResponse.info("downloading cover");
const res = await fetch(imageUrl);
if (!res.ok) {
log.error(`Failed to download remote image: ${imageUrl}`, {
status: res.status,
});
return;
}
const buffer = await res.arrayBuffer();
streamResponse.info("saving cover");
await createResource(imagePath, buffer);
return `resources/${imagePath}`;
} catch (err) {
log.error(`Failed to save image: ${imageUrl}`, err);
return;
}
}
async function processEnhanceArticle(
name: string,
streamResponse: ReturnType<typeof createStreamResponse>,
) {
const article = await fetchResource<ArticleResource>(
`articles/${name}`,
);
if (!article) {
throw new NotFoundError();
}
const fetchUrl = article.content?.url;
if (!fetchUrl) {
throw new BadRequestError("Article has no URL to enhance from.");
}
log.info("enhancing article from url", { url: fetchUrl });
streamResponse.info("scraping url");
const result = await webScrape(fetchUrl, streamResponse);
streamResponse.info("parsing content");
log.debug("downloaded and parsed", result);
streamResponse.info("extracting metadata with openai");
const aiMeta = await openai.extractArticleMetadata(result.markdown);
const title = result?.title || aiMeta?.headline ||
article.content?.headline || "";
article.content ??= {
_type: "Article",
headline: title,
url: fetchUrl,
};
article.content.articleBody = result.markdown;
article.content.datePublished ??= formatDate(
result?.published || aiMeta?.datePublished || undefined,
);
if (!article.content.author?.name || article.content.author.name === "") {
article.content.author = {
_type: "Person",
name: (result.schemaOrgData?.author?.name || aiMeta?.author || "")
.replace(
"@",
"twitter:",
),
};
}
if (!article.content.image) {
let coverImagePath: string | undefined = undefined;
if (result?.image?.length) {
log.debug("using local image for cover image", { image: result.image });
coverImagePath = await fetchAndStoreCover(
result.image,
title,
streamResponse,
);
} else {
const urlPath = await getUnsplashCoverImage(
result.content,
streamResponse,
);
coverImagePath = await fetchAndStoreCover(urlPath, title, streamResponse);
log.debug("using unsplash for cover image", { image: coverImagePath });
}
if (coverImagePath) {
article.content.image = coverImagePath;
}
}
log.debug("writing to disk", {
name: name,
article: {
...article,
content: {
...article.content,
articleBody: article.content.articleBody?.slice(0, 200),
},
},
});
streamResponse.info("writing to disk");
await createResource(`articles/${name}`, article.content);
streamResponse.send({ type: "finished", url: name.replace(/$\.md/, "") });
}
const POST = (
_req: Request,
ctx: FreshContext,
): Response => {
const session = ctx.state.session;
if (!session) {
throw new AccessDeniedError();
}
const streamResponse = createStreamResponse();
processEnhanceArticle(ctx.params.name, streamResponse)
.catch((err) => {
log.error(err);
streamResponse.error(err.message);
})
.finally(() => {
streamResponse.cancel();
});
return streamResponse.response;
};
export const handler: Handlers = {
POST,
};

View File

@@ -2,7 +2,7 @@ import { Handlers } from "$fresh/server.ts";
import { json } from "@lib/helpers.ts";
export const handler: Handlers = {
async GET() {
GET() {
return json([]);
},
};

View File

@@ -10,7 +10,6 @@ import { parseJsonLdToRecipeSchema } from "./parseJsonLd.ts";
import z from "zod";
import { createResource } from "@lib/marka/index.ts";
import { webScrape } from "@lib/webScraper.ts";
import { Defuddle } from "defuddle/node";
import { RecipeResource } from "@lib/marka/schema.ts";
const log = createLogger("api/article");
@@ -23,18 +22,14 @@ async function processCreateRecipeFromUrl(
) {
log.info("create article from url", { url: fetchUrl });
streamResponse.enqueue("downloading article");
streamResponse.info("downloading article");
const doc = await webScrape(fetchUrl, streamResponse);
const result = await webScrape(fetchUrl, streamResponse);
const result = await Defuddle(doc, fetchUrl, {
markdown: true,
});
streamResponse.enqueue("download success");
streamResponse.info("download success");
const jsonLds = Array.from(
doc?.querySelectorAll(
result.dom?.querySelectorAll(
"script[type='application/ld+json']",
),
) as unknown as HTMLScriptElement[];
@@ -48,11 +43,11 @@ async function processCreateRecipeFromUrl(
}
if (!recipe) {
const res = await openai.extractRecipe(result.content);
const res = await openai.extractRecipe(result.markdown);
if (!res || "errorMessages" in res) {
const errorMessage = res?.errorMessages?.[0] ||
"could not extract recipe";
streamResponse.enqueue(`failed to extract recipe: ${errorMessage}`);
streamResponse.error(`failed to extract recipe: ${errorMessage}`);
return;
}
recipe = res;
@@ -61,7 +56,7 @@ async function processCreateRecipeFromUrl(
const id = toUrlSafeString(recipe?.name || "");
if (!recipe) {
streamResponse.enqueue("failed to parse recipe");
streamResponse.error("failed to parse recipe");
streamResponse.cancel();
return;
}
@@ -80,11 +75,11 @@ async function processCreateRecipeFromUrl(
const finalPath = `resources/recipes/images/${
safeFileName(id)
}_cover.${extension}`;
streamResponse.enqueue("downloading image");
streamResponse.info("downloading image");
try {
streamResponse.enqueue("downloading image");
streamResponse.info("downloading image");
const res = await fetch(newRecipe.image);
streamResponse.enqueue("saving image");
streamResponse.info("saving image");
const buffer = await res.arrayBuffer();
await createResource(finalPath, buffer);
newRecipe.image = finalPath;
@@ -93,11 +88,11 @@ async function processCreateRecipeFromUrl(
}
}
streamResponse.enqueue("finished processing, creating file");
streamResponse.info("finished processing, creating file");
await createResource(`recipes/${id}.md`, newRecipe);
streamResponse.enqueue("id: " + id);
streamResponse.send({ type: "finished", url: id });
}
export const handler: Handlers = {
@@ -119,7 +114,7 @@ export const handler: Handlers = {
processCreateRecipeFromUrl({ fetchUrl, streamResponse }).then((article) => {
log.debug("created article from link", { article });
}).catch((err) => {
streamResponse.enqueue(`error creating recipe: ${err}`);
streamResponse.error(`creating recipe: ${err}`);
log.error(err);
}).finally(() => {
streamResponse.cancel();

View File

@@ -21,7 +21,7 @@ async function processUpdateRecommendations(
return true;
}) as ReviewResource[];
streamResponse.enqueue("Fetched all movies");
streamResponse.info("fetched all movies");
let done = 0;
const total = movies.length;
@@ -41,7 +41,7 @@ async function processUpdateRecommendations(
console.log(err);
}
done++;
streamResponse.enqueue(
streamResponse.info(
`${Math.floor((done / total) * 100)}% [${
done + 1
}/${total}] ${movie.name}`,
@@ -50,7 +50,7 @@ async function processUpdateRecommendations(
console.log(err);
});
streamResponse.enqueue("100% Finished");
streamResponse.info("100% Finished");
}
export const handler: Handlers = {