import { JSDOM } from "jsdom"; import { fetchHtmlWithPlaywright } from "./playwright.ts"; import { createStreamResponse } from "./helpers.ts"; import { Defuddle } from "defuddle/node"; import TurndownService from "turndown"; /** * Mutates the given JSDOM instance: rewrites all relevant URL-bearing attributes * to absolute URLs, resolving against the provided domain (e.g., "https://example.com"). */ export function absolutizeDomUrls(dom: JSDOM, domain: string): void { const { document } = dom.window; const base = toBase(domain); const rewrite = (selector: string, attr: string) => { document.querySelectorAll(selector).forEach( (el: HTMLElement) => { const v = el.getAttribute(attr); if (!v) return; const abs = toAbsolute(v, base); if (abs !== v) el.setAttribute(attr, abs); }, ); }; // Common URL attributes rewrite("a[href]", "href"); rewrite("area[href]", "href"); rewrite("link[href]", "href"); rewrite("use[href]", "href"); // SVG 2 rewrite("use[xlink\\:href]", "xlink:href"); // legacy SVG rewrite("image[href]", "href"); // SVG rewrite("image[xlink\\:href]", "xlink:href"); // legacy SVG rewrite("script[src]", "src"); rewrite("img[src]", "src"); rewrite("source[src]", "src"); rewrite("track[src]", "src"); rewrite("iframe[src]", "src"); rewrite("embed[src]", "src"); rewrite("audio[src]", "src"); rewrite("video[src]", "src"); rewrite("object[data]", "data"); rewrite("input[src]", "src"); rewrite("form[action]", "action"); rewrite("video[poster]", "poster"); document .querySelectorAll("img[srcset], source[srcset]") .forEach((el: HTMLImageElement) => { const v = el.getAttribute("srcset"); if (!v) return; const abs = absolutizeSrcset(v, base); if (abs !== v) el.setAttribute("srcset", abs); }); document.querySelectorAll("[style]").forEach( (el: HTMLElement) => { const v = el.getAttribute("style"); if (!v) return; const abs = absolutizeCssUrls(v, base); if (abs !== v) el.setAttribute("style", abs); }, ); document.querySelectorAll("style").forEach( (styleEl: HTMLStyleElement) => { const css = styleEl.textContent ?? ""; const abs = absolutizeCssUrls(css, base); if (abs !== css) styleEl.textContent = abs; }, ); document .querySelectorAll('meta[http-equiv="refresh" i][content]') .forEach((meta: HTMLMetaElement) => { const content = meta.getAttribute("content") || ""; const abs = absolutizeMetaRefresh(content, base); if (abs !== content) meta.setAttribute("content", abs); }); } /** Normalize the base to a valid absolute URL root. */ function toBase(domain: string): string { // Allow callers to pass "example.com" or "//example.com" let d = domain.trim(); if (!/^[a-zA-Z][a-zA-Z0-9+\-.]*:/.test(d)) { d = d.startsWith("//") ? `https:${d}` : `https://${d}`; } // Ensure trailing slash does not matter for URL resolution try { // new URL('/', base) works whether base ends with slash or not return new URL("/", d).toString(); } catch { // Fallback: if domain is irreparably bad, throw early throw new Error(`Invalid base domain: ${domain}`); } } /** Convert a possibly-relative URL to absolute, using the provided base. */ function toAbsolute(url: string, base: string): string { const trimmed = url.trim(); // Leave already absolute or special schemes untouched by just parsing directly. // If it's not a valid absolute URL, resolve against base. try { // If parse succeeds without base and includes a scheme, keep as-is const abs = new URL(trimmed); return abs.toString(); } catch { // Not absolute, resolve relative to base (handles #hash, ?q, //host, etc.) try { return new URL(trimmed, base).toString(); } catch { // If still invalid (e.g., badly formed), return original return url; } } } /** Absolutize a srcset list. */ function absolutizeSrcset(srcset: string, base: string): string { // Split by commas but keep descriptors (1x, 2x, 100w, etc.) // Each candidate: [] return srcset .split(",") .map((part) => { const s = part.trim(); if (!s) return s; // First whitespace separates URL and descriptor const spaceIdx = s.search(/\s/); if (spaceIdx === -1) { return toAbsolute(s, base); } const url = s.slice(0, spaceIdx); const desc = s.slice(spaceIdx).trim(); return `${toAbsolute(url, base)} ${desc}`; }) .join(", "); } /** Replace url(...) in CSS text with absolute URLs. */ function absolutizeCssUrls(cssText: string, base: string): string { // Matches url("..."), url('...'), url(...) return cssText.replace( /url\(\s*(['"]?)([^'")]+)\1\s*\)/g, (_m, _q, rawUrl) => { const abs = toAbsolute(rawUrl, base); // Preserve quoting if present; browsers accept unquoted if safe, but keep simple. return `url(${abs})`; }, ); } /** Rewrite the URL in a meta refresh content value if present. */ function absolutizeMetaRefresh(content: string, base: string): string { // Format examples: // "5; url=/path", "0;URL='page.html'" const match = content.match( /^\s*([^;]+)\s*;\s*(url|URL)\s*=\s*('?)([^']+)\3\s*$/, ); if (!match) return content; const delay = match[1].trim(); const url = match[4].trim(); const abs = toAbsolute(url, base); return `${delay}; url=${abs}`; } const turndownService = new TurndownService(); export async function webScrape( url: string, streamResponse: ReturnType, ): JSDOM { const u = new URL(url); const html = await fetchHtmlWithPlaywright(url, streamResponse); const dom = new JSDOM(html); absolutizeDomUrls(dom, u.origin); const result = await Defuddle(dom, url); return { ...result, dom, markdown: turndownService.turndown(result.content), }; }