177 lines
5.6 KiB
TypeScript
177 lines
5.6 KiB
TypeScript
import { JSDOM } from "jsdom";
|
|
import { fetchHtmlWithPlaywright } from "./playwright.ts";
|
|
import { createStreamResponse } from "./helpers.ts";
|
|
|
|
/**
|
|
* Mutates the given JSDOM instance: rewrites all relevant URL-bearing attributes
|
|
* to absolute URLs, resolving against the provided domain (e.g., "https://example.com").
|
|
*/
|
|
export function absolutizeDomUrls(dom: JSDOM, domain: string): void {
|
|
const { document } = dom.window;
|
|
const base = toBase(domain);
|
|
|
|
const rewrite = (selector: string, attr: string) => {
|
|
document.querySelectorAll<HTMLElement>(selector).forEach(
|
|
(el: HTMLElement) => {
|
|
const v = el.getAttribute(attr);
|
|
if (!v) return;
|
|
const abs = toAbsolute(v, base);
|
|
if (abs !== v) el.setAttribute(attr, abs);
|
|
},
|
|
);
|
|
};
|
|
|
|
// Common URL attributes
|
|
rewrite("a[href]", "href");
|
|
rewrite("area[href]", "href");
|
|
rewrite("link[href]", "href");
|
|
rewrite("use[href]", "href"); // SVG 2
|
|
rewrite("use[xlink\\:href]", "xlink:href"); // legacy SVG
|
|
rewrite("image[href]", "href"); // SVG
|
|
rewrite("image[xlink\\:href]", "xlink:href"); // legacy SVG
|
|
|
|
rewrite("script[src]", "src");
|
|
rewrite("img[src]", "src");
|
|
rewrite("source[src]", "src");
|
|
rewrite("track[src]", "src");
|
|
rewrite("iframe[src]", "src");
|
|
rewrite("embed[src]", "src");
|
|
rewrite("audio[src]", "src");
|
|
rewrite("video[src]", "src");
|
|
rewrite("object[data]", "data");
|
|
rewrite("input[src]", "src");
|
|
rewrite("form[action]", "action");
|
|
rewrite("video[poster]", "poster");
|
|
|
|
document
|
|
.querySelectorAll("img[srcset], source[srcset]")
|
|
.forEach((el: HTMLImageElement) => {
|
|
const v = el.getAttribute("srcset");
|
|
if (!v) return;
|
|
const abs = absolutizeSrcset(v, base);
|
|
if (abs !== v) el.setAttribute("srcset", abs);
|
|
});
|
|
|
|
document.querySelectorAll("[style]").forEach(
|
|
(el: HTMLElement) => {
|
|
const v = el.getAttribute("style");
|
|
if (!v) return;
|
|
const abs = absolutizeCssUrls(v, base);
|
|
if (abs !== v) el.setAttribute("style", abs);
|
|
},
|
|
);
|
|
|
|
document.querySelectorAll("style").forEach(
|
|
(styleEl: HTMLStyleElement) => {
|
|
const css = styleEl.textContent ?? "";
|
|
const abs = absolutizeCssUrls(css, base);
|
|
if (abs !== css) styleEl.textContent = abs;
|
|
},
|
|
);
|
|
|
|
document
|
|
.querySelectorAll('meta[http-equiv="refresh" i][content]')
|
|
.forEach((meta: HTMLMetaElement) => {
|
|
const content = meta.getAttribute("content") || "";
|
|
const abs = absolutizeMetaRefresh(content, base);
|
|
if (abs !== content) meta.setAttribute("content", abs);
|
|
});
|
|
}
|
|
|
|
/** Normalize the base to a valid absolute URL root. */
|
|
function toBase(domain: string): string {
|
|
// Allow callers to pass "example.com" or "//example.com"
|
|
let d = domain.trim();
|
|
if (!/^[a-zA-Z][a-zA-Z0-9+\-.]*:/.test(d)) {
|
|
d = d.startsWith("//") ? `https:${d}` : `https://${d}`;
|
|
}
|
|
// Ensure trailing slash does not matter for URL resolution
|
|
try {
|
|
// new URL('/', base) works whether base ends with slash or not
|
|
return new URL("/", d).toString();
|
|
} catch {
|
|
// Fallback: if domain is irreparably bad, throw early
|
|
throw new Error(`Invalid base domain: ${domain}`);
|
|
}
|
|
}
|
|
|
|
/** Convert a possibly-relative URL to absolute, using the provided base. */
|
|
function toAbsolute(url: string, base: string): string {
|
|
const trimmed = url.trim();
|
|
|
|
// Leave already absolute or special schemes untouched by just parsing directly.
|
|
// If it's not a valid absolute URL, resolve against base.
|
|
try {
|
|
// If parse succeeds without base and includes a scheme, keep as-is
|
|
const abs = new URL(trimmed);
|
|
return abs.toString();
|
|
} catch {
|
|
// Not absolute, resolve relative to base (handles #hash, ?q, //host, etc.)
|
|
try {
|
|
return new URL(trimmed, base).toString();
|
|
} catch {
|
|
// If still invalid (e.g., badly formed), return original
|
|
return url;
|
|
}
|
|
}
|
|
}
|
|
|
|
/** Absolutize a srcset list. */
|
|
function absolutizeSrcset(srcset: string, base: string): string {
|
|
// Split by commas but keep descriptors (1x, 2x, 100w, etc.)
|
|
// Each candidate: <url> [<descriptor>]
|
|
return srcset
|
|
.split(",")
|
|
.map((part) => {
|
|
const s = part.trim();
|
|
if (!s) return s;
|
|
// First whitespace separates URL and descriptor
|
|
const spaceIdx = s.search(/\s/);
|
|
if (spaceIdx === -1) {
|
|
return toAbsolute(s, base);
|
|
}
|
|
const url = s.slice(0, spaceIdx);
|
|
const desc = s.slice(spaceIdx).trim();
|
|
return `${toAbsolute(url, base)} ${desc}`;
|
|
})
|
|
.join(", ");
|
|
}
|
|
|
|
/** Replace url(...) in CSS text with absolute URLs. */
|
|
function absolutizeCssUrls(cssText: string, base: string): string {
|
|
// Matches url("..."), url('...'), url(...)
|
|
return cssText.replace(
|
|
/url\(\s*(['"]?)([^'")]+)\1\s*\)/g,
|
|
(_m, _q, rawUrl) => {
|
|
const abs = toAbsolute(rawUrl, base);
|
|
// Preserve quoting if present; browsers accept unquoted if safe, but keep simple.
|
|
return `url(${abs})`;
|
|
},
|
|
);
|
|
}
|
|
|
|
/** Rewrite the URL in a meta refresh content value if present. */
|
|
function absolutizeMetaRefresh(content: string, base: string): string {
|
|
// Format examples:
|
|
// "5; url=/path", "0;URL='page.html'"
|
|
const match = content.match(
|
|
/^\s*([^;]+)\s*;\s*(url|URL)\s*=\s*('?)([^']+)\3\s*$/,
|
|
);
|
|
if (!match) return content;
|
|
const delay = match[1].trim();
|
|
const url = match[4].trim();
|
|
const abs = toAbsolute(url, base);
|
|
return `${delay}; url=${abs}`;
|
|
}
|
|
|
|
export async function webScrape(
|
|
url: string,
|
|
streamResponse: ReturnType<typeof createStreamResponse>,
|
|
): JSDOM {
|
|
const u = new URL(url);
|
|
const html = await fetchHtmlWithPlaywright(url, streamResponse);
|
|
const dom = new JSDOM(html);
|
|
absolutizeDomUrls(dom, u.origin);
|
|
return dom;
|
|
}
|