Download a static site
by kuligaposten 2025-02-01
how to download a static site.
Understanding the JavaScript Code for Downloading and Archiving Websites
Introduction
This blog post will break down the key functions in the provided JavaScript code. The script is built using Node.js, Express, and several third-party modules to download, process, and archive an entire website into a ZIP file. Let's explore how each function contributes to the workflow.
Modules Used
The script imports several modules to handle file operations, web requests, and DOM parsing:
import express from "express";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import archiver from "archiver";
import { JSDOM } from "jsdom";
import { URL } from "url";
import { renderWithLayout } from "../utils/renderWithLayout.js";
import { v4 as uuidv4 } from "uuid";
Purpose of Each Module:
express: Manages HTTP routes.fs: Handles file system operations.path: Works with file and directory paths.fileURLToPath&URL: Handle URL transformations.archiver: Creates ZIP archives.JSDOM: Parses and manipulates HTML.uuid: Generates unique IDs.renderWithLayout: Custom function for rendering pages with a layout.
Defining Paths and Directories
Setting Up Directory Paths
The script determines the directory of the current file and sets up a downloads directory:
const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const downloadsDir = path.join(__dirname, "../public/downloads");
Ensuring the downloads directory exists:
if (!fs.existsSync(downloadsDir)) {
fs.mkdirSync(downloadsDir, { recursive: true });
}
This ensures the required directories are available for storing downloaded files.
Core Functions
1. resolveUrl
This function constructs an absolute URL from a relative path:
function resolveUrl(base, relativePath) {
return new URL(relativePath, base).href;
}
It is used to resolve links found in HTML documents.
2. renderWithLayout
The renderWithLayout function is responsible for rendering views within a specified layout. This helps maintain consistency across different pages.
Function Breakdown:
export const renderWithLayout = (res, layout, options = {}) => {
const { page, ...layoutOptions } = options;
res.render(page, layoutOptions, (err, renderedBody) => {
if (err) {
console.error("Error rendering page content:", err);
res.status(500).send("Error rendering page content");
} else {
res.render(layout, { ...layoutOptions, body: renderedBody });
}
});
};
How It Works:
Extracts the page from the options.
Calls res.render(page, layoutOptions), rendering the page content.
If an error occurs, it logs and returns a 500 error.
If successful, it renders the layout template with the rendered body content.
This ensures that pages are consistently displayed within a layout template.
3. downloadFonts
This function finds font files referenced in a webpage's CSS and downloads them into /assets/fonts/.
Steps:
- Fetch the HTML and parse it with JSDOM.
- Extract
<base>tag if present to determine absolute paths. - Find font URLs inside:
<style>tags.- Linked external stylesheets.
- Download fonts and save them locally.
async function downloadFonts(baseUrl, tempDir) {
try {
const fontsDir = path.join(tempDir, "assets/fonts");
let fontUrls = new Set();
let foundFonts = false;
const fontExtensions = ["woff", "woff2", "ttf", "otf", "eot"];
const response = await fetch(baseUrl);
if (response.ok) {
const html = await response.text();
const dom = new JSDOM(html);
const document = dom.window.document;
// Get <base> tag if available
const baseTag = document.querySelector("base");
if (baseTag) {
const baseHref = baseTag.getAttribute("href");
if (baseHref) {
baseUrl = new URL(baseHref, baseUrl).href;
}
}
// Find font references in inline <style> blocks
const inlineStyles = [...document.querySelectorAll("style")].map(
(style) => style.textContent,
);
for (const inlineCss of inlineStyles) {
const fontsFromInlineCSS = [
...inlineCss.matchAll(/url\(['"]?(.*?)['"]?\)/g),
]
.map((match) => match[1])
.map((url) => url.split("?")[0]) // Remove query parameters
.filter((url) => fontExtensions.some((ext) => url.endsWith(ext)))
.map((url) => new URL(url, baseUrl).href);
fontsFromInlineCSS.forEach((url) => fontUrls.add(url));
}
// Find fonts inside linked <link rel="stylesheet"> stylesheets
const cssLinks = [
...document.querySelectorAll('link[rel="stylesheet"]'),
].map((link) => link.href);
for (const cssUrl of cssLinks) {
const resolvedCssUrl = new URL(cssUrl, baseUrl).href;
const cssResponse = await fetch(resolvedCssUrl);
if (!cssResponse.ok) continue;
const cssContent = await cssResponse.text();
const fontsFromCSS = [...cssContent.matchAll(/url\(['"]?(.*?)['"]?\)/g)]
.map((match) => match[1])
.map((url) => url.split("?")[0]) // Remove query parameters
.filter((url) => fontExtensions.some((ext) => url.endsWith(ext)))
.map((url) => new URL(url, resolvedCssUrl).href);
fontsFromCSS.forEach((url) => fontUrls.add(url));
}
}
if (fontUrls.size > 0) {
foundFonts = true; // Fonts were found, so we create the folder
fs.mkdirSync(fontsDir, { recursive: true });
}
for (const fontUrl of fontUrls) {
const fontFileName = path.basename(fontUrl);
const fontPath = path.join(fontsDir, fontFileName);
if (fs.existsSync(fontPath)) {
console.log(`⚠️ Font already exists, skipping: ${fontPath}`);
continue;
}
const fontResponse = await fetch(fontUrl);
if (!fontResponse.ok) {
continue;
}
const fileBuffer = Buffer.from(await fontResponse.arrayBuffer());
fs.writeFileSync(fontPath, fileBuffer);
}
if (!foundFonts) {
console.log(`No fonts found. Not creating empty /assets/fonts/ folder.`);
}
} catch (error) {
console.error(`Error downloading fonts:`, error.message);
}
}
This ensures that websites relying on external fonts can still display properly when downloaded.
4. downloadAndProcessHtml
Downloads an HTML page, saves it, and processes its assets (CSS, JS, Images).
async function downloadAndProcessHtml(url, baseUrl, tempDir) {
try {
const response = await fetch(url);
if (!response.ok) throw new Error(`Failed to fetch ${url}`);
const html = await response.text();
const dom = new JSDOM(html);
const document = dom.window.document;
let filePath = new URL(url).pathname;
if (filePath.endsWith("/")) {
filePath += "index.html";
}
const localHtmlPath = path.join(tempDir, path.basename(filePath));
await processAssets(document, baseUrl, tempDir);
fs.writeFileSync(localHtmlPath, dom.serialize());
return path.basename(filePath);
} catch (error) {
console.error(`Error processing HTML page ${url}:`, error.message);
return null;
}
}
5. downloadAndSaveFile
This function downloads CSS, JS, and images referenced in HTML and saves them inside an assets/ folder.
async function downloadAndSaveFile(url, baseUrl, tempDir) {
try {
let cleanPath = new URL(url, baseUrl).pathname.replace(
/^\/?[^/]+\/assets\//,
"assets/",
);
if (!cleanPath.startsWith("assets/")) {
cleanPath = `assets/${cleanPath}`;
}
const localPath = path.join(tempDir, cleanPath);
fs.mkdirSync(path.dirname(localPath), { recursive: true });
if (fs.existsSync(localPath)) return cleanPath;
const response = await fetch(url);
if (!response.ok) throw new Error(`Failed to fetch ${url}`);
fs.writeFileSync(localPath, Buffer.from(await response.arrayBuffer()));
return cleanPath;
} catch (error) {
console.error(`Error downloading ${url}:`, error.message);
return null;
}
}
6. findAndDownloadHtmlPages
This function recursively finds and downloads all linked HTML pages from the starting URL.
async function findAndDownloadHtmlPages(startUrl, baseUrl, tempDir) {
const visitedPages = new Set();
const pagesToVisit = [startUrl];
while (pagesToVisit.length > 0) {
const url = pagesToVisit.pop();
if (visitedPages.has(url)) continue;
visitedPages.add(url);
const fileName = await downloadAndProcessHtml(url, baseUrl, tempDir);
if (!fileName) continue;
const response = await fetch(url);
const html = await response.text();
const dom = new JSDOM(html);
const document = dom.window.document;
const linkedPages = [...document.querySelectorAll("a[href]")]
.map((a) => a.href)
.filter((link) => link.endsWith(".html") || link.endsWith(".htm"))
.map((link) =>
link.startsWith("http") ? link : resolveUrl(baseUrl, link),
);
pagesToVisit.push(...linkedPages);
}
}
This ensures that all linked pages within a website are downloaded.
7. processAssets
Function to process CSS, JS, and image assets in HTML
async function processAssets(document, baseUrl, tempDir) {
const assetTags = [
{ selector: 'link[rel="stylesheet"]', attr: "href" },
{ selector: "script[src]", attr: "src" },
{ selector: "img[src]", attr: "src" },
];
for (const tag of assetTags) {
const elements = document.querySelectorAll(tag.selector);
for (const element of elements) {
let assetUrl = element[tag.attr];
if (!assetUrl.startsWith("http")) {
assetUrl = new URL(assetUrl, baseUrl).href;
}
const localPath = await downloadAndSaveFile(assetUrl, baseUrl, tempDir); // ✅ Pass tempDir
if (localPath) {
element[tag.attr] = localPath;
}
}
}
}
Express Routes
GET /download-site
Renders the Download Site page:
router.get("/download-site", (req, res) => {
renderWithLayout(res, "layouts/mainhtmx", {
page: "downloadSite",
title: "Download Site",
description: "Learn more about downloading a static site.",
zipPath: null,
});
});
POST /download-site
Handles the website download process:
- Creates a unique temp directory using
uuid. - Downloads all pages, assets, and fonts.
- Archives everything into a ZIP file.
- Cleans up temporary files after ZIP creation.
router.post("/download-site", async (req, res) => {
const { url } = req.body;
if (!url) return res.status(400).json({ error: "URL is required." });
try {
const baseUrl = url.endsWith("/") ? url : url + "/";
// Generate a unique temporary folder for each request
const requestId = uuidv4();
const tempDir = path.join(
__dirname,
`../temp/downloaded-site-${requestId}`,
);
const zipPath = path.join(
__dirname,
`../public/downloads/site-${requestId}.zip`,
);
fs.mkdirSync(tempDir, { recursive: true });
// Pass tempDir to all functions
await findAndDownloadHtmlPages(url, baseUrl, tempDir);
await downloadFonts(baseUrl, tempDir);
// Create ZIP Archive
const output = fs.createWriteStream(zipPath);
const archive = archiver("zip", { zlib: { level: 9 } });
output.on("close", () => {
fs.rmSync(tempDir, { recursive: true, force: true }); // Remove temp folder
res.send(
`<p class="mt-2">Download ready</p><a class="btn btn-success mt-1" href="/downloads/site-${requestId}.zip" download class="btn btn-primary">Click here to download</a>`,
);
});
archive.on("error", (err) =>
res.status(500).json({ error: `Error creating ZIP: ${err.message}` }),
);
archive.pipe(output);
archive.directory(tempDir, false); // Use the correct tempDir
archive.finalize();
} catch (error) {
res.status(500).json({ error: `Error fetching URL: ${error.message}` });
}
});
Conclusion
This script provides a robust way to download, process, and archive entire static websites. By leveraging JSDOM, fetch, and archiver, it ensures that all necessary assets are properly stored and structured for offline use.