Download a static site

by kuligaposten 2025-02-01

how to download a static site.

Understanding the JavaScript Code for Downloading and Archiving Websites

Introduction

This blog post will break down the key functions in the provided JavaScript code. The script is built using Node.js, Express, and several third-party modules to download, process, and archive an entire website into a ZIP file. Let's explore how each function contributes to the workflow.

Modules Used

The script imports several modules to handle file operations, web requests, and DOM parsing:

import express from "express";
import fs from "fs";
import path from "path";
import { fileURLToPath } from "url";
import archiver from "archiver";
import { JSDOM } from "jsdom";
import { URL } from "url";
import { renderWithLayout } from "../utils/renderWithLayout.js";
import { v4 as uuidv4 } from "uuid";

Purpose of Each Module:

express: Manages HTTP routes.
fs: Handles file system operations.
path: Works with file and directory paths.
fileURLToPath & URL: Handle URL transformations.
archiver: Creates ZIP archives.
JSDOM: Parses and manipulates HTML.
uuid: Generates unique IDs.
renderWithLayout: Custom function for rendering pages with a layout.

Defining Paths and Directories

Setting Up Directory Paths

The script determines the directory of the current file and sets up a downloads directory:

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
const downloadsDir = path.join(__dirname, "../public/downloads");

Ensuring the downloads directory exists:

if (!fs.existsSync(downloadsDir)) {
  fs.mkdirSync(downloadsDir, { recursive: true });
}

This ensures the required directories are available for storing downloaded files.

Core Functions

1. `resolveUrl`

This function constructs an absolute URL from a relative path:

function resolveUrl(base, relativePath) {
  return new URL(relativePath, base).href;
}

It is used to resolve links found in HTML documents.

2. `renderWithLayout`

The renderWithLayout function is responsible for rendering views within a specified layout. This helps maintain consistency across different pages.

Function Breakdown:

export const renderWithLayout = (res, layout, options = {}) => {
  const { page, ...layoutOptions } = options;
  res.render(page, layoutOptions, (err, renderedBody) => {
    if (err) {
      console.error("Error rendering page content:", err);
      res.status(500).send("Error rendering page content");
    } else {
      res.render(layout, { ...layoutOptions, body: renderedBody });
    }
  });
};

How It Works:

Extracts the page from the options.
Calls res.render(page, layoutOptions), rendering the page content.
If an error occurs, it logs and returns a 500 error.
If successful, it renders the layout template with the rendered body content.

This ensures that pages are consistently displayed within a layout template.

3. `downloadFonts`

This function finds font files referenced in a webpage's CSS and downloads them into /assets/fonts/.

Steps:

Fetch the HTML and parse it with JSDOM.
Extract <base> tag if present to determine absolute paths.
Find font URLs inside:
- <style> tags.
- Linked external stylesheets.
Download fonts and save them locally.

async function downloadFonts(baseUrl, tempDir) {
  try {
    const fontsDir = path.join(tempDir, "assets/fonts");
    let fontUrls = new Set();
    let foundFonts = false;

    const fontExtensions = ["woff", "woff2", "ttf", "otf", "eot"];
    const response = await fetch(baseUrl);

    if (response.ok) {
      const html = await response.text();
      const dom = new JSDOM(html);
      const document = dom.window.document;

      // Get <base> tag if available
      const baseTag = document.querySelector("base");
      if (baseTag) {
        const baseHref = baseTag.getAttribute("href");
        if (baseHref) {
          baseUrl = new URL(baseHref, baseUrl).href;
        }
      }

      // Find font references in inline <style> blocks
      const inlineStyles = [...document.querySelectorAll("style")].map(
        (style) => style.textContent,
      );
      for (const inlineCss of inlineStyles) {
        const fontsFromInlineCSS = [
          ...inlineCss.matchAll(/url\(['"]?(.*?)['"]?\)/g),
        ]
          .map((match) => match[1])
          .map((url) => url.split("?")[0]) // Remove query parameters
          .filter((url) => fontExtensions.some((ext) => url.endsWith(ext)))
          .map((url) => new URL(url, baseUrl).href);

        fontsFromInlineCSS.forEach((url) => fontUrls.add(url));
      }

      // Find fonts inside linked <link rel="stylesheet"> stylesheets
      const cssLinks = [
        ...document.querySelectorAll('link[rel="stylesheet"]'),
      ].map((link) => link.href);
      for (const cssUrl of cssLinks) {
        const resolvedCssUrl = new URL(cssUrl, baseUrl).href;
        const cssResponse = await fetch(resolvedCssUrl);
        if (!cssResponse.ok) continue;

        const cssContent = await cssResponse.text();
        const fontsFromCSS = [...cssContent.matchAll(/url\(['"]?(.*?)['"]?\)/g)]
          .map((match) => match[1])
          .map((url) => url.split("?")[0]) // Remove query parameters
          .filter((url) => fontExtensions.some((ext) => url.endsWith(ext)))
          .map((url) => new URL(url, resolvedCssUrl).href);

        fontsFromCSS.forEach((url) => fontUrls.add(url));
      }
    }

    if (fontUrls.size > 0) {
      foundFonts = true; // Fonts were found, so we create the folder
      fs.mkdirSync(fontsDir, { recursive: true });
    }

    for (const fontUrl of fontUrls) {
      const fontFileName = path.basename(fontUrl);
      const fontPath = path.join(fontsDir, fontFileName);

      if (fs.existsSync(fontPath)) {
        console.log(`⚠️ Font already exists, skipping: ${fontPath}`);
        continue;
      }

      const fontResponse = await fetch(fontUrl);
      if (!fontResponse.ok) {
        continue;
      }

      const fileBuffer = Buffer.from(await fontResponse.arrayBuffer());
      fs.writeFileSync(fontPath, fileBuffer);
    }

    if (!foundFonts) {
      console.log(`No fonts found. Not creating empty /assets/fonts/ folder.`);
    }
  } catch (error) {
    console.error(`Error downloading fonts:`, error.message);
  }
}

This ensures that websites relying on external fonts can still display properly when downloaded.

4. `downloadAndProcessHtml`

Downloads an HTML page, saves it, and processes its assets (CSS, JS, Images).

async function downloadAndProcessHtml(url, baseUrl, tempDir) {
  try {
    const response = await fetch(url);
    if (!response.ok) throw new Error(`Failed to fetch ${url}`);

    const html = await response.text();
    const dom = new JSDOM(html);
    const document = dom.window.document;

    let filePath = new URL(url).pathname;
    if (filePath.endsWith("/")) {
      filePath += "index.html";
    }

    const localHtmlPath = path.join(tempDir, path.basename(filePath));
    await processAssets(document, baseUrl, tempDir);
    fs.writeFileSync(localHtmlPath, dom.serialize());
    return path.basename(filePath);
  } catch (error) {
    console.error(`Error processing HTML page ${url}:`, error.message);
    return null;
  }
}

5. `downloadAndSaveFile`

This function downloads CSS, JS, and images referenced in HTML and saves them inside an assets/ folder.

async function downloadAndSaveFile(url, baseUrl, tempDir) {
  try {
    let cleanPath = new URL(url, baseUrl).pathname.replace(
      /^\/?[^/]+\/assets\//,
      "assets/",
    );
    if (!cleanPath.startsWith("assets/")) {
      cleanPath = `assets/${cleanPath}`;
    }

    const localPath = path.join(tempDir, cleanPath);
    fs.mkdirSync(path.dirname(localPath), { recursive: true });
    if (fs.existsSync(localPath)) return cleanPath;

    const response = await fetch(url);
    if (!response.ok) throw new Error(`Failed to fetch ${url}`);

    fs.writeFileSync(localPath, Buffer.from(await response.arrayBuffer()));
    return cleanPath;
  } catch (error) {
    console.error(`Error downloading ${url}:`, error.message);
    return null;
  }
}

6. `findAndDownloadHtmlPages`

This function recursively finds and downloads all linked HTML pages from the starting URL.

async function findAndDownloadHtmlPages(startUrl, baseUrl, tempDir) {
  const visitedPages = new Set();
  const pagesToVisit = [startUrl];

  while (pagesToVisit.length > 0) {
    const url = pagesToVisit.pop();
    if (visitedPages.has(url)) continue;
    visitedPages.add(url);

    const fileName = await downloadAndProcessHtml(url, baseUrl, tempDir);
    if (!fileName) continue;

    const response = await fetch(url);
    const html = await response.text();
    const dom = new JSDOM(html);
    const document = dom.window.document;

    const linkedPages = [...document.querySelectorAll("a[href]")]
      .map((a) => a.href)
      .filter((link) => link.endsWith(".html") || link.endsWith(".htm"))
      .map((link) =>
        link.startsWith("http") ? link : resolveUrl(baseUrl, link),
      );

    pagesToVisit.push(...linkedPages);
  }
}

This ensures that all linked pages within a website are downloaded.

7. `processAssets`

Function to process CSS, JS, and image assets in HTML

async function processAssets(document, baseUrl, tempDir) {
  const assetTags = [
    { selector: 'link[rel="stylesheet"]', attr: "href" },
    { selector: "script[src]", attr: "src" },
    { selector: "img[src]", attr: "src" },
  ];

  for (const tag of assetTags) {
    const elements = document.querySelectorAll(tag.selector);
    for (const element of elements) {
      let assetUrl = element[tag.attr];
      if (!assetUrl.startsWith("http")) {
        assetUrl = new URL(assetUrl, baseUrl).href;
      }
      const localPath = await downloadAndSaveFile(assetUrl, baseUrl, tempDir); // ✅ Pass tempDir
      if (localPath) {
        element[tag.attr] = localPath;
      }
    }
  }
}

Express Routes

`GET /download-site`

Renders the Download Site page:

router.get("/download-site", (req, res) => {
  renderWithLayout(res, "layouts/mainhtmx", {
    page: "downloadSite",
    title: "Download Site",
    description: "Learn more about downloading a static site.",
    zipPath: null,
  });
});

`POST /download-site`

Handles the website download process:

Creates a unique temp directory using uuid.
Downloads all pages, assets, and fonts.
Archives everything into a ZIP file.
Cleans up temporary files after ZIP creation.

router.post("/download-site", async (req, res) => {
  const { url } = req.body;
  if (!url) return res.status(400).json({ error: "URL is required." });

  try {
    const baseUrl = url.endsWith("/") ? url : url + "/";

    // Generate a unique temporary folder for each request
    const requestId = uuidv4();
    const tempDir = path.join(
      __dirname,
      `../temp/downloaded-site-${requestId}`,
    );
    const zipPath = path.join(
      __dirname,
      `../public/downloads/site-${requestId}.zip`,
    );

    fs.mkdirSync(tempDir, { recursive: true });

    // Pass tempDir to all functions
    await findAndDownloadHtmlPages(url, baseUrl, tempDir);
    await downloadFonts(baseUrl, tempDir);

    // Create ZIP Archive
    const output = fs.createWriteStream(zipPath);
    const archive = archiver("zip", { zlib: { level: 9 } });

    output.on("close", () => {
      fs.rmSync(tempDir, { recursive: true, force: true }); // Remove temp folder
      res.send(
        `<p class="mt-2">Download ready</p><a class="btn btn-success mt-1" href="/downloads/site-${requestId}.zip" download class="btn btn-primary">Click here to download</a>`,
      );
    });

    archive.on("error", (err) =>
      res.status(500).json({ error: `Error creating ZIP: ${err.message}` }),
    );
    archive.pipe(output);
    archive.directory(tempDir, false); // Use the correct tempDir
    archive.finalize();
  } catch (error) {
    res.status(500).json({ error: `Error fetching URL: ${error.message}` });
  }
});

Conclusion

This script provides a robust way to download, process, and archive entire static websites. By leveraging JSDOM, fetch, and archiver, it ensures that all necessary assets are properly stored and structured for offline use.

check it out here

Download a static site

Understanding the JavaScript Code for Downloading and Archiving Websites

Introduction

Modules Used

Purpose of Each Module:

Defining Paths and Directories

Setting Up Directory Paths

Core Functions

1. resolveUrl

2. renderWithLayout

3. downloadFonts

Steps:

4. downloadAndProcessHtml

5. downloadAndSaveFile

6. findAndDownloadHtmlPages

7. processAssets