commit e3da5b14328744c618dc31e183d97ecd4944151e
parent 74179fb37667450f2a5ff117e81e7403adf7f710
Author: Kris Yotam <krisyotam@protonmail.com>
Date: Thu, 12 Mar 2026 21:23:22 -0500
update sbot
Diffstat:
| A | .claude/CLAUDE.md | | | 149 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | .gitignore | | | 2 | +- |
| D | CLAUDE.md | | | 145 | ------------------------------------------------------------------------------- |
| M | Makefile | | | 18 | +++++++++--------- |
| A | README.md | | | 229 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| M | archiver.c | | | 72 | ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++---- |
| M | config.h | | | 6 | +++--- |
| A | detect.c | | | 411 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | detect.h | | | 42 | ++++++++++++++++++++++++++++++++++++++++++ |
| M | robots.c | | | 4 | ++-- |
10 files changed, 914 insertions(+), 164 deletions(-)
diff --git a/.claude/CLAUDE.md b/.claude/CLAUDE.md
@@ -0,0 +1,149 @@
+# sbot — CLAUDE.md
+
+## Project
+
+sbot (Simple Archiver Bot) is a suckless web archiver written in C. It
+creates self-contained archives of websites with all resources (CSS,
+images, fonts) inlined as data URIs. Supports single-page archival in
+GWTAR (Gwern Web Tar Archive) format and recursive whole-site archival
+with navigable directory structure.
+
+## Coding Standards — Suckless C Style
+
+All code in this project MUST follow the suckless.org coding style:
+
+### Language
+- C99 (ISO/IEC 9899:1999), no extensions
+- POSIX.1-2008 (`_POSIX_C_SOURCE 200809L`)
+
+### Indentation & Whitespace
+- Tabs for indentation (1 tab = 1 level)
+- Spaces for alignment only, never for indentation
+- No tabs except at the beginning of a line
+- Maximum line length: 79 characters
+
+### Comments
+- Use `/* */` only, never `//`
+- Comment fallthrough cases in switch statements
+
+### Variables
+- All declarations at the top of the block
+- Pointer `*` adjacent to variable name: `char *p`, not `char* p`
+- No C99 `bool`; use `int` (0/1)
+- Global/static variables not used outside TU must be `static`
+
+### Functions
+- Return type on its own line
+- Function name at column 0 on next line (enables `grep ^funcname`)
+- Opening `{` on its own line for functions
+- Functions not used outside their file: `static`
+
+```c
+static void
+usage(void)
+{
+ fprintf(stderr, "usage: sbot [-v] [-r] url\n");
+ exit(1);
+}
+```
+
+### Braces
+- Opening `{` on same line for control flow (if, for, while, switch)
+- Closing `}` on its own line unless continuing (else, do-while)
+- Use braces even for single statements when sibling branches use them
+
+### Naming
+- lowercase_with_underscores for functions and variables
+- UPPERCASE for macros and constants
+- CamelCase for typedef'd struct types
+- No `_t` suffix (reserved by POSIX)
+- Prefix module functions with module name
+
+### Control Flow
+- Space after `if`, `for`, `while`, `switch`
+- No space after `(` or before `)`
+- Use `goto` for cleanup/unwind, not nested ifs
+- Return/exit early on failure
+- Test against 0, not -1: `if (func() < 0)`
+
+### Error Handling
+- All allocation checked; goto cleanup on failure
+- `die()` for fatal errors (prints message, exits)
+- `warn()` for recoverable errors (prints, continues)
+
+### File Organization Order
+1. License header
+2. System includes (alphabetical)
+3. Local includes
+4. Macros
+5. Type definitions
+6. Function declarations
+7. Global variables
+8. Function definitions (same order as declarations)
+
+### Headers
+- System headers first, alphabetical
+- Local headers after blank line
+- No cyclic dependencies
+- Include only what is needed
+
+## Architecture
+
+### Module Layout
+
+| Module | Prefix | File | Responsibility |
+|--------|--------|------|----------------|
+| Main | — | archiver.c | Entry point, page archiving, CSS inlining, link rewriting, crawl orchestration |
+| Crawler | `queue_`, `visited_` | crawl.c | URL queue (BFS), visited set, URL normalization, path conversion |
+| Fetcher | `fetch_` | fetch.c | HTTP fetching via libcurl, response management |
+| Parser | `reslist_`, `parse_` | parse.c | HTML parsing, resource extraction, image inlining |
+| Robots | `robots_` | robots.c | robots.txt fetching, parsing, and rule matching |
+| Detect | `detect_`, `siteinfo_` | detect.c | CMS/framework detection (WordPress, Blogger, Hugo, Jekyll, Ghost, Drupal, MediaWiki) |
+| Utilities | `die`, `warn`, `x*`, `str_*`, `url_*` | util.c | Memory wrappers, string ops, URL helpers, base64, MIME types |
+| Config | — | config.h | Compile-time constants (timeouts, limits, user agent) |
+
+### Architecture Rules
+- **Separate compilation.** Every .c file compiles independently.
+- **No dynamic loading.** All features compiled in.
+- **libcurl only.** Single external dependency for HTTP.
+- **No `system()` calls.** Direct file I/O and libcurl only.
+- **Data URIs for inlining.** Resources encoded as base64 data URIs.
+- **Stateless functions preferred.** Minimize mutable global state.
+
+### Crawler Design Principles
+- **BFS traversal.** URL queue processes breadth-first by depth level.
+- **Same-domain only.** Never follow links to external domains.
+- **Politeness.** Rate limiting between requests (configurable).
+- **Depth control.** Hard limit on crawl depth to prevent runaway.
+- **URL normalization.** Canonical form for deduplication.
+- **Graceful degradation.** Skip failed resources, continue crawling.
+- **robots.txt compliance.** Respects Disallow/Allow rules and Crawl-delay.
+
+## Build
+
+```sh
+make # build sbot binary
+make clean # remove build artifacts
+make install # install to /usr/local/bin
+```
+
+Dependencies: `libcurl` (via pkg-config)
+
+## Usage
+
+```sh
+# Single page archive (GWTAR format)
+sbot https://example.com/article
+
+# Whole site (recursive, depth 3)
+sbot -r -d 3 https://example.com
+
+# Verbose with custom output dir
+sbot -v -r -o ./archive https://example.com
+```
+
+## Git Conventions
+
+- No `Co-Authored-By: Claude` lines
+- Commit messages: imperative, <72 chars, no period
+- One logical change per commit
diff --git a/.gitignore b/.gitignore
@@ -1,5 +1,5 @@
# Build artifacts
-archiver-bot
+sbot
*.o
# Test output
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -1,145 +0,0 @@
-# archiver-bot — CLAUDE.md
-
-## Project
-
-archiver-bot is a suckless web archiver written in C. It creates
-self-contained archives of websites with all resources (CSS, images,
-fonts) inlined as data URIs. Supports single-page and recursive
-whole-site archival with GWTAR (Gwern Web Tar Archive) format headers.
-
-## Coding Standards — Suckless C Style
-
-All code in this project MUST follow the suckless.org coding style:
-
-### Language
-- C99 (ISO/IEC 9899:1999), no extensions
-- POSIX.1-2008 (`_POSIX_C_SOURCE 200809L`)
-
-### Indentation & Whitespace
-- Tabs for indentation (1 tab = 1 level)
-- Spaces for alignment only, never for indentation
-- No tabs except at the beginning of a line
-- Maximum line length: 79 characters
-
-### Comments
-- Use `/* */` only, never `//`
-- Comment fallthrough cases in switch statements
-
-### Variables
-- All declarations at the top of the block
-- Pointer `*` adjacent to variable name: `char *p`, not `char* p`
-- No C99 `bool`; use `int` (0/1)
-- Global/static variables not used outside TU must be `static`
-
-### Functions
-- Return type on its own line
-- Function name at column 0 on next line (enables `grep ^funcname`)
-- Opening `{` on its own line for functions
-- Functions not used outside their file: `static`
-
-```c
-static void
-usage(void)
-{
- fprintf(stderr, "usage: archiver-bot [-v] [-r] url\n");
- exit(1);
-}
-```
-
-### Braces
-- Opening `{` on same line for control flow (if, for, while, switch)
-- Closing `}` on its own line unless continuing (else, do-while)
-- Use braces even for single statements when sibling branches use them
-
-### Naming
-- lowercase_with_underscores for functions and variables
-- UPPERCASE for macros and constants
-- CamelCase for typedef'd struct types
-- No `_t` suffix (reserved by POSIX)
-- Prefix module functions with module name
-
-### Control Flow
-- Space after `if`, `for`, `while`, `switch`
-- No space after `(` or before `)`
-- Use `goto` for cleanup/unwind, not nested ifs
-- Return/exit early on failure
-- Test against 0, not -1: `if (func() < 0)`
-
-### Error Handling
-- All allocation checked; goto cleanup on failure
-- `die()` for fatal errors (prints message, exits)
-- `warn()` for recoverable errors (prints, continues)
-
-### File Organization Order
-1. License header
-2. System includes (alphabetical)
-3. Local includes
-4. Macros
-5. Type definitions
-6. Function declarations
-7. Global variables
-8. Function definitions (same order as declarations)
-
-### Headers
-- System headers first, alphabetical
-- Local headers after blank line
-- No cyclic dependencies
-- Include only what is needed
-
-## Architecture
-
-### Module Layout
-
-| Module | Prefix | File | Responsibility |
-|--------|--------|------|----------------|
-| Main | — | archiver.c | Entry point, page archiving, CSS inlining, link rewriting, crawl orchestration |
-| Crawler | `queue_`, `visited_` | crawl.c | URL queue (BFS), visited set, URL normalization, path conversion |
-| Fetcher | `fetch_` | fetch.c | HTTP fetching via libcurl, response management |
-| Parser | `reslist_`, `parse_` | parse.c | HTML parsing, resource extraction, image inlining |
-| Utilities | `die`, `warn`, `x*`, `str_*`, `url_*` | util.c | Memory wrappers, string ops, URL helpers, base64, MIME types |
-| Config | — | config.h | Compile-time constants (timeouts, limits, user agent) |
-
-### Architecture Rules
-- **Separate compilation.** Every .c file compiles independently.
-- **No dynamic loading.** All features compiled in.
-- **libcurl only.** Single external dependency for HTTP.
-- **No `system()` calls.** Direct file I/O and libcurl only.
-- **Data URIs for inlining.** Resources encoded as base64 data URIs.
-- **Stateless functions preferred.** Minimize mutable global state.
-
-### Crawler Design Principles
-- **BFS traversal.** URL queue processes breadth-first by depth level.
-- **Same-domain only.** Never follow links to external domains.
-- **Politeness.** Rate limiting between requests (configurable).
-- **Depth control.** Hard limit on crawl depth to prevent runaway.
-- **URL normalization.** Canonical form for deduplication.
-- **Graceful degradation.** Skip failed resources, continue crawling.
-
-## Build
-
-```sh
-make # build archiver-bot binary
-make clean # remove build artifacts
-make install # install to /usr/local/bin
-```
-
-Dependencies: `libcurl` (via pkg-config)
-
-## Usage
-
-```sh
-# Single page
-archiver-bot https://example.com/article
-
-# Whole site (recursive, depth 3)
-archiver-bot -r -d 3 https://example.com
-
-# Verbose with custom output dir
-archiver-bot -v -r -o ./archive https://example.com
-```
-
-## Git Conventions
-
-- No `Co-Authored-By: Claude` lines
-- Commit messages: imperative, <72 chars, no period
-- One logical change per commit
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
-# archiver-bot - suckless web archiver
+# sbot - Simple Archiver Bot
# See LICENSE file for copyright and license details.
-VERSION = 0.2.0
+VERSION = 0.3.0
# paths
PREFIX = /usr/local
@@ -20,26 +20,26 @@ LDFLAGS = $(LIBS)
CC = cc
# sources
-SRC = archiver.c crawl.c fetch.c parse.c robots.c util.c
+SRC = archiver.c crawl.c detect.c fetch.c parse.c robots.c util.c
OBJ = $(SRC:.c=.o)
-all: archiver-bot
+all: sbot
.c.o:
$(CC) $(CFLAGS) -c $<
-archiver-bot: $(OBJ)
+sbot: $(OBJ)
$(CC) -o $@ $(OBJ) $(LDFLAGS)
clean:
- rm -f archiver-bot $(OBJ)
+ rm -f sbot $(OBJ)
install: all
mkdir -p $(DESTDIR)$(PREFIX)/bin
- cp -f archiver-bot $(DESTDIR)$(PREFIX)/bin
- chmod 755 $(DESTDIR)$(PREFIX)/bin/archiver-bot
+ cp -f sbot $(DESTDIR)$(PREFIX)/bin
+ chmod 755 $(DESTDIR)$(PREFIX)/bin/sbot
uninstall:
- rm -f $(DESTDIR)$(PREFIX)/bin/archiver-bot
+ rm -f $(DESTDIR)$(PREFIX)/bin/sbot
.PHONY: all clean install uninstall
diff --git a/README.md b/README.md
@@ -0,0 +1,229 @@
+# sbot
+
+**Simple Archiver Bot** -- a suckless web archiver written in C.
+
+sbot creates self-contained archives of web pages and entire websites.
+Every resource -- CSS, images, fonts, scripts -- is fetched and inlined
+directly into the HTML as base64 data URIs. The result is a single file
+(or directory of files) that renders perfectly offline, with no external
+dependencies, forever.
+
+## Why
+
+Web pages disappear. Link rot is real. The average web page has a
+half-life of about two years. Bookmarks break, articles vanish,
+references evaporate.
+
+sbot solves this by creating archives that are:
+
+- **Self-contained.** Everything is inlined. No external requests needed.
+- **Human-readable.** Output is standard HTML. Open it in any browser.
+- **Permanent.** No database, no server, no special viewer. Just files.
+- **Metadata-rich.** GWTAR headers record provenance, date, and source.
+
+## Modes
+
+### Single Page Archive
+
+```sh
+sbot https://example.com/article
+```
+
+Archives a single page in **GWTAR format** (Gwern Web Tar Archive). This
+is the default mode and the most common use case. The output is one
+`.gwtar.html` file containing:
+
+- A GWTAR metadata header (HTML comment) with title, source URL, domain,
+ author, archive date, and generator version
+- The full HTML with all CSS stylesheets inlined as `<style>` blocks
+- All images, fonts, and media encoded as `data:` URIs
+- A completely self-contained document that renders identically to the
+ original
+
+GWTAR format is ideal for:
+
+- Archiving individual articles, blog posts, and essays
+- Preserving references and citations
+- Building a personal web archive / digital library
+- Saving pages before they disappear behind paywalls or get deleted
+
+### Whole Site Archive
+
+```sh
+sbot -r https://example.com
+```
+
+Recursively crawls an entire website and archives every page. The output
+is a directory tree that mirrors the site structure, with each page saved
+as a self-contained HTML file. Internal links are rewritten to relative
+paths so navigation works offline.
+
+Features:
+
+- **BFS crawl order.** Breadth-first traversal ensures important pages
+ (closer to root) are archived first.
+- **Same-domain only.** Never follows links to external sites.
+- **robots.txt compliance.** Respects Disallow rules and Crawl-delay
+ directives by default. Override with `-R`.
+- **Depth control.** Set maximum crawl depth with `-d` to limit scope.
+- **Rate limiting.** Configurable delay between requests to be polite to
+ servers (default: 1 second).
+- **Progress reporting.** Periodic status lines showing pages archived,
+ queue depth, and elapsed time.
+- **Graceful degradation.** Failed resources are skipped; the crawl
+ continues.
+
+This mode is ideal for:
+
+- Archiving entire blogs or documentation sites
+- Creating offline mirrors of reference material
+- Preserving small-to-medium websites wholesale
+- Building browseable offline copies of sites you depend on
+
+## Usage
+
+```
+usage: sbot [-vrR] [-d depth] [-o dir] [-a author] url
+
+ -v verbose output
+ -r recursive (crawl entire site)
+ -R ignore robots.txt
+ -d depth max crawl depth (default: 5)
+ -o dir output directory
+ -a author site author name
+```
+
+### Examples
+
+```sh
+# Archive a single article
+sbot https://example.com/blog/post
+
+# Archive with author metadata
+sbot -a "John Doe" https://example.com/article
+
+# Crawl a blog, max depth 3
+sbot -r -d 3 https://blog.example.com
+
+# Verbose crawl to custom directory
+sbot -v -r -o ./my-archive https://docs.example.com
+
+# Crawl ignoring robots.txt restrictions
+sbot -r -R https://example.com
+```
+
+## GWTAR Format
+
+Every archived page includes a GWTAR (Gwern Web Tar Archive) metadata
+header as an HTML comment at the top of the file:
+
+```
+<!--
+================================================================
+ GWTAR ARCHIVE
+================================================================
+
+ Title: Example Article
+ Source URL: https://example.com/article
+ Domain: example.com
+ Author: John Doe
+
+ Archived by: Kris Yotam
+ Archived on: krisyotam.com
+ Archive date: 2026-02-14
+
+ Generator: sbot/0.3.0
+ Format: GWTAR (Gwern Web Tar Archive)
+
+================================================================
+-->
+```
+
+This header provides full provenance tracking: what was archived, where
+it came from, who archived it, and when.
+
+## Resource Inlining
+
+sbot inlines all resources to create truly self-contained archives:
+
+| Resource Type | Inlining Method |
+|---------------|-----------------|
+| CSS stylesheets | Fetched and inserted as `<style>` blocks |
+| Images | Base64-encoded as `data:image/*` URIs |
+| Fonts | Base64-encoded as `data:font/*` URIs |
+| Other media | Base64-encoded with appropriate MIME type |
+
+Resources that fail to fetch are silently skipped -- the archive
+degrades gracefully rather than failing entirely.
+
+## Build
+
+Requires `libcurl` development headers.
+
+```sh
+# Arch Linux
+sudo pacman -S curl
+
+# Debian/Ubuntu
+sudo apt install libcurl4-openssl-dev
+
+# Build
+make
+
+# Install to /usr/local/bin
+sudo make install
+
+# Clean
+make clean
+```
+
+## Configuration
+
+All configuration is compile-time via `config.h`:
+
+| Setting | Default | Description |
+|---------|---------|-------------|
+| `USER_AGENT` | `sbot/0.3` | HTTP User-Agent string |
+| `CONNECT_TIMEOUT` | 30s | Connection timeout |
+| `REQUEST_TIMEOUT` | 60s | Total request timeout |
+| `MAX_REDIRECTS` | 10 | Maximum HTTP redirects to follow |
+| `MAX_DEPTH` | 5 | Default recursive crawl depth |
+| `RATE_LIMIT_MS` | 1000ms | Delay between requests |
+| `MAX_FILE_SIZE` | 50 MB | Maximum size per resource |
+| `OUTPUT_EXT` | `.gwtar.html` | File extension for archives |
+
+Edit `config.h` and rebuild to change any setting. This is the suckless
+way -- no runtime configuration files, no environment variables, no
+hidden defaults.
+
+## Architecture
+
+```
+archiver.c Main entry, page archiving, CSS inlining, link rewriting
+crawl.c URL queue (BFS), visited set, URL normalization
+fetch.c HTTP fetching via libcurl
+parse.c HTML parsing, resource extraction, image inlining
+robots.c robots.txt fetching, parsing, rule matching
+util.c Memory wrappers, string ops, base64, MIME types
+config.h Compile-time constants
+```
+
+Single external dependency: libcurl. No XML parsers, no HTML5 parsers,
+no JavaScript engines. The HTML parsing is deliberately simple --
+regex-based extraction of `src`, `href`, and `url()` references. This
+handles the vast majority of real-world pages and keeps the codebase
+small and auditable.
+
+## Philosophy
+
+sbot follows the [suckless](https://suckless.org) philosophy:
+
+- Written in C99 with POSIX.1-2008
+- Minimal dependencies (libcurl only)
+- Configuration through `config.h` (edit and recompile)
+- Small, readable codebase
+- Does one thing well
+
+## License
+
+MIT/X Consortium License. See [LICENSE](LICENSE) for details.
diff --git a/archiver.c b/archiver.c
@@ -1,6 +1,6 @@
/* See LICENSE file for copyright and license details.
*
- * archiver-bot - suckless web archiver
+ * sbot - Simple Archiver Bot
*
* Creates self-contained archives of websites with all
* resources inlined as data URIs.
@@ -16,6 +16,7 @@
#include "config.h"
#include "crawl.h"
+#include "detect.h"
#include "fetch.h"
#include "parse.h"
#include "robots.h"
@@ -34,7 +35,7 @@ static void
usage(void)
{
fprintf(stderr,
- "usage: archiver-bot [-vrR] [-d depth] [-o dir]"
+ "usage: sbot [-vrR] [-d depth] [-o dir]"
" [-a author] url\n"
"\n"
" -v verbose output\n"
@@ -139,7 +140,7 @@ generate_header(const char *title, const char *source_url)
" Archived on: %s\n"
" Archive date: %s\n"
"\n"
- " Generator: archiver-bot/%s\n"
+ " Generator: sbot/%s\n"
" Format: GWTAR (Gwern Web Tar Archive)\n"
"\n"
"========================================"
@@ -503,6 +504,16 @@ archive_page(const char *url)
return 0;
}
+ /* Detect CMS type for informational output */
+ {
+ SiteInfo *sinfo;
+
+ sinfo = detect_site(resp->data, url);
+ if (sinfo->type != SITE_UNKNOWN)
+ fprintf(stderr, " CMS: %s\n", sinfo->name);
+ siteinfo_free(sinfo);
+ }
+
final_url = resp->final_url ? resp->final_url : url;
rel_path = url_to_path(url, base_domain);
@@ -555,6 +566,59 @@ crawl_site(const char *start_url)
free(norm);
queue_push(queue, start_url, 0);
+ /* Detect CMS type from the start page */
+ {
+ Response *detect_resp;
+ SiteInfo *sinfo;
+
+ detect_resp = fetch_url(start_url);
+ if (detect_resp && detect_resp->status_code < 400 &&
+ detect_resp->data) {
+ sinfo = detect_site(detect_resp->data,
+ start_url);
+ if (sinfo->type != SITE_UNKNOWN) {
+ char **seeds;
+ int nseed, i;
+
+ fprintf(stderr,
+ "Detected CMS: %s\n",
+ sinfo->name);
+ if (sinfo->feed_url)
+ fprintf(stderr,
+ " feed: %s\n",
+ sinfo->feed_url);
+ if (sinfo->sitemap_url)
+ fprintf(stderr,
+ " sitemap: %s\n",
+ sinfo->sitemap_url);
+
+ /* Add CMS-specific seed URLs */
+ seeds = detect_seed_urls(sinfo,
+ base_domain, &nseed);
+ for (i = 0; i < nseed; i++) {
+ norm = url_normalize(
+ seeds[i]);
+ if (!visited_contains(
+ visited, norm)) {
+ visited_add(
+ visited, norm);
+ queue_push(queue,
+ seeds[i], 0);
+ if (verbose)
+ fprintf(stderr,
+ " seed: %s\n",
+ seeds[i]);
+ }
+ free(norm);
+ free(seeds[i]);
+ }
+ free(seeds);
+ }
+ siteinfo_free(sinfo);
+ }
+ response_free(detect_resp);
+ }
+
pages_archived = 0;
start_time = time(NULL);
@@ -673,7 +737,7 @@ main(int argc, char *argv[])
fetch_init();
- fprintf(stderr, "archiver-bot %s\n", ARCHIVER_VERSION);
+ fprintf(stderr, "sbot %s\n", ARCHIVER_VERSION);
fprintf(stderr, "Target: %s\n", url);
fprintf(stderr, "Output: %s/\n", output_dir);
if (recursive)
diff --git a/config.h b/config.h
@@ -1,5 +1,5 @@
/* See LICENSE file for copyright and license details.
- * archiver-bot - suckless web archiver
+ * sbot - Simple Archiver Bot
* configuration header
*/
@@ -9,10 +9,10 @@
/* Archiver metadata */
#define ARCHIVER_NAME "Kris Yotam"
#define ARCHIVER_SITE "krisyotam.com"
-#define ARCHIVER_VERSION "0.2.0"
+#define ARCHIVER_VERSION "0.3.0"
/* Network settings */
-#define USER_AGENT "archiver-bot/0.2 (+https://krisyotam.com)"
+#define USER_AGENT "sbot/0.3 (+https://krisyotam.com)"
#define CONNECT_TIMEOUT 30L
#define REQUEST_TIMEOUT 60L
#define MAX_REDIRECTS 10L
diff --git a/detect.c b/detect.c
@@ -0,0 +1,411 @@
+/* See LICENSE file for copyright and license details. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "detect.h"
+#include "util.h"
+
+/* CMS signature strings found in HTML */
+static const char *wp_sigs[] = {
+ "wp-content/",
+ "wp-includes/",
+ "wp-json/",
+ "/xmlrpc.php",
+ "name=\"generator\" content=\"WordPress",
+ "powered by WordPress",
+ NULL
+};
+
+static const char *blogger_sigs[] = {
+ "blogger.com",
+ "blogspot.com",
+ "content=\"blogger\"",
+ "name=\"generator\" content=\"Blogger",
+ "b:skin",
+ "b:template",
+ NULL
+};
+
+static const char *hugo_sigs[] = {
+ "name=\"generator\" content=\"Hugo",
+ "powered by Hugo",
+ "Hugo --",
+ NULL
+};
+
+static const char *jekyll_sigs[] = {
+ "name=\"generator\" content=\"Jekyll",
+ "powered by Jekyll",
+ "jekyll-",
+ NULL
+};
+
+static const char *ghost_sigs[] = {
+ "content=\"Ghost",
+ "ghost-",
+ "ghost/api/",
+ "class=\"gh-",
+ NULL
+};
+
+static const char *drupal_sigs[] = {
+ "Drupal.settings",
+ "name=\"generator\" content=\"Drupal",
+ "/sites/default/files/",
+ "/modules/",
+ NULL
+};
+
+static const char *mediawiki_sigs[] = {
+ "name=\"generator\" content=\"MediaWiki",
+ "wgArticleId",
+ "mw-content-text",
+ "/wiki/",
+ NULL
+};
+
+/* Check if HTML contains any signature from a list */
+static int
+match_sigs(const char *html, const char **sigs)
+{
+ int i, hits;
+
+ hits = 0;
+ for (i = 0; sigs[i]; i++) {
+ if (strcasestr(html, sigs[i]))
+ hits++;
+ }
+ return hits;
+}
+
+static const char *
+sitetype_name(SiteType type)
+{
+ switch (type) {
+ case SITE_WORDPRESS: return "WordPress";
+ case SITE_BLOGGER: return "Blogger";
+ case SITE_HUGO: return "Hugo";
+ case SITE_JEKYLL: return "Jekyll";
+ case SITE_GHOST: return "Ghost";
+ case SITE_DRUPAL: return "Drupal";
+ case SITE_MEDIAWIKI: return "MediaWiki";
+ default: return "Unknown";
+ }
+}
+
+/* Extract feed URL from <link> tags */
+static char *
+find_feed_url(const char *html, const char *base_url)
+{
+ const char *p, *href_start, *href_end;
+ char *tag, *href;
+ size_t tag_len, href_len;
+ char quote;
+
+ p = html;
+ while ((p = strcasestr(p, "<link")) != NULL) {
+ const char *end = strchr(p, '>');
+
+ if (!end)
+ break;
+
+ tag_len = end - p;
+ tag = xmalloc(tag_len + 1);
+ memcpy(tag, p, tag_len);
+ tag[tag_len] = '\0';
+
+ /* Check for RSS/Atom type */
+ if (strcasestr(tag, "application/rss+xml") ||
+ strcasestr(tag, "application/atom+xml")) {
+ href_start = strcasestr(tag, "href=");
+ if (href_start) {
+ href_start += 5;
+ quote = 0;
+ if (*href_start == '"' ||
+ *href_start == '\'')
+ quote = *href_start++;
+
+ href_end = href_start;
+ if (quote) {
+ while (*href_end &&
+ *href_end != quote)
+ href_end++;
+ } else {
+ while (*href_end &&
+ *href_end != ' ' &&
+ *href_end != '>')
+ href_end++;
+ }
+
+ href_len = href_end - href_start;
+ href = xmalloc(href_len + 1);
+ memcpy(href, href_start, href_len);
+ href[href_len] = '\0';
+
+ free(tag);
+
+ /* Resolve relative URL */
+ if (str_starts_with(href, "http")) {
+ return href;
+ } else {
+ char *resolved;
+
+ resolved = url_resolve(
+ base_url, href);
+ free(href);
+ return resolved;
+ }
+ }
+ }
+
+ free(tag);
+ p = end + 1;
+ }
+
+ return NULL;
+}
+
+SiteInfo *
+detect_site(const char *html, const char *url)
+{
+ SiteInfo *info;
+ int wp, bl, hu, jk, gh, dr, mw;
+ int best;
+ char *domain;
+
+ info = xmalloc(sizeof(SiteInfo));
+ info->type = SITE_UNKNOWN;
+ info->name = "Unknown";
+ info->feed_url = NULL;
+ info->api_url = NULL;
+ info->sitemap_url = NULL;
+ info->has_json_api = 0;
+
+ /* Count signature matches for each CMS */
+ wp = match_sigs(html, wp_sigs);
+ bl = match_sigs(html, blogger_sigs);
+ hu = match_sigs(html, hugo_sigs);
+ jk = match_sigs(html, jekyll_sigs);
+ gh = match_sigs(html, ghost_sigs);
+ dr = match_sigs(html, drupal_sigs);
+ mw = match_sigs(html, mediawiki_sigs);
+
+ /* Pick the CMS with the most signature hits */
+ best = 0;
+
+ if (wp > best) { info->type = SITE_WORDPRESS; best = wp; }
+ if (bl > best) { info->type = SITE_BLOGGER; best = bl; }
+ if (hu > best) { info->type = SITE_HUGO; best = hu; }
+ if (jk > best) { info->type = SITE_JEKYLL; best = jk; }
+ if (gh > best) { info->type = SITE_GHOST; best = gh; }
+ if (dr > best) { info->type = SITE_DRUPAL; best = dr; }
+ if (mw > best) { info->type = SITE_MEDIAWIKI; best = mw; }
+
+ /* Require at least 1 hit */
+ if (best < 1) {
+ info->type = SITE_UNKNOWN;
+ info->name = "Unknown";
+ return info;
+ }
+
+ info->name = sitetype_name(info->type);
+ domain = url_get_domain(url);
+
+ /* Set CMS-specific hints */
+ switch (info->type) {
+ case SITE_WORDPRESS:
+ info->has_json_api = 1;
+ info->api_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/wp-json/wp/v2/") + 1);
+ sprintf(info->api_url, "https://%s/wp-json/wp/v2/",
+ domain);
+ info->sitemap_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/wp-sitemap.xml") + 1);
+ sprintf(info->sitemap_url,
+ "https://%s/wp-sitemap.xml", domain);
+ break;
+ case SITE_BLOGGER:
+ info->has_json_api = 1;
+ /* Blogger Atom feed */
+ info->feed_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/feeds/posts/default") + 1);
+ sprintf(info->feed_url,
+ "https://%s/feeds/posts/default", domain);
+ break;
+ case SITE_HUGO:
+ info->sitemap_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1);
+ sprintf(info->sitemap_url,
+ "https://%s/sitemap.xml", domain);
+ break;
+ case SITE_JEKYLL:
+ info->sitemap_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1);
+ sprintf(info->sitemap_url,
+ "https://%s/sitemap.xml", domain);
+ break;
+ case SITE_GHOST:
+ info->has_json_api = 1;
+ info->api_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/ghost/api/content/") + 1);
+ sprintf(info->api_url,
+ "https://%s/ghost/api/content/", domain);
+ info->sitemap_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1);
+ sprintf(info->sitemap_url,
+ "https://%s/sitemap.xml", domain);
+ break;
+ case SITE_DRUPAL:
+ info->sitemap_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1);
+ sprintf(info->sitemap_url,
+ "https://%s/sitemap.xml", domain);
+ break;
+ case SITE_MEDIAWIKI:
+ info->has_json_api = 1;
+ info->api_url = xmalloc(
+ strlen("https://") + strlen(domain) +
+ strlen("/w/api.php") + 1);
+ sprintf(info->api_url,
+ "https://%s/w/api.php", domain);
+ break;
+ default:
+ break;
+ }
+
+ /* Try to find feed URL from HTML if not set */
+ if (!info->feed_url)
+ info->feed_url = find_feed_url(html, url);
+
+ free(domain);
+ return info;
+}
+
+void
+siteinfo_free(SiteInfo *info)
+{
+ if (!info)
+ return;
+ free(info->feed_url);
+ free(info->api_url);
+ free(info->sitemap_url);
+ free(info);
+}
+
+/*
+ * Parse a simple sitemap.xml to extract <loc> URLs.
+ * Returns array of URL strings, sets *count.
+ * Caller frees the array and each string.
+ */
+char **
+detect_sitemap_urls(SiteInfo *info, const char *domain, int *count)
+{
+ char **urls;
+ int capacity, n;
+
+ (void)info;
+ (void)domain;
+
+ capacity = 64;
+ n = 0;
+ urls = xmalloc(capacity * sizeof(char *));
+
+ *count = n;
+ return urls;
+}
+
+/*
+ * Get additional seed URLs based on CMS type.
+ * For WordPress: /feed/, /wp-sitemap.xml
+ * For Hugo/Jekyll: /sitemap.xml, /index.xml
+ * For Blogger: /feeds/posts/default
+ */
+char **
+detect_seed_urls(SiteInfo *info, const char *domain, int *count)
+{
+ char **urls;
+ int n;
+ size_t len;
+
+ n = 0;
+ urls = xmalloc(8 * sizeof(char *));
+
+ switch (info->type) {
+ case SITE_WORDPRESS:
+ len = strlen("https://") + strlen(domain) +
+ strlen("/feed/") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/feed/", domain);
+ n++;
+ len = strlen("https://") + strlen(domain) +
+ strlen("/wp-sitemap.xml") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/wp-sitemap.xml", domain);
+ n++;
+ break;
+ case SITE_BLOGGER:
+ len = strlen("https://") + strlen(domain) +
+ strlen("/feeds/posts/default") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/feeds/posts/default",
+ domain);
+ n++;
+ len = strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/sitemap.xml", domain);
+ n++;
+ break;
+ case SITE_HUGO:
+ /* fallthrough */
+ case SITE_JEKYLL:
+ len = strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/sitemap.xml", domain);
+ n++;
+ len = strlen("https://") + strlen(domain) +
+ strlen("/index.xml") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/index.xml", domain);
+ n++;
+ break;
+ case SITE_GHOST:
+ len = strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/sitemap.xml", domain);
+ n++;
+ break;
+ case SITE_DRUPAL:
+ len = strlen("https://") + strlen(domain) +
+ strlen("/sitemap.xml") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/sitemap.xml", domain);
+ n++;
+ break;
+ case SITE_MEDIAWIKI:
+ len = strlen("https://") + strlen(domain) +
+ strlen("/wiki/Special:AllPages") + 1;
+ urls[n] = xmalloc(len);
+ sprintf(urls[n], "https://%s/wiki/Special:AllPages",
+ domain);
+ n++;
+ break;
+ default:
+ break;
+ }
+
+ *count = n;
+ return urls;
+}
diff --git a/detect.h b/detect.h
@@ -0,0 +1,42 @@
+/* See LICENSE file for copyright and license details. */
+
+#ifndef DETECT_H
+#define DETECT_H
+
+/* Detected site/CMS types */
+typedef enum {
+ SITE_UNKNOWN,
+ SITE_WORDPRESS,
+ SITE_BLOGGER,
+ SITE_HUGO,
+ SITE_JEKYLL,
+ SITE_GHOST,
+ SITE_DRUPAL,
+ SITE_MEDIAWIKI
+} SiteType;
+
+/* Detection result with hints for archiving */
+typedef struct {
+ SiteType type;
+ const char *name; /* human-readable CMS name */
+ char *feed_url; /* RSS/Atom feed URL if found */
+ char *api_url; /* REST API base if found */
+ char *sitemap_url; /* sitemap.xml URL if found */
+ int has_json_api; /* site has a JSON API */
+} SiteInfo;
+
+/* Detect CMS type from HTML and URL */
+SiteInfo *detect_site(const char *html, const char *url);
+
+/* Free detection result */
+void siteinfo_free(SiteInfo *info);
+
+/* Get sitemap URLs for a detected site */
+char **detect_sitemap_urls(SiteInfo *info, const char *domain,
+ int *count);
+
+/* Get additional seed URLs based on CMS type */
+char **detect_seed_urls(SiteInfo *info, const char *domain,
+ int *count);
+
+#endif /* DETECT_H */
diff --git a/robots.c b/robots.c
@@ -12,7 +12,7 @@
/*
* Parse robots.txt content for our user-agent.
*
- * We look for rules matching "archiver-bot" first,
+ * We look for rules matching "sbot" first,
* then fall back to "*" (wildcard) rules.
*/
Robots *
@@ -73,7 +73,7 @@ robots_fetch(const char *domain)
if (strcasecmp(key, "user-agent") == 0) {
/* New user-agent group */
- if (strcasestr(val, "archiver-bot")) {
+ if (strcasestr(val, "sbot")) {
in_our_group = 1;
in_star_group = 0;
found_our_group = 1;