commit 74179fb37667450f2a5ff117e81e7403adf7f710
Author: Kris Yotam <krisyotam@protonmail.com>
Date: Sat, 14 Feb 2026 17:06:53 -0600
archiver-bot v0.2: hash table, robots.txt, retry logic
Diffstat:
| A | .gitignore | | | 7 | +++++++ |
| A | CLAUDE.md | | | 145 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | LICENSE | | | 21 | +++++++++++++++++++++ |
| A | Makefile | | | 45 | +++++++++++++++++++++++++++++++++++++++++++++ |
| A | archiver.c | | | 694 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | config.h | | | 31 | +++++++++++++++++++++++++++++++ |
| A | crawl.c | | | 296 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | crawl.h | | | 57 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | fetch.c | | | 168 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | fetch.h | | | 31 | +++++++++++++++++++++++++++++++ |
| A | parse.c | | | 380 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | parse.h | | | 51 | +++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | robots.c | | | 200 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | robots.h | | | 35 | +++++++++++++++++++++++++++++++++++ |
| A | util.c | | | 307 | +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ |
| A | util.h | | | 39 | +++++++++++++++++++++++++++++++++++++++ |
16 files changed, 2507 insertions(+), 0 deletions(-)
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+# Build artifacts
+archiver-bot
+*.o
+
+# Test output
+*.gwtar.html
+www.*/
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -0,0 +1,145 @@
+# archiver-bot — CLAUDE.md
+
+## Project
+
+archiver-bot is a suckless web archiver written in C. It creates
+self-contained archives of websites with all resources (CSS, images,
+fonts) inlined as data URIs. Supports single-page and recursive
+whole-site archival with GWTAR (Gwern Web Tar Archive) format headers.
+
+## Coding Standards — Suckless C Style
+
+All code in this project MUST follow the suckless.org coding style:
+
+### Language
+- C99 (ISO/IEC 9899:1999), no extensions
+- POSIX.1-2008 (`_POSIX_C_SOURCE 200809L`)
+
+### Indentation & Whitespace
+- Tabs for indentation (1 tab = 1 level)
+- Spaces for alignment only, never for indentation
+- No tabs except at the beginning of a line
+- Maximum line length: 79 characters
+
+### Comments
+- Use `/* */` only, never `//`
+- Comment fallthrough cases in switch statements
+
+### Variables
+- All declarations at the top of the block
+- Pointer `*` adjacent to variable name: `char *p`, not `char* p`
+- No C99 `bool`; use `int` (0/1)
+- Global/static variables not used outside TU must be `static`
+
+### Functions
+- Return type on its own line
+- Function name at column 0 on next line (enables `grep ^funcname`)
+- Opening `{` on its own line for functions
+- Functions not used outside their file: `static`
+
+```c
+static void
+usage(void)
+{
+ fprintf(stderr, "usage: archiver-bot [-v] [-r] url\n");
+ exit(1);
+}
+```
+
+### Braces
+- Opening `{` on same line for control flow (if, for, while, switch)
+- Closing `}` on its own line unless continuing (else, do-while)
+- Use braces even for single statements when sibling branches use them
+
+### Naming
+- lowercase_with_underscores for functions and variables
+- UPPERCASE for macros and constants
+- CamelCase for typedef'd struct types
+- No `_t` suffix (reserved by POSIX)
+- Prefix module functions with module name
+
+### Control Flow
+- Space after `if`, `for`, `while`, `switch`
+- No space after `(` or before `)`
+- Use `goto` for cleanup/unwind, not nested ifs
+- Return/exit early on failure
+- Test against 0, not -1: `if (func() < 0)`
+
+### Error Handling
+- All allocation checked; goto cleanup on failure
+- `die()` for fatal errors (prints message, exits)
+- `warn()` for recoverable errors (prints, continues)
+
+### File Organization Order
+1. License header
+2. System includes (alphabetical)
+3. Local includes
+4. Macros
+5. Type definitions
+6. Function declarations
+7. Global variables
+8. Function definitions (same order as declarations)
+
+### Headers
+- System headers first, alphabetical
+- Local headers after blank line
+- No cyclic dependencies
+- Include only what is needed
+
+## Architecture
+
+### Module Layout
+
+| Module | Prefix | File | Responsibility |
+|--------|--------|------|----------------|
+| Main | — | archiver.c | Entry point, page archiving, CSS inlining, link rewriting, crawl orchestration |
+| Crawler | `queue_`, `visited_` | crawl.c | URL queue (BFS), visited set, URL normalization, path conversion |
+| Fetcher | `fetch_` | fetch.c | HTTP fetching via libcurl, response management |
+| Parser | `reslist_`, `parse_` | parse.c | HTML parsing, resource extraction, image inlining |
+| Utilities | `die`, `warn`, `x*`, `str_*`, `url_*` | util.c | Memory wrappers, string ops, URL helpers, base64, MIME types |
+| Config | — | config.h | Compile-time constants (timeouts, limits, user agent) |
+
+### Architecture Rules
+- **Separate compilation.** Every .c file compiles independently.
+- **No dynamic loading.** All features compiled in.
+- **libcurl only.** Single external dependency for HTTP.
+- **No `system()` calls.** Direct file I/O and libcurl only.
+- **Data URIs for inlining.** Resources encoded as base64 data URIs.
+- **Stateless functions preferred.** Minimize mutable global state.
+
+### Crawler Design Principles
+- **BFS traversal.** URL queue processes breadth-first by depth level.
+- **Same-domain only.** Never follow links to external domains.
+- **Politeness.** Rate limiting between requests (configurable).
+- **Depth control.** Hard limit on crawl depth to prevent runaway.
+- **URL normalization.** Canonical form for deduplication.
+- **Graceful degradation.** Skip failed resources, continue crawling.
+
+## Build
+
+```sh
+make # build archiver-bot binary
+make clean # remove build artifacts
+make install # install to /usr/local/bin
+```
+
+Dependencies: `libcurl` (via pkg-config)
+
+## Usage
+
+```sh
+# Single page
+archiver-bot https://example.com/article
+
+# Whole site (recursive, depth 3)
+archiver-bot -r -d 3 https://example.com
+
+# Verbose with custom output dir
+archiver-bot -v -r -o ./archive https://example.com
+```
+
+## Git Conventions
+
+- No `Co-Authored-By: Claude` lines
+- Commit messages: imperative, <72 chars, no period
+- One logical change per commit
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT/X Consortium License
+
+(c) 2026 Kris Yotam <krisyotam@proton.me>
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+the rights to use, copy, modify, merge, publish, distribute, sublicense,
+and/or sell copies of the Software, and to permit persons to whom the
+Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+DEALINGS IN THE SOFTWARE.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,45 @@
+# archiver-bot - suckless web archiver
+# See LICENSE file for copyright and license details.
+
+VERSION = 0.2.0
+
+# paths
+PREFIX = /usr/local
+MANPREFIX = $(PREFIX)/share/man
+
+# includes and libs
+INCS = `pkg-config --cflags libcurl`
+LIBS = `pkg-config --libs libcurl`
+
+# flags
+CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_POSIX_C_SOURCE=200809L -DVERSION=\"$(VERSION)\"
+CFLAGS = -std=c99 -pedantic -Wall -Wextra -Os $(INCS) $(CPPFLAGS)
+LDFLAGS = $(LIBS)
+
+# compiler
+CC = cc
+
+# sources
+SRC = archiver.c crawl.c fetch.c parse.c robots.c util.c
+OBJ = $(SRC:.c=.o)
+
+all: archiver-bot
+
+.c.o:
+ $(CC) $(CFLAGS) -c $<
+
+archiver-bot: $(OBJ)
+ $(CC) -o $@ $(OBJ) $(LDFLAGS)
+
+clean:
+ rm -f archiver-bot $(OBJ)
+
+install: all
+ mkdir -p $(DESTDIR)$(PREFIX)/bin
+ cp -f archiver-bot $(DESTDIR)$(PREFIX)/bin
+ chmod 755 $(DESTDIR)$(PREFIX)/bin/archiver-bot
+
+uninstall:
+ rm -f $(DESTDIR)$(PREFIX)/bin/archiver-bot
+
+.PHONY: all clean install uninstall
diff --git a/archiver.c b/archiver.c
@@ -0,0 +1,694 @@
+/* See LICENSE file for copyright and license details.
+ *
+ * archiver-bot - suckless web archiver
+ *
+ * Creates self-contained archives of websites with all
+ * resources inlined as data URIs.
+ */
+
+#include <errno.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "config.h"
+#include "crawl.h"
+#include "fetch.h"
+#include "parse.h"
+#include "robots.h"
+#include "util.h"
+
+/* Global options */
+static int verbose = 0;
+static int recursive = 0;
+static int max_depth = MAX_DEPTH;
+static int respect_robots = 1;
+static const char *author = "Unknown";
+static const char *output_dir = NULL;
+static char *base_domain = NULL;
+
+static void
+usage(void)
+{
+ fprintf(stderr,
+ "usage: archiver-bot [-vrR] [-d depth] [-o dir]"
+ " [-a author] url\n"
+ "\n"
+ " -v verbose output\n"
+ " -r recursive (crawl entire site)\n"
+ " -R ignore robots.txt\n"
+ " -d depth max crawl depth (default: %d)\n"
+ " -o dir output directory\n"
+ " -a author site author name\n",
+ MAX_DEPTH);
+ exit(1);
+}
+
+static int
+mkdirp(const char *path)
+{
+ char *p, *sep;
+
+ p = xstrdup(path);
+ for (sep = p + 1; *sep; sep++) {
+ if (*sep == '/') {
+ *sep = '\0';
+ if (mkdir(p, 0755) != 0 && errno != EEXIST) {
+ free(p);
+ return -1;
+ }
+ *sep = '/';
+ }
+ }
+ free(p);
+ return 0;
+}
+
+static char *
+fetch_and_encode(const char *url, const char *base)
+{
+ char *resolved, *mime, *b64, *data_uri, *semi;
+ Response *resp;
+ size_t b64_len, uri_len;
+
+ resolved = url_resolve(base, url);
+ if (verbose)
+ fprintf(stderr, " resource: %s\n", resolved);
+
+ resp = fetch_url(resolved);
+ if (!resp || resp->status_code >= 400 || resp->size == 0) {
+ free(resolved);
+ response_free(resp);
+ return NULL;
+ }
+
+ if (resp->content_type) {
+ mime = xstrdup(resp->content_type);
+ semi = strchr(mime, ';');
+ if (semi)
+ *semi = '\0';
+ str_trim(mime);
+ } else {
+ mime = get_mime_type(resolved);
+ }
+
+ b64 = base64_encode((unsigned char *)resp->data,
+ resp->size, &b64_len);
+
+ uri_len = 5 + strlen(mime) + 8 + b64_len + 1;
+ data_uri = xmalloc(uri_len);
+ snprintf(data_uri, uri_len, "data:%s;base64,%s", mime, b64);
+
+ free(mime);
+ free(b64);
+ free(resolved);
+ response_free(resp);
+ return data_uri;
+}
+
+static char *
+generate_header(const char *title, const char *source_url)
+{
+ char *date, *domain, *header;
+ size_t len;
+
+ date = get_iso_date();
+ domain = url_get_domain(source_url);
+
+ len = 2048 + strlen(title) + strlen(source_url) +
+ strlen(author) + strlen(domain);
+ header = xmalloc(len);
+
+ snprintf(header, len,
+ "<!--\n"
+ "========================================"
+ "========================================\n"
+ " GWTAR ARCHIVE\n"
+ "========================================"
+ "========================================\n"
+ "\n"
+ " Title: %s\n"
+ " Source URL: %s\n"
+ " Domain: %s\n"
+ " Author: %s\n"
+ "\n"
+ " Archived by: %s\n"
+ " Archived on: %s\n"
+ " Archive date: %s\n"
+ "\n"
+ " Generator: archiver-bot/%s\n"
+ " Format: GWTAR (Gwern Web Tar Archive)\n"
+ "\n"
+ "========================================"
+ "========================================\n"
+ "-->\n",
+ title, source_url, domain, author,
+ ARCHIVER_NAME, ARCHIVER_SITE, date,
+ ARCHIVER_VERSION);
+
+ free(date);
+ free(domain);
+ return header;
+}
+
+static int
+path_depth(const char *path)
+{
+ int depth = 0;
+ const char *p;
+
+ for (p = path; *p; p++)
+ if (*p == '/')
+ depth++;
+ return depth;
+}
+
+static char *
+make_relative_prefix(int depth)
+{
+ char *prefix;
+ size_t len;
+ int i;
+
+ if (depth == 0)
+ return xstrdup("");
+ len = depth * 3 + 1;
+ prefix = xmalloc(len);
+ prefix[0] = '\0';
+ for (i = 0; i < depth; i++)
+ strcat(prefix, "../");
+ return prefix;
+}
+
+static char *
+rewrite_links(char *html, const char *rel_path)
+{
+ int depth;
+ char *prefix, *result, *new_result;
+ size_t prefix_len, search_offset, pos, old_len, new_len;
+ const char *p;
+
+ depth = path_depth(rel_path);
+ prefix = make_relative_prefix(depth);
+ prefix_len = strlen(prefix);
+ result = html;
+
+ search_offset = 0;
+ while (1) {
+ p = strstr(result + search_offset, "href=\"/");
+ if (!p)
+ break;
+ if (p[7] == '/') {
+ search_offset = (p - result) + 8;
+ continue;
+ }
+ pos = p - result + 6;
+ old_len = strlen(result);
+ new_len = old_len - 1 + prefix_len;
+ new_result = xmalloc(new_len + 1);
+ memcpy(new_result, result, pos);
+ memcpy(new_result + pos, prefix, prefix_len);
+ memcpy(new_result + pos + prefix_len,
+ result + pos + 1, old_len - pos);
+ free(result);
+ result = new_result;
+ search_offset = pos + prefix_len;
+ }
+
+ search_offset = 0;
+ while (1) {
+ p = strstr(result + search_offset, "src=\"/");
+ if (!p)
+ break;
+ if (p[6] == '/') {
+ search_offset = (p - result) + 7;
+ continue;
+ }
+ pos = p - result + 5;
+ old_len = strlen(result);
+ new_len = old_len - 1 + prefix_len;
+ new_result = xmalloc(new_len + 1);
+ memcpy(new_result, result, pos);
+ memcpy(new_result + pos, prefix, prefix_len);
+ memcpy(new_result + pos + prefix_len,
+ result + pos + 1, old_len - pos);
+ free(result);
+ result = new_result;
+ search_offset = pos + prefix_len;
+ }
+
+ free(prefix);
+ return result;
+}
+
+static char *
+inline_css(char *html, const char *base)
+{
+ char *result, *tag, *href, *resolved;
+ char *new_tag, *new_result;
+ const char *link_start, *link_end;
+ const char *href_start, *href_end;
+ size_t search_offset, tag_offset, tag_len, href_len;
+ size_t new_tag_len, old_len, new_len, result_len;
+ Response *resp;
+ char quote;
+
+ result = html;
+ search_offset = 0;
+
+ while (1) {
+ link_start = strcasestr(result + search_offset,
+ "<link");
+ if (!link_start)
+ break;
+ link_end = strchr(link_start, '>');
+ if (!link_end)
+ break;
+
+ tag_offset = link_start - result;
+ tag_len = link_end - link_start;
+ tag = xmalloc(tag_len + 1);
+ memcpy(tag, link_start, tag_len);
+ tag[tag_len] = '\0';
+
+ if (!strcasestr(tag, "stylesheet")) {
+ free(tag);
+ search_offset = (link_end - result) + 1;
+ continue;
+ }
+
+ href_start = strcasestr(tag, "href=");
+ if (!href_start) {
+ free(tag);
+ search_offset = (link_end - result) + 1;
+ continue;
+ }
+ href_start += 5;
+ quote = 0;
+ if (*href_start == '"' || *href_start == '\'')
+ quote = *href_start++;
+
+ href_end = href_start;
+ if (quote) {
+ while (*href_end && *href_end != quote)
+ href_end++;
+ } else {
+ while (*href_end && *href_end != ' ' &&
+ *href_end != '>')
+ href_end++;
+ }
+
+ href_len = href_end - href_start;
+ href = xmalloc(href_len + 1);
+ memcpy(href, href_start, href_len);
+ href[href_len] = '\0';
+
+ resolved = url_resolve(base, href);
+ if (verbose)
+ fprintf(stderr, " css: %s\n", resolved);
+
+ resp = fetch_url(resolved);
+ free(resolved);
+
+ if (resp && resp->status_code < 400 &&
+ resp->size > 0) {
+ new_tag_len = 7 + resp->size + 8 + 1;
+ new_tag = xmalloc(new_tag_len);
+ snprintf(new_tag, new_tag_len,
+ "<style>%s</style>", resp->data);
+
+ old_len = (link_end + 1) - link_start;
+ new_len = strlen(new_tag);
+ result_len = strlen(result);
+
+ new_result = xmalloc(
+ result_len - old_len + new_len + 1);
+ memcpy(new_result, result, tag_offset);
+ memcpy(new_result + tag_offset,
+ new_tag, new_len);
+ memcpy(new_result + tag_offset + new_len,
+ link_end + 1,
+ result_len - tag_offset - old_len + 1);
+
+ free(result);
+ result = new_result;
+ search_offset = tag_offset + new_len;
+ free(new_tag);
+ } else {
+ search_offset = (link_end - result) + 1;
+ }
+
+ response_free(resp);
+ free(href);
+ free(tag);
+ }
+
+ return result;
+}
+
+static void
+extract_links(const char *html, const char *base_url,
+ UrlQueue *queue, VisitedSet *visited,
+ int current_depth, Robots *robots)
+{
+ ResourceList *resources;
+ Resource *r;
+ char *norm, *path;
+ const char *pstart;
+
+ if (current_depth >= max_depth)
+ return;
+
+ resources = parse_html(html, base_url);
+
+ for (r = resources->head; r; r = r->next) {
+ if (r->type != RES_PAGE)
+ continue;
+
+ if (str_starts_with(r->url, "mailto:") ||
+ str_starts_with(r->url, "tel:") ||
+ str_starts_with(r->url, "javascript:") ||
+ str_starts_with(r->url, "#"))
+ continue;
+
+ if (!url_same_domain(r->url, base_url))
+ continue;
+
+ /* Check robots.txt */
+ if (robots) {
+ pstart = r->url;
+ if (str_starts_with(pstart, "https://"))
+ pstart += 8;
+ else if (str_starts_with(pstart, "http://"))
+ pstart += 7;
+ while (*pstart && *pstart != '/')
+ pstart++;
+ path = xstrdup(pstart[0] ? pstart : "/");
+ if (!robots_allowed(robots, path)) {
+ if (verbose)
+ fprintf(stderr,
+ " robots: blocked"
+ " %s\n", r->url);
+ free(path);
+ continue;
+ }
+ free(path);
+ }
+
+ norm = url_normalize(r->url);
+ if (!visited_contains(visited, norm)) {
+ visited_add(visited, norm);
+ queue_push(queue, r->url,
+ current_depth + 1);
+ if (verbose)
+ fprintf(stderr,
+ " queued: %s\n", r->url);
+ }
+ free(norm);
+ }
+
+ reslist_free(resources);
+}
+
+static int
+save_page(const char *url, const char *final_url,
+ const char *data, const char *rel_path)
+{
+ char *html, *title, *header;
+ char *full_path, *dir, *last_slash;
+ size_t full_path_len;
+ FILE *fp;
+
+ html = xstrdup(data);
+ title = parse_title(html);
+
+ if (verbose)
+ fprintf(stderr, " title: %s\n", title);
+
+ html = inline_css(html, final_url);
+ html = inline_resources(html, final_url,
+ fetch_and_encode);
+ html = rewrite_links(html, rel_path);
+
+ header = generate_header(title, url);
+
+ full_path_len = strlen(output_dir) + 1 +
+ strlen(rel_path) + 1;
+ full_path = xmalloc(full_path_len);
+ snprintf(full_path, full_path_len, "%s/%s",
+ output_dir, rel_path);
+
+ dir = xstrdup(full_path);
+ last_slash = strrchr(dir, '/');
+ if (last_slash) {
+ *last_slash = '\0';
+ mkdirp(dir);
+ mkdir(dir, 0755);
+ }
+ free(dir);
+
+ fp = fopen(full_path, "w");
+ if (!fp) {
+ warn("cannot write: %s", full_path);
+ free(full_path);
+ free(header);
+ free(html);
+ free(title);
+ return -1;
+ }
+
+ fputs(header, fp);
+ fputs(html, fp);
+ fclose(fp);
+
+ fprintf(stderr, " saved: %s\n", full_path);
+
+ free(full_path);
+ free(header);
+ free(html);
+ free(title);
+ return 0;
+}
+
+static int
+archive_page(const char *url)
+{
+ Response *resp;
+ const char *final_url;
+ char *rel_path;
+ int ret;
+
+ if (verbose)
+ fprintf(stderr, "[0] %s\n", url);
+
+ resp = fetch_url(url);
+ if (!resp) {
+ warn("failed to fetch: %s", url);
+ return -1;
+ }
+ if (resp->status_code >= 400) {
+ warn("HTTP %ld: %s", resp->status_code, url);
+ response_free(resp);
+ return -1;
+ }
+ if (resp->content_type &&
+ !strstr(resp->content_type, "text/html")) {
+ if (verbose)
+ fprintf(stderr, " skip non-HTML: %s\n",
+ resp->content_type);
+ response_free(resp);
+ return 0;
+ }
+
+ final_url = resp->final_url ? resp->final_url : url;
+ rel_path = url_to_path(url, base_domain);
+
+ ret = save_page(url, final_url, resp->data, rel_path);
+
+ free(rel_path);
+ response_free(resp);
+ return ret;
+}
+
+static void
+crawl_site(const char *start_url)
+{
+ UrlQueue *queue;
+ VisitedSet *visited;
+ Robots *robots = NULL;
+ QueueNode *node;
+ Response *resp;
+ const char *final_url;
+ char *norm, *url, *rel_path;
+ int depth, pages_archived, rate_ms;
+ time_t start_time, now;
+
+ queue = queue_new();
+ visited = visited_new();
+
+ if (respect_robots) {
+ fprintf(stderr, "Fetching robots.txt for %s...\n",
+ base_domain);
+ robots = robots_fetch(base_domain);
+ if (robots->nrules > 0)
+ fprintf(stderr, " %d rules loaded\n",
+ robots->nrules);
+ else
+ fprintf(stderr, " no restrictions\n");
+
+ if (robots_delay(robots) > 0) {
+ rate_ms = robots_delay(robots) * 1000;
+ fprintf(stderr, " crawl-delay: %ds\n",
+ robots_delay(robots));
+ } else {
+ rate_ms = RATE_LIMIT_MS;
+ }
+ } else {
+ rate_ms = RATE_LIMIT_MS;
+ }
+
+ norm = url_normalize(start_url);
+ visited_add(visited, norm);
+ free(norm);
+ queue_push(queue, start_url, 0);
+
+ pages_archived = 0;
+ start_time = time(NULL);
+
+ while (!queue_empty(queue)) {
+ node = queue_pop(queue);
+ url = node->url;
+ depth = node->depth;
+
+ resp = fetch_url(url);
+ if (!resp || resp->status_code >= 400) {
+ if (verbose)
+ fprintf(stderr, "[%d] SKIP %s\n",
+ depth, url);
+ response_free(resp);
+ free(url);
+ free(node);
+ continue;
+ }
+
+ if (resp->content_type &&
+ !strstr(resp->content_type, "text/html")) {
+ response_free(resp);
+ free(url);
+ free(node);
+ continue;
+ }
+
+ final_url = resp->final_url ?
+ resp->final_url : url;
+
+ fprintf(stderr, "[d=%d q=%zu v=%zu] %s\n",
+ depth, queue_size(queue),
+ visited_count(visited), url);
+
+ extract_links(resp->data, final_url, queue,
+ visited, depth, robots);
+
+ rel_path = url_to_path(url, base_domain);
+ save_page(url, final_url, resp->data, rel_path);
+ pages_archived++;
+
+ if (pages_archived % PROGRESS_INTERVAL == 0) {
+ now = time(NULL);
+ fprintf(stderr,
+ "\n--- %d pages, %zu queued, "
+ "%zu visited, %lds ---\n\n",
+ pages_archived,
+ queue_size(queue),
+ visited_count(visited),
+ (long)(now - start_time));
+ }
+
+ free(rel_path);
+ response_free(resp);
+ free(url);
+ free(node);
+
+ usleep(rate_ms * 1000);
+ }
+
+ now = time(NULL);
+ fprintf(stderr,
+ "\nDone: %d pages to %s/ in %lds\n",
+ pages_archived, output_dir,
+ (long)(now - start_time));
+
+ robots_free(robots);
+ queue_free(queue);
+ visited_free(visited);
+}
+
+int
+main(int argc, char *argv[])
+{
+ const char *url;
+ int opt;
+
+ while ((opt = getopt(argc, argv, "vrRd:o:a:h")) != -1) {
+ switch (opt) {
+ case 'v':
+ verbose = 1;
+ break;
+ case 'r':
+ recursive = 1;
+ break;
+ case 'R':
+ respect_robots = 0;
+ break;
+ case 'd':
+ max_depth = atoi(optarg);
+ if (max_depth < 1)
+ max_depth = 1;
+ break;
+ case 'o':
+ output_dir = optarg;
+ break;
+ case 'a':
+ author = optarg;
+ break;
+ case 'h': /* fallthrough */
+ default:
+ usage();
+ }
+ }
+
+ if (optind >= argc)
+ usage();
+
+ url = argv[optind];
+ base_domain = url_get_domain(url);
+ if (!output_dir)
+ output_dir = base_domain;
+
+ if (mkdir(output_dir, 0755) != 0 && errno != EEXIST)
+ die("cannot create directory: %s", output_dir);
+
+ fetch_init();
+
+ fprintf(stderr, "archiver-bot %s\n", ARCHIVER_VERSION);
+ fprintf(stderr, "Target: %s\n", url);
+ fprintf(stderr, "Output: %s/\n", output_dir);
+ if (recursive)
+ fprintf(stderr, "Mode: recursive (depth %d)\n",
+ max_depth);
+ if (!respect_robots)
+ fprintf(stderr, "Warning: ignoring robots.txt\n");
+ fprintf(stderr, "\n");
+
+ if (recursive)
+ crawl_site(url);
+ else
+ archive_page(url);
+
+ fetch_cleanup();
+ free(base_domain);
+ return 0;
+}
diff --git a/config.h b/config.h
@@ -0,0 +1,31 @@
+/* See LICENSE file for copyright and license details.
+ * archiver-bot - suckless web archiver
+ * configuration header
+ */
+
+#ifndef CONFIG_H
+#define CONFIG_H
+
+/* Archiver metadata */
+#define ARCHIVER_NAME "Kris Yotam"
+#define ARCHIVER_SITE "krisyotam.com"
+#define ARCHIVER_VERSION "0.2.0"
+
+/* Network settings */
+#define USER_AGENT "archiver-bot/0.2 (+https://krisyotam.com)"
+#define CONNECT_TIMEOUT 30L
+#define REQUEST_TIMEOUT 60L
+#define MAX_REDIRECTS 10L
+
+/* Crawl settings */
+#define MAX_DEPTH 5
+#define RATE_LIMIT_MS 1000 /* milliseconds between requests */
+#define MAX_FILE_SIZE (50 * 1024 * 1024) /* 50 MB max per resource */
+
+/* Output settings */
+#define OUTPUT_EXT ".gwtar.html"
+
+/* Progress reporting interval (pages between status lines) */
+#define PROGRESS_INTERVAL 10
+
+#endif /* CONFIG_H */
diff --git a/crawl.c b/crawl.c
@@ -0,0 +1,296 @@
+/* See LICENSE file for copyright and license details. */
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "crawl.h"
+#include "util.h"
+
+/* FNV-1a hash - fast, good distribution for URL strings */
+static unsigned long
+fnv1a(const char *s)
+{
+ unsigned long h = 2166136261UL;
+
+ for (; *s; s++) {
+ h ^= (unsigned char)*s;
+ h *= 16777619UL;
+ }
+ return h;
+}
+
+UrlQueue *
+queue_new(void)
+{
+ UrlQueue *q = xmalloc(sizeof(UrlQueue));
+
+ q->head = NULL;
+ q->tail = NULL;
+ q->count = 0;
+ return q;
+}
+
+void
+queue_free(UrlQueue *q)
+{
+ QueueNode *n, *next;
+
+ if (!q)
+ return;
+ for (n = q->head; n; n = next) {
+ next = n->next;
+ free(n->url);
+ free(n);
+ }
+ free(q);
+}
+
+void
+queue_push(UrlQueue *q, const char *url, int depth)
+{
+ QueueNode *n = xmalloc(sizeof(QueueNode));
+
+ n->url = xstrdup(url);
+ n->depth = depth;
+ n->next = NULL;
+ if (q->tail) {
+ q->tail->next = n;
+ q->tail = n;
+ } else {
+ q->head = n;
+ q->tail = n;
+ }
+ q->count++;
+}
+
+QueueNode *
+queue_pop(UrlQueue *q)
+{
+ QueueNode *n;
+
+ if (!q->head)
+ return NULL;
+ n = q->head;
+ q->head = n->next;
+ if (!q->head)
+ q->tail = NULL;
+ q->count--;
+ return n;
+}
+
+int
+queue_empty(UrlQueue *q)
+{
+ return q->head == NULL;
+}
+
+size_t
+queue_size(UrlQueue *q)
+{
+ return q->count;
+}
+
+VisitedSet *
+visited_new(void)
+{
+ VisitedSet *v = xmalloc(sizeof(VisitedSet));
+
+ memset(v->buckets, 0, sizeof(v->buckets));
+ v->count = 0;
+ return v;
+}
+
+void
+visited_free(VisitedSet *v)
+{
+ HashNode *n, *next;
+ size_t i;
+
+ if (!v)
+ return;
+ for (i = 0; i < HT_SIZE; i++) {
+ for (n = v->buckets[i]; n; n = next) {
+ next = n->next;
+ free(n->url);
+ free(n);
+ }
+ }
+ free(v);
+}
+
+void
+visited_add(VisitedSet *v, const char *url)
+{
+ unsigned long h = fnv1a(url) % HT_SIZE;
+ HashNode *n;
+
+ /* Check for duplicate first */
+ for (n = v->buckets[h]; n; n = n->next) {
+ if (strcmp(n->url, url) == 0)
+ return;
+ }
+ n = xmalloc(sizeof(HashNode));
+ n->url = xstrdup(url);
+ n->next = v->buckets[h];
+ v->buckets[h] = n;
+ v->count++;
+}
+
+int
+visited_contains(VisitedSet *v, const char *url)
+{
+ unsigned long h = fnv1a(url) % HT_SIZE;
+ HashNode *n;
+
+ for (n = v->buckets[h]; n; n = n->next) {
+ if (strcmp(n->url, url) == 0)
+ return 1;
+ }
+ return 0;
+}
+
+size_t
+visited_count(VisitedSet *v)
+{
+ return v->count;
+}
+
+char *
+url_normalize(const char *url)
+{
+ char *norm, *p, *hash, *query;
+ size_t len;
+
+ norm = xstrdup(url);
+
+ /* Remove fragment */
+ hash = strchr(norm, '#');
+ if (hash)
+ *hash = '\0';
+
+ /* Remove query string */
+ query = strchr(norm, '?');
+ if (query)
+ *query = '\0';
+
+ /* Remove trailing slash (but not bare domain slash) */
+ len = strlen(norm);
+ if (len > 1 && norm[len - 1] == '/') {
+ /* Keep slash if it's just protocol://domain/ */
+ p = norm;
+ if (str_starts_with(p, "https://"))
+ p += 8;
+ else if (str_starts_with(p, "http://"))
+ p += 7;
+ /* Skip domain */
+ while (*p && *p != '/')
+ p++;
+ /* Only strip if there's path beyond domain */
+ if (p < norm + len - 1)
+ norm[len - 1] = '\0';
+ }
+
+ /* Lowercase the domain part */
+ p = norm;
+ if (str_starts_with(p, "https://"))
+ p += 8;
+ else if (str_starts_with(p, "http://"))
+ p += 7;
+ while (*p && *p != '/')
+ *p++ = tolower((unsigned char)*p);
+
+ /* Remove default port :80 or :443 */
+ p = norm;
+ if (str_starts_with(p, "https://"))
+ p += 8;
+ else if (str_starts_with(p, "http://"))
+ p += 7;
+ {
+ char *colon = NULL;
+ char *slash = NULL;
+ char *scan;
+ int is_https;
+
+ is_https = str_starts_with(norm, "https://");
+ for (scan = p; *scan && *scan != '/'; scan++) {
+ if (*scan == ':')
+ colon = scan;
+ }
+ slash = scan;
+ if (colon) {
+ char port[8];
+ size_t plen = slash - colon - 1;
+
+ if (plen < sizeof(port)) {
+ memcpy(port, colon + 1, plen);
+ port[plen] = '\0';
+ if ((is_https && strcmp(port, "443") == 0) ||
+ (!is_https && strcmp(port, "80") == 0)) {
+ memmove(colon, slash,
+ strlen(slash) + 1);
+ }
+ }
+ }
+ }
+
+ return norm;
+}
+
+char *
+url_to_path(const char *url, const char *base_domain)
+{
+ const char *path_start;
+ char *path, *query, *hash, *new_path;
+ size_t len, new_len;
+
+ (void)base_domain;
+
+ path_start = url;
+ /* Skip protocol */
+ if (str_starts_with(url, "https://"))
+ path_start = url + 8;
+ else if (str_starts_with(url, "http://"))
+ path_start = url + 7;
+
+ /* Skip domain */
+ while (*path_start && *path_start != '/')
+ path_start++;
+
+ /* No path or just "/" -> index.html */
+ if (!*path_start || strcmp(path_start, "/") == 0)
+ return xstrdup("index.html");
+
+ /* Skip leading slash */
+ if (*path_start == '/')
+ path_start++;
+
+ /* Copy path, strip query/fragment */
+ path = xstrdup(path_start);
+ query = strchr(path, '?');
+ if (query)
+ *query = '\0';
+ hash = strchr(path, '#');
+ if (hash)
+ *hash = '\0';
+
+ /* Remove trailing slash */
+ len = strlen(path);
+ if (len > 0 && path[len - 1] == '/') {
+ path[len - 1] = '\0';
+ len--;
+ }
+
+ /* If path doesn't end in .html/.htm, treat as directory */
+ if (len > 0 && !str_ends_with(path, ".html") &&
+ !str_ends_with(path, ".htm")) {
+ new_len = len + 12;
+ new_path = xmalloc(new_len);
+ snprintf(new_path, new_len, "%s/index.html", path);
+ free(path);
+ path = new_path;
+ }
+
+ return path;
+}
diff --git a/crawl.h b/crawl.h
@@ -0,0 +1,57 @@
+/* See LICENSE file for copyright and license details. */
+
+#ifndef CRAWL_H
+#define CRAWL_H
+
+#include <stddef.h>
+
+/* Hash table size (prime, ~64k buckets) */
+#define HT_SIZE 65521
+
+/* URL queue for BFS crawling */
+typedef struct QueueNode {
+ char *url;
+ int depth;
+ struct QueueNode *next;
+} QueueNode;
+
+typedef struct {
+ QueueNode *head;
+ QueueNode *tail;
+ size_t count;
+} UrlQueue;
+
+/* Hash table node for visited URLs */
+typedef struct HashNode {
+ char *url;
+ struct HashNode *next;
+} HashNode;
+
+/* Hash table based visited set - O(1) lookup */
+typedef struct {
+ HashNode *buckets[HT_SIZE];
+ size_t count;
+} VisitedSet;
+
+/* Queue operations */
+UrlQueue *queue_new(void);
+void queue_free(UrlQueue *q);
+void queue_push(UrlQueue *q, const char *url, int depth);
+QueueNode *queue_pop(UrlQueue *q);
+int queue_empty(UrlQueue *q);
+size_t queue_size(UrlQueue *q);
+
+/* Visited set operations (hash table) */
+VisitedSet *visited_new(void);
+void visited_free(VisitedSet *v);
+void visited_add(VisitedSet *v, const char *url);
+int visited_contains(VisitedSet *v, const char *url);
+size_t visited_count(VisitedSet *v);
+
+/* URL normalization for comparison */
+char *url_normalize(const char *url);
+
+/* Get path component from URL for directory structure */
+char *url_to_path(const char *url, const char *base_domain);
+
+#endif /* CRAWL_H */
diff --git a/fetch.c b/fetch.c
@@ -0,0 +1,168 @@
+/* See LICENSE file for copyright and license details. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <curl/curl.h>
+
+#include "config.h"
+#include "fetch.h"
+#include "util.h"
+
+static CURL *curl_handle = NULL;
+
+static size_t
+write_cb(void *contents, size_t size, size_t nmemb, void *userp)
+{
+ size_t realsize = size * nmemb;
+ Response *resp = (Response *)userp;
+ char *ptr;
+
+ ptr = xrealloc(resp->data, resp->size + realsize + 1);
+ resp->data = ptr;
+ memcpy(&(resp->data[resp->size]), contents, realsize);
+ resp->size += realsize;
+ resp->data[resp->size] = '\0';
+ return realsize;
+}
+
+/*
+ * Check if an HTTP status code is transient (worth retrying).
+ * 429 = rate limited, 5xx = server errors
+ */
+static int
+is_transient(long code)
+{
+ return code == 429 || code == 500 || code == 502 ||
+ code == 503 || code == 504;
+}
+
+void
+fetch_init(void)
+{
+ curl_global_init(CURL_GLOBAL_ALL);
+ curl_handle = curl_easy_init();
+ if (!curl_handle)
+ die("curl_easy_init failed");
+}
+
+void
+fetch_cleanup(void)
+{
+ if (curl_handle) {
+ curl_easy_cleanup(curl_handle);
+ curl_handle = NULL;
+ }
+ curl_global_cleanup();
+}
+
+Response *
+fetch_url(const char *url)
+{
+ Response *resp;
+ CURLcode res;
+ char *ct, *effective_url;
+ int attempt;
+
+ for (attempt = 0; attempt < FETCH_MAX_RETRIES; attempt++) {
+ if (attempt > 0) {
+ unsigned int delay;
+
+ delay = FETCH_RETRY_BASE * (1 << (attempt - 1));
+ warn("retry %d/%d for %s (waiting %us)",
+ attempt, FETCH_MAX_RETRIES - 1, url, delay);
+ sleep(delay);
+ }
+
+ resp = xmalloc(sizeof(Response));
+ resp->data = xmalloc(1);
+ resp->data[0] = '\0';
+ resp->size = 0;
+ resp->content_type = NULL;
+ resp->status_code = 0;
+ resp->final_url = NULL;
+
+ curl_easy_reset(curl_handle);
+ curl_easy_setopt(curl_handle, CURLOPT_URL, url);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION,
+ write_cb);
+ curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA,
+ (void *)resp);
+ curl_easy_setopt(curl_handle, CURLOPT_USERAGENT,
+ USER_AGENT);
+ curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L);
+ curl_easy_setopt(curl_handle, CURLOPT_MAXREDIRS,
+ MAX_REDIRECTS);
+ curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT,
+ CONNECT_TIMEOUT);
+ curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT,
+ REQUEST_TIMEOUT);
+ curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 1L);
+ curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 2L);
+ curl_easy_setopt(curl_handle, CURLOPT_ACCEPT_ENCODING,
+ "");
+
+ res = curl_easy_perform(curl_handle);
+
+ if (res != CURLE_OK) {
+ /* Network-level failure */
+ if (res == CURLE_OPERATION_TIMEDOUT ||
+ res == CURLE_COULDNT_CONNECT ||
+ res == CURLE_GOT_NOTHING) {
+ warn("fetch: %s: %s",
+ url, curl_easy_strerror(res));
+ response_free(resp);
+ resp = NULL;
+ continue;
+ }
+ /* Non-transient curl error */
+ warn("fetch: %s: %s",
+ url, curl_easy_strerror(res));
+ response_free(resp);
+ return NULL;
+ }
+
+ curl_easy_getinfo(curl_handle,
+ CURLINFO_RESPONSE_CODE,
+ &resp->status_code);
+
+ ct = NULL;
+ if (curl_easy_getinfo(curl_handle,
+ CURLINFO_CONTENT_TYPE,
+ &ct) == CURLE_OK && ct)
+ resp->content_type = xstrdup(ct);
+
+ effective_url = NULL;
+ if (curl_easy_getinfo(curl_handle,
+ CURLINFO_EFFECTIVE_URL,
+ &effective_url) == CURLE_OK &&
+ effective_url)
+ resp->final_url = xstrdup(effective_url);
+
+ /* Retry on transient HTTP errors */
+ if (is_transient(resp->status_code)) {
+ response_free(resp);
+ resp = NULL;
+ continue;
+ }
+
+ return resp;
+ }
+
+ /* All retries exhausted */
+ warn("fetch: gave up on %s after %d attempts",
+ url, FETCH_MAX_RETRIES);
+ return resp;
+}
+
+void
+response_free(Response *resp)
+{
+ if (!resp)
+ return;
+ free(resp->data);
+ free(resp->content_type);
+ free(resp->final_url);
+ free(resp);
+}
diff --git a/fetch.h b/fetch.h
@@ -0,0 +1,31 @@
+/* See LICENSE file for copyright and license details. */
+
+#ifndef FETCH_H
+#define FETCH_H
+
+#include <stddef.h>
+
+/* Retry settings */
+#define FETCH_MAX_RETRIES 3
+#define FETCH_RETRY_BASE 2 /* base seconds for exponential backoff */
+
+/* Response structure */
+typedef struct {
+ char *data;
+ size_t size;
+ char *content_type;
+ long status_code;
+ char *final_url;
+} Response;
+
+/* Initialize/cleanup curl globally */
+void fetch_init(void);
+void fetch_cleanup(void);
+
+/* Fetch a URL with automatic retry on transient errors */
+Response *fetch_url(const char *url);
+
+/* Free a response */
+void response_free(Response *resp);
+
+#endif /* FETCH_H */
diff --git a/parse.c b/parse.c
@@ -0,0 +1,380 @@
+/* See LICENSE file for copyright and license details. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+
+#include "parse.h"
+#include "util.h"
+
+ResourceList *
+reslist_new(void)
+{
+ ResourceList *list = xmalloc(sizeof(ResourceList));
+ list->head = NULL;
+ list->tail = NULL;
+ list->count = 0;
+ return list;
+}
+
+void
+reslist_free(ResourceList *list)
+{
+ if (!list)
+ return;
+ Resource *r = list->head;
+ while (r) {
+ Resource *next = r->next;
+ free(r->url);
+ free(r);
+ r = next;
+ }
+ free(list);
+}
+
+int
+reslist_contains(ResourceList *list, const char *url)
+{
+ for (Resource *r = list->head; r; r = r->next)
+ if (strcmp(r->url, url) == 0)
+ return 1;
+ return 0;
+}
+
+void
+reslist_add(ResourceList *list, const char *url, ResourceType type)
+{
+ if (!url || !*url || reslist_contains(list, url))
+ return;
+
+ /* Skip data: URLs */
+ if (str_starts_with(url, "data:"))
+ return;
+
+ Resource *r = xmalloc(sizeof(Resource));
+ r->url = xstrdup(url);
+ r->type = type;
+ r->next = NULL;
+
+ if (list->tail) {
+ list->tail->next = r;
+ list->tail = r;
+ } else {
+ list->head = r;
+ list->tail = r;
+ }
+ list->count++;
+}
+
+/* Extract attribute value from tag */
+static char *
+get_attr(const char *tag, const char *attr)
+{
+ size_t attr_len = strlen(attr);
+ const char *p = tag;
+
+ while (*p) {
+ /* Skip whitespace */
+ while (*p && isspace((unsigned char)*p))
+ p++;
+
+ /* Check for attribute name */
+ if (strncasecmp(p, attr, attr_len) == 0) {
+ p += attr_len;
+ while (*p && isspace((unsigned char)*p))
+ p++;
+ if (*p == '=') {
+ p++;
+ while (*p && isspace((unsigned char)*p))
+ p++;
+
+ char quote = 0;
+ if (*p == '"' || *p == '\'') {
+ quote = *p++;
+ }
+
+ const char *start = p;
+ if (quote) {
+ while (*p && *p != quote)
+ p++;
+ } else {
+ while (*p && !isspace((unsigned char)*p) && *p != '>')
+ p++;
+ }
+
+ size_t len = p - start;
+ char *value = xmalloc(len + 1);
+ memcpy(value, start, len);
+ value[len] = '\0';
+ return value;
+ }
+ }
+
+ /* Skip to next attribute */
+ while (*p && !isspace((unsigned char)*p) && *p != '>')
+ p++;
+ }
+
+ return NULL;
+}
+
+/* Determine resource type from URL/tag */
+static ResourceType
+guess_resource_type(const char *url, const char *tag_name)
+{
+ if (!tag_name)
+ return RES_OTHER;
+
+ if (strcasecmp(tag_name, "img") == 0)
+ return RES_IMAGE;
+
+ if (strcasecmp(tag_name, "link") == 0) {
+ if (strstr(url, ".css") || strstr(url, "stylesheet"))
+ return RES_CSS;
+ if (strstr(url, ".woff") || strstr(url, ".ttf") || strstr(url, ".otf"))
+ return RES_FONT;
+ return RES_OTHER;
+ }
+
+ if (strcasecmp(tag_name, "a") == 0)
+ return RES_PAGE;
+
+ if (strcasecmp(tag_name, "script") == 0)
+ return RES_OTHER;
+
+ /* Check by extension */
+ char *lower = xstrdup(url);
+ str_tolower(lower);
+
+ ResourceType type = RES_OTHER;
+ if (strstr(lower, ".jpg") || strstr(lower, ".jpeg") ||
+ strstr(lower, ".png") || strstr(lower, ".gif") ||
+ strstr(lower, ".webp") || strstr(lower, ".svg") ||
+ strstr(lower, ".ico"))
+ type = RES_IMAGE;
+ else if (strstr(lower, ".css"))
+ type = RES_CSS;
+ else if (strstr(lower, ".woff") || strstr(lower, ".woff2") ||
+ strstr(lower, ".ttf") || strstr(lower, ".otf") ||
+ strstr(lower, ".eot"))
+ type = RES_FONT;
+
+ free(lower);
+ return type;
+}
+
+ResourceList *
+parse_html(const char *html, const char *base_url)
+{
+ ResourceList *list = reslist_new();
+ const char *p = html;
+
+ while (*p) {
+ /* Find tag start */
+ if (*p != '<') {
+ p++;
+ continue;
+ }
+ p++;
+
+ /* Skip comments */
+ if (str_starts_with(p, "!--")) {
+ p = strstr(p, "-->");
+ if (p)
+ p += 3;
+ else
+ break;
+ continue;
+ }
+
+ /* Get tag name */
+ const char *tag_start = p;
+ while (*p && !isspace((unsigned char)*p) && *p != '>' && *p != '/')
+ p++;
+
+ size_t tag_len = p - tag_start;
+ if (tag_len == 0 || tag_len > 20)
+ continue;
+
+ char tag_name[21];
+ memcpy(tag_name, tag_start, tag_len);
+ tag_name[tag_len] = '\0';
+
+ /* Find tag end */
+ const char *tag_end = strchr(p, '>');
+ if (!tag_end)
+ break;
+
+ /* Extract tag content for attribute parsing */
+ size_t content_len = tag_end - tag_start;
+ char *tag_content = xmalloc(content_len + 1);
+ memcpy(tag_content, tag_start, content_len);
+ tag_content[content_len] = '\0';
+
+ /* Check for relevant attributes based on tag */
+ char *url = NULL;
+
+ if (strcasecmp(tag_name, "img") == 0) {
+ url = get_attr(tag_content, "src");
+ if (!url)
+ url = get_attr(tag_content, "data-src");
+ } else if (strcasecmp(tag_name, "link") == 0) {
+ url = get_attr(tag_content, "href");
+ } else if (strcasecmp(tag_name, "script") == 0) {
+ url = get_attr(tag_content, "src");
+ } else if (strcasecmp(tag_name, "a") == 0) {
+ url = get_attr(tag_content, "href");
+ } else if (strcasecmp(tag_name, "source") == 0) {
+ url = get_attr(tag_content, "srcset");
+ if (!url)
+ url = get_attr(tag_content, "src");
+ }
+
+ if (url && *url) {
+ char *resolved = url_resolve(base_url, url);
+ ResourceType type = guess_resource_type(resolved, tag_name);
+ reslist_add(list, resolved, type);
+ free(resolved);
+ }
+
+ free(url);
+ free(tag_content);
+ p = tag_end + 1;
+ }
+
+ return list;
+}
+
+char *
+parse_title(const char *html)
+{
+ const char *start = strcasestr(html, "<title");
+ if (!start)
+ return xstrdup("Untitled");
+
+ start = strchr(start, '>');
+ if (!start)
+ return xstrdup("Untitled");
+ start++;
+
+ const char *end = strcasestr(start, "</title>");
+ if (!end)
+ return xstrdup("Untitled");
+
+ size_t len = end - start;
+ char *title = xmalloc(len + 1);
+ memcpy(title, start, len);
+ title[len] = '\0';
+
+ return str_trim(title);
+}
+
+/* Helper to find and replace in string, returns new allocated string */
+static char *
+str_replace_first(const char *str, const char *old, size_t old_len, const char *new, size_t new_len)
+{
+ const char *pos = strstr(str, old);
+ if (!pos)
+ return xstrdup(str);
+
+ size_t before_len = pos - str;
+ size_t after_len = strlen(pos + old_len);
+ size_t result_len = before_len + new_len + after_len;
+
+ char *result = xmalloc(result_len + 1);
+ memcpy(result, str, before_len);
+ memcpy(result + before_len, new, new_len);
+ memcpy(result + before_len + new_len, pos + old_len, after_len + 1);
+
+ return result;
+}
+
+char *
+inline_resources(const char *html, const char *base_url,
+ char *(*fetch_and_encode)(const char *url, const char *base_url))
+{
+ char *result = xstrdup(html);
+ size_t search_offset = 0;
+
+ /* Process img tags */
+ while (1) {
+ const char *p = strcasestr(result + search_offset, "<img");
+ if (!p)
+ break;
+
+ const char *tag_end = strchr(p, '>');
+ if (!tag_end)
+ break;
+
+ /* Calculate offset for this tag */
+ size_t tag_offset = p - result;
+
+ /* Find src attribute */
+ const char *src_start = strcasestr(p, "src=");
+ if (!src_start || src_start > tag_end) {
+ /* No src, skip this img */
+ search_offset = (tag_end - result) + 1;
+ continue;
+ }
+
+ src_start += 4;
+ char quote = 0;
+ if (*src_start == '"' || *src_start == '\'')
+ quote = *src_start++;
+
+ const char *src_end = src_start;
+ if (quote) {
+ while (*src_end && *src_end != quote)
+ src_end++;
+ } else {
+ while (*src_end && !isspace((unsigned char)*src_end) && *src_end != '>')
+ src_end++;
+ }
+
+ /* Extract URL */
+ size_t url_len = src_end - src_start;
+ char *url = xmalloc(url_len + 1);
+ memcpy(url, src_start, url_len);
+ url[url_len] = '\0';
+
+ /* Skip if already data URI */
+ if (str_starts_with(url, "data:")) {
+ free(url);
+ search_offset = (tag_end - result) + 1;
+ continue;
+ }
+
+ /* Fetch and encode */
+ char *data_uri = fetch_and_encode(url, base_url);
+ if (data_uri) {
+ /* Build old and new strings for replacement */
+ char *old_attr = xmalloc(url_len + 8);
+ snprintf(old_attr, url_len + 8, "src=%c%s%c",
+ quote ? quote : '"', url, quote ? quote : '"');
+
+ size_t new_attr_len = 5 + strlen(data_uri) + 2;
+ char *new_attr = xmalloc(new_attr_len + 1);
+ snprintf(new_attr, new_attr_len + 1, "src=\"%s\"", data_uri);
+
+ char *new_result = str_replace_first(result, old_attr, strlen(old_attr),
+ new_attr, strlen(new_attr));
+ /* Continue searching after the new data URI */
+ search_offset = tag_offset + strlen(new_attr);
+
+ free(result);
+ result = new_result;
+
+ free(old_attr);
+ free(new_attr);
+ free(data_uri);
+ } else {
+ /* Fetch failed, skip this img tag */
+ search_offset = (tag_end - result) + 1;
+ }
+
+ free(url);
+ }
+
+ return result;
+}
diff --git a/parse.h b/parse.h
@@ -0,0 +1,51 @@
+/* See LICENSE file for copyright and license details. */
+
+#ifndef PARSE_H
+#define PARSE_H
+
+#include <stddef.h>
+
+/* Resource types */
+typedef enum {
+ RES_IMAGE,
+ RES_CSS,
+ RES_FONT,
+ RES_PAGE,
+ RES_OTHER
+} ResourceType;
+
+/* Extracted resource */
+typedef struct Resource {
+ char *url;
+ ResourceType type;
+ struct Resource *next;
+} Resource;
+
+/* Resource list */
+typedef struct {
+ Resource *head;
+ Resource *tail;
+ size_t count;
+} ResourceList;
+
+/* Create/destroy resource list */
+ResourceList *reslist_new(void);
+void reslist_free(ResourceList *list);
+
+/* Add a resource (url is copied) */
+void reslist_add(ResourceList *list, const char *url, ResourceType type);
+
+/* Check if URL already in list */
+int reslist_contains(ResourceList *list, const char *url);
+
+/* Extract resources from HTML */
+ResourceList *parse_html(const char *html, const char *base_url);
+
+/* Extract title from HTML */
+char *parse_title(const char *html);
+
+/* Find and inline resources in HTML */
+char *inline_resources(const char *html, const char *base_url,
+ char *(*fetch_and_encode)(const char *url, const char *base_url));
+
+#endif /* PARSE_H */
diff --git a/robots.c b/robots.c
@@ -0,0 +1,200 @@
+/* See LICENSE file for copyright and license details. */
+
+#include <ctype.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "robots.h"
+#include "fetch.h"
+#include "util.h"
+
+/*
+ * Parse robots.txt content for our user-agent.
+ *
+ * We look for rules matching "archiver-bot" first,
+ * then fall back to "*" (wildcard) rules.
+ */
+Robots *
+robots_fetch(const char *domain)
+{
+ Robots *r;
+ Response *resp;
+ char url[1024];
+ char *line, *saveptr, *text;
+ int in_our_group, in_star_group, found_our_group;
+ int star_nrules, star_delay;
+ RobotsRule star_rules[MAX_RULES];
+
+ r = xmalloc(sizeof(Robots));
+ r->domain = xstrdup(domain);
+ r->nrules = 0;
+ r->crawl_delay = 0;
+
+ snprintf(url, sizeof(url), "https://%s/robots.txt", domain);
+ resp = fetch_url(url);
+ if (!resp || resp->status_code >= 400 || !resp->data) {
+ /* No robots.txt = everything allowed */
+ response_free(resp);
+ return r;
+ }
+
+ text = xstrdup(resp->data);
+ response_free(resp);
+
+ in_our_group = 0;
+ in_star_group = 0;
+ found_our_group = 0;
+ star_nrules = 0;
+ star_delay = 0;
+
+ for (line = strtok_r(text, "\n", &saveptr); line;
+ line = strtok_r(NULL, "\n", &saveptr)) {
+ char *trimmed, *colon, *key, *val;
+
+ trimmed = str_trim(line);
+
+ /* Skip empty lines and comments */
+ if (!*trimmed || *trimmed == '#')
+ continue;
+
+ /* Strip inline comments */
+ colon = strchr(trimmed, '#');
+ if (colon)
+ *colon = '\0';
+
+ /* Find key: value */
+ colon = strchr(trimmed, ':');
+ if (!colon)
+ continue;
+ *colon = '\0';
+ key = str_trim(trimmed);
+ val = str_trim(colon + 1);
+
+ if (strcasecmp(key, "user-agent") == 0) {
+ /* New user-agent group */
+ if (strcasestr(val, "archiver-bot")) {
+ in_our_group = 1;
+ in_star_group = 0;
+ found_our_group = 1;
+ } else if (strcmp(val, "*") == 0 &&
+ !found_our_group) {
+ in_star_group = 1;
+ in_our_group = 0;
+ } else {
+ in_our_group = 0;
+ in_star_group = 0;
+ }
+ continue;
+ }
+
+ if (strcasecmp(key, "disallow") == 0) {
+ if (in_our_group && r->nrules < MAX_RULES) {
+ r->rules[r->nrules].path = xstrdup(val);
+ r->rules[r->nrules].allow = 0;
+ r->nrules++;
+ } else if (in_star_group &&
+ star_nrules < MAX_RULES) {
+ star_rules[star_nrules].path = xstrdup(val);
+ star_rules[star_nrules].allow = 0;
+ star_nrules++;
+ }
+ } else if (strcasecmp(key, "allow") == 0) {
+ if (in_our_group && r->nrules < MAX_RULES) {
+ r->rules[r->nrules].path = xstrdup(val);
+ r->rules[r->nrules].allow = 1;
+ r->nrules++;
+ } else if (in_star_group &&
+ star_nrules < MAX_RULES) {
+ star_rules[star_nrules].path = xstrdup(val);
+ star_rules[star_nrules].allow = 1;
+ star_nrules++;
+ }
+ } else if (strcasecmp(key, "crawl-delay") == 0) {
+ int delay = atoi(val);
+
+ if (delay > 0) {
+ if (in_our_group)
+ r->crawl_delay = delay;
+ else if (in_star_group)
+ star_delay = delay;
+ }
+ }
+ }
+
+ /* If no specific rules for us, use wildcard rules */
+ if (!found_our_group && star_nrules > 0) {
+ int i;
+
+ for (i = 0; i < star_nrules; i++)
+ r->rules[i] = star_rules[i];
+ r->nrules = star_nrules;
+ r->crawl_delay = star_delay;
+ } else if (!found_our_group) {
+ /* Free star rules if we didn't use them */
+ int i;
+
+ for (i = 0; i < star_nrules; i++)
+ free(star_rules[i].path);
+ }
+
+ free(text);
+ return r;
+}
+
+int
+robots_allowed(Robots *r, const char *path)
+{
+ int i, best_len, allowed;
+
+ if (!r || r->nrules == 0)
+ return 1;
+
+ /*
+ * Match the most specific (longest) rule.
+ * If multiple rules of same length, Allow wins.
+ */
+ best_len = -1;
+ allowed = 1;
+
+ for (i = 0; i < r->nrules; i++) {
+ int plen = strlen(r->rules[i].path);
+
+ /* Empty disallow = allow all */
+ if (plen == 0 && !r->rules[i].allow)
+ continue;
+
+ if (strncmp(path, r->rules[i].path, plen) == 0) {
+ if (plen > best_len) {
+ best_len = plen;
+ allowed = r->rules[i].allow;
+ } else if (plen == best_len &&
+ r->rules[i].allow) {
+ allowed = 1;
+ }
+ }
+ }
+
+ return allowed;
+}
+
+int
+robots_delay(Robots *r)
+{
+ if (!r)
+ return 0;
+ return r->crawl_delay;
+}
+
+void
+robots_free(Robots *r)
+{
+ int i;
+
+ if (!r)
+ return;
+ for (i = 0; i < r->nrules; i++)
+ free(r->rules[i].path);
+ free(r->domain);
+ free(r);
+}
diff --git a/robots.h b/robots.h
@@ -0,0 +1,35 @@
+/* See LICENSE file for copyright and license details. */
+
+#ifndef ROBOTS_H
+#define ROBOTS_H
+
+/* Maximum rules per robots.txt */
+#define MAX_RULES 512
+
+/* Rule types */
+typedef struct {
+ char *path;
+ int allow; /* 1 = allow, 0 = disallow */
+} RobotsRule;
+
+/* Parsed robots.txt for a domain */
+typedef struct {
+ char *domain;
+ RobotsRule rules[MAX_RULES];
+ int nrules;
+ int crawl_delay; /* seconds, 0 = none specified */
+} Robots;
+
+/* Fetch and parse robots.txt for a domain */
+Robots *robots_fetch(const char *domain);
+
+/* Check if a path is allowed */
+int robots_allowed(Robots *r, const char *path);
+
+/* Get crawl delay in seconds (0 = none) */
+int robots_delay(Robots *r);
+
+/* Free robots struct */
+void robots_free(Robots *r);
+
+#endif /* ROBOTS_H */
diff --git a/util.c b/util.c
@@ -0,0 +1,307 @@
+/* See LICENSE file for copyright and license details. */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdarg.h>
+#include <ctype.h>
+#include <time.h>
+
+#include "util.h"
+
+static const char base64_table[] =
+ "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+void
+die(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ if (fmt[0] && fmt[strlen(fmt)-1] == ':') {
+ fputc(' ', stderr);
+ perror(NULL);
+ } else {
+ fputc('\n', stderr);
+ }
+ exit(1);
+}
+
+void
+warn(const char *fmt, ...)
+{
+ va_list ap;
+ va_start(ap, fmt);
+ fprintf(stderr, "warning: ");
+ vfprintf(stderr, fmt, ap);
+ va_end(ap);
+ fputc('\n', stderr);
+}
+
+void *
+xmalloc(size_t size)
+{
+ void *p = malloc(size);
+ if (!p)
+ die("malloc:");
+ return p;
+}
+
+void *
+xrealloc(void *ptr, size_t size)
+{
+ void *p = realloc(ptr, size);
+ if (!p)
+ die("realloc:");
+ return p;
+}
+
+char *
+xstrdup(const char *s)
+{
+ char *p = strdup(s);
+ if (!p)
+ die("strdup:");
+ return p;
+}
+
+char *
+base64_encode(const unsigned char *data, size_t input_len, size_t *output_len)
+{
+ size_t olen = 4 * ((input_len + 2) / 3);
+ char *encoded = xmalloc(olen + 1);
+
+ size_t i, j;
+ for (i = 0, j = 0; i < input_len;) {
+ unsigned int a = i < input_len ? data[i++] : 0;
+ unsigned int b = i < input_len ? data[i++] : 0;
+ unsigned int c = i < input_len ? data[i++] : 0;
+ unsigned int triple = (a << 16) | (b << 8) | c;
+
+ encoded[j++] = base64_table[(triple >> 18) & 0x3F];
+ encoded[j++] = base64_table[(triple >> 12) & 0x3F];
+ encoded[j++] = base64_table[(triple >> 6) & 0x3F];
+ encoded[j++] = base64_table[triple & 0x3F];
+ }
+
+ /* Add padding */
+ size_t mod = input_len % 3;
+ if (mod) {
+ encoded[olen - 1] = '=';
+ if (mod == 1)
+ encoded[olen - 2] = '=';
+ }
+
+ encoded[olen] = '\0';
+ if (output_len)
+ *output_len = olen;
+ return encoded;
+}
+
+int
+str_starts_with(const char *str, const char *prefix)
+{
+ return strncmp(str, prefix, strlen(prefix)) == 0;
+}
+
+int
+str_ends_with(const char *str, const char *suffix)
+{
+ size_t slen = strlen(str);
+ size_t suflen = strlen(suffix);
+ if (suflen > slen)
+ return 0;
+ return strcmp(str + slen - suflen, suffix) == 0;
+}
+
+char *
+str_tolower(char *str)
+{
+ for (char *p = str; *p; p++)
+ *p = tolower((unsigned char)*p);
+ return str;
+}
+
+char *
+str_trim(char *str)
+{
+ char *end;
+ while (isspace((unsigned char)*str))
+ str++;
+ if (*str == '\0')
+ return str;
+ end = str + strlen(str) - 1;
+ while (end > str && isspace((unsigned char)*end))
+ end--;
+ end[1] = '\0';
+ return str;
+}
+
+char *
+url_get_domain(const char *url)
+{
+ const char *start, *end;
+ char *domain;
+
+ /* Skip protocol */
+ if (str_starts_with(url, "https://"))
+ start = url + 8;
+ else if (str_starts_with(url, "http://"))
+ start = url + 7;
+ else
+ start = url;
+
+ /* Find end of domain */
+ end = start;
+ while (*end && *end != '/' && *end != ':' && *end != '?')
+ end++;
+
+ size_t len = end - start;
+ domain = xmalloc(len + 1);
+ memcpy(domain, start, len);
+ domain[len] = '\0';
+
+ return domain;
+}
+
+int
+url_same_domain(const char *url1, const char *url2)
+{
+ char *d1 = url_get_domain(url1);
+ char *d2 = url_get_domain(url2);
+ int same = strcasecmp(d1, d2) == 0;
+ free(d1);
+ free(d2);
+ return same;
+}
+
+char *
+url_resolve(const char *base, const char *relative)
+{
+ char *result;
+
+ /* Already absolute */
+ if (str_starts_with(relative, "http://") ||
+ str_starts_with(relative, "https://") ||
+ str_starts_with(relative, "data:")) {
+ return xstrdup(relative);
+ }
+
+ /* Protocol-relative */
+ if (str_starts_with(relative, "//")) {
+ size_t len = 6 + strlen(relative);
+ result = xmalloc(len + 1);
+ snprintf(result, len + 1, "https:%s", relative);
+ return result;
+ }
+
+ char *domain = url_get_domain(base);
+ const char *proto = str_starts_with(base, "https://") ? "https://" : "http://";
+
+ /* Root-relative */
+ if (relative[0] == '/') {
+ size_t len = strlen(proto) + strlen(domain) + strlen(relative);
+ result = xmalloc(len + 1);
+ snprintf(result, len + 1, "%s%s%s", proto, domain, relative);
+ free(domain);
+ return result;
+ }
+
+ /* Find base path */
+ const char *path_start;
+ if (str_starts_with(base, "https://"))
+ path_start = base + 8;
+ else if (str_starts_with(base, "http://"))
+ path_start = base + 7;
+ else
+ path_start = base;
+
+ /* Skip domain */
+ while (*path_start && *path_start != '/')
+ path_start++;
+
+ /* Find last slash in path */
+ const char *last_slash = strrchr(path_start, '/');
+ if (!last_slash)
+ last_slash = path_start;
+
+ size_t base_len = last_slash - path_start + 1;
+ size_t len = strlen(proto) + strlen(domain) + base_len + strlen(relative);
+ result = xmalloc(len + 1);
+ snprintf(result, len + 1, "%s%s%.*s%s", proto, domain, (int)base_len, path_start, relative);
+
+ free(domain);
+ return result;
+}
+
+char *
+get_mime_type(const char *url)
+{
+ /* Strip query string */
+ char *copy = xstrdup(url);
+ char *query = strchr(copy, '?');
+ if (query)
+ *query = '\0';
+
+ str_tolower(copy);
+
+ const char *mime = "application/octet-stream";
+
+ if (str_ends_with(copy, ".jpg") || str_ends_with(copy, ".jpeg"))
+ mime = "image/jpeg";
+ else if (str_ends_with(copy, ".png"))
+ mime = "image/png";
+ else if (str_ends_with(copy, ".gif"))
+ mime = "image/gif";
+ else if (str_ends_with(copy, ".webp"))
+ mime = "image/webp";
+ else if (str_ends_with(copy, ".svg"))
+ mime = "image/svg+xml";
+ else if (str_ends_with(copy, ".ico"))
+ mime = "image/x-icon";
+ else if (str_ends_with(copy, ".css"))
+ mime = "text/css";
+ else if (str_ends_with(copy, ".js"))
+ mime = "application/javascript";
+ else if (str_ends_with(copy, ".woff"))
+ mime = "font/woff";
+ else if (str_ends_with(copy, ".woff2"))
+ mime = "font/woff2";
+ else if (str_ends_with(copy, ".ttf"))
+ mime = "font/ttf";
+ else if (str_ends_with(copy, ".otf"))
+ mime = "font/otf";
+ else if (str_ends_with(copy, ".eot"))
+ mime = "application/vnd.ms-fontobject";
+
+ free(copy);
+ return xstrdup(mime);
+}
+
+char *
+sanitize_filename(const char *url)
+{
+ char *domain = url_get_domain(url);
+ size_t len = strlen(domain) + 32;
+ char *filename = xmalloc(len);
+
+ /* Replace dots with underscores */
+ for (char *p = domain; *p; p++)
+ if (*p == '.')
+ *p = '_';
+
+ snprintf(filename, len, "%s", domain);
+ free(domain);
+ return filename;
+}
+
+char *
+get_iso_date(void)
+{
+ time_t t = time(NULL);
+ struct tm *tm = gmtime(&t);
+ char *buf = xmalloc(32);
+ strftime(buf, 32, "%Y-%m-%dT%H:%M:%SZ", tm);
+ return buf;
+}
diff --git a/util.h b/util.h
@@ -0,0 +1,39 @@
+/* See LICENSE file for copyright and license details. */
+
+#ifndef UTIL_H
+#define UTIL_H
+
+#include <stddef.h>
+
+/* Memory allocation with error handling */
+void *xmalloc(size_t size);
+void *xrealloc(void *ptr, size_t size);
+char *xstrdup(const char *s);
+
+/* Base64 encoding */
+char *base64_encode(const unsigned char *data, size_t input_len,
+ size_t *output_len);
+
+/* String utilities */
+int str_starts_with(const char *str, const char *prefix);
+int str_ends_with(const char *str, const char *suffix);
+char *str_tolower(char *str);
+char *str_trim(char *str);
+
+/* URL utilities */
+char *url_resolve(const char *base, const char *relative);
+char *url_get_domain(const char *url);
+int url_same_domain(const char *url1, const char *url2);
+
+/* File utilities */
+char *get_mime_type(const char *url);
+char *sanitize_filename(const char *url);
+
+/* Time utilities */
+char *get_iso_date(void);
+
+/* Error handling */
+void die(const char *fmt, ...);
+void warn(const char *fmt, ...);
+
+#endif /* UTIL_H */