sbot

Unnamed repository; edit this file 'description' to name the repository.
Log | Files | Refs | README | LICENSE

commit 74179fb37667450f2a5ff117e81e7403adf7f710
Author: Kris Yotam <krisyotam@protonmail.com>
Date:   Sat, 14 Feb 2026 17:06:53 -0600

archiver-bot v0.2: hash table, robots.txt, retry logic

Diffstat:
A.gitignore | 7+++++++
ACLAUDE.md | 145+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
ALICENSE | 21+++++++++++++++++++++
AMakefile | 45+++++++++++++++++++++++++++++++++++++++++++++
Aarchiver.c | 694+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aconfig.h | 31+++++++++++++++++++++++++++++++
Acrawl.c | 296+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Acrawl.h | 57+++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Afetch.c | 168+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Afetch.h | 31+++++++++++++++++++++++++++++++
Aparse.c | 380+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Aparse.h | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
Arobots.c | 200+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Arobots.h | 35+++++++++++++++++++++++++++++++++++
Autil.c | 307+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
Autil.h | 39+++++++++++++++++++++++++++++++++++++++
16 files changed, 2507 insertions(+), 0 deletions(-)

diff --git a/.gitignore b/.gitignore @@ -0,0 +1,7 @@ +# Build artifacts +archiver-bot +*.o + +# Test output +*.gwtar.html +www.*/ diff --git a/CLAUDE.md b/CLAUDE.md @@ -0,0 +1,145 @@ +# archiver-bot — CLAUDE.md + +## Project + +archiver-bot is a suckless web archiver written in C. It creates +self-contained archives of websites with all resources (CSS, images, +fonts) inlined as data URIs. Supports single-page and recursive +whole-site archival with GWTAR (Gwern Web Tar Archive) format headers. + +## Coding Standards — Suckless C Style + +All code in this project MUST follow the suckless.org coding style: + +### Language +- C99 (ISO/IEC 9899:1999), no extensions +- POSIX.1-2008 (`_POSIX_C_SOURCE 200809L`) + +### Indentation & Whitespace +- Tabs for indentation (1 tab = 1 level) +- Spaces for alignment only, never for indentation +- No tabs except at the beginning of a line +- Maximum line length: 79 characters + +### Comments +- Use `/* */` only, never `//` +- Comment fallthrough cases in switch statements + +### Variables +- All declarations at the top of the block +- Pointer `*` adjacent to variable name: `char *p`, not `char* p` +- No C99 `bool`; use `int` (0/1) +- Global/static variables not used outside TU must be `static` + +### Functions +- Return type on its own line +- Function name at column 0 on next line (enables `grep ^funcname`) +- Opening `{` on its own line for functions +- Functions not used outside their file: `static` + +```c +static void +usage(void) +{ + fprintf(stderr, "usage: archiver-bot [-v] [-r] url\n"); + exit(1); +} +``` + +### Braces +- Opening `{` on same line for control flow (if, for, while, switch) +- Closing `}` on its own line unless continuing (else, do-while) +- Use braces even for single statements when sibling branches use them + +### Naming +- lowercase_with_underscores for functions and variables +- UPPERCASE for macros and constants +- CamelCase for typedef'd struct types +- No `_t` suffix (reserved by POSIX) +- Prefix module functions with module name + +### Control Flow +- Space after `if`, `for`, `while`, `switch` +- No space after `(` or before `)` +- Use `goto` for cleanup/unwind, not nested ifs +- Return/exit early on failure +- Test against 0, not -1: `if (func() < 0)` + +### Error Handling +- All allocation checked; goto cleanup on failure +- `die()` for fatal errors (prints message, exits) +- `warn()` for recoverable errors (prints, continues) + +### File Organization Order +1. License header +2. System includes (alphabetical) +3. Local includes +4. Macros +5. Type definitions +6. Function declarations +7. Global variables +8. Function definitions (same order as declarations) + +### Headers +- System headers first, alphabetical +- Local headers after blank line +- No cyclic dependencies +- Include only what is needed + +## Architecture + +### Module Layout + +| Module | Prefix | File | Responsibility | +|--------|--------|------|----------------| +| Main | — | archiver.c | Entry point, page archiving, CSS inlining, link rewriting, crawl orchestration | +| Crawler | `queue_`, `visited_` | crawl.c | URL queue (BFS), visited set, URL normalization, path conversion | +| Fetcher | `fetch_` | fetch.c | HTTP fetching via libcurl, response management | +| Parser | `reslist_`, `parse_` | parse.c | HTML parsing, resource extraction, image inlining | +| Utilities | `die`, `warn`, `x*`, `str_*`, `url_*` | util.c | Memory wrappers, string ops, URL helpers, base64, MIME types | +| Config | — | config.h | Compile-time constants (timeouts, limits, user agent) | + +### Architecture Rules +- **Separate compilation.** Every .c file compiles independently. +- **No dynamic loading.** All features compiled in. +- **libcurl only.** Single external dependency for HTTP. +- **No `system()` calls.** Direct file I/O and libcurl only. +- **Data URIs for inlining.** Resources encoded as base64 data URIs. +- **Stateless functions preferred.** Minimize mutable global state. + +### Crawler Design Principles +- **BFS traversal.** URL queue processes breadth-first by depth level. +- **Same-domain only.** Never follow links to external domains. +- **Politeness.** Rate limiting between requests (configurable). +- **Depth control.** Hard limit on crawl depth to prevent runaway. +- **URL normalization.** Canonical form for deduplication. +- **Graceful degradation.** Skip failed resources, continue crawling. + +## Build + +```sh +make # build archiver-bot binary +make clean # remove build artifacts +make install # install to /usr/local/bin +``` + +Dependencies: `libcurl` (via pkg-config) + +## Usage + +```sh +# Single page +archiver-bot https://example.com/article + +# Whole site (recursive, depth 3) +archiver-bot -r -d 3 https://example.com + +# Verbose with custom output dir +archiver-bot -v -r -o ./archive https://example.com +``` + +## Git Conventions + +- No `Co-Authored-By: Claude` lines +- Commit messages: imperative, <72 chars, no period +- One logical change per commit diff --git a/LICENSE b/LICENSE @@ -0,0 +1,21 @@ +MIT/X Consortium License + +(c) 2026 Kris Yotam <krisyotam@proton.me> + +Permission is hereby granted, free of charge, to any person obtaining a +copy of this software and associated documentation files (the "Software"), +to deal in the Software without restriction, including without limitation +the rights to use, copy, modify, merge, publish, distribute, sublicense, +and/or sell copies of the Software, and to permit persons to whom the +Software is furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER +DEALINGS IN THE SOFTWARE. diff --git a/Makefile b/Makefile @@ -0,0 +1,45 @@ +# archiver-bot - suckless web archiver +# See LICENSE file for copyright and license details. + +VERSION = 0.2.0 + +# paths +PREFIX = /usr/local +MANPREFIX = $(PREFIX)/share/man + +# includes and libs +INCS = `pkg-config --cflags libcurl` +LIBS = `pkg-config --libs libcurl` + +# flags +CPPFLAGS = -D_DEFAULT_SOURCE -D_BSD_SOURCE -D_POSIX_C_SOURCE=200809L -DVERSION=\"$(VERSION)\" +CFLAGS = -std=c99 -pedantic -Wall -Wextra -Os $(INCS) $(CPPFLAGS) +LDFLAGS = $(LIBS) + +# compiler +CC = cc + +# sources +SRC = archiver.c crawl.c fetch.c parse.c robots.c util.c +OBJ = $(SRC:.c=.o) + +all: archiver-bot + +.c.o: + $(CC) $(CFLAGS) -c $< + +archiver-bot: $(OBJ) + $(CC) -o $@ $(OBJ) $(LDFLAGS) + +clean: + rm -f archiver-bot $(OBJ) + +install: all + mkdir -p $(DESTDIR)$(PREFIX)/bin + cp -f archiver-bot $(DESTDIR)$(PREFIX)/bin + chmod 755 $(DESTDIR)$(PREFIX)/bin/archiver-bot + +uninstall: + rm -f $(DESTDIR)$(PREFIX)/bin/archiver-bot + +.PHONY: all clean install uninstall diff --git a/archiver.c b/archiver.c @@ -0,0 +1,694 @@ +/* See LICENSE file for copyright and license details. + * + * archiver-bot - suckless web archiver + * + * Creates self-contained archives of websites with all + * resources inlined as data URIs. + */ + +#include <errno.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <sys/stat.h> +#include <time.h> +#include <unistd.h> + +#include "config.h" +#include "crawl.h" +#include "fetch.h" +#include "parse.h" +#include "robots.h" +#include "util.h" + +/* Global options */ +static int verbose = 0; +static int recursive = 0; +static int max_depth = MAX_DEPTH; +static int respect_robots = 1; +static const char *author = "Unknown"; +static const char *output_dir = NULL; +static char *base_domain = NULL; + +static void +usage(void) +{ + fprintf(stderr, + "usage: archiver-bot [-vrR] [-d depth] [-o dir]" + " [-a author] url\n" + "\n" + " -v verbose output\n" + " -r recursive (crawl entire site)\n" + " -R ignore robots.txt\n" + " -d depth max crawl depth (default: %d)\n" + " -o dir output directory\n" + " -a author site author name\n", + MAX_DEPTH); + exit(1); +} + +static int +mkdirp(const char *path) +{ + char *p, *sep; + + p = xstrdup(path); + for (sep = p + 1; *sep; sep++) { + if (*sep == '/') { + *sep = '\0'; + if (mkdir(p, 0755) != 0 && errno != EEXIST) { + free(p); + return -1; + } + *sep = '/'; + } + } + free(p); + return 0; +} + +static char * +fetch_and_encode(const char *url, const char *base) +{ + char *resolved, *mime, *b64, *data_uri, *semi; + Response *resp; + size_t b64_len, uri_len; + + resolved = url_resolve(base, url); + if (verbose) + fprintf(stderr, " resource: %s\n", resolved); + + resp = fetch_url(resolved); + if (!resp || resp->status_code >= 400 || resp->size == 0) { + free(resolved); + response_free(resp); + return NULL; + } + + if (resp->content_type) { + mime = xstrdup(resp->content_type); + semi = strchr(mime, ';'); + if (semi) + *semi = '\0'; + str_trim(mime); + } else { + mime = get_mime_type(resolved); + } + + b64 = base64_encode((unsigned char *)resp->data, + resp->size, &b64_len); + + uri_len = 5 + strlen(mime) + 8 + b64_len + 1; + data_uri = xmalloc(uri_len); + snprintf(data_uri, uri_len, "data:%s;base64,%s", mime, b64); + + free(mime); + free(b64); + free(resolved); + response_free(resp); + return data_uri; +} + +static char * +generate_header(const char *title, const char *source_url) +{ + char *date, *domain, *header; + size_t len; + + date = get_iso_date(); + domain = url_get_domain(source_url); + + len = 2048 + strlen(title) + strlen(source_url) + + strlen(author) + strlen(domain); + header = xmalloc(len); + + snprintf(header, len, + "<!--\n" + "========================================" + "========================================\n" + " GWTAR ARCHIVE\n" + "========================================" + "========================================\n" + "\n" + " Title: %s\n" + " Source URL: %s\n" + " Domain: %s\n" + " Author: %s\n" + "\n" + " Archived by: %s\n" + " Archived on: %s\n" + " Archive date: %s\n" + "\n" + " Generator: archiver-bot/%s\n" + " Format: GWTAR (Gwern Web Tar Archive)\n" + "\n" + "========================================" + "========================================\n" + "-->\n", + title, source_url, domain, author, + ARCHIVER_NAME, ARCHIVER_SITE, date, + ARCHIVER_VERSION); + + free(date); + free(domain); + return header; +} + +static int +path_depth(const char *path) +{ + int depth = 0; + const char *p; + + for (p = path; *p; p++) + if (*p == '/') + depth++; + return depth; +} + +static char * +make_relative_prefix(int depth) +{ + char *prefix; + size_t len; + int i; + + if (depth == 0) + return xstrdup(""); + len = depth * 3 + 1; + prefix = xmalloc(len); + prefix[0] = '\0'; + for (i = 0; i < depth; i++) + strcat(prefix, "../"); + return prefix; +} + +static char * +rewrite_links(char *html, const char *rel_path) +{ + int depth; + char *prefix, *result, *new_result; + size_t prefix_len, search_offset, pos, old_len, new_len; + const char *p; + + depth = path_depth(rel_path); + prefix = make_relative_prefix(depth); + prefix_len = strlen(prefix); + result = html; + + search_offset = 0; + while (1) { + p = strstr(result + search_offset, "href=\"/"); + if (!p) + break; + if (p[7] == '/') { + search_offset = (p - result) + 8; + continue; + } + pos = p - result + 6; + old_len = strlen(result); + new_len = old_len - 1 + prefix_len; + new_result = xmalloc(new_len + 1); + memcpy(new_result, result, pos); + memcpy(new_result + pos, prefix, prefix_len); + memcpy(new_result + pos + prefix_len, + result + pos + 1, old_len - pos); + free(result); + result = new_result; + search_offset = pos + prefix_len; + } + + search_offset = 0; + while (1) { + p = strstr(result + search_offset, "src=\"/"); + if (!p) + break; + if (p[6] == '/') { + search_offset = (p - result) + 7; + continue; + } + pos = p - result + 5; + old_len = strlen(result); + new_len = old_len - 1 + prefix_len; + new_result = xmalloc(new_len + 1); + memcpy(new_result, result, pos); + memcpy(new_result + pos, prefix, prefix_len); + memcpy(new_result + pos + prefix_len, + result + pos + 1, old_len - pos); + free(result); + result = new_result; + search_offset = pos + prefix_len; + } + + free(prefix); + return result; +} + +static char * +inline_css(char *html, const char *base) +{ + char *result, *tag, *href, *resolved; + char *new_tag, *new_result; + const char *link_start, *link_end; + const char *href_start, *href_end; + size_t search_offset, tag_offset, tag_len, href_len; + size_t new_tag_len, old_len, new_len, result_len; + Response *resp; + char quote; + + result = html; + search_offset = 0; + + while (1) { + link_start = strcasestr(result + search_offset, + "<link"); + if (!link_start) + break; + link_end = strchr(link_start, '>'); + if (!link_end) + break; + + tag_offset = link_start - result; + tag_len = link_end - link_start; + tag = xmalloc(tag_len + 1); + memcpy(tag, link_start, tag_len); + tag[tag_len] = '\0'; + + if (!strcasestr(tag, "stylesheet")) { + free(tag); + search_offset = (link_end - result) + 1; + continue; + } + + href_start = strcasestr(tag, "href="); + if (!href_start) { + free(tag); + search_offset = (link_end - result) + 1; + continue; + } + href_start += 5; + quote = 0; + if (*href_start == '"' || *href_start == '\'') + quote = *href_start++; + + href_end = href_start; + if (quote) { + while (*href_end && *href_end != quote) + href_end++; + } else { + while (*href_end && *href_end != ' ' && + *href_end != '>') + href_end++; + } + + href_len = href_end - href_start; + href = xmalloc(href_len + 1); + memcpy(href, href_start, href_len); + href[href_len] = '\0'; + + resolved = url_resolve(base, href); + if (verbose) + fprintf(stderr, " css: %s\n", resolved); + + resp = fetch_url(resolved); + free(resolved); + + if (resp && resp->status_code < 400 && + resp->size > 0) { + new_tag_len = 7 + resp->size + 8 + 1; + new_tag = xmalloc(new_tag_len); + snprintf(new_tag, new_tag_len, + "<style>%s</style>", resp->data); + + old_len = (link_end + 1) - link_start; + new_len = strlen(new_tag); + result_len = strlen(result); + + new_result = xmalloc( + result_len - old_len + new_len + 1); + memcpy(new_result, result, tag_offset); + memcpy(new_result + tag_offset, + new_tag, new_len); + memcpy(new_result + tag_offset + new_len, + link_end + 1, + result_len - tag_offset - old_len + 1); + + free(result); + result = new_result; + search_offset = tag_offset + new_len; + free(new_tag); + } else { + search_offset = (link_end - result) + 1; + } + + response_free(resp); + free(href); + free(tag); + } + + return result; +} + +static void +extract_links(const char *html, const char *base_url, + UrlQueue *queue, VisitedSet *visited, + int current_depth, Robots *robots) +{ + ResourceList *resources; + Resource *r; + char *norm, *path; + const char *pstart; + + if (current_depth >= max_depth) + return; + + resources = parse_html(html, base_url); + + for (r = resources->head; r; r = r->next) { + if (r->type != RES_PAGE) + continue; + + if (str_starts_with(r->url, "mailto:") || + str_starts_with(r->url, "tel:") || + str_starts_with(r->url, "javascript:") || + str_starts_with(r->url, "#")) + continue; + + if (!url_same_domain(r->url, base_url)) + continue; + + /* Check robots.txt */ + if (robots) { + pstart = r->url; + if (str_starts_with(pstart, "https://")) + pstart += 8; + else if (str_starts_with(pstart, "http://")) + pstart += 7; + while (*pstart && *pstart != '/') + pstart++; + path = xstrdup(pstart[0] ? pstart : "/"); + if (!robots_allowed(robots, path)) { + if (verbose) + fprintf(stderr, + " robots: blocked" + " %s\n", r->url); + free(path); + continue; + } + free(path); + } + + norm = url_normalize(r->url); + if (!visited_contains(visited, norm)) { + visited_add(visited, norm); + queue_push(queue, r->url, + current_depth + 1); + if (verbose) + fprintf(stderr, + " queued: %s\n", r->url); + } + free(norm); + } + + reslist_free(resources); +} + +static int +save_page(const char *url, const char *final_url, + const char *data, const char *rel_path) +{ + char *html, *title, *header; + char *full_path, *dir, *last_slash; + size_t full_path_len; + FILE *fp; + + html = xstrdup(data); + title = parse_title(html); + + if (verbose) + fprintf(stderr, " title: %s\n", title); + + html = inline_css(html, final_url); + html = inline_resources(html, final_url, + fetch_and_encode); + html = rewrite_links(html, rel_path); + + header = generate_header(title, url); + + full_path_len = strlen(output_dir) + 1 + + strlen(rel_path) + 1; + full_path = xmalloc(full_path_len); + snprintf(full_path, full_path_len, "%s/%s", + output_dir, rel_path); + + dir = xstrdup(full_path); + last_slash = strrchr(dir, '/'); + if (last_slash) { + *last_slash = '\0'; + mkdirp(dir); + mkdir(dir, 0755); + } + free(dir); + + fp = fopen(full_path, "w"); + if (!fp) { + warn("cannot write: %s", full_path); + free(full_path); + free(header); + free(html); + free(title); + return -1; + } + + fputs(header, fp); + fputs(html, fp); + fclose(fp); + + fprintf(stderr, " saved: %s\n", full_path); + + free(full_path); + free(header); + free(html); + free(title); + return 0; +} + +static int +archive_page(const char *url) +{ + Response *resp; + const char *final_url; + char *rel_path; + int ret; + + if (verbose) + fprintf(stderr, "[0] %s\n", url); + + resp = fetch_url(url); + if (!resp) { + warn("failed to fetch: %s", url); + return -1; + } + if (resp->status_code >= 400) { + warn("HTTP %ld: %s", resp->status_code, url); + response_free(resp); + return -1; + } + if (resp->content_type && + !strstr(resp->content_type, "text/html")) { + if (verbose) + fprintf(stderr, " skip non-HTML: %s\n", + resp->content_type); + response_free(resp); + return 0; + } + + final_url = resp->final_url ? resp->final_url : url; + rel_path = url_to_path(url, base_domain); + + ret = save_page(url, final_url, resp->data, rel_path); + + free(rel_path); + response_free(resp); + return ret; +} + +static void +crawl_site(const char *start_url) +{ + UrlQueue *queue; + VisitedSet *visited; + Robots *robots = NULL; + QueueNode *node; + Response *resp; + const char *final_url; + char *norm, *url, *rel_path; + int depth, pages_archived, rate_ms; + time_t start_time, now; + + queue = queue_new(); + visited = visited_new(); + + if (respect_robots) { + fprintf(stderr, "Fetching robots.txt for %s...\n", + base_domain); + robots = robots_fetch(base_domain); + if (robots->nrules > 0) + fprintf(stderr, " %d rules loaded\n", + robots->nrules); + else + fprintf(stderr, " no restrictions\n"); + + if (robots_delay(robots) > 0) { + rate_ms = robots_delay(robots) * 1000; + fprintf(stderr, " crawl-delay: %ds\n", + robots_delay(robots)); + } else { + rate_ms = RATE_LIMIT_MS; + } + } else { + rate_ms = RATE_LIMIT_MS; + } + + norm = url_normalize(start_url); + visited_add(visited, norm); + free(norm); + queue_push(queue, start_url, 0); + + pages_archived = 0; + start_time = time(NULL); + + while (!queue_empty(queue)) { + node = queue_pop(queue); + url = node->url; + depth = node->depth; + + resp = fetch_url(url); + if (!resp || resp->status_code >= 400) { + if (verbose) + fprintf(stderr, "[%d] SKIP %s\n", + depth, url); + response_free(resp); + free(url); + free(node); + continue; + } + + if (resp->content_type && + !strstr(resp->content_type, "text/html")) { + response_free(resp); + free(url); + free(node); + continue; + } + + final_url = resp->final_url ? + resp->final_url : url; + + fprintf(stderr, "[d=%d q=%zu v=%zu] %s\n", + depth, queue_size(queue), + visited_count(visited), url); + + extract_links(resp->data, final_url, queue, + visited, depth, robots); + + rel_path = url_to_path(url, base_domain); + save_page(url, final_url, resp->data, rel_path); + pages_archived++; + + if (pages_archived % PROGRESS_INTERVAL == 0) { + now = time(NULL); + fprintf(stderr, + "\n--- %d pages, %zu queued, " + "%zu visited, %lds ---\n\n", + pages_archived, + queue_size(queue), + visited_count(visited), + (long)(now - start_time)); + } + + free(rel_path); + response_free(resp); + free(url); + free(node); + + usleep(rate_ms * 1000); + } + + now = time(NULL); + fprintf(stderr, + "\nDone: %d pages to %s/ in %lds\n", + pages_archived, output_dir, + (long)(now - start_time)); + + robots_free(robots); + queue_free(queue); + visited_free(visited); +} + +int +main(int argc, char *argv[]) +{ + const char *url; + int opt; + + while ((opt = getopt(argc, argv, "vrRd:o:a:h")) != -1) { + switch (opt) { + case 'v': + verbose = 1; + break; + case 'r': + recursive = 1; + break; + case 'R': + respect_robots = 0; + break; + case 'd': + max_depth = atoi(optarg); + if (max_depth < 1) + max_depth = 1; + break; + case 'o': + output_dir = optarg; + break; + case 'a': + author = optarg; + break; + case 'h': /* fallthrough */ + default: + usage(); + } + } + + if (optind >= argc) + usage(); + + url = argv[optind]; + base_domain = url_get_domain(url); + if (!output_dir) + output_dir = base_domain; + + if (mkdir(output_dir, 0755) != 0 && errno != EEXIST) + die("cannot create directory: %s", output_dir); + + fetch_init(); + + fprintf(stderr, "archiver-bot %s\n", ARCHIVER_VERSION); + fprintf(stderr, "Target: %s\n", url); + fprintf(stderr, "Output: %s/\n", output_dir); + if (recursive) + fprintf(stderr, "Mode: recursive (depth %d)\n", + max_depth); + if (!respect_robots) + fprintf(stderr, "Warning: ignoring robots.txt\n"); + fprintf(stderr, "\n"); + + if (recursive) + crawl_site(url); + else + archive_page(url); + + fetch_cleanup(); + free(base_domain); + return 0; +} diff --git a/config.h b/config.h @@ -0,0 +1,31 @@ +/* See LICENSE file for copyright and license details. + * archiver-bot - suckless web archiver + * configuration header + */ + +#ifndef CONFIG_H +#define CONFIG_H + +/* Archiver metadata */ +#define ARCHIVER_NAME "Kris Yotam" +#define ARCHIVER_SITE "krisyotam.com" +#define ARCHIVER_VERSION "0.2.0" + +/* Network settings */ +#define USER_AGENT "archiver-bot/0.2 (+https://krisyotam.com)" +#define CONNECT_TIMEOUT 30L +#define REQUEST_TIMEOUT 60L +#define MAX_REDIRECTS 10L + +/* Crawl settings */ +#define MAX_DEPTH 5 +#define RATE_LIMIT_MS 1000 /* milliseconds between requests */ +#define MAX_FILE_SIZE (50 * 1024 * 1024) /* 50 MB max per resource */ + +/* Output settings */ +#define OUTPUT_EXT ".gwtar.html" + +/* Progress reporting interval (pages between status lines) */ +#define PROGRESS_INTERVAL 10 + +#endif /* CONFIG_H */ diff --git a/crawl.c b/crawl.c @@ -0,0 +1,296 @@ +/* See LICENSE file for copyright and license details. */ + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "crawl.h" +#include "util.h" + +/* FNV-1a hash - fast, good distribution for URL strings */ +static unsigned long +fnv1a(const char *s) +{ + unsigned long h = 2166136261UL; + + for (; *s; s++) { + h ^= (unsigned char)*s; + h *= 16777619UL; + } + return h; +} + +UrlQueue * +queue_new(void) +{ + UrlQueue *q = xmalloc(sizeof(UrlQueue)); + + q->head = NULL; + q->tail = NULL; + q->count = 0; + return q; +} + +void +queue_free(UrlQueue *q) +{ + QueueNode *n, *next; + + if (!q) + return; + for (n = q->head; n; n = next) { + next = n->next; + free(n->url); + free(n); + } + free(q); +} + +void +queue_push(UrlQueue *q, const char *url, int depth) +{ + QueueNode *n = xmalloc(sizeof(QueueNode)); + + n->url = xstrdup(url); + n->depth = depth; + n->next = NULL; + if (q->tail) { + q->tail->next = n; + q->tail = n; + } else { + q->head = n; + q->tail = n; + } + q->count++; +} + +QueueNode * +queue_pop(UrlQueue *q) +{ + QueueNode *n; + + if (!q->head) + return NULL; + n = q->head; + q->head = n->next; + if (!q->head) + q->tail = NULL; + q->count--; + return n; +} + +int +queue_empty(UrlQueue *q) +{ + return q->head == NULL; +} + +size_t +queue_size(UrlQueue *q) +{ + return q->count; +} + +VisitedSet * +visited_new(void) +{ + VisitedSet *v = xmalloc(sizeof(VisitedSet)); + + memset(v->buckets, 0, sizeof(v->buckets)); + v->count = 0; + return v; +} + +void +visited_free(VisitedSet *v) +{ + HashNode *n, *next; + size_t i; + + if (!v) + return; + for (i = 0; i < HT_SIZE; i++) { + for (n = v->buckets[i]; n; n = next) { + next = n->next; + free(n->url); + free(n); + } + } + free(v); +} + +void +visited_add(VisitedSet *v, const char *url) +{ + unsigned long h = fnv1a(url) % HT_SIZE; + HashNode *n; + + /* Check for duplicate first */ + for (n = v->buckets[h]; n; n = n->next) { + if (strcmp(n->url, url) == 0) + return; + } + n = xmalloc(sizeof(HashNode)); + n->url = xstrdup(url); + n->next = v->buckets[h]; + v->buckets[h] = n; + v->count++; +} + +int +visited_contains(VisitedSet *v, const char *url) +{ + unsigned long h = fnv1a(url) % HT_SIZE; + HashNode *n; + + for (n = v->buckets[h]; n; n = n->next) { + if (strcmp(n->url, url) == 0) + return 1; + } + return 0; +} + +size_t +visited_count(VisitedSet *v) +{ + return v->count; +} + +char * +url_normalize(const char *url) +{ + char *norm, *p, *hash, *query; + size_t len; + + norm = xstrdup(url); + + /* Remove fragment */ + hash = strchr(norm, '#'); + if (hash) + *hash = '\0'; + + /* Remove query string */ + query = strchr(norm, '?'); + if (query) + *query = '\0'; + + /* Remove trailing slash (but not bare domain slash) */ + len = strlen(norm); + if (len > 1 && norm[len - 1] == '/') { + /* Keep slash if it's just protocol://domain/ */ + p = norm; + if (str_starts_with(p, "https://")) + p += 8; + else if (str_starts_with(p, "http://")) + p += 7; + /* Skip domain */ + while (*p && *p != '/') + p++; + /* Only strip if there's path beyond domain */ + if (p < norm + len - 1) + norm[len - 1] = '\0'; + } + + /* Lowercase the domain part */ + p = norm; + if (str_starts_with(p, "https://")) + p += 8; + else if (str_starts_with(p, "http://")) + p += 7; + while (*p && *p != '/') + *p++ = tolower((unsigned char)*p); + + /* Remove default port :80 or :443 */ + p = norm; + if (str_starts_with(p, "https://")) + p += 8; + else if (str_starts_with(p, "http://")) + p += 7; + { + char *colon = NULL; + char *slash = NULL; + char *scan; + int is_https; + + is_https = str_starts_with(norm, "https://"); + for (scan = p; *scan && *scan != '/'; scan++) { + if (*scan == ':') + colon = scan; + } + slash = scan; + if (colon) { + char port[8]; + size_t plen = slash - colon - 1; + + if (plen < sizeof(port)) { + memcpy(port, colon + 1, plen); + port[plen] = '\0'; + if ((is_https && strcmp(port, "443") == 0) || + (!is_https && strcmp(port, "80") == 0)) { + memmove(colon, slash, + strlen(slash) + 1); + } + } + } + } + + return norm; +} + +char * +url_to_path(const char *url, const char *base_domain) +{ + const char *path_start; + char *path, *query, *hash, *new_path; + size_t len, new_len; + + (void)base_domain; + + path_start = url; + /* Skip protocol */ + if (str_starts_with(url, "https://")) + path_start = url + 8; + else if (str_starts_with(url, "http://")) + path_start = url + 7; + + /* Skip domain */ + while (*path_start && *path_start != '/') + path_start++; + + /* No path or just "/" -> index.html */ + if (!*path_start || strcmp(path_start, "/") == 0) + return xstrdup("index.html"); + + /* Skip leading slash */ + if (*path_start == '/') + path_start++; + + /* Copy path, strip query/fragment */ + path = xstrdup(path_start); + query = strchr(path, '?'); + if (query) + *query = '\0'; + hash = strchr(path, '#'); + if (hash) + *hash = '\0'; + + /* Remove trailing slash */ + len = strlen(path); + if (len > 0 && path[len - 1] == '/') { + path[len - 1] = '\0'; + len--; + } + + /* If path doesn't end in .html/.htm, treat as directory */ + if (len > 0 && !str_ends_with(path, ".html") && + !str_ends_with(path, ".htm")) { + new_len = len + 12; + new_path = xmalloc(new_len); + snprintf(new_path, new_len, "%s/index.html", path); + free(path); + path = new_path; + } + + return path; +} diff --git a/crawl.h b/crawl.h @@ -0,0 +1,57 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef CRAWL_H +#define CRAWL_H + +#include <stddef.h> + +/* Hash table size (prime, ~64k buckets) */ +#define HT_SIZE 65521 + +/* URL queue for BFS crawling */ +typedef struct QueueNode { + char *url; + int depth; + struct QueueNode *next; +} QueueNode; + +typedef struct { + QueueNode *head; + QueueNode *tail; + size_t count; +} UrlQueue; + +/* Hash table node for visited URLs */ +typedef struct HashNode { + char *url; + struct HashNode *next; +} HashNode; + +/* Hash table based visited set - O(1) lookup */ +typedef struct { + HashNode *buckets[HT_SIZE]; + size_t count; +} VisitedSet; + +/* Queue operations */ +UrlQueue *queue_new(void); +void queue_free(UrlQueue *q); +void queue_push(UrlQueue *q, const char *url, int depth); +QueueNode *queue_pop(UrlQueue *q); +int queue_empty(UrlQueue *q); +size_t queue_size(UrlQueue *q); + +/* Visited set operations (hash table) */ +VisitedSet *visited_new(void); +void visited_free(VisitedSet *v); +void visited_add(VisitedSet *v, const char *url); +int visited_contains(VisitedSet *v, const char *url); +size_t visited_count(VisitedSet *v); + +/* URL normalization for comparison */ +char *url_normalize(const char *url); + +/* Get path component from URL for directory structure */ +char *url_to_path(const char *url, const char *base_domain); + +#endif /* CRAWL_H */ diff --git a/fetch.c b/fetch.c @@ -0,0 +1,168 @@ +/* See LICENSE file for copyright and license details. */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <curl/curl.h> + +#include "config.h" +#include "fetch.h" +#include "util.h" + +static CURL *curl_handle = NULL; + +static size_t +write_cb(void *contents, size_t size, size_t nmemb, void *userp) +{ + size_t realsize = size * nmemb; + Response *resp = (Response *)userp; + char *ptr; + + ptr = xrealloc(resp->data, resp->size + realsize + 1); + resp->data = ptr; + memcpy(&(resp->data[resp->size]), contents, realsize); + resp->size += realsize; + resp->data[resp->size] = '\0'; + return realsize; +} + +/* + * Check if an HTTP status code is transient (worth retrying). + * 429 = rate limited, 5xx = server errors + */ +static int +is_transient(long code) +{ + return code == 429 || code == 500 || code == 502 || + code == 503 || code == 504; +} + +void +fetch_init(void) +{ + curl_global_init(CURL_GLOBAL_ALL); + curl_handle = curl_easy_init(); + if (!curl_handle) + die("curl_easy_init failed"); +} + +void +fetch_cleanup(void) +{ + if (curl_handle) { + curl_easy_cleanup(curl_handle); + curl_handle = NULL; + } + curl_global_cleanup(); +} + +Response * +fetch_url(const char *url) +{ + Response *resp; + CURLcode res; + char *ct, *effective_url; + int attempt; + + for (attempt = 0; attempt < FETCH_MAX_RETRIES; attempt++) { + if (attempt > 0) { + unsigned int delay; + + delay = FETCH_RETRY_BASE * (1 << (attempt - 1)); + warn("retry %d/%d for %s (waiting %us)", + attempt, FETCH_MAX_RETRIES - 1, url, delay); + sleep(delay); + } + + resp = xmalloc(sizeof(Response)); + resp->data = xmalloc(1); + resp->data[0] = '\0'; + resp->size = 0; + resp->content_type = NULL; + resp->status_code = 0; + resp->final_url = NULL; + + curl_easy_reset(curl_handle); + curl_easy_setopt(curl_handle, CURLOPT_URL, url); + curl_easy_setopt(curl_handle, CURLOPT_WRITEFUNCTION, + write_cb); + curl_easy_setopt(curl_handle, CURLOPT_WRITEDATA, + (void *)resp); + curl_easy_setopt(curl_handle, CURLOPT_USERAGENT, + USER_AGENT); + curl_easy_setopt(curl_handle, CURLOPT_FOLLOWLOCATION, 1L); + curl_easy_setopt(curl_handle, CURLOPT_MAXREDIRS, + MAX_REDIRECTS); + curl_easy_setopt(curl_handle, CURLOPT_CONNECTTIMEOUT, + CONNECT_TIMEOUT); + curl_easy_setopt(curl_handle, CURLOPT_TIMEOUT, + REQUEST_TIMEOUT); + curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYPEER, 1L); + curl_easy_setopt(curl_handle, CURLOPT_SSL_VERIFYHOST, 2L); + curl_easy_setopt(curl_handle, CURLOPT_ACCEPT_ENCODING, + ""); + + res = curl_easy_perform(curl_handle); + + if (res != CURLE_OK) { + /* Network-level failure */ + if (res == CURLE_OPERATION_TIMEDOUT || + res == CURLE_COULDNT_CONNECT || + res == CURLE_GOT_NOTHING) { + warn("fetch: %s: %s", + url, curl_easy_strerror(res)); + response_free(resp); + resp = NULL; + continue; + } + /* Non-transient curl error */ + warn("fetch: %s: %s", + url, curl_easy_strerror(res)); + response_free(resp); + return NULL; + } + + curl_easy_getinfo(curl_handle, + CURLINFO_RESPONSE_CODE, + &resp->status_code); + + ct = NULL; + if (curl_easy_getinfo(curl_handle, + CURLINFO_CONTENT_TYPE, + &ct) == CURLE_OK && ct) + resp->content_type = xstrdup(ct); + + effective_url = NULL; + if (curl_easy_getinfo(curl_handle, + CURLINFO_EFFECTIVE_URL, + &effective_url) == CURLE_OK && + effective_url) + resp->final_url = xstrdup(effective_url); + + /* Retry on transient HTTP errors */ + if (is_transient(resp->status_code)) { + response_free(resp); + resp = NULL; + continue; + } + + return resp; + } + + /* All retries exhausted */ + warn("fetch: gave up on %s after %d attempts", + url, FETCH_MAX_RETRIES); + return resp; +} + +void +response_free(Response *resp) +{ + if (!resp) + return; + free(resp->data); + free(resp->content_type); + free(resp->final_url); + free(resp); +} diff --git a/fetch.h b/fetch.h @@ -0,0 +1,31 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef FETCH_H +#define FETCH_H + +#include <stddef.h> + +/* Retry settings */ +#define FETCH_MAX_RETRIES 3 +#define FETCH_RETRY_BASE 2 /* base seconds for exponential backoff */ + +/* Response structure */ +typedef struct { + char *data; + size_t size; + char *content_type; + long status_code; + char *final_url; +} Response; + +/* Initialize/cleanup curl globally */ +void fetch_init(void); +void fetch_cleanup(void); + +/* Fetch a URL with automatic retry on transient errors */ +Response *fetch_url(const char *url); + +/* Free a response */ +void response_free(Response *resp); + +#endif /* FETCH_H */ diff --git a/parse.c b/parse.c @@ -0,0 +1,380 @@ +/* See LICENSE file for copyright and license details. */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <ctype.h> + +#include "parse.h" +#include "util.h" + +ResourceList * +reslist_new(void) +{ + ResourceList *list = xmalloc(sizeof(ResourceList)); + list->head = NULL; + list->tail = NULL; + list->count = 0; + return list; +} + +void +reslist_free(ResourceList *list) +{ + if (!list) + return; + Resource *r = list->head; + while (r) { + Resource *next = r->next; + free(r->url); + free(r); + r = next; + } + free(list); +} + +int +reslist_contains(ResourceList *list, const char *url) +{ + for (Resource *r = list->head; r; r = r->next) + if (strcmp(r->url, url) == 0) + return 1; + return 0; +} + +void +reslist_add(ResourceList *list, const char *url, ResourceType type) +{ + if (!url || !*url || reslist_contains(list, url)) + return; + + /* Skip data: URLs */ + if (str_starts_with(url, "data:")) + return; + + Resource *r = xmalloc(sizeof(Resource)); + r->url = xstrdup(url); + r->type = type; + r->next = NULL; + + if (list->tail) { + list->tail->next = r; + list->tail = r; + } else { + list->head = r; + list->tail = r; + } + list->count++; +} + +/* Extract attribute value from tag */ +static char * +get_attr(const char *tag, const char *attr) +{ + size_t attr_len = strlen(attr); + const char *p = tag; + + while (*p) { + /* Skip whitespace */ + while (*p && isspace((unsigned char)*p)) + p++; + + /* Check for attribute name */ + if (strncasecmp(p, attr, attr_len) == 0) { + p += attr_len; + while (*p && isspace((unsigned char)*p)) + p++; + if (*p == '=') { + p++; + while (*p && isspace((unsigned char)*p)) + p++; + + char quote = 0; + if (*p == '"' || *p == '\'') { + quote = *p++; + } + + const char *start = p; + if (quote) { + while (*p && *p != quote) + p++; + } else { + while (*p && !isspace((unsigned char)*p) && *p != '>') + p++; + } + + size_t len = p - start; + char *value = xmalloc(len + 1); + memcpy(value, start, len); + value[len] = '\0'; + return value; + } + } + + /* Skip to next attribute */ + while (*p && !isspace((unsigned char)*p) && *p != '>') + p++; + } + + return NULL; +} + +/* Determine resource type from URL/tag */ +static ResourceType +guess_resource_type(const char *url, const char *tag_name) +{ + if (!tag_name) + return RES_OTHER; + + if (strcasecmp(tag_name, "img") == 0) + return RES_IMAGE; + + if (strcasecmp(tag_name, "link") == 0) { + if (strstr(url, ".css") || strstr(url, "stylesheet")) + return RES_CSS; + if (strstr(url, ".woff") || strstr(url, ".ttf") || strstr(url, ".otf")) + return RES_FONT; + return RES_OTHER; + } + + if (strcasecmp(tag_name, "a") == 0) + return RES_PAGE; + + if (strcasecmp(tag_name, "script") == 0) + return RES_OTHER; + + /* Check by extension */ + char *lower = xstrdup(url); + str_tolower(lower); + + ResourceType type = RES_OTHER; + if (strstr(lower, ".jpg") || strstr(lower, ".jpeg") || + strstr(lower, ".png") || strstr(lower, ".gif") || + strstr(lower, ".webp") || strstr(lower, ".svg") || + strstr(lower, ".ico")) + type = RES_IMAGE; + else if (strstr(lower, ".css")) + type = RES_CSS; + else if (strstr(lower, ".woff") || strstr(lower, ".woff2") || + strstr(lower, ".ttf") || strstr(lower, ".otf") || + strstr(lower, ".eot")) + type = RES_FONT; + + free(lower); + return type; +} + +ResourceList * +parse_html(const char *html, const char *base_url) +{ + ResourceList *list = reslist_new(); + const char *p = html; + + while (*p) { + /* Find tag start */ + if (*p != '<') { + p++; + continue; + } + p++; + + /* Skip comments */ + if (str_starts_with(p, "!--")) { + p = strstr(p, "-->"); + if (p) + p += 3; + else + break; + continue; + } + + /* Get tag name */ + const char *tag_start = p; + while (*p && !isspace((unsigned char)*p) && *p != '>' && *p != '/') + p++; + + size_t tag_len = p - tag_start; + if (tag_len == 0 || tag_len > 20) + continue; + + char tag_name[21]; + memcpy(tag_name, tag_start, tag_len); + tag_name[tag_len] = '\0'; + + /* Find tag end */ + const char *tag_end = strchr(p, '>'); + if (!tag_end) + break; + + /* Extract tag content for attribute parsing */ + size_t content_len = tag_end - tag_start; + char *tag_content = xmalloc(content_len + 1); + memcpy(tag_content, tag_start, content_len); + tag_content[content_len] = '\0'; + + /* Check for relevant attributes based on tag */ + char *url = NULL; + + if (strcasecmp(tag_name, "img") == 0) { + url = get_attr(tag_content, "src"); + if (!url) + url = get_attr(tag_content, "data-src"); + } else if (strcasecmp(tag_name, "link") == 0) { + url = get_attr(tag_content, "href"); + } else if (strcasecmp(tag_name, "script") == 0) { + url = get_attr(tag_content, "src"); + } else if (strcasecmp(tag_name, "a") == 0) { + url = get_attr(tag_content, "href"); + } else if (strcasecmp(tag_name, "source") == 0) { + url = get_attr(tag_content, "srcset"); + if (!url) + url = get_attr(tag_content, "src"); + } + + if (url && *url) { + char *resolved = url_resolve(base_url, url); + ResourceType type = guess_resource_type(resolved, tag_name); + reslist_add(list, resolved, type); + free(resolved); + } + + free(url); + free(tag_content); + p = tag_end + 1; + } + + return list; +} + +char * +parse_title(const char *html) +{ + const char *start = strcasestr(html, "<title"); + if (!start) + return xstrdup("Untitled"); + + start = strchr(start, '>'); + if (!start) + return xstrdup("Untitled"); + start++; + + const char *end = strcasestr(start, "</title>"); + if (!end) + return xstrdup("Untitled"); + + size_t len = end - start; + char *title = xmalloc(len + 1); + memcpy(title, start, len); + title[len] = '\0'; + + return str_trim(title); +} + +/* Helper to find and replace in string, returns new allocated string */ +static char * +str_replace_first(const char *str, const char *old, size_t old_len, const char *new, size_t new_len) +{ + const char *pos = strstr(str, old); + if (!pos) + return xstrdup(str); + + size_t before_len = pos - str; + size_t after_len = strlen(pos + old_len); + size_t result_len = before_len + new_len + after_len; + + char *result = xmalloc(result_len + 1); + memcpy(result, str, before_len); + memcpy(result + before_len, new, new_len); + memcpy(result + before_len + new_len, pos + old_len, after_len + 1); + + return result; +} + +char * +inline_resources(const char *html, const char *base_url, + char *(*fetch_and_encode)(const char *url, const char *base_url)) +{ + char *result = xstrdup(html); + size_t search_offset = 0; + + /* Process img tags */ + while (1) { + const char *p = strcasestr(result + search_offset, "<img"); + if (!p) + break; + + const char *tag_end = strchr(p, '>'); + if (!tag_end) + break; + + /* Calculate offset for this tag */ + size_t tag_offset = p - result; + + /* Find src attribute */ + const char *src_start = strcasestr(p, "src="); + if (!src_start || src_start > tag_end) { + /* No src, skip this img */ + search_offset = (tag_end - result) + 1; + continue; + } + + src_start += 4; + char quote = 0; + if (*src_start == '"' || *src_start == '\'') + quote = *src_start++; + + const char *src_end = src_start; + if (quote) { + while (*src_end && *src_end != quote) + src_end++; + } else { + while (*src_end && !isspace((unsigned char)*src_end) && *src_end != '>') + src_end++; + } + + /* Extract URL */ + size_t url_len = src_end - src_start; + char *url = xmalloc(url_len + 1); + memcpy(url, src_start, url_len); + url[url_len] = '\0'; + + /* Skip if already data URI */ + if (str_starts_with(url, "data:")) { + free(url); + search_offset = (tag_end - result) + 1; + continue; + } + + /* Fetch and encode */ + char *data_uri = fetch_and_encode(url, base_url); + if (data_uri) { + /* Build old and new strings for replacement */ + char *old_attr = xmalloc(url_len + 8); + snprintf(old_attr, url_len + 8, "src=%c%s%c", + quote ? quote : '"', url, quote ? quote : '"'); + + size_t new_attr_len = 5 + strlen(data_uri) + 2; + char *new_attr = xmalloc(new_attr_len + 1); + snprintf(new_attr, new_attr_len + 1, "src=\"%s\"", data_uri); + + char *new_result = str_replace_first(result, old_attr, strlen(old_attr), + new_attr, strlen(new_attr)); + /* Continue searching after the new data URI */ + search_offset = tag_offset + strlen(new_attr); + + free(result); + result = new_result; + + free(old_attr); + free(new_attr); + free(data_uri); + } else { + /* Fetch failed, skip this img tag */ + search_offset = (tag_end - result) + 1; + } + + free(url); + } + + return result; +} diff --git a/parse.h b/parse.h @@ -0,0 +1,51 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef PARSE_H +#define PARSE_H + +#include <stddef.h> + +/* Resource types */ +typedef enum { + RES_IMAGE, + RES_CSS, + RES_FONT, + RES_PAGE, + RES_OTHER +} ResourceType; + +/* Extracted resource */ +typedef struct Resource { + char *url; + ResourceType type; + struct Resource *next; +} Resource; + +/* Resource list */ +typedef struct { + Resource *head; + Resource *tail; + size_t count; +} ResourceList; + +/* Create/destroy resource list */ +ResourceList *reslist_new(void); +void reslist_free(ResourceList *list); + +/* Add a resource (url is copied) */ +void reslist_add(ResourceList *list, const char *url, ResourceType type); + +/* Check if URL already in list */ +int reslist_contains(ResourceList *list, const char *url); + +/* Extract resources from HTML */ +ResourceList *parse_html(const char *html, const char *base_url); + +/* Extract title from HTML */ +char *parse_title(const char *html); + +/* Find and inline resources in HTML */ +char *inline_resources(const char *html, const char *base_url, + char *(*fetch_and_encode)(const char *url, const char *base_url)); + +#endif /* PARSE_H */ diff --git a/robots.c b/robots.c @@ -0,0 +1,200 @@ +/* See LICENSE file for copyright and license details. */ + +#include <ctype.h> +#include <stdio.h> +#include <stdlib.h> +#include <string.h> + +#include "robots.h" +#include "fetch.h" +#include "util.h" + +/* + * Parse robots.txt content for our user-agent. + * + * We look for rules matching "archiver-bot" first, + * then fall back to "*" (wildcard) rules. + */ +Robots * +robots_fetch(const char *domain) +{ + Robots *r; + Response *resp; + char url[1024]; + char *line, *saveptr, *text; + int in_our_group, in_star_group, found_our_group; + int star_nrules, star_delay; + RobotsRule star_rules[MAX_RULES]; + + r = xmalloc(sizeof(Robots)); + r->domain = xstrdup(domain); + r->nrules = 0; + r->crawl_delay = 0; + + snprintf(url, sizeof(url), "https://%s/robots.txt", domain); + resp = fetch_url(url); + if (!resp || resp->status_code >= 400 || !resp->data) { + /* No robots.txt = everything allowed */ + response_free(resp); + return r; + } + + text = xstrdup(resp->data); + response_free(resp); + + in_our_group = 0; + in_star_group = 0; + found_our_group = 0; + star_nrules = 0; + star_delay = 0; + + for (line = strtok_r(text, "\n", &saveptr); line; + line = strtok_r(NULL, "\n", &saveptr)) { + char *trimmed, *colon, *key, *val; + + trimmed = str_trim(line); + + /* Skip empty lines and comments */ + if (!*trimmed || *trimmed == '#') + continue; + + /* Strip inline comments */ + colon = strchr(trimmed, '#'); + if (colon) + *colon = '\0'; + + /* Find key: value */ + colon = strchr(trimmed, ':'); + if (!colon) + continue; + *colon = '\0'; + key = str_trim(trimmed); + val = str_trim(colon + 1); + + if (strcasecmp(key, "user-agent") == 0) { + /* New user-agent group */ + if (strcasestr(val, "archiver-bot")) { + in_our_group = 1; + in_star_group = 0; + found_our_group = 1; + } else if (strcmp(val, "*") == 0 && + !found_our_group) { + in_star_group = 1; + in_our_group = 0; + } else { + in_our_group = 0; + in_star_group = 0; + } + continue; + } + + if (strcasecmp(key, "disallow") == 0) { + if (in_our_group && r->nrules < MAX_RULES) { + r->rules[r->nrules].path = xstrdup(val); + r->rules[r->nrules].allow = 0; + r->nrules++; + } else if (in_star_group && + star_nrules < MAX_RULES) { + star_rules[star_nrules].path = xstrdup(val); + star_rules[star_nrules].allow = 0; + star_nrules++; + } + } else if (strcasecmp(key, "allow") == 0) { + if (in_our_group && r->nrules < MAX_RULES) { + r->rules[r->nrules].path = xstrdup(val); + r->rules[r->nrules].allow = 1; + r->nrules++; + } else if (in_star_group && + star_nrules < MAX_RULES) { + star_rules[star_nrules].path = xstrdup(val); + star_rules[star_nrules].allow = 1; + star_nrules++; + } + } else if (strcasecmp(key, "crawl-delay") == 0) { + int delay = atoi(val); + + if (delay > 0) { + if (in_our_group) + r->crawl_delay = delay; + else if (in_star_group) + star_delay = delay; + } + } + } + + /* If no specific rules for us, use wildcard rules */ + if (!found_our_group && star_nrules > 0) { + int i; + + for (i = 0; i < star_nrules; i++) + r->rules[i] = star_rules[i]; + r->nrules = star_nrules; + r->crawl_delay = star_delay; + } else if (!found_our_group) { + /* Free star rules if we didn't use them */ + int i; + + for (i = 0; i < star_nrules; i++) + free(star_rules[i].path); + } + + free(text); + return r; +} + +int +robots_allowed(Robots *r, const char *path) +{ + int i, best_len, allowed; + + if (!r || r->nrules == 0) + return 1; + + /* + * Match the most specific (longest) rule. + * If multiple rules of same length, Allow wins. + */ + best_len = -1; + allowed = 1; + + for (i = 0; i < r->nrules; i++) { + int plen = strlen(r->rules[i].path); + + /* Empty disallow = allow all */ + if (plen == 0 && !r->rules[i].allow) + continue; + + if (strncmp(path, r->rules[i].path, plen) == 0) { + if (plen > best_len) { + best_len = plen; + allowed = r->rules[i].allow; + } else if (plen == best_len && + r->rules[i].allow) { + allowed = 1; + } + } + } + + return allowed; +} + +int +robots_delay(Robots *r) +{ + if (!r) + return 0; + return r->crawl_delay; +} + +void +robots_free(Robots *r) +{ + int i; + + if (!r) + return; + for (i = 0; i < r->nrules; i++) + free(r->rules[i].path); + free(r->domain); + free(r); +} diff --git a/robots.h b/robots.h @@ -0,0 +1,35 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef ROBOTS_H +#define ROBOTS_H + +/* Maximum rules per robots.txt */ +#define MAX_RULES 512 + +/* Rule types */ +typedef struct { + char *path; + int allow; /* 1 = allow, 0 = disallow */ +} RobotsRule; + +/* Parsed robots.txt for a domain */ +typedef struct { + char *domain; + RobotsRule rules[MAX_RULES]; + int nrules; + int crawl_delay; /* seconds, 0 = none specified */ +} Robots; + +/* Fetch and parse robots.txt for a domain */ +Robots *robots_fetch(const char *domain); + +/* Check if a path is allowed */ +int robots_allowed(Robots *r, const char *path); + +/* Get crawl delay in seconds (0 = none) */ +int robots_delay(Robots *r); + +/* Free robots struct */ +void robots_free(Robots *r); + +#endif /* ROBOTS_H */ diff --git a/util.c b/util.c @@ -0,0 +1,307 @@ +/* See LICENSE file for copyright and license details. */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <stdarg.h> +#include <ctype.h> +#include <time.h> + +#include "util.h" + +static const char base64_table[] = + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +void +die(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + if (fmt[0] && fmt[strlen(fmt)-1] == ':') { + fputc(' ', stderr); + perror(NULL); + } else { + fputc('\n', stderr); + } + exit(1); +} + +void +warn(const char *fmt, ...) +{ + va_list ap; + va_start(ap, fmt); + fprintf(stderr, "warning: "); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); +} + +void * +xmalloc(size_t size) +{ + void *p = malloc(size); + if (!p) + die("malloc:"); + return p; +} + +void * +xrealloc(void *ptr, size_t size) +{ + void *p = realloc(ptr, size); + if (!p) + die("realloc:"); + return p; +} + +char * +xstrdup(const char *s) +{ + char *p = strdup(s); + if (!p) + die("strdup:"); + return p; +} + +char * +base64_encode(const unsigned char *data, size_t input_len, size_t *output_len) +{ + size_t olen = 4 * ((input_len + 2) / 3); + char *encoded = xmalloc(olen + 1); + + size_t i, j; + for (i = 0, j = 0; i < input_len;) { + unsigned int a = i < input_len ? data[i++] : 0; + unsigned int b = i < input_len ? data[i++] : 0; + unsigned int c = i < input_len ? data[i++] : 0; + unsigned int triple = (a << 16) | (b << 8) | c; + + encoded[j++] = base64_table[(triple >> 18) & 0x3F]; + encoded[j++] = base64_table[(triple >> 12) & 0x3F]; + encoded[j++] = base64_table[(triple >> 6) & 0x3F]; + encoded[j++] = base64_table[triple & 0x3F]; + } + + /* Add padding */ + size_t mod = input_len % 3; + if (mod) { + encoded[olen - 1] = '='; + if (mod == 1) + encoded[olen - 2] = '='; + } + + encoded[olen] = '\0'; + if (output_len) + *output_len = olen; + return encoded; +} + +int +str_starts_with(const char *str, const char *prefix) +{ + return strncmp(str, prefix, strlen(prefix)) == 0; +} + +int +str_ends_with(const char *str, const char *suffix) +{ + size_t slen = strlen(str); + size_t suflen = strlen(suffix); + if (suflen > slen) + return 0; + return strcmp(str + slen - suflen, suffix) == 0; +} + +char * +str_tolower(char *str) +{ + for (char *p = str; *p; p++) + *p = tolower((unsigned char)*p); + return str; +} + +char * +str_trim(char *str) +{ + char *end; + while (isspace((unsigned char)*str)) + str++; + if (*str == '\0') + return str; + end = str + strlen(str) - 1; + while (end > str && isspace((unsigned char)*end)) + end--; + end[1] = '\0'; + return str; +} + +char * +url_get_domain(const char *url) +{ + const char *start, *end; + char *domain; + + /* Skip protocol */ + if (str_starts_with(url, "https://")) + start = url + 8; + else if (str_starts_with(url, "http://")) + start = url + 7; + else + start = url; + + /* Find end of domain */ + end = start; + while (*end && *end != '/' && *end != ':' && *end != '?') + end++; + + size_t len = end - start; + domain = xmalloc(len + 1); + memcpy(domain, start, len); + domain[len] = '\0'; + + return domain; +} + +int +url_same_domain(const char *url1, const char *url2) +{ + char *d1 = url_get_domain(url1); + char *d2 = url_get_domain(url2); + int same = strcasecmp(d1, d2) == 0; + free(d1); + free(d2); + return same; +} + +char * +url_resolve(const char *base, const char *relative) +{ + char *result; + + /* Already absolute */ + if (str_starts_with(relative, "http://") || + str_starts_with(relative, "https://") || + str_starts_with(relative, "data:")) { + return xstrdup(relative); + } + + /* Protocol-relative */ + if (str_starts_with(relative, "//")) { + size_t len = 6 + strlen(relative); + result = xmalloc(len + 1); + snprintf(result, len + 1, "https:%s", relative); + return result; + } + + char *domain = url_get_domain(base); + const char *proto = str_starts_with(base, "https://") ? "https://" : "http://"; + + /* Root-relative */ + if (relative[0] == '/') { + size_t len = strlen(proto) + strlen(domain) + strlen(relative); + result = xmalloc(len + 1); + snprintf(result, len + 1, "%s%s%s", proto, domain, relative); + free(domain); + return result; + } + + /* Find base path */ + const char *path_start; + if (str_starts_with(base, "https://")) + path_start = base + 8; + else if (str_starts_with(base, "http://")) + path_start = base + 7; + else + path_start = base; + + /* Skip domain */ + while (*path_start && *path_start != '/') + path_start++; + + /* Find last slash in path */ + const char *last_slash = strrchr(path_start, '/'); + if (!last_slash) + last_slash = path_start; + + size_t base_len = last_slash - path_start + 1; + size_t len = strlen(proto) + strlen(domain) + base_len + strlen(relative); + result = xmalloc(len + 1); + snprintf(result, len + 1, "%s%s%.*s%s", proto, domain, (int)base_len, path_start, relative); + + free(domain); + return result; +} + +char * +get_mime_type(const char *url) +{ + /* Strip query string */ + char *copy = xstrdup(url); + char *query = strchr(copy, '?'); + if (query) + *query = '\0'; + + str_tolower(copy); + + const char *mime = "application/octet-stream"; + + if (str_ends_with(copy, ".jpg") || str_ends_with(copy, ".jpeg")) + mime = "image/jpeg"; + else if (str_ends_with(copy, ".png")) + mime = "image/png"; + else if (str_ends_with(copy, ".gif")) + mime = "image/gif"; + else if (str_ends_with(copy, ".webp")) + mime = "image/webp"; + else if (str_ends_with(copy, ".svg")) + mime = "image/svg+xml"; + else if (str_ends_with(copy, ".ico")) + mime = "image/x-icon"; + else if (str_ends_with(copy, ".css")) + mime = "text/css"; + else if (str_ends_with(copy, ".js")) + mime = "application/javascript"; + else if (str_ends_with(copy, ".woff")) + mime = "font/woff"; + else if (str_ends_with(copy, ".woff2")) + mime = "font/woff2"; + else if (str_ends_with(copy, ".ttf")) + mime = "font/ttf"; + else if (str_ends_with(copy, ".otf")) + mime = "font/otf"; + else if (str_ends_with(copy, ".eot")) + mime = "application/vnd.ms-fontobject"; + + free(copy); + return xstrdup(mime); +} + +char * +sanitize_filename(const char *url) +{ + char *domain = url_get_domain(url); + size_t len = strlen(domain) + 32; + char *filename = xmalloc(len); + + /* Replace dots with underscores */ + for (char *p = domain; *p; p++) + if (*p == '.') + *p = '_'; + + snprintf(filename, len, "%s", domain); + free(domain); + return filename; +} + +char * +get_iso_date(void) +{ + time_t t = time(NULL); + struct tm *tm = gmtime(&t); + char *buf = xmalloc(32); + strftime(buf, 32, "%Y-%m-%dT%H:%M:%SZ", tm); + return buf; +} diff --git a/util.h b/util.h @@ -0,0 +1,39 @@ +/* See LICENSE file for copyright and license details. */ + +#ifndef UTIL_H +#define UTIL_H + +#include <stddef.h> + +/* Memory allocation with error handling */ +void *xmalloc(size_t size); +void *xrealloc(void *ptr, size_t size); +char *xstrdup(const char *s); + +/* Base64 encoding */ +char *base64_encode(const unsigned char *data, size_t input_len, + size_t *output_len); + +/* String utilities */ +int str_starts_with(const char *str, const char *prefix); +int str_ends_with(const char *str, const char *suffix); +char *str_tolower(char *str); +char *str_trim(char *str); + +/* URL utilities */ +char *url_resolve(const char *base, const char *relative); +char *url_get_domain(const char *url); +int url_same_domain(const char *url1, const char *url2); + +/* File utilities */ +char *get_mime_type(const char *url); +char *sanitize_filename(const char *url); + +/* Time utilities */ +char *get_iso_date(void); + +/* Error handling */ +void die(const char *fmt, ...); +void warn(const char *fmt, ...); + +#endif /* UTIL_H */