sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

archiver.c (16244B)


      1 /* See LICENSE file for copyright and license details.
      2  *
      3  * sbot - Simple Archiver Bot
      4  *
      5  * Creates self-contained archives of websites with all
      6  * resources inlined as data URIs.
      7  */
      8 
      9 #include <errno.h>
     10 #include <stdio.h>
     11 #include <stdlib.h>
     12 #include <string.h>
     13 #include <sys/stat.h>
     14 #include <time.h>
     15 #include <unistd.h>
     16 
     17 #include "config.h"
     18 #include "crawl.h"
     19 #include "detect.h"
     20 #include "fetch.h"
     21 #include "parse.h"
     22 #include "robots.h"
     23 #include "util.h"
     24 
     25 /* Global options */
     26 static int verbose = 0;
     27 static int recursive = 0;
     28 static int max_depth = MAX_DEPTH;
     29 static int respect_robots = 1;
     30 static const char *author = "Unknown";
     31 static const char *output_dir = NULL;
     32 static char *base_domain = NULL;
     33 
     34 static void
     35 usage(void)
     36 {
     37 	fprintf(stderr,
     38 	    "usage: sbot [-vrR] [-d depth] [-o dir]"
     39 	    " [-a author] url\n"
     40 	    "\n"
     41 	    "  -v          verbose output\n"
     42 	    "  -r          recursive (crawl entire site)\n"
     43 	    "  -R          ignore robots.txt\n"
     44 	    "  -d depth    max crawl depth (default: %d)\n"
     45 	    "  -o dir      output directory\n"
     46 	    "  -a author   site author name\n",
     47 	    MAX_DEPTH);
     48 	exit(1);
     49 }
     50 
     51 static int
     52 mkdirp(const char *path)
     53 {
     54 	char *p, *sep;
     55 
     56 	p = xstrdup(path);
     57 	for (sep = p + 1; *sep; sep++) {
     58 		if (*sep == '/') {
     59 			*sep = '\0';
     60 			if (mkdir(p, 0755) != 0 && errno != EEXIST) {
     61 				free(p);
     62 				return -1;
     63 			}
     64 			*sep = '/';
     65 		}
     66 	}
     67 	free(p);
     68 	return 0;
     69 }
     70 
     71 static char *
     72 fetch_and_encode(const char *url, const char *base)
     73 {
     74 	char *resolved, *mime, *b64, *data_uri, *semi;
     75 	Response *resp;
     76 	size_t b64_len, uri_len;
     77 
     78 	resolved = url_resolve(base, url);
     79 	if (verbose)
     80 		fprintf(stderr, "    resource: %s\n", resolved);
     81 
     82 	resp = fetch_url(resolved);
     83 	if (!resp || resp->status_code >= 400 || resp->size == 0) {
     84 		free(resolved);
     85 		response_free(resp);
     86 		return NULL;
     87 	}
     88 
     89 	if (resp->content_type) {
     90 		mime = xstrdup(resp->content_type);
     91 		semi = strchr(mime, ';');
     92 		if (semi)
     93 			*semi = '\0';
     94 		str_trim(mime);
     95 	} else {
     96 		mime = get_mime_type(resolved);
     97 	}
     98 
     99 	b64 = base64_encode((unsigned char *)resp->data,
    100 	                     resp->size, &b64_len);
    101 
    102 	uri_len = 5 + strlen(mime) + 8 + b64_len + 1;
    103 	data_uri = xmalloc(uri_len);
    104 	snprintf(data_uri, uri_len, "data:%s;base64,%s", mime, b64);
    105 
    106 	free(mime);
    107 	free(b64);
    108 	free(resolved);
    109 	response_free(resp);
    110 	return data_uri;
    111 }
    112 
    113 static char *
    114 generate_header(const char *title, const char *source_url)
    115 {
    116 	char *date, *domain, *header;
    117 	size_t len;
    118 
    119 	date = get_iso_date();
    120 	domain = url_get_domain(source_url);
    121 
    122 	len = 2048 + strlen(title) + strlen(source_url) +
    123 	      strlen(author) + strlen(domain);
    124 	header = xmalloc(len);
    125 
    126 	snprintf(header, len,
    127 	    "<!--\n"
    128 	    "========================================"
    129 	    "========================================\n"
    130 	    "  GWTAR ARCHIVE\n"
    131 	    "========================================"
    132 	    "========================================\n"
    133 	    "\n"
    134 	    "  Title:        %s\n"
    135 	    "  Source URL:   %s\n"
    136 	    "  Domain:       %s\n"
    137 	    "  Author:       %s\n"
    138 	    "\n"
    139 	    "  Archived by:  %s\n"
    140 	    "  Archived on:  %s\n"
    141 	    "  Archive date: %s\n"
    142 	    "\n"
    143 	    "  Generator:    sbot/%s\n"
    144 	    "  Format:       GWTAR (Gwern Web Tar Archive)\n"
    145 	    "\n"
    146 	    "========================================"
    147 	    "========================================\n"
    148 	    "-->\n",
    149 	    title, source_url, domain, author,
    150 	    ARCHIVER_NAME, ARCHIVER_SITE, date,
    151 	    ARCHIVER_VERSION);
    152 
    153 	free(date);
    154 	free(domain);
    155 	return header;
    156 }
    157 
    158 static int
    159 path_depth(const char *path)
    160 {
    161 	int depth = 0;
    162 	const char *p;
    163 
    164 	for (p = path; *p; p++)
    165 		if (*p == '/')
    166 			depth++;
    167 	return depth;
    168 }
    169 
    170 static char *
    171 make_relative_prefix(int depth)
    172 {
    173 	char *prefix;
    174 	size_t len;
    175 	int i;
    176 
    177 	if (depth == 0)
    178 		return xstrdup("");
    179 	len = depth * 3 + 1;
    180 	prefix = xmalloc(len);
    181 	prefix[0] = '\0';
    182 	for (i = 0; i < depth; i++)
    183 		strcat(prefix, "../");
    184 	return prefix;
    185 }
    186 
    187 static char *
    188 rewrite_links(char *html, const char *rel_path)
    189 {
    190 	int depth;
    191 	char *prefix, *result, *new_result;
    192 	size_t prefix_len, search_offset, pos, old_len, new_len;
    193 	const char *p;
    194 
    195 	depth = path_depth(rel_path);
    196 	prefix = make_relative_prefix(depth);
    197 	prefix_len = strlen(prefix);
    198 	result = html;
    199 
    200 	search_offset = 0;
    201 	while (1) {
    202 		p = strstr(result + search_offset, "href=\"/");
    203 		if (!p)
    204 			break;
    205 		if (p[7] == '/') {
    206 			search_offset = (p - result) + 8;
    207 			continue;
    208 		}
    209 		pos = p - result + 6;
    210 		old_len = strlen(result);
    211 		new_len = old_len - 1 + prefix_len;
    212 		new_result = xmalloc(new_len + 1);
    213 		memcpy(new_result, result, pos);
    214 		memcpy(new_result + pos, prefix, prefix_len);
    215 		memcpy(new_result + pos + prefix_len,
    216 		       result + pos + 1, old_len - pos);
    217 		free(result);
    218 		result = new_result;
    219 		search_offset = pos + prefix_len;
    220 	}
    221 
    222 	search_offset = 0;
    223 	while (1) {
    224 		p = strstr(result + search_offset, "src=\"/");
    225 		if (!p)
    226 			break;
    227 		if (p[6] == '/') {
    228 			search_offset = (p - result) + 7;
    229 			continue;
    230 		}
    231 		pos = p - result + 5;
    232 		old_len = strlen(result);
    233 		new_len = old_len - 1 + prefix_len;
    234 		new_result = xmalloc(new_len + 1);
    235 		memcpy(new_result, result, pos);
    236 		memcpy(new_result + pos, prefix, prefix_len);
    237 		memcpy(new_result + pos + prefix_len,
    238 		       result + pos + 1, old_len - pos);
    239 		free(result);
    240 		result = new_result;
    241 		search_offset = pos + prefix_len;
    242 	}
    243 
    244 	free(prefix);
    245 	return result;
    246 }
    247 
    248 static char *
    249 inline_css(char *html, const char *base)
    250 {
    251 	char *result, *tag, *href, *resolved;
    252 	char *new_tag, *new_result;
    253 	const char *link_start, *link_end;
    254 	const char *href_start, *href_end;
    255 	size_t search_offset, tag_offset, tag_len, href_len;
    256 	size_t new_tag_len, old_len, new_len, result_len;
    257 	Response *resp;
    258 	char quote;
    259 
    260 	result = html;
    261 	search_offset = 0;
    262 
    263 	while (1) {
    264 		link_start = strcasestr(result + search_offset,
    265 		                        "<link");
    266 		if (!link_start)
    267 			break;
    268 		link_end = strchr(link_start, '>');
    269 		if (!link_end)
    270 			break;
    271 
    272 		tag_offset = link_start - result;
    273 		tag_len = link_end - link_start;
    274 		tag = xmalloc(tag_len + 1);
    275 		memcpy(tag, link_start, tag_len);
    276 		tag[tag_len] = '\0';
    277 
    278 		if (!strcasestr(tag, "stylesheet")) {
    279 			free(tag);
    280 			search_offset = (link_end - result) + 1;
    281 			continue;
    282 		}
    283 
    284 		href_start = strcasestr(tag, "href=");
    285 		if (!href_start) {
    286 			free(tag);
    287 			search_offset = (link_end - result) + 1;
    288 			continue;
    289 		}
    290 		href_start += 5;
    291 		quote = 0;
    292 		if (*href_start == '"' || *href_start == '\'')
    293 			quote = *href_start++;
    294 
    295 		href_end = href_start;
    296 		if (quote) {
    297 			while (*href_end && *href_end != quote)
    298 				href_end++;
    299 		} else {
    300 			while (*href_end && *href_end != ' ' &&
    301 			       *href_end != '>')
    302 				href_end++;
    303 		}
    304 
    305 		href_len = href_end - href_start;
    306 		href = xmalloc(href_len + 1);
    307 		memcpy(href, href_start, href_len);
    308 		href[href_len] = '\0';
    309 
    310 		resolved = url_resolve(base, href);
    311 		if (verbose)
    312 			fprintf(stderr, "    css: %s\n", resolved);
    313 
    314 		resp = fetch_url(resolved);
    315 		free(resolved);
    316 
    317 		if (resp && resp->status_code < 400 &&
    318 		    resp->size > 0) {
    319 			new_tag_len = 7 + resp->size + 8 + 1;
    320 			new_tag = xmalloc(new_tag_len);
    321 			snprintf(new_tag, new_tag_len,
    322 			         "<style>%s</style>", resp->data);
    323 
    324 			old_len = (link_end + 1) - link_start;
    325 			new_len = strlen(new_tag);
    326 			result_len = strlen(result);
    327 
    328 			new_result = xmalloc(
    329 			    result_len - old_len + new_len + 1);
    330 			memcpy(new_result, result, tag_offset);
    331 			memcpy(new_result + tag_offset,
    332 			       new_tag, new_len);
    333 			memcpy(new_result + tag_offset + new_len,
    334 			       link_end + 1,
    335 			       result_len - tag_offset - old_len + 1);
    336 
    337 			free(result);
    338 			result = new_result;
    339 			search_offset = tag_offset + new_len;
    340 			free(new_tag);
    341 		} else {
    342 			search_offset = (link_end - result) + 1;
    343 		}
    344 
    345 		response_free(resp);
    346 		free(href);
    347 		free(tag);
    348 	}
    349 
    350 	return result;
    351 }
    352 
    353 static void
    354 extract_links(const char *html, const char *base_url,
    355               UrlQueue *queue, VisitedSet *visited,
    356               int current_depth, Robots *robots)
    357 {
    358 	ResourceList *resources;
    359 	Resource *r;
    360 	char *norm, *path;
    361 	const char *pstart;
    362 
    363 	if (current_depth >= max_depth)
    364 		return;
    365 
    366 	resources = parse_html(html, base_url);
    367 
    368 	for (r = resources->head; r; r = r->next) {
    369 		if (r->type != RES_PAGE)
    370 			continue;
    371 
    372 		if (str_starts_with(r->url, "mailto:") ||
    373 		    str_starts_with(r->url, "tel:") ||
    374 		    str_starts_with(r->url, "javascript:") ||
    375 		    str_starts_with(r->url, "#"))
    376 			continue;
    377 
    378 		if (!url_same_domain(r->url, base_url))
    379 			continue;
    380 
    381 		/* Check robots.txt */
    382 		if (robots) {
    383 			pstart = r->url;
    384 			if (str_starts_with(pstart, "https://"))
    385 				pstart += 8;
    386 			else if (str_starts_with(pstart, "http://"))
    387 				pstart += 7;
    388 			while (*pstart && *pstart != '/')
    389 				pstart++;
    390 			path = xstrdup(pstart[0] ? pstart : "/");
    391 			if (!robots_allowed(robots, path)) {
    392 				if (verbose)
    393 					fprintf(stderr,
    394 					    "    robots: blocked"
    395 					    " %s\n", r->url);
    396 				free(path);
    397 				continue;
    398 			}
    399 			free(path);
    400 		}
    401 
    402 		norm = url_normalize(r->url);
    403 		if (!visited_contains(visited, norm)) {
    404 			visited_add(visited, norm);
    405 			queue_push(queue, r->url,
    406 			           current_depth + 1);
    407 			if (verbose)
    408 				fprintf(stderr,
    409 				    "    queued: %s\n", r->url);
    410 		}
    411 		free(norm);
    412 	}
    413 
    414 	reslist_free(resources);
    415 }
    416 
    417 static int
    418 save_page(const char *url, const char *final_url,
    419           const char *data, const char *rel_path)
    420 {
    421 	char *html, *title, *header;
    422 	char *full_path, *dir, *last_slash;
    423 	size_t full_path_len;
    424 	FILE *fp;
    425 
    426 	html = xstrdup(data);
    427 	title = parse_title(html);
    428 
    429 	if (verbose)
    430 		fprintf(stderr, "  title: %s\n", title);
    431 
    432 	html = inline_css(html, final_url);
    433 	html = inline_resources(html, final_url,
    434 	                        fetch_and_encode);
    435 	html = rewrite_links(html, rel_path);
    436 
    437 	header = generate_header(title, url);
    438 
    439 	full_path_len = strlen(output_dir) + 1 +
    440 	                strlen(rel_path) + 1;
    441 	full_path = xmalloc(full_path_len);
    442 	snprintf(full_path, full_path_len, "%s/%s",
    443 	         output_dir, rel_path);
    444 
    445 	dir = xstrdup(full_path);
    446 	last_slash = strrchr(dir, '/');
    447 	if (last_slash) {
    448 		*last_slash = '\0';
    449 		mkdirp(dir);
    450 		mkdir(dir, 0755);
    451 	}
    452 	free(dir);
    453 
    454 	fp = fopen(full_path, "w");
    455 	if (!fp) {
    456 		warn("cannot write: %s", full_path);
    457 		free(full_path);
    458 		free(header);
    459 		free(html);
    460 		free(title);
    461 		return -1;
    462 	}
    463 
    464 	fputs(header, fp);
    465 	fputs(html, fp);
    466 	fclose(fp);
    467 
    468 	fprintf(stderr, "  saved: %s\n", full_path);
    469 
    470 	free(full_path);
    471 	free(header);
    472 	free(html);
    473 	free(title);
    474 	return 0;
    475 }
    476 
    477 static int
    478 archive_page(const char *url)
    479 {
    480 	Response *resp;
    481 	const char *final_url;
    482 	char *rel_path;
    483 	int ret;
    484 
    485 	if (verbose)
    486 		fprintf(stderr, "[0] %s\n", url);
    487 
    488 	resp = fetch_url(url);
    489 	if (!resp) {
    490 		warn("failed to fetch: %s", url);
    491 		return -1;
    492 	}
    493 	if (resp->status_code >= 400) {
    494 		warn("HTTP %ld: %s", resp->status_code, url);
    495 		response_free(resp);
    496 		return -1;
    497 	}
    498 	if (resp->content_type &&
    499 	    !strstr(resp->content_type, "text/html")) {
    500 		if (verbose)
    501 			fprintf(stderr, "  skip non-HTML: %s\n",
    502 			        resp->content_type);
    503 		response_free(resp);
    504 		return 0;
    505 	}
    506 
    507 	/* Detect CMS type for informational output */
    508 	{
    509 		SiteInfo *sinfo;
    510 
    511 		sinfo = detect_site(resp->data, url);
    512 		if (sinfo->type != SITE_UNKNOWN)
    513 			fprintf(stderr, "  CMS: %s\n", sinfo->name);
    514 		siteinfo_free(sinfo);
    515 	}
    516 
    517 	final_url = resp->final_url ? resp->final_url : url;
    518 	rel_path = url_to_path(url, base_domain);
    519 
    520 	ret = save_page(url, final_url, resp->data, rel_path);
    521 
    522 	free(rel_path);
    523 	response_free(resp);
    524 	return ret;
    525 }
    526 
    527 static void
    528 crawl_site(const char *start_url)
    529 {
    530 	UrlQueue *queue;
    531 	VisitedSet *visited;
    532 	Robots *robots = NULL;
    533 	QueueNode *node;
    534 	Response *resp;
    535 	const char *final_url;
    536 	char *norm, *url, *rel_path;
    537 	int depth, pages_archived, rate_ms;
    538 	time_t start_time, now;
    539 
    540 	queue = queue_new();
    541 	visited = visited_new();
    542 
    543 	if (respect_robots) {
    544 		fprintf(stderr, "Fetching robots.txt for %s...\n",
    545 		        base_domain);
    546 		robots = robots_fetch(base_domain);
    547 		if (robots->nrules > 0)
    548 			fprintf(stderr, "  %d rules loaded\n",
    549 			        robots->nrules);
    550 		else
    551 			fprintf(stderr, "  no restrictions\n");
    552 
    553 		if (robots_delay(robots) > 0) {
    554 			rate_ms = robots_delay(robots) * 1000;
    555 			fprintf(stderr, "  crawl-delay: %ds\n",
    556 			        robots_delay(robots));
    557 		} else {
    558 			rate_ms = RATE_LIMIT_MS;
    559 		}
    560 	} else {
    561 		rate_ms = RATE_LIMIT_MS;
    562 	}
    563 
    564 	norm = url_normalize(start_url);
    565 	visited_add(visited, norm);
    566 	free(norm);
    567 	queue_push(queue, start_url, 0);
    568 
    569 	/* Detect CMS type from the start page */
    570 	{
    571 		Response *detect_resp;
    572 		SiteInfo *sinfo;
    573 
    574 		detect_resp = fetch_url(start_url);
    575 		if (detect_resp && detect_resp->status_code < 400 &&
    576 		    detect_resp->data) {
    577 			sinfo = detect_site(detect_resp->data,
    578 			                    start_url);
    579 			if (sinfo->type != SITE_UNKNOWN) {
    580 				char **seeds;
    581 				int nseed, i;
    582 
    583 				fprintf(stderr,
    584 				    "Detected CMS: %s\n",
    585 				    sinfo->name);
    586 				if (sinfo->feed_url)
    587 					fprintf(stderr,
    588 					    "  feed: %s\n",
    589 					    sinfo->feed_url);
    590 				if (sinfo->sitemap_url)
    591 					fprintf(stderr,
    592 					    "  sitemap: %s\n",
    593 					    sinfo->sitemap_url);
    594 
    595 				/* Add CMS-specific seed URLs */
    596 				seeds = detect_seed_urls(sinfo,
    597 				    base_domain, &nseed);
    598 				for (i = 0; i < nseed; i++) {
    599 					norm = url_normalize(
    600 					    seeds[i]);
    601 					if (!visited_contains(
    602 					    visited, norm)) {
    603 						visited_add(
    604 						    visited, norm);
    605 						queue_push(queue,
    606 						    seeds[i], 0);
    607 						if (verbose)
    608 							fprintf(stderr,
    609 							    "  seed: %s\n",
    610 							    seeds[i]);
    611 					}
    612 					free(norm);
    613 					free(seeds[i]);
    614 				}
    615 				free(seeds);
    616 			}
    617 			siteinfo_free(sinfo);
    618 		}
    619 		response_free(detect_resp);
    620 	}
    621 
    622 	pages_archived = 0;
    623 	start_time = time(NULL);
    624 
    625 	while (!queue_empty(queue)) {
    626 		node = queue_pop(queue);
    627 		url = node->url;
    628 		depth = node->depth;
    629 
    630 		resp = fetch_url(url);
    631 		if (!resp || resp->status_code >= 400) {
    632 			if (verbose)
    633 				fprintf(stderr, "[%d] SKIP %s\n",
    634 				        depth, url);
    635 			response_free(resp);
    636 			free(url);
    637 			free(node);
    638 			continue;
    639 		}
    640 
    641 		if (resp->content_type &&
    642 		    !strstr(resp->content_type, "text/html")) {
    643 			response_free(resp);
    644 			free(url);
    645 			free(node);
    646 			continue;
    647 		}
    648 
    649 		final_url = resp->final_url ?
    650 		            resp->final_url : url;
    651 
    652 		fprintf(stderr, "[d=%d q=%zu v=%zu] %s\n",
    653 		        depth, queue_size(queue),
    654 		        visited_count(visited), url);
    655 
    656 		extract_links(resp->data, final_url, queue,
    657 		              visited, depth, robots);
    658 
    659 		rel_path = url_to_path(url, base_domain);
    660 		save_page(url, final_url, resp->data, rel_path);
    661 		pages_archived++;
    662 
    663 		if (pages_archived % PROGRESS_INTERVAL == 0) {
    664 			now = time(NULL);
    665 			fprintf(stderr,
    666 			    "\n--- %d pages, %zu queued, "
    667 			    "%zu visited, %lds ---\n\n",
    668 			    pages_archived,
    669 			    queue_size(queue),
    670 			    visited_count(visited),
    671 			    (long)(now - start_time));
    672 		}
    673 
    674 		free(rel_path);
    675 		response_free(resp);
    676 		free(url);
    677 		free(node);
    678 
    679 		usleep(rate_ms * 1000);
    680 	}
    681 
    682 	now = time(NULL);
    683 	fprintf(stderr,
    684 	    "\nDone: %d pages to %s/ in %lds\n",
    685 	    pages_archived, output_dir,
    686 	    (long)(now - start_time));
    687 
    688 	robots_free(robots);
    689 	queue_free(queue);
    690 	visited_free(visited);
    691 }
    692 
    693 int
    694 main(int argc, char *argv[])
    695 {
    696 	const char *url;
    697 	int opt;
    698 
    699 	while ((opt = getopt(argc, argv, "vrRd:o:a:h")) != -1) {
    700 		switch (opt) {
    701 		case 'v':
    702 			verbose = 1;
    703 			break;
    704 		case 'r':
    705 			recursive = 1;
    706 			break;
    707 		case 'R':
    708 			respect_robots = 0;
    709 			break;
    710 		case 'd':
    711 			max_depth = atoi(optarg);
    712 			if (max_depth < 1)
    713 				max_depth = 1;
    714 			break;
    715 		case 'o':
    716 			output_dir = optarg;
    717 			break;
    718 		case 'a':
    719 			author = optarg;
    720 			break;
    721 		case 'h': /* fallthrough */
    722 		default:
    723 			usage();
    724 		}
    725 	}
    726 
    727 	if (optind >= argc)
    728 		usage();
    729 
    730 	url = argv[optind];
    731 	base_domain = url_get_domain(url);
    732 	if (!output_dir)
    733 		output_dir = base_domain;
    734 
    735 	if (mkdir(output_dir, 0755) != 0 && errno != EEXIST)
    736 		die("cannot create directory: %s", output_dir);
    737 
    738 	fetch_init();
    739 
    740 	fprintf(stderr, "sbot %s\n", ARCHIVER_VERSION);
    741 	fprintf(stderr, "Target: %s\n", url);
    742 	fprintf(stderr, "Output: %s/\n", output_dir);
    743 	if (recursive)
    744 		fprintf(stderr, "Mode: recursive (depth %d)\n",
    745 		        max_depth);
    746 	if (!respect_robots)
    747 		fprintf(stderr, "Warning: ignoring robots.txt\n");
    748 	fprintf(stderr, "\n");
    749 
    750 	if (recursive)
    751 		crawl_site(url);
    752 	else
    753 		archive_page(url);
    754 
    755 	fetch_cleanup();
    756 	free(base_domain);
    757 	return 0;
    758 }