archiver.c (16244B)
1 /* See LICENSE file for copyright and license details. 2 * 3 * sbot - Simple Archiver Bot 4 * 5 * Creates self-contained archives of websites with all 6 * resources inlined as data URIs. 7 */ 8 9 #include <errno.h> 10 #include <stdio.h> 11 #include <stdlib.h> 12 #include <string.h> 13 #include <sys/stat.h> 14 #include <time.h> 15 #include <unistd.h> 16 17 #include "config.h" 18 #include "crawl.h" 19 #include "detect.h" 20 #include "fetch.h" 21 #include "parse.h" 22 #include "robots.h" 23 #include "util.h" 24 25 /* Global options */ 26 static int verbose = 0; 27 static int recursive = 0; 28 static int max_depth = MAX_DEPTH; 29 static int respect_robots = 1; 30 static const char *author = "Unknown"; 31 static const char *output_dir = NULL; 32 static char *base_domain = NULL; 33 34 static void 35 usage(void) 36 { 37 fprintf(stderr, 38 "usage: sbot [-vrR] [-d depth] [-o dir]" 39 " [-a author] url\n" 40 "\n" 41 " -v verbose output\n" 42 " -r recursive (crawl entire site)\n" 43 " -R ignore robots.txt\n" 44 " -d depth max crawl depth (default: %d)\n" 45 " -o dir output directory\n" 46 " -a author site author name\n", 47 MAX_DEPTH); 48 exit(1); 49 } 50 51 static int 52 mkdirp(const char *path) 53 { 54 char *p, *sep; 55 56 p = xstrdup(path); 57 for (sep = p + 1; *sep; sep++) { 58 if (*sep == '/') { 59 *sep = '\0'; 60 if (mkdir(p, 0755) != 0 && errno != EEXIST) { 61 free(p); 62 return -1; 63 } 64 *sep = '/'; 65 } 66 } 67 free(p); 68 return 0; 69 } 70 71 static char * 72 fetch_and_encode(const char *url, const char *base) 73 { 74 char *resolved, *mime, *b64, *data_uri, *semi; 75 Response *resp; 76 size_t b64_len, uri_len; 77 78 resolved = url_resolve(base, url); 79 if (verbose) 80 fprintf(stderr, " resource: %s\n", resolved); 81 82 resp = fetch_url(resolved); 83 if (!resp || resp->status_code >= 400 || resp->size == 0) { 84 free(resolved); 85 response_free(resp); 86 return NULL; 87 } 88 89 if (resp->content_type) { 90 mime = xstrdup(resp->content_type); 91 semi = strchr(mime, ';'); 92 if (semi) 93 *semi = '\0'; 94 str_trim(mime); 95 } else { 96 mime = get_mime_type(resolved); 97 } 98 99 b64 = base64_encode((unsigned char *)resp->data, 100 resp->size, &b64_len); 101 102 uri_len = 5 + strlen(mime) + 8 + b64_len + 1; 103 data_uri = xmalloc(uri_len); 104 snprintf(data_uri, uri_len, "data:%s;base64,%s", mime, b64); 105 106 free(mime); 107 free(b64); 108 free(resolved); 109 response_free(resp); 110 return data_uri; 111 } 112 113 static char * 114 generate_header(const char *title, const char *source_url) 115 { 116 char *date, *domain, *header; 117 size_t len; 118 119 date = get_iso_date(); 120 domain = url_get_domain(source_url); 121 122 len = 2048 + strlen(title) + strlen(source_url) + 123 strlen(author) + strlen(domain); 124 header = xmalloc(len); 125 126 snprintf(header, len, 127 "<!--\n" 128 "========================================" 129 "========================================\n" 130 " GWTAR ARCHIVE\n" 131 "========================================" 132 "========================================\n" 133 "\n" 134 " Title: %s\n" 135 " Source URL: %s\n" 136 " Domain: %s\n" 137 " Author: %s\n" 138 "\n" 139 " Archived by: %s\n" 140 " Archived on: %s\n" 141 " Archive date: %s\n" 142 "\n" 143 " Generator: sbot/%s\n" 144 " Format: GWTAR (Gwern Web Tar Archive)\n" 145 "\n" 146 "========================================" 147 "========================================\n" 148 "-->\n", 149 title, source_url, domain, author, 150 ARCHIVER_NAME, ARCHIVER_SITE, date, 151 ARCHIVER_VERSION); 152 153 free(date); 154 free(domain); 155 return header; 156 } 157 158 static int 159 path_depth(const char *path) 160 { 161 int depth = 0; 162 const char *p; 163 164 for (p = path; *p; p++) 165 if (*p == '/') 166 depth++; 167 return depth; 168 } 169 170 static char * 171 make_relative_prefix(int depth) 172 { 173 char *prefix; 174 size_t len; 175 int i; 176 177 if (depth == 0) 178 return xstrdup(""); 179 len = depth * 3 + 1; 180 prefix = xmalloc(len); 181 prefix[0] = '\0'; 182 for (i = 0; i < depth; i++) 183 strcat(prefix, "../"); 184 return prefix; 185 } 186 187 static char * 188 rewrite_links(char *html, const char *rel_path) 189 { 190 int depth; 191 char *prefix, *result, *new_result; 192 size_t prefix_len, search_offset, pos, old_len, new_len; 193 const char *p; 194 195 depth = path_depth(rel_path); 196 prefix = make_relative_prefix(depth); 197 prefix_len = strlen(prefix); 198 result = html; 199 200 search_offset = 0; 201 while (1) { 202 p = strstr(result + search_offset, "href=\"/"); 203 if (!p) 204 break; 205 if (p[7] == '/') { 206 search_offset = (p - result) + 8; 207 continue; 208 } 209 pos = p - result + 6; 210 old_len = strlen(result); 211 new_len = old_len - 1 + prefix_len; 212 new_result = xmalloc(new_len + 1); 213 memcpy(new_result, result, pos); 214 memcpy(new_result + pos, prefix, prefix_len); 215 memcpy(new_result + pos + prefix_len, 216 result + pos + 1, old_len - pos); 217 free(result); 218 result = new_result; 219 search_offset = pos + prefix_len; 220 } 221 222 search_offset = 0; 223 while (1) { 224 p = strstr(result + search_offset, "src=\"/"); 225 if (!p) 226 break; 227 if (p[6] == '/') { 228 search_offset = (p - result) + 7; 229 continue; 230 } 231 pos = p - result + 5; 232 old_len = strlen(result); 233 new_len = old_len - 1 + prefix_len; 234 new_result = xmalloc(new_len + 1); 235 memcpy(new_result, result, pos); 236 memcpy(new_result + pos, prefix, prefix_len); 237 memcpy(new_result + pos + prefix_len, 238 result + pos + 1, old_len - pos); 239 free(result); 240 result = new_result; 241 search_offset = pos + prefix_len; 242 } 243 244 free(prefix); 245 return result; 246 } 247 248 static char * 249 inline_css(char *html, const char *base) 250 { 251 char *result, *tag, *href, *resolved; 252 char *new_tag, *new_result; 253 const char *link_start, *link_end; 254 const char *href_start, *href_end; 255 size_t search_offset, tag_offset, tag_len, href_len; 256 size_t new_tag_len, old_len, new_len, result_len; 257 Response *resp; 258 char quote; 259 260 result = html; 261 search_offset = 0; 262 263 while (1) { 264 link_start = strcasestr(result + search_offset, 265 "<link"); 266 if (!link_start) 267 break; 268 link_end = strchr(link_start, '>'); 269 if (!link_end) 270 break; 271 272 tag_offset = link_start - result; 273 tag_len = link_end - link_start; 274 tag = xmalloc(tag_len + 1); 275 memcpy(tag, link_start, tag_len); 276 tag[tag_len] = '\0'; 277 278 if (!strcasestr(tag, "stylesheet")) { 279 free(tag); 280 search_offset = (link_end - result) + 1; 281 continue; 282 } 283 284 href_start = strcasestr(tag, "href="); 285 if (!href_start) { 286 free(tag); 287 search_offset = (link_end - result) + 1; 288 continue; 289 } 290 href_start += 5; 291 quote = 0; 292 if (*href_start == '"' || *href_start == '\'') 293 quote = *href_start++; 294 295 href_end = href_start; 296 if (quote) { 297 while (*href_end && *href_end != quote) 298 href_end++; 299 } else { 300 while (*href_end && *href_end != ' ' && 301 *href_end != '>') 302 href_end++; 303 } 304 305 href_len = href_end - href_start; 306 href = xmalloc(href_len + 1); 307 memcpy(href, href_start, href_len); 308 href[href_len] = '\0'; 309 310 resolved = url_resolve(base, href); 311 if (verbose) 312 fprintf(stderr, " css: %s\n", resolved); 313 314 resp = fetch_url(resolved); 315 free(resolved); 316 317 if (resp && resp->status_code < 400 && 318 resp->size > 0) { 319 new_tag_len = 7 + resp->size + 8 + 1; 320 new_tag = xmalloc(new_tag_len); 321 snprintf(new_tag, new_tag_len, 322 "<style>%s</style>", resp->data); 323 324 old_len = (link_end + 1) - link_start; 325 new_len = strlen(new_tag); 326 result_len = strlen(result); 327 328 new_result = xmalloc( 329 result_len - old_len + new_len + 1); 330 memcpy(new_result, result, tag_offset); 331 memcpy(new_result + tag_offset, 332 new_tag, new_len); 333 memcpy(new_result + tag_offset + new_len, 334 link_end + 1, 335 result_len - tag_offset - old_len + 1); 336 337 free(result); 338 result = new_result; 339 search_offset = tag_offset + new_len; 340 free(new_tag); 341 } else { 342 search_offset = (link_end - result) + 1; 343 } 344 345 response_free(resp); 346 free(href); 347 free(tag); 348 } 349 350 return result; 351 } 352 353 static void 354 extract_links(const char *html, const char *base_url, 355 UrlQueue *queue, VisitedSet *visited, 356 int current_depth, Robots *robots) 357 { 358 ResourceList *resources; 359 Resource *r; 360 char *norm, *path; 361 const char *pstart; 362 363 if (current_depth >= max_depth) 364 return; 365 366 resources = parse_html(html, base_url); 367 368 for (r = resources->head; r; r = r->next) { 369 if (r->type != RES_PAGE) 370 continue; 371 372 if (str_starts_with(r->url, "mailto:") || 373 str_starts_with(r->url, "tel:") || 374 str_starts_with(r->url, "javascript:") || 375 str_starts_with(r->url, "#")) 376 continue; 377 378 if (!url_same_domain(r->url, base_url)) 379 continue; 380 381 /* Check robots.txt */ 382 if (robots) { 383 pstart = r->url; 384 if (str_starts_with(pstart, "https://")) 385 pstart += 8; 386 else if (str_starts_with(pstart, "http://")) 387 pstart += 7; 388 while (*pstart && *pstart != '/') 389 pstart++; 390 path = xstrdup(pstart[0] ? pstart : "/"); 391 if (!robots_allowed(robots, path)) { 392 if (verbose) 393 fprintf(stderr, 394 " robots: blocked" 395 " %s\n", r->url); 396 free(path); 397 continue; 398 } 399 free(path); 400 } 401 402 norm = url_normalize(r->url); 403 if (!visited_contains(visited, norm)) { 404 visited_add(visited, norm); 405 queue_push(queue, r->url, 406 current_depth + 1); 407 if (verbose) 408 fprintf(stderr, 409 " queued: %s\n", r->url); 410 } 411 free(norm); 412 } 413 414 reslist_free(resources); 415 } 416 417 static int 418 save_page(const char *url, const char *final_url, 419 const char *data, const char *rel_path) 420 { 421 char *html, *title, *header; 422 char *full_path, *dir, *last_slash; 423 size_t full_path_len; 424 FILE *fp; 425 426 html = xstrdup(data); 427 title = parse_title(html); 428 429 if (verbose) 430 fprintf(stderr, " title: %s\n", title); 431 432 html = inline_css(html, final_url); 433 html = inline_resources(html, final_url, 434 fetch_and_encode); 435 html = rewrite_links(html, rel_path); 436 437 header = generate_header(title, url); 438 439 full_path_len = strlen(output_dir) + 1 + 440 strlen(rel_path) + 1; 441 full_path = xmalloc(full_path_len); 442 snprintf(full_path, full_path_len, "%s/%s", 443 output_dir, rel_path); 444 445 dir = xstrdup(full_path); 446 last_slash = strrchr(dir, '/'); 447 if (last_slash) { 448 *last_slash = '\0'; 449 mkdirp(dir); 450 mkdir(dir, 0755); 451 } 452 free(dir); 453 454 fp = fopen(full_path, "w"); 455 if (!fp) { 456 warn("cannot write: %s", full_path); 457 free(full_path); 458 free(header); 459 free(html); 460 free(title); 461 return -1; 462 } 463 464 fputs(header, fp); 465 fputs(html, fp); 466 fclose(fp); 467 468 fprintf(stderr, " saved: %s\n", full_path); 469 470 free(full_path); 471 free(header); 472 free(html); 473 free(title); 474 return 0; 475 } 476 477 static int 478 archive_page(const char *url) 479 { 480 Response *resp; 481 const char *final_url; 482 char *rel_path; 483 int ret; 484 485 if (verbose) 486 fprintf(stderr, "[0] %s\n", url); 487 488 resp = fetch_url(url); 489 if (!resp) { 490 warn("failed to fetch: %s", url); 491 return -1; 492 } 493 if (resp->status_code >= 400) { 494 warn("HTTP %ld: %s", resp->status_code, url); 495 response_free(resp); 496 return -1; 497 } 498 if (resp->content_type && 499 !strstr(resp->content_type, "text/html")) { 500 if (verbose) 501 fprintf(stderr, " skip non-HTML: %s\n", 502 resp->content_type); 503 response_free(resp); 504 return 0; 505 } 506 507 /* Detect CMS type for informational output */ 508 { 509 SiteInfo *sinfo; 510 511 sinfo = detect_site(resp->data, url); 512 if (sinfo->type != SITE_UNKNOWN) 513 fprintf(stderr, " CMS: %s\n", sinfo->name); 514 siteinfo_free(sinfo); 515 } 516 517 final_url = resp->final_url ? resp->final_url : url; 518 rel_path = url_to_path(url, base_domain); 519 520 ret = save_page(url, final_url, resp->data, rel_path); 521 522 free(rel_path); 523 response_free(resp); 524 return ret; 525 } 526 527 static void 528 crawl_site(const char *start_url) 529 { 530 UrlQueue *queue; 531 VisitedSet *visited; 532 Robots *robots = NULL; 533 QueueNode *node; 534 Response *resp; 535 const char *final_url; 536 char *norm, *url, *rel_path; 537 int depth, pages_archived, rate_ms; 538 time_t start_time, now; 539 540 queue = queue_new(); 541 visited = visited_new(); 542 543 if (respect_robots) { 544 fprintf(stderr, "Fetching robots.txt for %s...\n", 545 base_domain); 546 robots = robots_fetch(base_domain); 547 if (robots->nrules > 0) 548 fprintf(stderr, " %d rules loaded\n", 549 robots->nrules); 550 else 551 fprintf(stderr, " no restrictions\n"); 552 553 if (robots_delay(robots) > 0) { 554 rate_ms = robots_delay(robots) * 1000; 555 fprintf(stderr, " crawl-delay: %ds\n", 556 robots_delay(robots)); 557 } else { 558 rate_ms = RATE_LIMIT_MS; 559 } 560 } else { 561 rate_ms = RATE_LIMIT_MS; 562 } 563 564 norm = url_normalize(start_url); 565 visited_add(visited, norm); 566 free(norm); 567 queue_push(queue, start_url, 0); 568 569 /* Detect CMS type from the start page */ 570 { 571 Response *detect_resp; 572 SiteInfo *sinfo; 573 574 detect_resp = fetch_url(start_url); 575 if (detect_resp && detect_resp->status_code < 400 && 576 detect_resp->data) { 577 sinfo = detect_site(detect_resp->data, 578 start_url); 579 if (sinfo->type != SITE_UNKNOWN) { 580 char **seeds; 581 int nseed, i; 582 583 fprintf(stderr, 584 "Detected CMS: %s\n", 585 sinfo->name); 586 if (sinfo->feed_url) 587 fprintf(stderr, 588 " feed: %s\n", 589 sinfo->feed_url); 590 if (sinfo->sitemap_url) 591 fprintf(stderr, 592 " sitemap: %s\n", 593 sinfo->sitemap_url); 594 595 /* Add CMS-specific seed URLs */ 596 seeds = detect_seed_urls(sinfo, 597 base_domain, &nseed); 598 for (i = 0; i < nseed; i++) { 599 norm = url_normalize( 600 seeds[i]); 601 if (!visited_contains( 602 visited, norm)) { 603 visited_add( 604 visited, norm); 605 queue_push(queue, 606 seeds[i], 0); 607 if (verbose) 608 fprintf(stderr, 609 " seed: %s\n", 610 seeds[i]); 611 } 612 free(norm); 613 free(seeds[i]); 614 } 615 free(seeds); 616 } 617 siteinfo_free(sinfo); 618 } 619 response_free(detect_resp); 620 } 621 622 pages_archived = 0; 623 start_time = time(NULL); 624 625 while (!queue_empty(queue)) { 626 node = queue_pop(queue); 627 url = node->url; 628 depth = node->depth; 629 630 resp = fetch_url(url); 631 if (!resp || resp->status_code >= 400) { 632 if (verbose) 633 fprintf(stderr, "[%d] SKIP %s\n", 634 depth, url); 635 response_free(resp); 636 free(url); 637 free(node); 638 continue; 639 } 640 641 if (resp->content_type && 642 !strstr(resp->content_type, "text/html")) { 643 response_free(resp); 644 free(url); 645 free(node); 646 continue; 647 } 648 649 final_url = resp->final_url ? 650 resp->final_url : url; 651 652 fprintf(stderr, "[d=%d q=%zu v=%zu] %s\n", 653 depth, queue_size(queue), 654 visited_count(visited), url); 655 656 extract_links(resp->data, final_url, queue, 657 visited, depth, robots); 658 659 rel_path = url_to_path(url, base_domain); 660 save_page(url, final_url, resp->data, rel_path); 661 pages_archived++; 662 663 if (pages_archived % PROGRESS_INTERVAL == 0) { 664 now = time(NULL); 665 fprintf(stderr, 666 "\n--- %d pages, %zu queued, " 667 "%zu visited, %lds ---\n\n", 668 pages_archived, 669 queue_size(queue), 670 visited_count(visited), 671 (long)(now - start_time)); 672 } 673 674 free(rel_path); 675 response_free(resp); 676 free(url); 677 free(node); 678 679 usleep(rate_ms * 1000); 680 } 681 682 now = time(NULL); 683 fprintf(stderr, 684 "\nDone: %d pages to %s/ in %lds\n", 685 pages_archived, output_dir, 686 (long)(now - start_time)); 687 688 robots_free(robots); 689 queue_free(queue); 690 visited_free(visited); 691 } 692 693 int 694 main(int argc, char *argv[]) 695 { 696 const char *url; 697 int opt; 698 699 while ((opt = getopt(argc, argv, "vrRd:o:a:h")) != -1) { 700 switch (opt) { 701 case 'v': 702 verbose = 1; 703 break; 704 case 'r': 705 recursive = 1; 706 break; 707 case 'R': 708 respect_robots = 0; 709 break; 710 case 'd': 711 max_depth = atoi(optarg); 712 if (max_depth < 1) 713 max_depth = 1; 714 break; 715 case 'o': 716 output_dir = optarg; 717 break; 718 case 'a': 719 author = optarg; 720 break; 721 case 'h': /* fallthrough */ 722 default: 723 usage(); 724 } 725 } 726 727 if (optind >= argc) 728 usage(); 729 730 url = argv[optind]; 731 base_domain = url_get_domain(url); 732 if (!output_dir) 733 output_dir = base_domain; 734 735 if (mkdir(output_dir, 0755) != 0 && errno != EEXIST) 736 die("cannot create directory: %s", output_dir); 737 738 fetch_init(); 739 740 fprintf(stderr, "sbot %s\n", ARCHIVER_VERSION); 741 fprintf(stderr, "Target: %s\n", url); 742 fprintf(stderr, "Output: %s/\n", output_dir); 743 if (recursive) 744 fprintf(stderr, "Mode: recursive (depth %d)\n", 745 max_depth); 746 if (!respect_robots) 747 fprintf(stderr, "Warning: ignoring robots.txt\n"); 748 fprintf(stderr, "\n"); 749 750 if (recursive) 751 crawl_site(url); 752 else 753 archive_page(url); 754 755 fetch_cleanup(); 756 free(base_domain); 757 return 0; 758 }