sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

crawl.c (5142B)


      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #include <ctype.h>
      4 #include <stdio.h>
      5 #include <stdlib.h>
      6 #include <string.h>
      7 
      8 #include "crawl.h"
      9 #include "util.h"
     10 
     11 /* FNV-1a hash - fast, good distribution for URL strings */
     12 static unsigned long
     13 fnv1a(const char *s)
     14 {
     15 	unsigned long h = 2166136261UL;
     16 
     17 	for (; *s; s++) {
     18 		h ^= (unsigned char)*s;
     19 		h *= 16777619UL;
     20 	}
     21 	return h;
     22 }
     23 
     24 UrlQueue *
     25 queue_new(void)
     26 {
     27 	UrlQueue *q = xmalloc(sizeof(UrlQueue));
     28 
     29 	q->head = NULL;
     30 	q->tail = NULL;
     31 	q->count = 0;
     32 	return q;
     33 }
     34 
     35 void
     36 queue_free(UrlQueue *q)
     37 {
     38 	QueueNode *n, *next;
     39 
     40 	if (!q)
     41 		return;
     42 	for (n = q->head; n; n = next) {
     43 		next = n->next;
     44 		free(n->url);
     45 		free(n);
     46 	}
     47 	free(q);
     48 }
     49 
     50 void
     51 queue_push(UrlQueue *q, const char *url, int depth)
     52 {
     53 	QueueNode *n = xmalloc(sizeof(QueueNode));
     54 
     55 	n->url = xstrdup(url);
     56 	n->depth = depth;
     57 	n->next = NULL;
     58 	if (q->tail) {
     59 		q->tail->next = n;
     60 		q->tail = n;
     61 	} else {
     62 		q->head = n;
     63 		q->tail = n;
     64 	}
     65 	q->count++;
     66 }
     67 
     68 QueueNode *
     69 queue_pop(UrlQueue *q)
     70 {
     71 	QueueNode *n;
     72 
     73 	if (!q->head)
     74 		return NULL;
     75 	n = q->head;
     76 	q->head = n->next;
     77 	if (!q->head)
     78 		q->tail = NULL;
     79 	q->count--;
     80 	return n;
     81 }
     82 
     83 int
     84 queue_empty(UrlQueue *q)
     85 {
     86 	return q->head == NULL;
     87 }
     88 
     89 size_t
     90 queue_size(UrlQueue *q)
     91 {
     92 	return q->count;
     93 }
     94 
     95 VisitedSet *
     96 visited_new(void)
     97 {
     98 	VisitedSet *v = xmalloc(sizeof(VisitedSet));
     99 
    100 	memset(v->buckets, 0, sizeof(v->buckets));
    101 	v->count = 0;
    102 	return v;
    103 }
    104 
    105 void
    106 visited_free(VisitedSet *v)
    107 {
    108 	HashNode *n, *next;
    109 	size_t i;
    110 
    111 	if (!v)
    112 		return;
    113 	for (i = 0; i < HT_SIZE; i++) {
    114 		for (n = v->buckets[i]; n; n = next) {
    115 			next = n->next;
    116 			free(n->url);
    117 			free(n);
    118 		}
    119 	}
    120 	free(v);
    121 }
    122 
    123 void
    124 visited_add(VisitedSet *v, const char *url)
    125 {
    126 	unsigned long h = fnv1a(url) % HT_SIZE;
    127 	HashNode *n;
    128 
    129 	/* Check for duplicate first */
    130 	for (n = v->buckets[h]; n; n = n->next) {
    131 		if (strcmp(n->url, url) == 0)
    132 			return;
    133 	}
    134 	n = xmalloc(sizeof(HashNode));
    135 	n->url = xstrdup(url);
    136 	n->next = v->buckets[h];
    137 	v->buckets[h] = n;
    138 	v->count++;
    139 }
    140 
    141 int
    142 visited_contains(VisitedSet *v, const char *url)
    143 {
    144 	unsigned long h = fnv1a(url) % HT_SIZE;
    145 	HashNode *n;
    146 
    147 	for (n = v->buckets[h]; n; n = n->next) {
    148 		if (strcmp(n->url, url) == 0)
    149 			return 1;
    150 	}
    151 	return 0;
    152 }
    153 
    154 size_t
    155 visited_count(VisitedSet *v)
    156 {
    157 	return v->count;
    158 }
    159 
    160 char *
    161 url_normalize(const char *url)
    162 {
    163 	char *norm, *p, *hash, *query;
    164 	size_t len;
    165 
    166 	norm = xstrdup(url);
    167 
    168 	/* Remove fragment */
    169 	hash = strchr(norm, '#');
    170 	if (hash)
    171 		*hash = '\0';
    172 
    173 	/* Remove query string */
    174 	query = strchr(norm, '?');
    175 	if (query)
    176 		*query = '\0';
    177 
    178 	/* Remove trailing slash (but not bare domain slash) */
    179 	len = strlen(norm);
    180 	if (len > 1 && norm[len - 1] == '/') {
    181 		/* Keep slash if it's just protocol://domain/ */
    182 		p = norm;
    183 		if (str_starts_with(p, "https://"))
    184 			p += 8;
    185 		else if (str_starts_with(p, "http://"))
    186 			p += 7;
    187 		/* Skip domain */
    188 		while (*p && *p != '/')
    189 			p++;
    190 		/* Only strip if there's path beyond domain */
    191 		if (p < norm + len - 1)
    192 			norm[len - 1] = '\0';
    193 	}
    194 
    195 	/* Lowercase the domain part */
    196 	p = norm;
    197 	if (str_starts_with(p, "https://"))
    198 		p += 8;
    199 	else if (str_starts_with(p, "http://"))
    200 		p += 7;
    201 	while (*p && *p != '/')
    202 		*p++ = tolower((unsigned char)*p);
    203 
    204 	/* Remove default port :80 or :443 */
    205 	p = norm;
    206 	if (str_starts_with(p, "https://"))
    207 		p += 8;
    208 	else if (str_starts_with(p, "http://"))
    209 		p += 7;
    210 	{
    211 		char *colon = NULL;
    212 		char *slash = NULL;
    213 		char *scan;
    214 		int is_https;
    215 
    216 		is_https = str_starts_with(norm, "https://");
    217 		for (scan = p; *scan && *scan != '/'; scan++) {
    218 			if (*scan == ':')
    219 				colon = scan;
    220 		}
    221 		slash = scan;
    222 		if (colon) {
    223 			char port[8];
    224 			size_t plen = slash - colon - 1;
    225 
    226 			if (plen < sizeof(port)) {
    227 				memcpy(port, colon + 1, plen);
    228 				port[plen] = '\0';
    229 				if ((is_https && strcmp(port, "443") == 0) ||
    230 				    (!is_https && strcmp(port, "80") == 0)) {
    231 					memmove(colon, slash,
    232 						strlen(slash) + 1);
    233 				}
    234 			}
    235 		}
    236 	}
    237 
    238 	return norm;
    239 }
    240 
    241 char *
    242 url_to_path(const char *url, const char *base_domain)
    243 {
    244 	const char *path_start;
    245 	char *path, *query, *hash, *new_path;
    246 	size_t len, new_len;
    247 
    248 	(void)base_domain;
    249 
    250 	path_start = url;
    251 	/* Skip protocol */
    252 	if (str_starts_with(url, "https://"))
    253 		path_start = url + 8;
    254 	else if (str_starts_with(url, "http://"))
    255 		path_start = url + 7;
    256 
    257 	/* Skip domain */
    258 	while (*path_start && *path_start != '/')
    259 		path_start++;
    260 
    261 	/* No path or just "/" -> index.html */
    262 	if (!*path_start || strcmp(path_start, "/") == 0)
    263 		return xstrdup("index.html");
    264 
    265 	/* Skip leading slash */
    266 	if (*path_start == '/')
    267 		path_start++;
    268 
    269 	/* Copy path, strip query/fragment */
    270 	path = xstrdup(path_start);
    271 	query = strchr(path, '?');
    272 	if (query)
    273 		*query = '\0';
    274 	hash = strchr(path, '#');
    275 	if (hash)
    276 		*hash = '\0';
    277 
    278 	/* Remove trailing slash */
    279 	len = strlen(path);
    280 	if (len > 0 && path[len - 1] == '/') {
    281 		path[len - 1] = '\0';
    282 		len--;
    283 	}
    284 
    285 	/* If path doesn't end in .html/.htm, treat as directory */
    286 	if (len > 0 && !str_ends_with(path, ".html") &&
    287 	    !str_ends_with(path, ".htm")) {
    288 		new_len = len + 12;
    289 		new_path = xmalloc(new_len);
    290 		snprintf(new_path, new_len, "%s/index.html", path);
    291 		free(path);
    292 		path = new_path;
    293 	}
    294 
    295 	return path;
    296 }