crawl.c (5142B)
1 /* See LICENSE file for copyright and license details. */ 2 3 #include <ctype.h> 4 #include <stdio.h> 5 #include <stdlib.h> 6 #include <string.h> 7 8 #include "crawl.h" 9 #include "util.h" 10 11 /* FNV-1a hash - fast, good distribution for URL strings */ 12 static unsigned long 13 fnv1a(const char *s) 14 { 15 unsigned long h = 2166136261UL; 16 17 for (; *s; s++) { 18 h ^= (unsigned char)*s; 19 h *= 16777619UL; 20 } 21 return h; 22 } 23 24 UrlQueue * 25 queue_new(void) 26 { 27 UrlQueue *q = xmalloc(sizeof(UrlQueue)); 28 29 q->head = NULL; 30 q->tail = NULL; 31 q->count = 0; 32 return q; 33 } 34 35 void 36 queue_free(UrlQueue *q) 37 { 38 QueueNode *n, *next; 39 40 if (!q) 41 return; 42 for (n = q->head; n; n = next) { 43 next = n->next; 44 free(n->url); 45 free(n); 46 } 47 free(q); 48 } 49 50 void 51 queue_push(UrlQueue *q, const char *url, int depth) 52 { 53 QueueNode *n = xmalloc(sizeof(QueueNode)); 54 55 n->url = xstrdup(url); 56 n->depth = depth; 57 n->next = NULL; 58 if (q->tail) { 59 q->tail->next = n; 60 q->tail = n; 61 } else { 62 q->head = n; 63 q->tail = n; 64 } 65 q->count++; 66 } 67 68 QueueNode * 69 queue_pop(UrlQueue *q) 70 { 71 QueueNode *n; 72 73 if (!q->head) 74 return NULL; 75 n = q->head; 76 q->head = n->next; 77 if (!q->head) 78 q->tail = NULL; 79 q->count--; 80 return n; 81 } 82 83 int 84 queue_empty(UrlQueue *q) 85 { 86 return q->head == NULL; 87 } 88 89 size_t 90 queue_size(UrlQueue *q) 91 { 92 return q->count; 93 } 94 95 VisitedSet * 96 visited_new(void) 97 { 98 VisitedSet *v = xmalloc(sizeof(VisitedSet)); 99 100 memset(v->buckets, 0, sizeof(v->buckets)); 101 v->count = 0; 102 return v; 103 } 104 105 void 106 visited_free(VisitedSet *v) 107 { 108 HashNode *n, *next; 109 size_t i; 110 111 if (!v) 112 return; 113 for (i = 0; i < HT_SIZE; i++) { 114 for (n = v->buckets[i]; n; n = next) { 115 next = n->next; 116 free(n->url); 117 free(n); 118 } 119 } 120 free(v); 121 } 122 123 void 124 visited_add(VisitedSet *v, const char *url) 125 { 126 unsigned long h = fnv1a(url) % HT_SIZE; 127 HashNode *n; 128 129 /* Check for duplicate first */ 130 for (n = v->buckets[h]; n; n = n->next) { 131 if (strcmp(n->url, url) == 0) 132 return; 133 } 134 n = xmalloc(sizeof(HashNode)); 135 n->url = xstrdup(url); 136 n->next = v->buckets[h]; 137 v->buckets[h] = n; 138 v->count++; 139 } 140 141 int 142 visited_contains(VisitedSet *v, const char *url) 143 { 144 unsigned long h = fnv1a(url) % HT_SIZE; 145 HashNode *n; 146 147 for (n = v->buckets[h]; n; n = n->next) { 148 if (strcmp(n->url, url) == 0) 149 return 1; 150 } 151 return 0; 152 } 153 154 size_t 155 visited_count(VisitedSet *v) 156 { 157 return v->count; 158 } 159 160 char * 161 url_normalize(const char *url) 162 { 163 char *norm, *p, *hash, *query; 164 size_t len; 165 166 norm = xstrdup(url); 167 168 /* Remove fragment */ 169 hash = strchr(norm, '#'); 170 if (hash) 171 *hash = '\0'; 172 173 /* Remove query string */ 174 query = strchr(norm, '?'); 175 if (query) 176 *query = '\0'; 177 178 /* Remove trailing slash (but not bare domain slash) */ 179 len = strlen(norm); 180 if (len > 1 && norm[len - 1] == '/') { 181 /* Keep slash if it's just protocol://domain/ */ 182 p = norm; 183 if (str_starts_with(p, "https://")) 184 p += 8; 185 else if (str_starts_with(p, "http://")) 186 p += 7; 187 /* Skip domain */ 188 while (*p && *p != '/') 189 p++; 190 /* Only strip if there's path beyond domain */ 191 if (p < norm + len - 1) 192 norm[len - 1] = '\0'; 193 } 194 195 /* Lowercase the domain part */ 196 p = norm; 197 if (str_starts_with(p, "https://")) 198 p += 8; 199 else if (str_starts_with(p, "http://")) 200 p += 7; 201 while (*p && *p != '/') 202 *p++ = tolower((unsigned char)*p); 203 204 /* Remove default port :80 or :443 */ 205 p = norm; 206 if (str_starts_with(p, "https://")) 207 p += 8; 208 else if (str_starts_with(p, "http://")) 209 p += 7; 210 { 211 char *colon = NULL; 212 char *slash = NULL; 213 char *scan; 214 int is_https; 215 216 is_https = str_starts_with(norm, "https://"); 217 for (scan = p; *scan && *scan != '/'; scan++) { 218 if (*scan == ':') 219 colon = scan; 220 } 221 slash = scan; 222 if (colon) { 223 char port[8]; 224 size_t plen = slash - colon - 1; 225 226 if (plen < sizeof(port)) { 227 memcpy(port, colon + 1, plen); 228 port[plen] = '\0'; 229 if ((is_https && strcmp(port, "443") == 0) || 230 (!is_https && strcmp(port, "80") == 0)) { 231 memmove(colon, slash, 232 strlen(slash) + 1); 233 } 234 } 235 } 236 } 237 238 return norm; 239 } 240 241 char * 242 url_to_path(const char *url, const char *base_domain) 243 { 244 const char *path_start; 245 char *path, *query, *hash, *new_path; 246 size_t len, new_len; 247 248 (void)base_domain; 249 250 path_start = url; 251 /* Skip protocol */ 252 if (str_starts_with(url, "https://")) 253 path_start = url + 8; 254 else if (str_starts_with(url, "http://")) 255 path_start = url + 7; 256 257 /* Skip domain */ 258 while (*path_start && *path_start != '/') 259 path_start++; 260 261 /* No path or just "/" -> index.html */ 262 if (!*path_start || strcmp(path_start, "/") == 0) 263 return xstrdup("index.html"); 264 265 /* Skip leading slash */ 266 if (*path_start == '/') 267 path_start++; 268 269 /* Copy path, strip query/fragment */ 270 path = xstrdup(path_start); 271 query = strchr(path, '?'); 272 if (query) 273 *query = '\0'; 274 hash = strchr(path, '#'); 275 if (hash) 276 *hash = '\0'; 277 278 /* Remove trailing slash */ 279 len = strlen(path); 280 if (len > 0 && path[len - 1] == '/') { 281 path[len - 1] = '\0'; 282 len--; 283 } 284 285 /* If path doesn't end in .html/.htm, treat as directory */ 286 if (len > 0 && !str_ends_with(path, ".html") && 287 !str_ends_with(path, ".htm")) { 288 new_len = len + 12; 289 new_path = xmalloc(new_len); 290 snprintf(new_path, new_len, "%s/index.html", path); 291 free(path); 292 path = new_path; 293 } 294 295 return path; 296 }