crawl.h (1327B)
1 /* See LICENSE file for copyright and license details. */ 2 3 #ifndef CRAWL_H 4 #define CRAWL_H 5 6 #include <stddef.h> 7 8 /* Hash table size (prime, ~64k buckets) */ 9 #define HT_SIZE 65521 10 11 /* URL queue for BFS crawling */ 12 typedef struct QueueNode { 13 char *url; 14 int depth; 15 struct QueueNode *next; 16 } QueueNode; 17 18 typedef struct { 19 QueueNode *head; 20 QueueNode *tail; 21 size_t count; 22 } UrlQueue; 23 24 /* Hash table node for visited URLs */ 25 typedef struct HashNode { 26 char *url; 27 struct HashNode *next; 28 } HashNode; 29 30 /* Hash table based visited set - O(1) lookup */ 31 typedef struct { 32 HashNode *buckets[HT_SIZE]; 33 size_t count; 34 } VisitedSet; 35 36 /* Queue operations */ 37 UrlQueue *queue_new(void); 38 void queue_free(UrlQueue *q); 39 void queue_push(UrlQueue *q, const char *url, int depth); 40 QueueNode *queue_pop(UrlQueue *q); 41 int queue_empty(UrlQueue *q); 42 size_t queue_size(UrlQueue *q); 43 44 /* Visited set operations (hash table) */ 45 VisitedSet *visited_new(void); 46 void visited_free(VisitedSet *v); 47 void visited_add(VisitedSet *v, const char *url); 48 int visited_contains(VisitedSet *v, const char *url); 49 size_t visited_count(VisitedSet *v); 50 51 /* URL normalization for comparison */ 52 char *url_normalize(const char *url); 53 54 /* Get path component from URL for directory structure */ 55 char *url_to_path(const char *url, const char *base_domain); 56 57 #endif /* CRAWL_H */