sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

crawl.h (1327B)


      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #ifndef CRAWL_H
      4 #define CRAWL_H
      5 
      6 #include <stddef.h>
      7 
      8 /* Hash table size (prime, ~64k buckets) */
      9 #define HT_SIZE 65521
     10 
     11 /* URL queue for BFS crawling */
     12 typedef struct QueueNode {
     13 	char *url;
     14 	int depth;
     15 	struct QueueNode *next;
     16 } QueueNode;
     17 
     18 typedef struct {
     19 	QueueNode *head;
     20 	QueueNode *tail;
     21 	size_t count;
     22 } UrlQueue;
     23 
     24 /* Hash table node for visited URLs */
     25 typedef struct HashNode {
     26 	char *url;
     27 	struct HashNode *next;
     28 } HashNode;
     29 
     30 /* Hash table based visited set - O(1) lookup */
     31 typedef struct {
     32 	HashNode *buckets[HT_SIZE];
     33 	size_t count;
     34 } VisitedSet;
     35 
     36 /* Queue operations */
     37 UrlQueue *queue_new(void);
     38 void queue_free(UrlQueue *q);
     39 void queue_push(UrlQueue *q, const char *url, int depth);
     40 QueueNode *queue_pop(UrlQueue *q);
     41 int queue_empty(UrlQueue *q);
     42 size_t queue_size(UrlQueue *q);
     43 
     44 /* Visited set operations (hash table) */
     45 VisitedSet *visited_new(void);
     46 void visited_free(VisitedSet *v);
     47 void visited_add(VisitedSet *v, const char *url);
     48 int visited_contains(VisitedSet *v, const char *url);
     49 size_t visited_count(VisitedSet *v);
     50 
     51 /* URL normalization for comparison */
     52 char *url_normalize(const char *url);
     53 
     54 /* Get path component from URL for directory structure */
     55 char *url_to_path(const char *url, const char *base_domain);
     56 
     57 #endif /* CRAWL_H */