sbot

Simple web archiver — self-contained GWTAR archives
git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
Log | Files | Refs | README | LICENSE

parse.h (1176B)


      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #ifndef PARSE_H
      4 #define PARSE_H
      5 
      6 #include <stddef.h>
      7 
      8 /* Resource types */
      9 typedef enum {
     10     RES_IMAGE,
     11     RES_CSS,
     12     RES_FONT,
     13     RES_PAGE,
     14     RES_OTHER
     15 } ResourceType;
     16 
     17 /* Extracted resource */
     18 typedef struct Resource {
     19     char *url;
     20     ResourceType type;
     21     struct Resource *next;
     22 } Resource;
     23 
     24 /* Resource list */
     25 typedef struct {
     26     Resource *head;
     27     Resource *tail;
     28     size_t count;
     29 } ResourceList;
     30 
     31 /* Create/destroy resource list */
     32 ResourceList *reslist_new(void);
     33 void reslist_free(ResourceList *list);
     34 
     35 /* Add a resource (url is copied) */
     36 void reslist_add(ResourceList *list, const char *url, ResourceType type);
     37 
     38 /* Check if URL already in list */
     39 int reslist_contains(ResourceList *list, const char *url);
     40 
     41 /* Extract resources from HTML */
     42 ResourceList *parse_html(const char *html, const char *base_url);
     43 
     44 /* Extract title from HTML */
     45 char *parse_title(const char *html);
     46 
     47 /* Find and inline resources in HTML */
     48 char *inline_resources(const char *html, const char *base_url,
     49                        char *(*fetch_and_encode)(const char *url, const char *base_url));
     50 
     51 #endif /* PARSE_H */