parse.c - sbot - Simple web archiver — self-contained GWTAR archives

parse.c (10170B)
      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 #include <ctype.h>
      7 
      8 #include "parse.h"
      9 #include "util.h"
     10 
     11 ResourceList *
     12 reslist_new(void)
     13 {
     14     ResourceList *list = xmalloc(sizeof(ResourceList));
     15     list->head = NULL;
     16     list->tail = NULL;
     17     list->count = 0;
     18     return list;
     19 }
     20 
     21 void
     22 reslist_free(ResourceList *list)
     23 {
     24     if (!list)
     25         return;
     26     Resource *r = list->head;
     27     while (r) {
     28         Resource *next = r->next;
     29         free(r->url);
     30         free(r);
     31         r = next;
     32     }
     33     free(list);
     34 }
     35 
     36 int
     37 reslist_contains(ResourceList *list, const char *url)
     38 {
     39     for (Resource *r = list->head; r; r = r->next)
     40         if (strcmp(r->url, url) == 0)
     41             return 1;
     42     return 0;
     43 }
     44 
     45 void
     46 reslist_add(ResourceList *list, const char *url, ResourceType type)
     47 {
     48     if (!url || !*url || reslist_contains(list, url))
     49         return;
     50 
     51     /* Skip data: URLs */
     52     if (str_starts_with(url, "data:"))
     53         return;
     54 
     55     Resource *r = xmalloc(sizeof(Resource));
     56     r->url = xstrdup(url);
     57     r->type = type;
     58     r->next = NULL;
     59 
     60     if (list->tail) {
     61         list->tail->next = r;
     62         list->tail = r;
     63     } else {
     64         list->head = r;
     65         list->tail = r;
     66     }
     67     list->count++;
     68 }
     69 
     70 /* Extract attribute value from tag */
     71 static char *
     72 get_attr(const char *tag, const char *attr)
     73 {
     74     size_t attr_len = strlen(attr);
     75     const char *p = tag;
     76 
     77     while (*p) {
     78         /* Skip whitespace */
     79         while (*p && isspace((unsigned char)*p))
     80             p++;
     81 
     82         /* Check for attribute name */
     83         if (strncasecmp(p, attr, attr_len) == 0) {
     84             p += attr_len;
     85             while (*p && isspace((unsigned char)*p))
     86                 p++;
     87             if (*p == '=') {
     88                 p++;
     89                 while (*p && isspace((unsigned char)*p))
     90                     p++;
     91 
     92                 char quote = 0;
     93                 if (*p == '"' || *p == '\'') {
     94                     quote = *p++;
     95                 }
     96 
     97                 const char *start = p;
     98                 if (quote) {
     99                     while (*p && *p != quote)
    100                         p++;
    101                 } else {
    102                     while (*p && !isspace((unsigned char)*p) && *p != '>')
    103                         p++;
    104                 }
    105 
    106                 size_t len = p - start;
    107                 char *value = xmalloc(len + 1);
    108                 memcpy(value, start, len);
    109                 value[len] = '\0';
    110                 return value;
    111             }
    112         }
    113 
    114         /* Skip to next attribute */
    115         while (*p && !isspace((unsigned char)*p) && *p != '>')
    116             p++;
    117     }
    118 
    119     return NULL;
    120 }
    121 
    122 /* Determine resource type from URL/tag */
    123 static ResourceType
    124 guess_resource_type(const char *url, const char *tag_name)
    125 {
    126     if (!tag_name)
    127         return RES_OTHER;
    128 
    129     if (strcasecmp(tag_name, "img") == 0)
    130         return RES_IMAGE;
    131 
    132     if (strcasecmp(tag_name, "link") == 0) {
    133         if (strstr(url, ".css") || strstr(url, "stylesheet"))
    134             return RES_CSS;
    135         if (strstr(url, ".woff") || strstr(url, ".ttf") || strstr(url, ".otf"))
    136             return RES_FONT;
    137         return RES_OTHER;
    138     }
    139 
    140     if (strcasecmp(tag_name, "a") == 0)
    141         return RES_PAGE;
    142 
    143     if (strcasecmp(tag_name, "script") == 0)
    144         return RES_OTHER;
    145 
    146     /* Check by extension */
    147     char *lower = xstrdup(url);
    148     str_tolower(lower);
    149 
    150     ResourceType type = RES_OTHER;
    151     if (strstr(lower, ".jpg") || strstr(lower, ".jpeg") ||
    152         strstr(lower, ".png") || strstr(lower, ".gif") ||
    153         strstr(lower, ".webp") || strstr(lower, ".svg") ||
    154         strstr(lower, ".ico"))
    155         type = RES_IMAGE;
    156     else if (strstr(lower, ".css"))
    157         type = RES_CSS;
    158     else if (strstr(lower, ".woff") || strstr(lower, ".woff2") ||
    159              strstr(lower, ".ttf") || strstr(lower, ".otf") ||
    160              strstr(lower, ".eot"))
    161         type = RES_FONT;
    162 
    163     free(lower);
    164     return type;
    165 }
    166 
    167 ResourceList *
    168 parse_html(const char *html, const char *base_url)
    169 {
    170     ResourceList *list = reslist_new();
    171     const char *p = html;
    172 
    173     while (*p) {
    174         /* Find tag start */
    175         if (*p != '<') {
    176             p++;
    177             continue;
    178         }
    179         p++;
    180 
    181         /* Skip comments */
    182         if (str_starts_with(p, "!--")) {
    183             p = strstr(p, "-->");
    184             if (p)
    185                 p += 3;
    186             else
    187                 break;
    188             continue;
    189         }
    190 
    191         /* Get tag name */
    192         const char *tag_start = p;
    193         while (*p && !isspace((unsigned char)*p) && *p != '>' && *p != '/')
    194             p++;
    195 
    196         size_t tag_len = p - tag_start;
    197         if (tag_len == 0 || tag_len > 20)
    198             continue;
    199 
    200         char tag_name[21];
    201         memcpy(tag_name, tag_start, tag_len);
    202         tag_name[tag_len] = '\0';
    203 
    204         /* Find tag end */
    205         const char *tag_end = strchr(p, '>');
    206         if (!tag_end)
    207             break;
    208 
    209         /* Extract tag content for attribute parsing */
    210         size_t content_len = tag_end - tag_start;
    211         char *tag_content = xmalloc(content_len + 1);
    212         memcpy(tag_content, tag_start, content_len);
    213         tag_content[content_len] = '\0';
    214 
    215         /* Check for relevant attributes based on tag */
    216         char *url = NULL;
    217 
    218         if (strcasecmp(tag_name, "img") == 0) {
    219             url = get_attr(tag_content, "src");
    220             if (!url)
    221                 url = get_attr(tag_content, "data-src");
    222         } else if (strcasecmp(tag_name, "link") == 0) {
    223             url = get_attr(tag_content, "href");
    224         } else if (strcasecmp(tag_name, "script") == 0) {
    225             url = get_attr(tag_content, "src");
    226         } else if (strcasecmp(tag_name, "a") == 0) {
    227             url = get_attr(tag_content, "href");
    228         } else if (strcasecmp(tag_name, "source") == 0) {
    229             url = get_attr(tag_content, "srcset");
    230             if (!url)
    231                 url = get_attr(tag_content, "src");
    232         }
    233 
    234         if (url && *url) {
    235             char *resolved = url_resolve(base_url, url);
    236             ResourceType type = guess_resource_type(resolved, tag_name);
    237             reslist_add(list, resolved, type);
    238             free(resolved);
    239         }
    240 
    241         free(url);
    242         free(tag_content);
    243         p = tag_end + 1;
    244     }
    245 
    246     return list;
    247 }
    248 
    249 char *
    250 parse_title(const char *html)
    251 {
    252     const char *start = strcasestr(html, "<title");
    253     if (!start)
    254         return xstrdup("Untitled");
    255 
    256     start = strchr(start, '>');
    257     if (!start)
    258         return xstrdup("Untitled");
    259     start++;
    260 
    261     const char *end = strcasestr(start, "</title>");
    262     if (!end)
    263         return xstrdup("Untitled");
    264 
    265     size_t len = end - start;
    266     char *title = xmalloc(len + 1);
    267     memcpy(title, start, len);
    268     title[len] = '\0';
    269 
    270     return str_trim(title);
    271 }
    272 
    273 /* Helper to find and replace in string, returns new allocated string */
    274 static char *
    275 str_replace_first(const char *str, const char *old, size_t old_len, const char *new, size_t new_len)
    276 {
    277     const char *pos = strstr(str, old);
    278     if (!pos)
    279         return xstrdup(str);
    280 
    281     size_t before_len = pos - str;
    282     size_t after_len = strlen(pos + old_len);
    283     size_t result_len = before_len + new_len + after_len;
    284 
    285     char *result = xmalloc(result_len + 1);
    286     memcpy(result, str, before_len);
    287     memcpy(result + before_len, new, new_len);
    288     memcpy(result + before_len + new_len, pos + old_len, after_len + 1);
    289 
    290     return result;
    291 }
    292 
    293 char *
    294 inline_resources(const char *html, const char *base_url,
    295                  char *(*fetch_and_encode)(const char *url, const char *base_url))
    296 {
    297     char *result = xstrdup(html);
    298     size_t search_offset = 0;
    299 
    300     /* Process img tags */
    301     while (1) {
    302         const char *p = strcasestr(result + search_offset, "<img");
    303         if (!p)
    304             break;
    305 
    306         const char *tag_end = strchr(p, '>');
    307         if (!tag_end)
    308             break;
    309 
    310         /* Calculate offset for this tag */
    311         size_t tag_offset = p - result;
    312 
    313         /* Find src attribute */
    314         const char *src_start = strcasestr(p, "src=");
    315         if (!src_start || src_start > tag_end) {
    316             /* No src, skip this img */
    317             search_offset = (tag_end - result) + 1;
    318             continue;
    319         }
    320 
    321         src_start += 4;
    322         char quote = 0;
    323         if (*src_start == '"' || *src_start == '\'')
    324             quote = *src_start++;
    325 
    326         const char *src_end = src_start;
    327         if (quote) {
    328             while (*src_end && *src_end != quote)
    329                 src_end++;
    330         } else {
    331             while (*src_end && !isspace((unsigned char)*src_end) && *src_end != '>')
    332                 src_end++;
    333         }
    334 
    335         /* Extract URL */
    336         size_t url_len = src_end - src_start;
    337         char *url = xmalloc(url_len + 1);
    338         memcpy(url, src_start, url_len);
    339         url[url_len] = '\0';
    340 
    341         /* Skip if already data URI */
    342         if (str_starts_with(url, "data:")) {
    343             free(url);
    344             search_offset = (tag_end - result) + 1;
    345             continue;
    346         }
    347 
    348         /* Fetch and encode */
    349         char *data_uri = fetch_and_encode(url, base_url);
    350         if (data_uri) {
    351             /* Build old and new strings for replacement */
    352             char *old_attr = xmalloc(url_len + 8);
    353             snprintf(old_attr, url_len + 8, "src=%c%s%c",
    354                      quote ? quote : '"', url, quote ? quote : '"');
    355 
    356             size_t new_attr_len = 5 + strlen(data_uri) + 2;
    357             char *new_attr = xmalloc(new_attr_len + 1);
    358             snprintf(new_attr, new_attr_len + 1, "src=\"%s\"", data_uri);
    359 
    360             char *new_result = str_replace_first(result, old_attr, strlen(old_attr),
    361                                                   new_attr, strlen(new_attr));
    362             /* Continue searching after the new data URI */
    363             search_offset = tag_offset + strlen(new_attr);
    364 
    365             free(result);
    366             result = new_result;
    367 
    368             free(old_attr);
    369             free(new_attr);
    370             free(data_uri);
    371         } else {
    372             /* Fetch failed, skip this img tag */
    373             search_offset = (tag_end - result) + 1;
    374         }
    375 
    376         free(url);
    377     }
    378 
    379     return result;
    380 }
	sbot Simple web archiver — self-contained GWTAR archives
	git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
	Log \| Files \| Refs \| README \| LICENSE