parse.c (10170B)
1 /* See LICENSE file for copyright and license details. */ 2 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 #include <ctype.h> 7 8 #include "parse.h" 9 #include "util.h" 10 11 ResourceList * 12 reslist_new(void) 13 { 14 ResourceList *list = xmalloc(sizeof(ResourceList)); 15 list->head = NULL; 16 list->tail = NULL; 17 list->count = 0; 18 return list; 19 } 20 21 void 22 reslist_free(ResourceList *list) 23 { 24 if (!list) 25 return; 26 Resource *r = list->head; 27 while (r) { 28 Resource *next = r->next; 29 free(r->url); 30 free(r); 31 r = next; 32 } 33 free(list); 34 } 35 36 int 37 reslist_contains(ResourceList *list, const char *url) 38 { 39 for (Resource *r = list->head; r; r = r->next) 40 if (strcmp(r->url, url) == 0) 41 return 1; 42 return 0; 43 } 44 45 void 46 reslist_add(ResourceList *list, const char *url, ResourceType type) 47 { 48 if (!url || !*url || reslist_contains(list, url)) 49 return; 50 51 /* Skip data: URLs */ 52 if (str_starts_with(url, "data:")) 53 return; 54 55 Resource *r = xmalloc(sizeof(Resource)); 56 r->url = xstrdup(url); 57 r->type = type; 58 r->next = NULL; 59 60 if (list->tail) { 61 list->tail->next = r; 62 list->tail = r; 63 } else { 64 list->head = r; 65 list->tail = r; 66 } 67 list->count++; 68 } 69 70 /* Extract attribute value from tag */ 71 static char * 72 get_attr(const char *tag, const char *attr) 73 { 74 size_t attr_len = strlen(attr); 75 const char *p = tag; 76 77 while (*p) { 78 /* Skip whitespace */ 79 while (*p && isspace((unsigned char)*p)) 80 p++; 81 82 /* Check for attribute name */ 83 if (strncasecmp(p, attr, attr_len) == 0) { 84 p += attr_len; 85 while (*p && isspace((unsigned char)*p)) 86 p++; 87 if (*p == '=') { 88 p++; 89 while (*p && isspace((unsigned char)*p)) 90 p++; 91 92 char quote = 0; 93 if (*p == '"' || *p == '\'') { 94 quote = *p++; 95 } 96 97 const char *start = p; 98 if (quote) { 99 while (*p && *p != quote) 100 p++; 101 } else { 102 while (*p && !isspace((unsigned char)*p) && *p != '>') 103 p++; 104 } 105 106 size_t len = p - start; 107 char *value = xmalloc(len + 1); 108 memcpy(value, start, len); 109 value[len] = '\0'; 110 return value; 111 } 112 } 113 114 /* Skip to next attribute */ 115 while (*p && !isspace((unsigned char)*p) && *p != '>') 116 p++; 117 } 118 119 return NULL; 120 } 121 122 /* Determine resource type from URL/tag */ 123 static ResourceType 124 guess_resource_type(const char *url, const char *tag_name) 125 { 126 if (!tag_name) 127 return RES_OTHER; 128 129 if (strcasecmp(tag_name, "img") == 0) 130 return RES_IMAGE; 131 132 if (strcasecmp(tag_name, "link") == 0) { 133 if (strstr(url, ".css") || strstr(url, "stylesheet")) 134 return RES_CSS; 135 if (strstr(url, ".woff") || strstr(url, ".ttf") || strstr(url, ".otf")) 136 return RES_FONT; 137 return RES_OTHER; 138 } 139 140 if (strcasecmp(tag_name, "a") == 0) 141 return RES_PAGE; 142 143 if (strcasecmp(tag_name, "script") == 0) 144 return RES_OTHER; 145 146 /* Check by extension */ 147 char *lower = xstrdup(url); 148 str_tolower(lower); 149 150 ResourceType type = RES_OTHER; 151 if (strstr(lower, ".jpg") || strstr(lower, ".jpeg") || 152 strstr(lower, ".png") || strstr(lower, ".gif") || 153 strstr(lower, ".webp") || strstr(lower, ".svg") || 154 strstr(lower, ".ico")) 155 type = RES_IMAGE; 156 else if (strstr(lower, ".css")) 157 type = RES_CSS; 158 else if (strstr(lower, ".woff") || strstr(lower, ".woff2") || 159 strstr(lower, ".ttf") || strstr(lower, ".otf") || 160 strstr(lower, ".eot")) 161 type = RES_FONT; 162 163 free(lower); 164 return type; 165 } 166 167 ResourceList * 168 parse_html(const char *html, const char *base_url) 169 { 170 ResourceList *list = reslist_new(); 171 const char *p = html; 172 173 while (*p) { 174 /* Find tag start */ 175 if (*p != '<') { 176 p++; 177 continue; 178 } 179 p++; 180 181 /* Skip comments */ 182 if (str_starts_with(p, "!--")) { 183 p = strstr(p, "-->"); 184 if (p) 185 p += 3; 186 else 187 break; 188 continue; 189 } 190 191 /* Get tag name */ 192 const char *tag_start = p; 193 while (*p && !isspace((unsigned char)*p) && *p != '>' && *p != '/') 194 p++; 195 196 size_t tag_len = p - tag_start; 197 if (tag_len == 0 || tag_len > 20) 198 continue; 199 200 char tag_name[21]; 201 memcpy(tag_name, tag_start, tag_len); 202 tag_name[tag_len] = '\0'; 203 204 /* Find tag end */ 205 const char *tag_end = strchr(p, '>'); 206 if (!tag_end) 207 break; 208 209 /* Extract tag content for attribute parsing */ 210 size_t content_len = tag_end - tag_start; 211 char *tag_content = xmalloc(content_len + 1); 212 memcpy(tag_content, tag_start, content_len); 213 tag_content[content_len] = '\0'; 214 215 /* Check for relevant attributes based on tag */ 216 char *url = NULL; 217 218 if (strcasecmp(tag_name, "img") == 0) { 219 url = get_attr(tag_content, "src"); 220 if (!url) 221 url = get_attr(tag_content, "data-src"); 222 } else if (strcasecmp(tag_name, "link") == 0) { 223 url = get_attr(tag_content, "href"); 224 } else if (strcasecmp(tag_name, "script") == 0) { 225 url = get_attr(tag_content, "src"); 226 } else if (strcasecmp(tag_name, "a") == 0) { 227 url = get_attr(tag_content, "href"); 228 } else if (strcasecmp(tag_name, "source") == 0) { 229 url = get_attr(tag_content, "srcset"); 230 if (!url) 231 url = get_attr(tag_content, "src"); 232 } 233 234 if (url && *url) { 235 char *resolved = url_resolve(base_url, url); 236 ResourceType type = guess_resource_type(resolved, tag_name); 237 reslist_add(list, resolved, type); 238 free(resolved); 239 } 240 241 free(url); 242 free(tag_content); 243 p = tag_end + 1; 244 } 245 246 return list; 247 } 248 249 char * 250 parse_title(const char *html) 251 { 252 const char *start = strcasestr(html, "<title"); 253 if (!start) 254 return xstrdup("Untitled"); 255 256 start = strchr(start, '>'); 257 if (!start) 258 return xstrdup("Untitled"); 259 start++; 260 261 const char *end = strcasestr(start, "</title>"); 262 if (!end) 263 return xstrdup("Untitled"); 264 265 size_t len = end - start; 266 char *title = xmalloc(len + 1); 267 memcpy(title, start, len); 268 title[len] = '\0'; 269 270 return str_trim(title); 271 } 272 273 /* Helper to find and replace in string, returns new allocated string */ 274 static char * 275 str_replace_first(const char *str, const char *old, size_t old_len, const char *new, size_t new_len) 276 { 277 const char *pos = strstr(str, old); 278 if (!pos) 279 return xstrdup(str); 280 281 size_t before_len = pos - str; 282 size_t after_len = strlen(pos + old_len); 283 size_t result_len = before_len + new_len + after_len; 284 285 char *result = xmalloc(result_len + 1); 286 memcpy(result, str, before_len); 287 memcpy(result + before_len, new, new_len); 288 memcpy(result + before_len + new_len, pos + old_len, after_len + 1); 289 290 return result; 291 } 292 293 char * 294 inline_resources(const char *html, const char *base_url, 295 char *(*fetch_and_encode)(const char *url, const char *base_url)) 296 { 297 char *result = xstrdup(html); 298 size_t search_offset = 0; 299 300 /* Process img tags */ 301 while (1) { 302 const char *p = strcasestr(result + search_offset, "<img"); 303 if (!p) 304 break; 305 306 const char *tag_end = strchr(p, '>'); 307 if (!tag_end) 308 break; 309 310 /* Calculate offset for this tag */ 311 size_t tag_offset = p - result; 312 313 /* Find src attribute */ 314 const char *src_start = strcasestr(p, "src="); 315 if (!src_start || src_start > tag_end) { 316 /* No src, skip this img */ 317 search_offset = (tag_end - result) + 1; 318 continue; 319 } 320 321 src_start += 4; 322 char quote = 0; 323 if (*src_start == '"' || *src_start == '\'') 324 quote = *src_start++; 325 326 const char *src_end = src_start; 327 if (quote) { 328 while (*src_end && *src_end != quote) 329 src_end++; 330 } else { 331 while (*src_end && !isspace((unsigned char)*src_end) && *src_end != '>') 332 src_end++; 333 } 334 335 /* Extract URL */ 336 size_t url_len = src_end - src_start; 337 char *url = xmalloc(url_len + 1); 338 memcpy(url, src_start, url_len); 339 url[url_len] = '\0'; 340 341 /* Skip if already data URI */ 342 if (str_starts_with(url, "data:")) { 343 free(url); 344 search_offset = (tag_end - result) + 1; 345 continue; 346 } 347 348 /* Fetch and encode */ 349 char *data_uri = fetch_and_encode(url, base_url); 350 if (data_uri) { 351 /* Build old and new strings for replacement */ 352 char *old_attr = xmalloc(url_len + 8); 353 snprintf(old_attr, url_len + 8, "src=%c%s%c", 354 quote ? quote : '"', url, quote ? quote : '"'); 355 356 size_t new_attr_len = 5 + strlen(data_uri) + 2; 357 char *new_attr = xmalloc(new_attr_len + 1); 358 snprintf(new_attr, new_attr_len + 1, "src=\"%s\"", data_uri); 359 360 char *new_result = str_replace_first(result, old_attr, strlen(old_attr), 361 new_attr, strlen(new_attr)); 362 /* Continue searching after the new data URI */ 363 search_offset = tag_offset + strlen(new_attr); 364 365 free(result); 366 result = new_result; 367 368 free(old_attr); 369 free(new_attr); 370 free(data_uri); 371 } else { 372 /* Fetch failed, skip this img tag */ 373 search_offset = (tag_end - result) + 1; 374 } 375 376 free(url); 377 } 378 379 return result; 380 }