detect.c - sbot - Simple web archiver — self-contained GWTAR archives

detect.c (9459B)
      1 /* See LICENSE file for copyright and license details. */
      2 
      3 #include <stdio.h>
      4 #include <stdlib.h>
      5 #include <string.h>
      6 
      7 #include "detect.h"
      8 #include "util.h"
      9 
     10 /* CMS signature strings found in HTML */
     11 static const char *wp_sigs[] = {
     12 	"wp-content/",
     13 	"wp-includes/",
     14 	"wp-json/",
     15 	"/xmlrpc.php",
     16 	"name=\"generator\" content=\"WordPress",
     17 	"powered by WordPress",
     18 	NULL
     19 };
     20 
     21 static const char *blogger_sigs[] = {
     22 	"blogger.com",
     23 	"blogspot.com",
     24 	"content=\"blogger\"",
     25 	"name=\"generator\" content=\"Blogger",
     26 	"b:skin",
     27 	"b:template",
     28 	NULL
     29 };
     30 
     31 static const char *hugo_sigs[] = {
     32 	"name=\"generator\" content=\"Hugo",
     33 	"powered by Hugo",
     34 	"Hugo --",
     35 	NULL
     36 };
     37 
     38 static const char *jekyll_sigs[] = {
     39 	"name=\"generator\" content=\"Jekyll",
     40 	"powered by Jekyll",
     41 	"jekyll-",
     42 	NULL
     43 };
     44 
     45 static const char *ghost_sigs[] = {
     46 	"content=\"Ghost",
     47 	"ghost-",
     48 	"ghost/api/",
     49 	"class=\"gh-",
     50 	NULL
     51 };
     52 
     53 static const char *drupal_sigs[] = {
     54 	"Drupal.settings",
     55 	"name=\"generator\" content=\"Drupal",
     56 	"/sites/default/files/",
     57 	"/modules/",
     58 	NULL
     59 };
     60 
     61 static const char *mediawiki_sigs[] = {
     62 	"name=\"generator\" content=\"MediaWiki",
     63 	"wgArticleId",
     64 	"mw-content-text",
     65 	"/wiki/",
     66 	NULL
     67 };
     68 
     69 /* Check if HTML contains any signature from a list */
     70 static int
     71 match_sigs(const char *html, const char **sigs)
     72 {
     73 	int i, hits;
     74 
     75 	hits = 0;
     76 	for (i = 0; sigs[i]; i++) {
     77 		if (strcasestr(html, sigs[i]))
     78 			hits++;
     79 	}
     80 	return hits;
     81 }
     82 
     83 static const char *
     84 sitetype_name(SiteType type)
     85 {
     86 	switch (type) {
     87 	case SITE_WORDPRESS: return "WordPress";
     88 	case SITE_BLOGGER:   return "Blogger";
     89 	case SITE_HUGO:      return "Hugo";
     90 	case SITE_JEKYLL:    return "Jekyll";
     91 	case SITE_GHOST:     return "Ghost";
     92 	case SITE_DRUPAL:    return "Drupal";
     93 	case SITE_MEDIAWIKI: return "MediaWiki";
     94 	default:             return "Unknown";
     95 	}
     96 }
     97 
     98 /* Extract feed URL from <link> tags */
     99 static char *
    100 find_feed_url(const char *html, const char *base_url)
    101 {
    102 	const char *p, *href_start, *href_end;
    103 	char *tag, *href;
    104 	size_t tag_len, href_len;
    105 	char quote;
    106 
    107 	p = html;
    108 	while ((p = strcasestr(p, "<link")) != NULL) {
    109 		const char *end = strchr(p, '>');
    110 
    111 		if (!end)
    112 			break;
    113 
    114 		tag_len = end - p;
    115 		tag = xmalloc(tag_len + 1);
    116 		memcpy(tag, p, tag_len);
    117 		tag[tag_len] = '\0';
    118 
    119 		/* Check for RSS/Atom type */
    120 		if (strcasestr(tag, "application/rss+xml") ||
    121 		    strcasestr(tag, "application/atom+xml")) {
    122 			href_start = strcasestr(tag, "href=");
    123 			if (href_start) {
    124 				href_start += 5;
    125 				quote = 0;
    126 				if (*href_start == '"' ||
    127 				    *href_start == '\'')
    128 					quote = *href_start++;
    129 
    130 				href_end = href_start;
    131 				if (quote) {
    132 					while (*href_end &&
    133 					       *href_end != quote)
    134 						href_end++;
    135 				} else {
    136 					while (*href_end &&
    137 					       *href_end != ' ' &&
    138 					       *href_end != '>')
    139 						href_end++;
    140 				}
    141 
    142 				href_len = href_end - href_start;
    143 				href = xmalloc(href_len + 1);
    144 				memcpy(href, href_start, href_len);
    145 				href[href_len] = '\0';
    146 
    147 				free(tag);
    148 
    149 				/* Resolve relative URL */
    150 				if (str_starts_with(href, "http")) {
    151 					return href;
    152 				} else {
    153 					char *resolved;
    154 
    155 					resolved = url_resolve(
    156 					    base_url, href);
    157 					free(href);
    158 					return resolved;
    159 				}
    160 			}
    161 		}
    162 
    163 		free(tag);
    164 		p = end + 1;
    165 	}
    166 
    167 	return NULL;
    168 }
    169 
    170 SiteInfo *
    171 detect_site(const char *html, const char *url)
    172 {
    173 	SiteInfo *info;
    174 	int wp, bl, hu, jk, gh, dr, mw;
    175 	int best;
    176 	char *domain;
    177 
    178 	info = xmalloc(sizeof(SiteInfo));
    179 	info->type = SITE_UNKNOWN;
    180 	info->name = "Unknown";
    181 	info->feed_url = NULL;
    182 	info->api_url = NULL;
    183 	info->sitemap_url = NULL;
    184 	info->has_json_api = 0;
    185 
    186 	/* Count signature matches for each CMS */
    187 	wp = match_sigs(html, wp_sigs);
    188 	bl = match_sigs(html, blogger_sigs);
    189 	hu = match_sigs(html, hugo_sigs);
    190 	jk = match_sigs(html, jekyll_sigs);
    191 	gh = match_sigs(html, ghost_sigs);
    192 	dr = match_sigs(html, drupal_sigs);
    193 	mw = match_sigs(html, mediawiki_sigs);
    194 
    195 	/* Pick the CMS with the most signature hits */
    196 	best = 0;
    197 
    198 	if (wp > best) { info->type = SITE_WORDPRESS; best = wp; }
    199 	if (bl > best) { info->type = SITE_BLOGGER; best = bl; }
    200 	if (hu > best) { info->type = SITE_HUGO; best = hu; }
    201 	if (jk > best) { info->type = SITE_JEKYLL; best = jk; }
    202 	if (gh > best) { info->type = SITE_GHOST; best = gh; }
    203 	if (dr > best) { info->type = SITE_DRUPAL; best = dr; }
    204 	if (mw > best) { info->type = SITE_MEDIAWIKI; best = mw; }
    205 
    206 	/* Require at least 1 hit */
    207 	if (best < 1) {
    208 		info->type = SITE_UNKNOWN;
    209 		info->name = "Unknown";
    210 		return info;
    211 	}
    212 
    213 	info->name = sitetype_name(info->type);
    214 	domain = url_get_domain(url);
    215 
    216 	/* Set CMS-specific hints */
    217 	switch (info->type) {
    218 	case SITE_WORDPRESS:
    219 		info->has_json_api = 1;
    220 		info->api_url = xmalloc(
    221 		    strlen("https://") + strlen(domain) +
    222 		    strlen("/wp-json/wp/v2/") + 1);
    223 		sprintf(info->api_url, "https://%s/wp-json/wp/v2/",
    224 		        domain);
    225 		info->sitemap_url = xmalloc(
    226 		    strlen("https://") + strlen(domain) +
    227 		    strlen("/wp-sitemap.xml") + 1);
    228 		sprintf(info->sitemap_url,
    229 		        "https://%s/wp-sitemap.xml", domain);
    230 		break;
    231 	case SITE_BLOGGER:
    232 		info->has_json_api = 1;
    233 		/* Blogger Atom feed */
    234 		info->feed_url = xmalloc(
    235 		    strlen("https://") + strlen(domain) +
    236 		    strlen("/feeds/posts/default") + 1);
    237 		sprintf(info->feed_url,
    238 		        "https://%s/feeds/posts/default", domain);
    239 		break;
    240 	case SITE_HUGO:
    241 		info->sitemap_url = xmalloc(
    242 		    strlen("https://") + strlen(domain) +
    243 		    strlen("/sitemap.xml") + 1);
    244 		sprintf(info->sitemap_url,
    245 		        "https://%s/sitemap.xml", domain);
    246 		break;
    247 	case SITE_JEKYLL:
    248 		info->sitemap_url = xmalloc(
    249 		    strlen("https://") + strlen(domain) +
    250 		    strlen("/sitemap.xml") + 1);
    251 		sprintf(info->sitemap_url,
    252 		        "https://%s/sitemap.xml", domain);
    253 		break;
    254 	case SITE_GHOST:
    255 		info->has_json_api = 1;
    256 		info->api_url = xmalloc(
    257 		    strlen("https://") + strlen(domain) +
    258 		    strlen("/ghost/api/content/") + 1);
    259 		sprintf(info->api_url,
    260 		        "https://%s/ghost/api/content/", domain);
    261 		info->sitemap_url = xmalloc(
    262 		    strlen("https://") + strlen(domain) +
    263 		    strlen("/sitemap.xml") + 1);
    264 		sprintf(info->sitemap_url,
    265 		        "https://%s/sitemap.xml", domain);
    266 		break;
    267 	case SITE_DRUPAL:
    268 		info->sitemap_url = xmalloc(
    269 		    strlen("https://") + strlen(domain) +
    270 		    strlen("/sitemap.xml") + 1);
    271 		sprintf(info->sitemap_url,
    272 		        "https://%s/sitemap.xml", domain);
    273 		break;
    274 	case SITE_MEDIAWIKI:
    275 		info->has_json_api = 1;
    276 		info->api_url = xmalloc(
    277 		    strlen("https://") + strlen(domain) +
    278 		    strlen("/w/api.php") + 1);
    279 		sprintf(info->api_url,
    280 		        "https://%s/w/api.php", domain);
    281 		break;
    282 	default:
    283 		break;
    284 	}
    285 
    286 	/* Try to find feed URL from HTML if not set */
    287 	if (!info->feed_url)
    288 		info->feed_url = find_feed_url(html, url);
    289 
    290 	free(domain);
    291 	return info;
    292 }
    293 
    294 void
    295 siteinfo_free(SiteInfo *info)
    296 {
    297 	if (!info)
    298 		return;
    299 	free(info->feed_url);
    300 	free(info->api_url);
    301 	free(info->sitemap_url);
    302 	free(info);
    303 }
    304 
    305 /*
    306  * Parse a simple sitemap.xml to extract <loc> URLs.
    307  * Returns array of URL strings, sets *count.
    308  * Caller frees the array and each string.
    309  */
    310 char **
    311 detect_sitemap_urls(SiteInfo *info, const char *domain, int *count)
    312 {
    313 	char **urls;
    314 	int capacity, n;
    315 
    316 	(void)info;
    317 	(void)domain;
    318 
    319 	capacity = 64;
    320 	n = 0;
    321 	urls = xmalloc(capacity * sizeof(char *));
    322 
    323 	*count = n;
    324 	return urls;
    325 }
    326 
    327 /*
    328  * Get additional seed URLs based on CMS type.
    329  * For WordPress: /feed/, /wp-sitemap.xml
    330  * For Hugo/Jekyll: /sitemap.xml, /index.xml
    331  * For Blogger: /feeds/posts/default
    332  */
    333 char **
    334 detect_seed_urls(SiteInfo *info, const char *domain, int *count)
    335 {
    336 	char **urls;
    337 	int n;
    338 	size_t len;
    339 
    340 	n = 0;
    341 	urls = xmalloc(8 * sizeof(char *));
    342 
    343 	switch (info->type) {
    344 	case SITE_WORDPRESS:
    345 		len = strlen("https://") + strlen(domain) +
    346 		      strlen("/feed/") + 1;
    347 		urls[n] = xmalloc(len);
    348 		sprintf(urls[n], "https://%s/feed/", domain);
    349 		n++;
    350 		len = strlen("https://") + strlen(domain) +
    351 		      strlen("/wp-sitemap.xml") + 1;
    352 		urls[n] = xmalloc(len);
    353 		sprintf(urls[n], "https://%s/wp-sitemap.xml", domain);
    354 		n++;
    355 		break;
    356 	case SITE_BLOGGER:
    357 		len = strlen("https://") + strlen(domain) +
    358 		      strlen("/feeds/posts/default") + 1;
    359 		urls[n] = xmalloc(len);
    360 		sprintf(urls[n], "https://%s/feeds/posts/default",
    361 		        domain);
    362 		n++;
    363 		len = strlen("https://") + strlen(domain) +
    364 		      strlen("/sitemap.xml") + 1;
    365 		urls[n] = xmalloc(len);
    366 		sprintf(urls[n], "https://%s/sitemap.xml", domain);
    367 		n++;
    368 		break;
    369 	case SITE_HUGO:
    370 		/* fallthrough */
    371 	case SITE_JEKYLL:
    372 		len = strlen("https://") + strlen(domain) +
    373 		      strlen("/sitemap.xml") + 1;
    374 		urls[n] = xmalloc(len);
    375 		sprintf(urls[n], "https://%s/sitemap.xml", domain);
    376 		n++;
    377 		len = strlen("https://") + strlen(domain) +
    378 		      strlen("/index.xml") + 1;
    379 		urls[n] = xmalloc(len);
    380 		sprintf(urls[n], "https://%s/index.xml", domain);
    381 		n++;
    382 		break;
    383 	case SITE_GHOST:
    384 		len = strlen("https://") + strlen(domain) +
    385 		      strlen("/sitemap.xml") + 1;
    386 		urls[n] = xmalloc(len);
    387 		sprintf(urls[n], "https://%s/sitemap.xml", domain);
    388 		n++;
    389 		break;
    390 	case SITE_DRUPAL:
    391 		len = strlen("https://") + strlen(domain) +
    392 		      strlen("/sitemap.xml") + 1;
    393 		urls[n] = xmalloc(len);
    394 		sprintf(urls[n], "https://%s/sitemap.xml", domain);
    395 		n++;
    396 		break;
    397 	case SITE_MEDIAWIKI:
    398 		len = strlen("https://") + strlen(domain) +
    399 		      strlen("/wiki/Special:AllPages") + 1;
    400 		urls[n] = xmalloc(len);
    401 		sprintf(urls[n], "https://%s/wiki/Special:AllPages",
    402 		        domain);
    403 		n++;
    404 		break;
    405 	default:
    406 		break;
    407 	}
    408 
    409 	*count = n;
    410 	return urls;
    411 }
	sbot Simple web archiver — self-contained GWTAR archives
	git clone git clone https://git.krisyotam.com/krisyotam/sbot.git
	Log \| Files \| Refs \| README \| LICENSE