detect.c (9459B)
1 /* See LICENSE file for copyright and license details. */ 2 3 #include <stdio.h> 4 #include <stdlib.h> 5 #include <string.h> 6 7 #include "detect.h" 8 #include "util.h" 9 10 /* CMS signature strings found in HTML */ 11 static const char *wp_sigs[] = { 12 "wp-content/", 13 "wp-includes/", 14 "wp-json/", 15 "/xmlrpc.php", 16 "name=\"generator\" content=\"WordPress", 17 "powered by WordPress", 18 NULL 19 }; 20 21 static const char *blogger_sigs[] = { 22 "blogger.com", 23 "blogspot.com", 24 "content=\"blogger\"", 25 "name=\"generator\" content=\"Blogger", 26 "b:skin", 27 "b:template", 28 NULL 29 }; 30 31 static const char *hugo_sigs[] = { 32 "name=\"generator\" content=\"Hugo", 33 "powered by Hugo", 34 "Hugo --", 35 NULL 36 }; 37 38 static const char *jekyll_sigs[] = { 39 "name=\"generator\" content=\"Jekyll", 40 "powered by Jekyll", 41 "jekyll-", 42 NULL 43 }; 44 45 static const char *ghost_sigs[] = { 46 "content=\"Ghost", 47 "ghost-", 48 "ghost/api/", 49 "class=\"gh-", 50 NULL 51 }; 52 53 static const char *drupal_sigs[] = { 54 "Drupal.settings", 55 "name=\"generator\" content=\"Drupal", 56 "/sites/default/files/", 57 "/modules/", 58 NULL 59 }; 60 61 static const char *mediawiki_sigs[] = { 62 "name=\"generator\" content=\"MediaWiki", 63 "wgArticleId", 64 "mw-content-text", 65 "/wiki/", 66 NULL 67 }; 68 69 /* Check if HTML contains any signature from a list */ 70 static int 71 match_sigs(const char *html, const char **sigs) 72 { 73 int i, hits; 74 75 hits = 0; 76 for (i = 0; sigs[i]; i++) { 77 if (strcasestr(html, sigs[i])) 78 hits++; 79 } 80 return hits; 81 } 82 83 static const char * 84 sitetype_name(SiteType type) 85 { 86 switch (type) { 87 case SITE_WORDPRESS: return "WordPress"; 88 case SITE_BLOGGER: return "Blogger"; 89 case SITE_HUGO: return "Hugo"; 90 case SITE_JEKYLL: return "Jekyll"; 91 case SITE_GHOST: return "Ghost"; 92 case SITE_DRUPAL: return "Drupal"; 93 case SITE_MEDIAWIKI: return "MediaWiki"; 94 default: return "Unknown"; 95 } 96 } 97 98 /* Extract feed URL from <link> tags */ 99 static char * 100 find_feed_url(const char *html, const char *base_url) 101 { 102 const char *p, *href_start, *href_end; 103 char *tag, *href; 104 size_t tag_len, href_len; 105 char quote; 106 107 p = html; 108 while ((p = strcasestr(p, "<link")) != NULL) { 109 const char *end = strchr(p, '>'); 110 111 if (!end) 112 break; 113 114 tag_len = end - p; 115 tag = xmalloc(tag_len + 1); 116 memcpy(tag, p, tag_len); 117 tag[tag_len] = '\0'; 118 119 /* Check for RSS/Atom type */ 120 if (strcasestr(tag, "application/rss+xml") || 121 strcasestr(tag, "application/atom+xml")) { 122 href_start = strcasestr(tag, "href="); 123 if (href_start) { 124 href_start += 5; 125 quote = 0; 126 if (*href_start == '"' || 127 *href_start == '\'') 128 quote = *href_start++; 129 130 href_end = href_start; 131 if (quote) { 132 while (*href_end && 133 *href_end != quote) 134 href_end++; 135 } else { 136 while (*href_end && 137 *href_end != ' ' && 138 *href_end != '>') 139 href_end++; 140 } 141 142 href_len = href_end - href_start; 143 href = xmalloc(href_len + 1); 144 memcpy(href, href_start, href_len); 145 href[href_len] = '\0'; 146 147 free(tag); 148 149 /* Resolve relative URL */ 150 if (str_starts_with(href, "http")) { 151 return href; 152 } else { 153 char *resolved; 154 155 resolved = url_resolve( 156 base_url, href); 157 free(href); 158 return resolved; 159 } 160 } 161 } 162 163 free(tag); 164 p = end + 1; 165 } 166 167 return NULL; 168 } 169 170 SiteInfo * 171 detect_site(const char *html, const char *url) 172 { 173 SiteInfo *info; 174 int wp, bl, hu, jk, gh, dr, mw; 175 int best; 176 char *domain; 177 178 info = xmalloc(sizeof(SiteInfo)); 179 info->type = SITE_UNKNOWN; 180 info->name = "Unknown"; 181 info->feed_url = NULL; 182 info->api_url = NULL; 183 info->sitemap_url = NULL; 184 info->has_json_api = 0; 185 186 /* Count signature matches for each CMS */ 187 wp = match_sigs(html, wp_sigs); 188 bl = match_sigs(html, blogger_sigs); 189 hu = match_sigs(html, hugo_sigs); 190 jk = match_sigs(html, jekyll_sigs); 191 gh = match_sigs(html, ghost_sigs); 192 dr = match_sigs(html, drupal_sigs); 193 mw = match_sigs(html, mediawiki_sigs); 194 195 /* Pick the CMS with the most signature hits */ 196 best = 0; 197 198 if (wp > best) { info->type = SITE_WORDPRESS; best = wp; } 199 if (bl > best) { info->type = SITE_BLOGGER; best = bl; } 200 if (hu > best) { info->type = SITE_HUGO; best = hu; } 201 if (jk > best) { info->type = SITE_JEKYLL; best = jk; } 202 if (gh > best) { info->type = SITE_GHOST; best = gh; } 203 if (dr > best) { info->type = SITE_DRUPAL; best = dr; } 204 if (mw > best) { info->type = SITE_MEDIAWIKI; best = mw; } 205 206 /* Require at least 1 hit */ 207 if (best < 1) { 208 info->type = SITE_UNKNOWN; 209 info->name = "Unknown"; 210 return info; 211 } 212 213 info->name = sitetype_name(info->type); 214 domain = url_get_domain(url); 215 216 /* Set CMS-specific hints */ 217 switch (info->type) { 218 case SITE_WORDPRESS: 219 info->has_json_api = 1; 220 info->api_url = xmalloc( 221 strlen("https://") + strlen(domain) + 222 strlen("/wp-json/wp/v2/") + 1); 223 sprintf(info->api_url, "https://%s/wp-json/wp/v2/", 224 domain); 225 info->sitemap_url = xmalloc( 226 strlen("https://") + strlen(domain) + 227 strlen("/wp-sitemap.xml") + 1); 228 sprintf(info->sitemap_url, 229 "https://%s/wp-sitemap.xml", domain); 230 break; 231 case SITE_BLOGGER: 232 info->has_json_api = 1; 233 /* Blogger Atom feed */ 234 info->feed_url = xmalloc( 235 strlen("https://") + strlen(domain) + 236 strlen("/feeds/posts/default") + 1); 237 sprintf(info->feed_url, 238 "https://%s/feeds/posts/default", domain); 239 break; 240 case SITE_HUGO: 241 info->sitemap_url = xmalloc( 242 strlen("https://") + strlen(domain) + 243 strlen("/sitemap.xml") + 1); 244 sprintf(info->sitemap_url, 245 "https://%s/sitemap.xml", domain); 246 break; 247 case SITE_JEKYLL: 248 info->sitemap_url = xmalloc( 249 strlen("https://") + strlen(domain) + 250 strlen("/sitemap.xml") + 1); 251 sprintf(info->sitemap_url, 252 "https://%s/sitemap.xml", domain); 253 break; 254 case SITE_GHOST: 255 info->has_json_api = 1; 256 info->api_url = xmalloc( 257 strlen("https://") + strlen(domain) + 258 strlen("/ghost/api/content/") + 1); 259 sprintf(info->api_url, 260 "https://%s/ghost/api/content/", domain); 261 info->sitemap_url = xmalloc( 262 strlen("https://") + strlen(domain) + 263 strlen("/sitemap.xml") + 1); 264 sprintf(info->sitemap_url, 265 "https://%s/sitemap.xml", domain); 266 break; 267 case SITE_DRUPAL: 268 info->sitemap_url = xmalloc( 269 strlen("https://") + strlen(domain) + 270 strlen("/sitemap.xml") + 1); 271 sprintf(info->sitemap_url, 272 "https://%s/sitemap.xml", domain); 273 break; 274 case SITE_MEDIAWIKI: 275 info->has_json_api = 1; 276 info->api_url = xmalloc( 277 strlen("https://") + strlen(domain) + 278 strlen("/w/api.php") + 1); 279 sprintf(info->api_url, 280 "https://%s/w/api.php", domain); 281 break; 282 default: 283 break; 284 } 285 286 /* Try to find feed URL from HTML if not set */ 287 if (!info->feed_url) 288 info->feed_url = find_feed_url(html, url); 289 290 free(domain); 291 return info; 292 } 293 294 void 295 siteinfo_free(SiteInfo *info) 296 { 297 if (!info) 298 return; 299 free(info->feed_url); 300 free(info->api_url); 301 free(info->sitemap_url); 302 free(info); 303 } 304 305 /* 306 * Parse a simple sitemap.xml to extract <loc> URLs. 307 * Returns array of URL strings, sets *count. 308 * Caller frees the array and each string. 309 */ 310 char ** 311 detect_sitemap_urls(SiteInfo *info, const char *domain, int *count) 312 { 313 char **urls; 314 int capacity, n; 315 316 (void)info; 317 (void)domain; 318 319 capacity = 64; 320 n = 0; 321 urls = xmalloc(capacity * sizeof(char *)); 322 323 *count = n; 324 return urls; 325 } 326 327 /* 328 * Get additional seed URLs based on CMS type. 329 * For WordPress: /feed/, /wp-sitemap.xml 330 * For Hugo/Jekyll: /sitemap.xml, /index.xml 331 * For Blogger: /feeds/posts/default 332 */ 333 char ** 334 detect_seed_urls(SiteInfo *info, const char *domain, int *count) 335 { 336 char **urls; 337 int n; 338 size_t len; 339 340 n = 0; 341 urls = xmalloc(8 * sizeof(char *)); 342 343 switch (info->type) { 344 case SITE_WORDPRESS: 345 len = strlen("https://") + strlen(domain) + 346 strlen("/feed/") + 1; 347 urls[n] = xmalloc(len); 348 sprintf(urls[n], "https://%s/feed/", domain); 349 n++; 350 len = strlen("https://") + strlen(domain) + 351 strlen("/wp-sitemap.xml") + 1; 352 urls[n] = xmalloc(len); 353 sprintf(urls[n], "https://%s/wp-sitemap.xml", domain); 354 n++; 355 break; 356 case SITE_BLOGGER: 357 len = strlen("https://") + strlen(domain) + 358 strlen("/feeds/posts/default") + 1; 359 urls[n] = xmalloc(len); 360 sprintf(urls[n], "https://%s/feeds/posts/default", 361 domain); 362 n++; 363 len = strlen("https://") + strlen(domain) + 364 strlen("/sitemap.xml") + 1; 365 urls[n] = xmalloc(len); 366 sprintf(urls[n], "https://%s/sitemap.xml", domain); 367 n++; 368 break; 369 case SITE_HUGO: 370 /* fallthrough */ 371 case SITE_JEKYLL: 372 len = strlen("https://") + strlen(domain) + 373 strlen("/sitemap.xml") + 1; 374 urls[n] = xmalloc(len); 375 sprintf(urls[n], "https://%s/sitemap.xml", domain); 376 n++; 377 len = strlen("https://") + strlen(domain) + 378 strlen("/index.xml") + 1; 379 urls[n] = xmalloc(len); 380 sprintf(urls[n], "https://%s/index.xml", domain); 381 n++; 382 break; 383 case SITE_GHOST: 384 len = strlen("https://") + strlen(domain) + 385 strlen("/sitemap.xml") + 1; 386 urls[n] = xmalloc(len); 387 sprintf(urls[n], "https://%s/sitemap.xml", domain); 388 n++; 389 break; 390 case SITE_DRUPAL: 391 len = strlen("https://") + strlen(domain) + 392 strlen("/sitemap.xml") + 1; 393 urls[n] = xmalloc(len); 394 sprintf(urls[n], "https://%s/sitemap.xml", domain); 395 n++; 396 break; 397 case SITE_MEDIAWIKI: 398 len = strlen("https://") + strlen(domain) + 399 strlen("/wiki/Special:AllPages") + 1; 400 urls[n] = xmalloc(len); 401 sprintf(urls[n], "https://%s/wiki/Special:AllPages", 402 domain); 403 n++; 404 break; 405 default: 406 break; 407 } 408 409 *count = n; 410 return urls; 411 }